maxframe 0.1.0b4__cp310-cp310-win_amd64.whl → 1.0.0rc1__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (81) hide show
  1. maxframe/__init__.py +1 -0
  2. maxframe/_utils.cp310-win_amd64.pyd +0 -0
  3. maxframe/codegen.py +56 -3
  4. maxframe/config/config.py +15 -1
  5. maxframe/core/__init__.py +0 -3
  6. maxframe/core/entity/__init__.py +1 -8
  7. maxframe/core/entity/objects.py +3 -45
  8. maxframe/core/graph/core.cp310-win_amd64.pyd +0 -0
  9. maxframe/core/graph/core.pyx +4 -4
  10. maxframe/dataframe/__init__.py +1 -0
  11. maxframe/dataframe/core.py +30 -8
  12. maxframe/dataframe/datasource/read_odps_query.py +3 -1
  13. maxframe/dataframe/datasource/read_odps_table.py +3 -1
  14. maxframe/dataframe/datastore/tests/__init__.py +13 -0
  15. maxframe/dataframe/datastore/tests/test_to_odps.py +48 -0
  16. maxframe/dataframe/datastore/to_odps.py +21 -0
  17. maxframe/dataframe/indexing/align.py +1 -1
  18. maxframe/dataframe/misc/__init__.py +4 -0
  19. maxframe/dataframe/misc/apply.py +3 -1
  20. maxframe/dataframe/misc/case_when.py +141 -0
  21. maxframe/dataframe/misc/memory_usage.py +2 -2
  22. maxframe/dataframe/misc/pivot_table.py +262 -0
  23. maxframe/dataframe/misc/tests/test_misc.py +84 -0
  24. maxframe/dataframe/plotting/core.py +2 -2
  25. maxframe/dataframe/reduction/core.py +2 -1
  26. maxframe/dataframe/statistics/corr.py +3 -3
  27. maxframe/dataframe/utils.py +7 -0
  28. maxframe/errors.py +13 -0
  29. maxframe/extension.py +12 -0
  30. maxframe/learn/contrib/utils.py +52 -0
  31. maxframe/learn/contrib/xgboost/__init__.py +26 -0
  32. maxframe/learn/contrib/xgboost/classifier.py +86 -0
  33. maxframe/learn/contrib/xgboost/core.py +156 -0
  34. maxframe/learn/contrib/xgboost/dmatrix.py +150 -0
  35. maxframe/learn/contrib/xgboost/predict.py +138 -0
  36. maxframe/learn/contrib/xgboost/regressor.py +78 -0
  37. maxframe/learn/contrib/xgboost/tests/__init__.py +13 -0
  38. maxframe/learn/contrib/xgboost/tests/test_core.py +43 -0
  39. maxframe/learn/contrib/xgboost/train.py +121 -0
  40. maxframe/learn/utils/__init__.py +15 -0
  41. maxframe/learn/utils/core.py +29 -0
  42. maxframe/lib/mmh3.cp310-win_amd64.pyd +0 -0
  43. maxframe/lib/mmh3.pyi +43 -0
  44. maxframe/lib/wrapped_pickle.py +2 -1
  45. maxframe/odpsio/arrow.py +2 -3
  46. maxframe/odpsio/tableio.py +22 -0
  47. maxframe/odpsio/tests/test_schema.py +16 -11
  48. maxframe/opcodes.py +3 -0
  49. maxframe/protocol.py +108 -10
  50. maxframe/serialization/core.cp310-win_amd64.pyd +0 -0
  51. maxframe/serialization/core.pxd +3 -0
  52. maxframe/serialization/core.pyi +64 -0
  53. maxframe/serialization/core.pyx +54 -25
  54. maxframe/serialization/exception.py +1 -1
  55. maxframe/serialization/pandas.py +7 -2
  56. maxframe/serialization/serializables/core.py +119 -12
  57. maxframe/serialization/serializables/tests/test_serializable.py +46 -4
  58. maxframe/session.py +28 -0
  59. maxframe/tensor/__init__.py +1 -1
  60. maxframe/tensor/arithmetic/tests/test_arithmetic.py +1 -1
  61. maxframe/tensor/base/__init__.py +2 -0
  62. maxframe/tensor/base/atleast_1d.py +74 -0
  63. maxframe/tensor/base/unique.py +205 -0
  64. maxframe/tensor/datasource/array.py +4 -2
  65. maxframe/tensor/datasource/scalar.py +1 -1
  66. maxframe/tensor/reduction/count_nonzero.py +1 -1
  67. maxframe/tests/test_protocol.py +34 -0
  68. maxframe/tests/test_utils.py +0 -12
  69. maxframe/tests/utils.py +2 -2
  70. maxframe/udf.py +63 -3
  71. maxframe/utils.py +22 -13
  72. {maxframe-0.1.0b4.dist-info → maxframe-1.0.0rc1.dist-info}/METADATA +3 -3
  73. {maxframe-0.1.0b4.dist-info → maxframe-1.0.0rc1.dist-info}/RECORD +80 -61
  74. maxframe_client/__init__.py +0 -1
  75. maxframe_client/fetcher.py +65 -3
  76. maxframe_client/session/odps.py +74 -5
  77. maxframe_client/session/task.py +65 -71
  78. maxframe_client/tests/test_session.py +64 -1
  79. maxframe_client/clients/spe.py +0 -104
  80. {maxframe-0.1.0b4.dist-info → maxframe-1.0.0rc1.dist-info}/WHEEL +0 -0
  81. {maxframe-0.1.0b4.dist-info → maxframe-1.0.0rc1.dist-info}/top_level.txt +0 -0
maxframe/__init__.py CHANGED
@@ -13,6 +13,7 @@
13
13
  # limitations under the License.
14
14
 
15
15
  from . import dataframe, learn, remote, tensor
16
+ from .config import options
16
17
  from .session import execute, fetch, new_session, stop_server
17
18
 
18
19
 
Binary file
maxframe/codegen.py CHANGED
@@ -16,6 +16,7 @@ import abc
16
16
  import base64
17
17
  import dataclasses
18
18
  import logging
19
+ from collections import defaultdict
19
20
  from enum import Enum
20
21
  from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union
21
22
 
@@ -32,7 +33,7 @@ from .protocol import DataFrameTableMeta, ResultInfo
32
33
  from .serialization import PickleContainer
33
34
  from .serialization.serializables import Serializable, StringField
34
35
  from .typing_ import PandasObjectTypes
35
- from .udf import MarkedFunction
36
+ from .udf import MarkedFunction, PythonPackOptions
36
37
 
37
38
  if TYPE_CHECKING:
38
39
  from odpsctx import ODPSSessionContext
@@ -75,8 +76,18 @@ class AbstractUDF(Serializable):
75
76
  def unregister(self, odps: "ODPSSessionContext"):
76
77
  raise NotImplementedError
77
78
 
79
+ @abc.abstractmethod
80
+ def collect_pythonpack(self) -> List[PythonPackOptions]:
81
+ raise NotImplementedError
82
+
83
+ @abc.abstractmethod
84
+ def load_pythonpack_resources(self, odps_ctx: "ODPSSessionContext") -> None:
85
+ raise NotImplementedError
86
+
78
87
 
79
88
  class UserCodeMixin:
89
+ __slots__ = ()
90
+
80
91
  @classmethod
81
92
  def obj_to_python_expr(cls, obj: Any = None) -> str:
82
93
  """
@@ -194,8 +205,12 @@ class BigDagCodeContext(metaclass=abc.ABCMeta):
194
205
  return self._session_id
195
206
 
196
207
  def register_udf(self, udf: AbstractUDF):
208
+ from maxframe_framedriver.services.session import SessionManager
209
+
197
210
  udf.session_id = self._session_id
198
211
  self._udfs[udf.name] = udf
212
+ if self._session_id and SessionManager.initialized():
213
+ SessionManager.instance().register_udf(self._session_id, udf)
199
214
 
200
215
  def get_udfs(self) -> List[AbstractUDF]:
201
216
  return list(self._udfs.values())
@@ -335,6 +350,8 @@ def register_engine_codegen(type_: Type["BigDagCodeGenerator"]):
335
350
  BUILTIN_ENGINE_SPE = "SPE"
336
351
  BUILTIN_ENGINE_MCSQL = "MCSQL"
337
352
 
353
+ FAST_RANGE_INDEX_ENABLED = "codegen.fast_range_index_enabled"
354
+
338
355
 
339
356
  class BigDagCodeGenerator(metaclass=abc.ABCMeta):
340
357
  _context: BigDagCodeContext
@@ -469,14 +486,50 @@ class BigDagCodeGenerator(metaclass=abc.ABCMeta):
469
486
  output_key_to_result_infos=self._context.get_tileable_result_infos(),
470
487
  )
471
488
 
489
+ def run_pythonpacks(
490
+ self,
491
+ odps_ctx: "ODPSSessionContext",
492
+ python_tag: str,
493
+ is_production: bool = False,
494
+ schedule_id: Optional[str] = None,
495
+ hints: Optional[dict] = None,
496
+ priority: Optional[int] = None,
497
+ ) -> Dict[str, PythonPackOptions]:
498
+ key_to_packs = defaultdict(list)
499
+ for udf in self._context.get_udfs():
500
+ for pack in udf.collect_pythonpack():
501
+ key_to_packs[pack.key].append(pack)
502
+ distinct_packs = []
503
+ for packs in key_to_packs.values():
504
+ distinct_packs.append(packs[0])
505
+
506
+ inst_id_to_req = {}
507
+ for pack in distinct_packs:
508
+ inst = odps_ctx.run_pythonpack(
509
+ requirements=pack.requirements,
510
+ prefer_binary=pack.prefer_binary,
511
+ pre_release=pack.pre_release,
512
+ force_rebuild=pack.force_rebuild,
513
+ python_tag=python_tag,
514
+ is_production=is_production,
515
+ schedule_id=schedule_id,
516
+ hints=hints,
517
+ priority=priority,
518
+ )
519
+ # fulfill instance id of pythonpacks with same keys
520
+ for same_pack in key_to_packs[pack.key]:
521
+ same_pack.pack_instance_id = inst.id
522
+ inst_id_to_req[inst.id] = pack
523
+ return inst_id_to_req
524
+
472
525
  def register_udfs(self, odps_ctx: "ODPSSessionContext"):
473
526
  for udf in self._context.get_udfs():
474
- logger.info("[Session %s] Registering UDF %s", self._session_id, udf.name)
527
+ logger.info("[Session=%s] Registering UDF %s", self._session_id, udf.name)
475
528
  udf.register(odps_ctx, True)
476
529
 
477
530
  def unregister_udfs(self, odps_ctx: "ODPSSessionContext"):
478
531
  for udf in self._context.get_udfs():
479
- logger.info("[Session %s] Unregistering UDF %s", self._session_id, udf.name)
532
+ logger.info("[Session=%s] Unregistering UDF %s", self._session_id, udf.name)
480
533
  udf.unregister(odps_ctx)
481
534
 
482
535
  def get_udfs(self) -> List[AbstractUDF]:
maxframe/config/config.py CHANGED
@@ -40,6 +40,8 @@ _DEFAULT_SPE_OPERATION_TIMEOUT_SECONDS = 120
40
40
  _DEFAULT_UPLOAD_BATCH_SIZE = 4096
41
41
  _DEFAULT_TEMP_LIFECYCLE = 1
42
42
  _DEFAULT_TASK_START_TIMEOUT = 60
43
+ _DEFAULT_TASK_RESTART_TIMEOUT = 300
44
+ _DEFAULT_LOGVIEW_HOURS = 24 * 60
43
45
 
44
46
 
45
47
  class OptionError(Exception):
@@ -296,22 +298,30 @@ class Config:
296
298
 
297
299
 
298
300
  default_options = Config()
299
-
300
301
  default_options.register_option(
301
302
  "execution_mode", "trigger", validator=is_in(["trigger", "eager"])
302
303
  )
303
304
  default_options.register_option(
304
305
  "python_tag", get_python_tag(), validator=is_string, remote=True
305
306
  )
307
+ default_options.register_option(
308
+ "session.logview_hours", _DEFAULT_LOGVIEW_HOURS, validator=is_integer, remote=True
309
+ )
306
310
  default_options.register_option(
307
311
  "client.task_start_timeout", _DEFAULT_TASK_START_TIMEOUT, validator=is_integer
308
312
  )
313
+ default_options.register_option(
314
+ "client.task_restart_timeout", _DEFAULT_TASK_RESTART_TIMEOUT, validator=is_integer
315
+ )
309
316
  default_options.register_option("sql.enable_mcqa", True, validator=is_bool, remote=True)
310
317
  default_options.register_option(
311
318
  "sql.generate_comments", True, validator=is_bool, remote=True
312
319
  )
313
320
  default_options.register_option("sql.settings", {}, validator=is_dict, remote=True)
314
321
 
322
+ default_options.register_option("is_production", False, validator=is_bool, remote=True)
323
+ default_options.register_option("schedule_id", "", validator=is_string, remote=True)
324
+
315
325
  default_options.register_option(
316
326
  "session.max_alive_seconds",
317
327
  _DEFAULT_MAX_ALIVE_SECONDS,
@@ -376,6 +386,10 @@ default_options.register_option(
376
386
  "spe.task.settings", dict(), validator=is_dict, remote=True
377
387
  )
378
388
 
389
+ default_options.register_option(
390
+ "pythonpack.task.settings", {}, validator=is_dict, remote=True
391
+ )
392
+
379
393
  _options_ctx_var = contextvars.ContextVar("_options_ctx_var")
380
394
 
381
395
 
maxframe/core/__init__.py CHANGED
@@ -19,7 +19,6 @@ from .entity import (
19
19
  CHUNK_TYPE,
20
20
  ENTITY_TYPE,
21
21
  FUSE_CHUNK_TYPE,
22
- OBJECT_CHUNK_TYPE,
23
22
  OBJECT_TYPE,
24
23
  TILEABLE_TYPE,
25
24
  Chunk,
@@ -33,8 +32,6 @@ from .entity import (
33
32
  HasShapeTileableData,
34
33
  NotSupportTile,
35
34
  Object,
36
- ObjectChunk,
37
- ObjectChunkData,
38
35
  ObjectData,
39
36
  OutputType,
40
37
  Tileable,
@@ -16,14 +16,7 @@ from .chunks import CHUNK_TYPE, Chunk, ChunkData
16
16
  from .core import ENTITY_TYPE, Entity, EntityData
17
17
  from .executable import ExecutableTuple, _ExecuteAndFetchMixin
18
18
  from .fuse import FUSE_CHUNK_TYPE, FuseChunk, FuseChunkData
19
- from .objects import (
20
- OBJECT_CHUNK_TYPE,
21
- OBJECT_TYPE,
22
- Object,
23
- ObjectChunk,
24
- ObjectChunkData,
25
- ObjectData,
26
- )
19
+ from .objects import OBJECT_TYPE, Object, ObjectData
27
20
  from .output_types import (
28
21
  OutputType,
29
22
  get_fetch_class,
@@ -14,58 +14,17 @@
14
14
 
15
15
  from typing import Any, Dict
16
16
 
17
- from ...serialization.serializables import FieldTypes, ListField
18
- from ...utils import skip_na_call
19
- from .chunks import Chunk, ChunkData
20
17
  from .core import Entity
21
18
  from .executable import _ToObjectMixin
22
19
  from .tileables import TileableData
23
20
 
24
21
 
25
- class ObjectChunkData(ChunkData):
26
- # chunk whose data could be any serializable
27
- __slots__ = ()
28
- type_name = "Object"
29
-
30
- def __init__(self, op=None, index=None, **kw):
31
- super().__init__(_op=op, _index=index, **kw)
32
-
33
- @property
34
- def params(self) -> Dict[str, Any]:
35
- # params return the properties which useful to rebuild a new chunk
36
- return {
37
- "index": self.index,
38
- }
39
-
40
- @params.setter
41
- def params(self, new_params: Dict[str, Any]):
42
- params = new_params.copy()
43
- params.pop("index", None) # index not needed to update
44
- if params: # pragma: no cover
45
- raise TypeError(f"Unknown params: {list(params)}")
46
-
47
- @classmethod
48
- def get_params_from_data(cls, data: Any) -> Dict[str, Any]:
49
- return dict()
50
-
51
-
52
- class ObjectChunk(Chunk):
53
- __slots__ = ()
54
- _allow_data_type_ = (ObjectChunkData,)
55
- type_name = "Object"
56
-
57
-
58
22
  class ObjectData(TileableData, _ToObjectMixin):
59
23
  __slots__ = ()
60
24
  type_name = "Object"
61
-
62
- # optional fields
63
- _chunks = ListField(
64
- "chunks",
65
- FieldTypes.reference(ObjectChunkData),
66
- on_serialize=skip_na_call(lambda x: [it.data for it in x]),
67
- on_deserialize=skip_na_call(lambda x: [ObjectChunk(it) for it in x]),
68
- )
25
+ # workaround for removed field since v0.1.0b5
26
+ # todo remove this when all versions below v0.1.0b5 is eliminated
27
+ _legacy_deprecated_non_primitives = ["_chunks"]
69
28
 
70
29
  def __init__(self, op=None, nsplits=None, **kw):
71
30
  super().__init__(_op=op, _nsplits=nsplits, **kw)
@@ -97,4 +56,3 @@ class Object(Entity, _ToObjectMixin):
97
56
 
98
57
 
99
58
  OBJECT_TYPE = (Object, ObjectData)
100
- OBJECT_CHUNK_TYPE = (ObjectChunk, ObjectChunkData)
@@ -354,10 +354,10 @@ cdef class DirectedGraph:
354
354
  sio.write(f'"Chunk:{self._gen_chunk_key(input_chunk, trunc_key)}" {chunk_style}\n')
355
355
  visited.add(input_chunk.key)
356
356
  if op.key not in visited:
357
- sio.write(f'"{op_name}:{op.key[:trunc_key]}" {operator_style}\n')
357
+ sio.write(f'"{op_name}:{op.key[:trunc_key]}_{id(op)}" {operator_style}\n')
358
358
  visited.add(op.key)
359
359
  sio.write(f'"Chunk:{self._gen_chunk_key(input_chunk, trunc_key)}" -> '
360
- f'"{op_name}:{op.key[:trunc_key]}"\n')
360
+ f'"{op_name}:{op.key[:trunc_key]}_{id(op)}"\n')
361
361
 
362
362
  for output_chunk in (op.outputs or []):
363
363
  if output_chunk.key not in visited:
@@ -367,9 +367,9 @@ cdef class DirectedGraph:
367
367
  sio.write(f'"Chunk:{self._gen_chunk_key(output_chunk, trunc_key)}" {tmp_chunk_style}\n')
368
368
  visited.add(output_chunk.key)
369
369
  if op.key not in visited:
370
- sio.write(f'"{op_name}:{op.key[:trunc_key]}" {operator_style}\n')
370
+ sio.write(f'"{op_name}:{op.key[:trunc_key]}_{id(op)}" {operator_style}\n')
371
371
  visited.add(op.key)
372
- sio.write(f'"{op_name}:{op.key[:trunc_key]}" -> '
372
+ sio.write(f'"{op_name}:{op.key[:trunc_key]}_{id(op)}" -> '
373
373
  f'"Chunk:{self._gen_chunk_key(output_chunk, trunc_key)}"')
374
374
  if show_columns:
375
375
  sio.write(f' [ label={get_col_names(output_chunk)} ]')
@@ -46,6 +46,7 @@ from .misc.cut import cut
46
46
  from .misc.eval import maxframe_eval as eval # pylint: disable=redefined-builtin
47
47
  from .misc.get_dummies import get_dummies
48
48
  from .misc.melt import melt
49
+ from .misc.pivot_table import pivot_table
49
50
  from .misc.qcut import qcut
50
51
  from .misc.to_numeric import to_numeric
51
52
  from .missing import isna, isnull, notna, notnull
@@ -35,6 +35,7 @@ from ..core import (
35
35
  register_output_types,
36
36
  )
37
37
  from ..core.entity.utils import refresh_tileable_shape
38
+ from ..protocol import DataFrameTableMeta
38
39
  from ..serialization.serializables import (
39
40
  AnyField,
40
41
  BoolField,
@@ -59,7 +60,13 @@ from ..utils import (
59
60
  on_serialize_numpy_type,
60
61
  tokenize,
61
62
  )
62
- from .utils import ReprSeries, fetch_corner_data, merge_index_value, parse_index
63
+ from .utils import (
64
+ ReprSeries,
65
+ apply_if_callable,
66
+ fetch_corner_data,
67
+ merge_index_value,
68
+ parse_index,
69
+ )
63
70
 
64
71
 
65
72
  class IndexValue(Serializable):
@@ -616,6 +623,9 @@ class IndexData(HasShapeTileableData, _ToPandasMixin):
616
623
  if self._name is None:
617
624
  self._name = self.chunks[0].name
618
625
 
626
+ def refresh_from_table_meta(self, table_meta: DataFrameTableMeta) -> None:
627
+ pass
628
+
619
629
  def _to_str(self, representation=False):
620
630
  if is_build_mode() or len(self._executed_sessions) == 0:
621
631
  # in build mode, or not executed, just return representation
@@ -945,6 +955,9 @@ class BaseSeriesData(HasShapeTileableData, _ToPandasMixin):
945
955
  if self._name is None:
946
956
  self._name = self.chunks[0].name
947
957
 
958
+ def refresh_from_table_meta(self, table_meta: DataFrameTableMeta) -> None:
959
+ pass
960
+
948
961
  def _to_str(self, representation=False):
949
962
  if is_build_mode() or len(self._executed_sessions) == 0:
950
963
  # in build mode, or not executed, just return representation
@@ -978,7 +991,7 @@ class BaseSeriesData(HasShapeTileableData, _ToPandasMixin):
978
991
  return self._to_str(representation=False)
979
992
 
980
993
  def __repr__(self):
981
- return self._to_str(representation=False)
994
+ return self._to_str(representation=True)
982
995
 
983
996
  @property
984
997
  def dtype(self):
@@ -1501,6 +1514,15 @@ class BaseDataFrameData(HasShapeTileableData, _ToPandasMixin):
1501
1514
  refresh_index_value(self)
1502
1515
  refresh_dtypes(self)
1503
1516
 
1517
+ def refresh_from_table_meta(self, table_meta: DataFrameTableMeta) -> None:
1518
+ dtypes = table_meta.pd_column_dtypes
1519
+ self._dtypes = dtypes
1520
+ self._columns_value = parse_index(dtypes.index, store_data=True)
1521
+ self._dtypes_value = DtypesValue(key=tokenize(dtypes), value=dtypes)
1522
+ new_shape = list(self._shape)
1523
+ new_shape[0] = len(dtypes)
1524
+ self._shape = tuple(new_shape)
1525
+
1504
1526
  @property
1505
1527
  def dtypes(self):
1506
1528
  dt = getattr(self, "_dtypes", None)
@@ -1997,12 +2019,6 @@ class DataFrame(HasShapeTileable, _ToPandasMixin):
1997
2019
  Berkeley 25.0 77.0 298.15
1998
2020
  """
1999
2021
 
2000
- def apply_if_callable(maybe_callable, obj, **kwargs):
2001
- if callable(maybe_callable):
2002
- return maybe_callable(obj, **kwargs)
2003
-
2004
- return maybe_callable
2005
-
2006
2022
  data = self.copy()
2007
2023
 
2008
2024
  for k, v in kwargs.items():
@@ -2197,6 +2213,9 @@ class CategoricalData(HasShapeTileableData, _ToPandasMixin):
2197
2213
  pd.Categorical(categories).categories, store_data=True
2198
2214
  )
2199
2215
 
2216
+ def refresh_from_table_meta(self, table_meta: DataFrameTableMeta) -> None:
2217
+ pass
2218
+
2200
2219
  def _to_str(self, representation=False):
2201
2220
  if is_build_mode() or len(self._executed_sessions) == 0:
2202
2221
  # in build mode, or not executed, just return representation
@@ -2347,6 +2366,9 @@ class DataFrameOrSeriesData(HasShapeTileableData, _ToPandasMixin):
2347
2366
  data_params["name"] = self.chunks[0].name
2348
2367
  self._data_params.update(data_params)
2349
2368
 
2369
+ def refresh_from_table_meta(self, table_meta: DataFrameTableMeta) -> None:
2370
+ pass
2371
+
2350
2372
  def ensure_data(self):
2351
2373
  from .fetch.core import DataFrameFetch
2352
2374
 
@@ -216,7 +216,9 @@ class DataFrameReadODPSQuery(
216
216
  index_value = parse_index(pd.RangeIndex(0))
217
217
  elif len(self.index_columns) == 1:
218
218
  index_value = parse_index(
219
- pd.Index([], name=self.index_columns[0]).astype(self.index_dtypes[0])
219
+ pd.Index([], name=self.index_columns[0]).astype(
220
+ self.index_dtypes.iloc[0]
221
+ )
220
222
  )
221
223
  else:
222
224
  idx = pd.MultiIndex.from_frame(
@@ -82,7 +82,9 @@ class DataFrameReadODPSTable(
82
82
  index_value = parse_index(pd.RangeIndex(shape[0]))
83
83
  elif len(self.index_columns) == 1:
84
84
  index_value = parse_index(
85
- pd.Index([], name=self.index_columns[0]).astype(self.index_dtypes[0])
85
+ pd.Index([], name=self.index_columns[0]).astype(
86
+ self.index_dtypes.iloc[0]
87
+ )
86
88
  )
87
89
  else:
88
90
  idx = pd.MultiIndex.from_frame(
@@ -0,0 +1,13 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
@@ -0,0 +1,48 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import pytest
16
+
17
+ from ... import DataFrame
18
+ from ..to_odps import to_odps_table
19
+
20
+
21
+ @pytest.fixture
22
+ def df():
23
+ return DataFrame({"A": [1, 2], "B": [3, 4]})
24
+
25
+
26
+ @pytest.mark.parametrize(
27
+ "kwargs",
28
+ [
29
+ {"partition_col": ["A", "C"]},
30
+ {"partition_col": "C"},
31
+ {"partition": "a=1,C=2"},
32
+ ],
33
+ )
34
+ def test_to_odps_table_validation(df, kwargs):
35
+ with pytest.raises(ValueError):
36
+ to_odps_table(df, "test_table", **kwargs)
37
+
38
+
39
+ @pytest.mark.parametrize(
40
+ "kwargs",
41
+ [
42
+ {"partition_col": ["a", "B"]},
43
+ {"partition_col": "a"},
44
+ {"partition": "C=1,d=2"},
45
+ ],
46
+ )
47
+ def test_to_odps_table_vaild(df, kwargs):
48
+ to_odps_table(df, "test_table", **kwargs)
@@ -18,10 +18,12 @@ import logging
18
18
  from typing import List, Optional, Union
19
19
 
20
20
  from odps.models import Table as ODPSTable
21
+ from odps.types import PartitionSpec
21
22
 
22
23
  from ... import opcodes
23
24
  from ...config import options
24
25
  from ...core import OutputType
26
+ from ...odpsio import build_dataframe_table_meta
25
27
  from ...serialization.serializables import (
26
28
  BoolField,
27
29
  FieldTypes,
@@ -147,6 +149,25 @@ def to_odps_table(
147
149
  f"index_label needs {len(df.index.nlevels)} labels "
148
150
  f"but it only have {len(index_label)}"
149
151
  )
152
+ table_cols = set(build_dataframe_table_meta(df).table_column_names)
153
+ if partition:
154
+ partition_intersect = (
155
+ set(x.lower() for x in PartitionSpec(partition).keys()) & table_cols
156
+ )
157
+ if partition_intersect:
158
+ raise ValueError(
159
+ f"Data column(s) {partition_intersect} in the dataframe"
160
+ " cannot be used in parameter 'partition'."
161
+ " Use 'partition_col' instead."
162
+ )
163
+
164
+ if partition_col:
165
+ partition_diff = set(x.lower() for x in partition_col) - table_cols
166
+ if partition_diff:
167
+ raise ValueError(
168
+ f"Partition column(s) {partition_diff}"
169
+ " is not the data column(s) of the input dataframe."
170
+ )
150
171
 
151
172
  op = DataFrameToODPSTable(
152
173
  dtypes=df.dtypes,
@@ -138,7 +138,7 @@ class DataFrameAlign(DataFrameOperator, DataFrameOperatorMixin):
138
138
  series_index = rhs.index_value.to_pandas()
139
139
  dtypes = lhs.dtypes.reindex(
140
140
  lhs.dtypes.index.join(series_index, how=self.join)
141
- ).fillna(np.dtype(np.float_))
141
+ ).fillna(np.dtype(float))
142
142
  l_shape[1] = r_size = len(dtypes)
143
143
  col_val = r_idx_val = parse_index(dtypes.index, store_data=True)
144
144
 
@@ -14,6 +14,7 @@
14
14
 
15
15
  from .apply import df_apply, series_apply
16
16
  from .astype import astype, index_astype
17
+ from .case_when import case_when
17
18
  from .check_monotonic import (
18
19
  check_monotonic,
19
20
  is_monotonic,
@@ -37,6 +38,7 @@ from .map import index_map, series_map
37
38
  from .melt import melt
38
39
  from .memory_usage import df_memory_usage, index_memory_usage, series_memory_usage
39
40
  from .pct_change import pct_change
41
+ from .pivot_table import pivot_table
40
42
  from .qcut import qcut
41
43
  from .select_dtypes import select_dtypes
42
44
  from .shift import shift, tshift
@@ -69,6 +71,7 @@ def _install():
69
71
  setattr(t, "melt", melt)
70
72
  setattr(t, "memory_usage", df_memory_usage)
71
73
  setattr(t, "pct_change", pct_change)
74
+ setattr(t, "pivot_table", pivot_table)
72
75
  setattr(t, "pop", df_pop)
73
76
  setattr(t, "query", df_query)
74
77
  setattr(t, "select_dtypes", select_dtypes)
@@ -81,6 +84,7 @@ def _install():
81
84
  for t in SERIES_TYPE:
82
85
  setattr(t, "apply", series_apply)
83
86
  setattr(t, "astype", astype)
87
+ setattr(t, "case_when", case_when)
84
88
  setattr(t, "check_monotonic", check_monotonic)
85
89
  setattr(t, "describe", describe)
86
90
  setattr(t, "diff", series_diff)
@@ -170,6 +170,8 @@ class ApplyOperator(
170
170
  elif self.output_types[0] == OutputType.dataframe:
171
171
  shape = [np.nan, np.nan]
172
172
  shape[1 - self.axis] = df.shape[1 - self.axis]
173
+ if self.axis == 1:
174
+ shape[1] = len(dtypes)
173
175
  shape = tuple(shape)
174
176
  else:
175
177
  shape = (df.shape[1 - self.axis],)
@@ -225,7 +227,7 @@ class ApplyOperator(
225
227
  else: # pragma: no cover
226
228
  index_value = parse_index(infer_series.index)
227
229
  else:
228
- index_value = parse_index(None, series)
230
+ index_value = parse_index(series.index_value)
229
231
 
230
232
  if output_type == OutputType.dataframe:
231
233
  if dtypes is None: