maxframe 0.1.0b4__cp311-cp311-macosx_10_9_x86_64.whl → 1.0.0rc2__cp311-cp311-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (112) hide show
  1. maxframe/__init__.py +1 -0
  2. maxframe/_utils.cpython-311-darwin.so +0 -0
  3. maxframe/codegen.py +52 -3
  4. maxframe/config/config.py +48 -2
  5. maxframe/config/validators.py +1 -0
  6. maxframe/conftest.py +2 -0
  7. maxframe/core/__init__.py +0 -3
  8. maxframe/core/entity/__init__.py +1 -8
  9. maxframe/core/entity/objects.py +3 -45
  10. maxframe/core/graph/core.cpython-311-darwin.so +0 -0
  11. maxframe/core/graph/core.pyx +4 -4
  12. maxframe/dataframe/__init__.py +2 -1
  13. maxframe/dataframe/arithmetic/around.py +5 -17
  14. maxframe/dataframe/arithmetic/core.py +15 -7
  15. maxframe/dataframe/arithmetic/docstring.py +5 -55
  16. maxframe/dataframe/arithmetic/tests/test_arithmetic.py +22 -0
  17. maxframe/dataframe/core.py +34 -12
  18. maxframe/dataframe/datasource/date_range.py +2 -2
  19. maxframe/dataframe/datasource/read_odps_query.py +9 -1
  20. maxframe/dataframe/datasource/read_odps_table.py +5 -2
  21. maxframe/dataframe/datasource/tests/test_datasource.py +14 -0
  22. maxframe/dataframe/datastore/tests/__init__.py +13 -0
  23. maxframe/dataframe/datastore/tests/test_to_odps.py +48 -0
  24. maxframe/dataframe/datastore/to_odps.py +21 -0
  25. maxframe/dataframe/groupby/cum.py +0 -1
  26. maxframe/dataframe/groupby/tests/test_groupby.py +4 -0
  27. maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
  28. maxframe/dataframe/indexing/align.py +1 -1
  29. maxframe/dataframe/indexing/rename.py +3 -37
  30. maxframe/dataframe/indexing/sample.py +0 -1
  31. maxframe/dataframe/indexing/set_index.py +68 -1
  32. maxframe/dataframe/merge/merge.py +236 -2
  33. maxframe/dataframe/merge/tests/test_merge.py +123 -0
  34. maxframe/dataframe/misc/__init__.py +4 -0
  35. maxframe/dataframe/misc/apply.py +6 -11
  36. maxframe/dataframe/misc/case_when.py +141 -0
  37. maxframe/dataframe/misc/describe.py +2 -2
  38. maxframe/dataframe/misc/drop_duplicates.py +4 -25
  39. maxframe/dataframe/misc/eval.py +4 -0
  40. maxframe/dataframe/misc/memory_usage.py +2 -2
  41. maxframe/dataframe/misc/pct_change.py +1 -83
  42. maxframe/dataframe/misc/pivot_table.py +262 -0
  43. maxframe/dataframe/misc/tests/test_misc.py +84 -0
  44. maxframe/dataframe/misc/transform.py +1 -30
  45. maxframe/dataframe/misc/value_counts.py +4 -17
  46. maxframe/dataframe/missing/dropna.py +1 -1
  47. maxframe/dataframe/missing/fillna.py +5 -5
  48. maxframe/dataframe/plotting/core.py +2 -2
  49. maxframe/dataframe/reduction/core.py +2 -1
  50. maxframe/dataframe/sort/sort_values.py +1 -11
  51. maxframe/dataframe/statistics/corr.py +3 -3
  52. maxframe/dataframe/statistics/quantile.py +5 -17
  53. maxframe/dataframe/utils.py +11 -7
  54. maxframe/errors.py +13 -0
  55. maxframe/extension.py +12 -0
  56. maxframe/learn/contrib/utils.py +52 -0
  57. maxframe/learn/contrib/xgboost/__init__.py +26 -0
  58. maxframe/learn/contrib/xgboost/classifier.py +86 -0
  59. maxframe/learn/contrib/xgboost/core.py +156 -0
  60. maxframe/learn/contrib/xgboost/dmatrix.py +150 -0
  61. maxframe/learn/contrib/xgboost/predict.py +138 -0
  62. maxframe/learn/contrib/xgboost/regressor.py +78 -0
  63. maxframe/learn/contrib/xgboost/tests/__init__.py +13 -0
  64. maxframe/learn/contrib/xgboost/tests/test_core.py +43 -0
  65. maxframe/learn/contrib/xgboost/train.py +121 -0
  66. maxframe/learn/utils/__init__.py +15 -0
  67. maxframe/learn/utils/core.py +29 -0
  68. maxframe/lib/mmh3.cpython-311-darwin.so +0 -0
  69. maxframe/lib/mmh3.pyi +43 -0
  70. maxframe/lib/wrapped_pickle.py +2 -1
  71. maxframe/odpsio/__init__.py +1 -1
  72. maxframe/odpsio/arrow.py +10 -7
  73. maxframe/odpsio/schema.py +10 -7
  74. maxframe/odpsio/tableio.py +410 -14
  75. maxframe/odpsio/tests/test_schema.py +32 -26
  76. maxframe/odpsio/tests/test_tableio.py +48 -21
  77. maxframe/opcodes.py +3 -0
  78. maxframe/protocol.py +148 -12
  79. maxframe/serialization/core.cpython-311-darwin.so +0 -0
  80. maxframe/serialization/core.pxd +3 -0
  81. maxframe/serialization/core.pyi +64 -0
  82. maxframe/serialization/core.pyx +54 -25
  83. maxframe/serialization/exception.py +1 -1
  84. maxframe/serialization/pandas.py +7 -2
  85. maxframe/serialization/serializables/core.py +158 -12
  86. maxframe/serialization/serializables/tests/test_serializable.py +46 -4
  87. maxframe/session.py +28 -0
  88. maxframe/tensor/__init__.py +60 -1
  89. maxframe/tensor/arithmetic/tests/test_arithmetic.py +1 -1
  90. maxframe/tensor/base/__init__.py +2 -0
  91. maxframe/tensor/base/atleast_1d.py +74 -0
  92. maxframe/tensor/base/unique.py +205 -0
  93. maxframe/tensor/datasource/array.py +4 -2
  94. maxframe/tensor/datasource/scalar.py +1 -1
  95. maxframe/tensor/reduction/count_nonzero.py +1 -1
  96. maxframe/tensor/statistics/quantile.py +2 -2
  97. maxframe/tests/test_protocol.py +34 -0
  98. maxframe/tests/test_utils.py +0 -12
  99. maxframe/tests/utils.py +11 -2
  100. maxframe/udf.py +63 -3
  101. maxframe/utils.py +30 -13
  102. {maxframe-0.1.0b4.dist-info → maxframe-1.0.0rc2.dist-info}/METADATA +76 -3
  103. {maxframe-0.1.0b4.dist-info → maxframe-1.0.0rc2.dist-info}/RECORD +111 -92
  104. {maxframe-0.1.0b4.dist-info → maxframe-1.0.0rc2.dist-info}/WHEEL +1 -1
  105. maxframe_client/__init__.py +0 -1
  106. maxframe_client/fetcher.py +86 -13
  107. maxframe_client/session/odps.py +79 -10
  108. maxframe_client/session/task.py +65 -71
  109. maxframe_client/tests/test_fetcher.py +21 -3
  110. maxframe_client/tests/test_session.py +76 -2
  111. maxframe_client/clients/spe.py +0 -104
  112. {maxframe-0.1.0b4.dist-info → maxframe-1.0.0rc2.dist-info}/top_level.txt +0 -0
maxframe/__init__.py CHANGED
@@ -13,6 +13,7 @@
13
13
  # limitations under the License.
14
14
 
15
15
  from . import dataframe, learn, remote, tensor
16
+ from .config import options
16
17
  from .session import execute, fetch, new_session, stop_server
17
18
 
18
19
 
Binary file
maxframe/codegen.py CHANGED
@@ -16,6 +16,7 @@ import abc
16
16
  import base64
17
17
  import dataclasses
18
18
  import logging
19
+ from collections import defaultdict
19
20
  from enum import Enum
20
21
  from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union
21
22
 
@@ -32,7 +33,7 @@ from .protocol import DataFrameTableMeta, ResultInfo
32
33
  from .serialization import PickleContainer
33
34
  from .serialization.serializables import Serializable, StringField
34
35
  from .typing_ import PandasObjectTypes
35
- from .udf import MarkedFunction
36
+ from .udf import MarkedFunction, PythonPackOptions
36
37
 
37
38
  if TYPE_CHECKING:
38
39
  from odpsctx import ODPSSessionContext
@@ -75,8 +76,18 @@ class AbstractUDF(Serializable):
75
76
  def unregister(self, odps: "ODPSSessionContext"):
76
77
  raise NotImplementedError
77
78
 
79
+ @abc.abstractmethod
80
+ def collect_pythonpack(self) -> List[PythonPackOptions]:
81
+ raise NotImplementedError
82
+
83
+ @abc.abstractmethod
84
+ def load_pythonpack_resources(self, odps_ctx: "ODPSSessionContext") -> None:
85
+ raise NotImplementedError
86
+
78
87
 
79
88
  class UserCodeMixin:
89
+ __slots__ = ()
90
+
80
91
  @classmethod
81
92
  def obj_to_python_expr(cls, obj: Any = None) -> str:
82
93
  """
@@ -335,6 +346,8 @@ def register_engine_codegen(type_: Type["BigDagCodeGenerator"]):
335
346
  BUILTIN_ENGINE_SPE = "SPE"
336
347
  BUILTIN_ENGINE_MCSQL = "MCSQL"
337
348
 
349
+ FAST_RANGE_INDEX_ENABLED = "codegen.fast_range_index_enabled"
350
+
338
351
 
339
352
  class BigDagCodeGenerator(metaclass=abc.ABCMeta):
340
353
  _context: BigDagCodeContext
@@ -469,14 +482,50 @@ class BigDagCodeGenerator(metaclass=abc.ABCMeta):
469
482
  output_key_to_result_infos=self._context.get_tileable_result_infos(),
470
483
  )
471
484
 
485
+ def run_pythonpacks(
486
+ self,
487
+ odps_ctx: "ODPSSessionContext",
488
+ python_tag: str,
489
+ is_production: bool = False,
490
+ schedule_id: Optional[str] = None,
491
+ hints: Optional[dict] = None,
492
+ priority: Optional[int] = None,
493
+ ) -> Dict[str, PythonPackOptions]:
494
+ key_to_packs = defaultdict(list)
495
+ for udf in self._context.get_udfs():
496
+ for pack in udf.collect_pythonpack():
497
+ key_to_packs[pack.key].append(pack)
498
+ distinct_packs = []
499
+ for packs in key_to_packs.values():
500
+ distinct_packs.append(packs[0])
501
+
502
+ inst_id_to_req = {}
503
+ for pack in distinct_packs:
504
+ inst = odps_ctx.run_pythonpack(
505
+ requirements=pack.requirements,
506
+ prefer_binary=pack.prefer_binary,
507
+ pre_release=pack.pre_release,
508
+ force_rebuild=pack.force_rebuild,
509
+ python_tag=python_tag,
510
+ is_production=is_production,
511
+ schedule_id=schedule_id,
512
+ hints=hints,
513
+ priority=priority,
514
+ )
515
+ # fulfill instance id of pythonpacks with same keys
516
+ for same_pack in key_to_packs[pack.key]:
517
+ same_pack.pack_instance_id = inst.id
518
+ inst_id_to_req[inst.id] = pack
519
+ return inst_id_to_req
520
+
472
521
  def register_udfs(self, odps_ctx: "ODPSSessionContext"):
473
522
  for udf in self._context.get_udfs():
474
- logger.info("[Session %s] Registering UDF %s", self._session_id, udf.name)
523
+ logger.info("[Session=%s] Registering UDF %s", self._session_id, udf.name)
475
524
  udf.register(odps_ctx, True)
476
525
 
477
526
  def unregister_udfs(self, odps_ctx: "ODPSSessionContext"):
478
527
  for udf in self._context.get_udfs():
479
- logger.info("[Session %s] Unregistering UDF %s", self._session_id, udf.name)
528
+ logger.info("[Session=%s] Unregistering UDF %s", self._session_id, udf.name)
480
529
  udf.unregister(odps_ctx)
481
530
 
482
531
  def get_udfs(self) -> List[AbstractUDF]:
maxframe/config/config.py CHANGED
@@ -19,6 +19,15 @@ import warnings
19
19
  from copy import deepcopy
20
20
  from typing import Any, Dict, Optional, Union
21
21
 
22
+ from odps.lib import tzlocal
23
+
24
+ try:
25
+ from zoneinfo import available_timezones
26
+ except ImportError:
27
+ from pytz import all_timezones
28
+
29
+ available_timezones = lambda: all_timezones
30
+
22
31
  from ..utils import get_python_tag
23
32
  from .validators import (
24
33
  ValidatorType,
@@ -28,6 +37,7 @@ from .validators import (
28
37
  is_dict,
29
38
  is_in,
30
39
  is_integer,
40
+ is_non_negative_integer,
31
41
  is_null,
32
42
  is_numeric,
33
43
  is_string,
@@ -37,9 +47,12 @@ _DEFAULT_REDIRECT_WARN = "Option {source} has been replaced by {target} and migh
37
47
  _DEFAULT_MAX_ALIVE_SECONDS = 3 * 24 * 3600
38
48
  _DEFAULT_MAX_IDLE_SECONDS = 3600
39
49
  _DEFAULT_SPE_OPERATION_TIMEOUT_SECONDS = 120
50
+ _DEFAULT_SPE_FAILURE_RETRY_TIMES = 5
40
51
  _DEFAULT_UPLOAD_BATCH_SIZE = 4096
41
52
  _DEFAULT_TEMP_LIFECYCLE = 1
42
53
  _DEFAULT_TASK_START_TIMEOUT = 60
54
+ _DEFAULT_TASK_RESTART_TIMEOUT = 300
55
+ _DEFAULT_LOGVIEW_HOURS = 24 * 30
43
56
 
44
57
 
45
58
  class OptionError(Exception):
@@ -295,23 +308,46 @@ class Config:
295
308
  return {k: v for k, v in res.items() if k in self._remote_options}
296
309
 
297
310
 
298
- default_options = Config()
311
+ def _get_legal_local_tz_name() -> Optional[str]:
312
+ """Sometimes we may get illegal tz name from tzlocal.get_localzone()"""
313
+ tz_name = str(tzlocal.get_localzone())
314
+ if tz_name not in available_timezones():
315
+ return None
316
+ return tz_name
317
+
299
318
 
319
+ default_options = Config()
300
320
  default_options.register_option(
301
321
  "execution_mode", "trigger", validator=is_in(["trigger", "eager"])
302
322
  )
323
+ default_options.register_option("use_common_table", False, validator=is_bool)
303
324
  default_options.register_option(
304
325
  "python_tag", get_python_tag(), validator=is_string, remote=True
305
326
  )
327
+ default_options.register_option(
328
+ "local_timezone",
329
+ _get_legal_local_tz_name(),
330
+ validator=any_validator(is_null, is_in(set(available_timezones()))),
331
+ remote=True,
332
+ )
333
+ default_options.register_option(
334
+ "session.logview_hours", _DEFAULT_LOGVIEW_HOURS, validator=is_integer, remote=True
335
+ )
306
336
  default_options.register_option(
307
337
  "client.task_start_timeout", _DEFAULT_TASK_START_TIMEOUT, validator=is_integer
308
338
  )
339
+ default_options.register_option(
340
+ "client.task_restart_timeout", _DEFAULT_TASK_RESTART_TIMEOUT, validator=is_integer
341
+ )
309
342
  default_options.register_option("sql.enable_mcqa", True, validator=is_bool, remote=True)
310
343
  default_options.register_option(
311
344
  "sql.generate_comments", True, validator=is_bool, remote=True
312
345
  )
313
346
  default_options.register_option("sql.settings", {}, validator=is_dict, remote=True)
314
347
 
348
+ default_options.register_option("is_production", False, validator=is_bool, remote=True)
349
+ default_options.register_option("schedule_id", "", validator=is_string, remote=True)
350
+
315
351
  default_options.register_option(
316
352
  "session.max_alive_seconds",
317
353
  _DEFAULT_MAX_ALIVE_SECONDS,
@@ -368,7 +404,13 @@ default_options.register_option(
368
404
  default_options.register_option(
369
405
  "spe.operation_timeout_seconds",
370
406
  _DEFAULT_SPE_OPERATION_TIMEOUT_SECONDS,
371
- validator=is_integer,
407
+ validator=is_non_negative_integer,
408
+ remote=True,
409
+ )
410
+ default_options.register_option(
411
+ "spe.failure_retry_times",
412
+ _DEFAULT_SPE_FAILURE_RETRY_TIMES,
413
+ validator=is_non_negative_integer,
372
414
  remote=True,
373
415
  )
374
416
 
@@ -376,6 +418,10 @@ default_options.register_option(
376
418
  "spe.task.settings", dict(), validator=is_dict, remote=True
377
419
  )
378
420
 
421
+ default_options.register_option(
422
+ "pythonpack.task.settings", {}, validator=is_dict, remote=True
423
+ )
424
+
379
425
  _options_ctx_var = contextvars.ContextVar("_options_ctx_var")
380
426
 
381
427
 
@@ -40,6 +40,7 @@ is_numeric = lambda x: isinstance(x, (int, float))
40
40
  is_string = lambda x: isinstance(x, str)
41
41
  is_dict = lambda x: isinstance(x, dict)
42
42
  is_positive_integer = lambda x: is_integer(x) and x > 0
43
+ is_non_negative_integer = lambda x: is_integer(x) and x >= 0
43
44
 
44
45
 
45
46
  def is_in(vals):
maxframe/conftest.py CHANGED
@@ -87,6 +87,7 @@ def oss_config():
87
87
  oss_secret_access_key = config.get("oss", "secret_access_key")
88
88
  oss_bucket_name = config.get("oss", "bucket_name")
89
89
  oss_endpoint = config.get("oss", "endpoint")
90
+ oss_rolearn = config.get("oss", "rolearn")
90
91
 
91
92
  config.oss_config = (
92
93
  oss_access_id,
@@ -99,6 +100,7 @@ def oss_config():
99
100
 
100
101
  auth = oss2.Auth(oss_access_id, oss_secret_access_key)
101
102
  config.oss_bucket = oss2.Bucket(auth, oss_endpoint, oss_bucket_name)
103
+ config.oss_rolearn = oss_rolearn
102
104
  return config
103
105
  except (ConfigParser.NoSectionError, ConfigParser.NoOptionError, ImportError):
104
106
  return None
maxframe/core/__init__.py CHANGED
@@ -19,7 +19,6 @@ from .entity import (
19
19
  CHUNK_TYPE,
20
20
  ENTITY_TYPE,
21
21
  FUSE_CHUNK_TYPE,
22
- OBJECT_CHUNK_TYPE,
23
22
  OBJECT_TYPE,
24
23
  TILEABLE_TYPE,
25
24
  Chunk,
@@ -33,8 +32,6 @@ from .entity import (
33
32
  HasShapeTileableData,
34
33
  NotSupportTile,
35
34
  Object,
36
- ObjectChunk,
37
- ObjectChunkData,
38
35
  ObjectData,
39
36
  OutputType,
40
37
  Tileable,
@@ -16,14 +16,7 @@ from .chunks import CHUNK_TYPE, Chunk, ChunkData
16
16
  from .core import ENTITY_TYPE, Entity, EntityData
17
17
  from .executable import ExecutableTuple, _ExecuteAndFetchMixin
18
18
  from .fuse import FUSE_CHUNK_TYPE, FuseChunk, FuseChunkData
19
- from .objects import (
20
- OBJECT_CHUNK_TYPE,
21
- OBJECT_TYPE,
22
- Object,
23
- ObjectChunk,
24
- ObjectChunkData,
25
- ObjectData,
26
- )
19
+ from .objects import OBJECT_TYPE, Object, ObjectData
27
20
  from .output_types import (
28
21
  OutputType,
29
22
  get_fetch_class,
@@ -14,58 +14,17 @@
14
14
 
15
15
  from typing import Any, Dict
16
16
 
17
- from ...serialization.serializables import FieldTypes, ListField
18
- from ...utils import skip_na_call
19
- from .chunks import Chunk, ChunkData
20
17
  from .core import Entity
21
18
  from .executable import _ToObjectMixin
22
19
  from .tileables import TileableData
23
20
 
24
21
 
25
- class ObjectChunkData(ChunkData):
26
- # chunk whose data could be any serializable
27
- __slots__ = ()
28
- type_name = "Object"
29
-
30
- def __init__(self, op=None, index=None, **kw):
31
- super().__init__(_op=op, _index=index, **kw)
32
-
33
- @property
34
- def params(self) -> Dict[str, Any]:
35
- # params return the properties which useful to rebuild a new chunk
36
- return {
37
- "index": self.index,
38
- }
39
-
40
- @params.setter
41
- def params(self, new_params: Dict[str, Any]):
42
- params = new_params.copy()
43
- params.pop("index", None) # index not needed to update
44
- if params: # pragma: no cover
45
- raise TypeError(f"Unknown params: {list(params)}")
46
-
47
- @classmethod
48
- def get_params_from_data(cls, data: Any) -> Dict[str, Any]:
49
- return dict()
50
-
51
-
52
- class ObjectChunk(Chunk):
53
- __slots__ = ()
54
- _allow_data_type_ = (ObjectChunkData,)
55
- type_name = "Object"
56
-
57
-
58
22
  class ObjectData(TileableData, _ToObjectMixin):
59
23
  __slots__ = ()
60
24
  type_name = "Object"
61
-
62
- # optional fields
63
- _chunks = ListField(
64
- "chunks",
65
- FieldTypes.reference(ObjectChunkData),
66
- on_serialize=skip_na_call(lambda x: [it.data for it in x]),
67
- on_deserialize=skip_na_call(lambda x: [ObjectChunk(it) for it in x]),
68
- )
25
+ # workaround for removed field since v0.1.0b5
26
+ # todo remove this when all versions below v1.0.0rc1 is eliminated
27
+ _legacy_deprecated_non_primitives = ["_chunks"]
69
28
 
70
29
  def __init__(self, op=None, nsplits=None, **kw):
71
30
  super().__init__(_op=op, _nsplits=nsplits, **kw)
@@ -97,4 +56,3 @@ class Object(Entity, _ToObjectMixin):
97
56
 
98
57
 
99
58
  OBJECT_TYPE = (Object, ObjectData)
100
- OBJECT_CHUNK_TYPE = (ObjectChunk, ObjectChunkData)
@@ -354,10 +354,10 @@ cdef class DirectedGraph:
354
354
  sio.write(f'"Chunk:{self._gen_chunk_key(input_chunk, trunc_key)}" {chunk_style}\n')
355
355
  visited.add(input_chunk.key)
356
356
  if op.key not in visited:
357
- sio.write(f'"{op_name}:{op.key[:trunc_key]}" {operator_style}\n')
357
+ sio.write(f'"{op_name}:{op.key[:trunc_key]}_{id(op)}" {operator_style}\n')
358
358
  visited.add(op.key)
359
359
  sio.write(f'"Chunk:{self._gen_chunk_key(input_chunk, trunc_key)}" -> '
360
- f'"{op_name}:{op.key[:trunc_key]}"\n')
360
+ f'"{op_name}:{op.key[:trunc_key]}_{id(op)}"\n')
361
361
 
362
362
  for output_chunk in (op.outputs or []):
363
363
  if output_chunk.key not in visited:
@@ -367,9 +367,9 @@ cdef class DirectedGraph:
367
367
  sio.write(f'"Chunk:{self._gen_chunk_key(output_chunk, trunc_key)}" {tmp_chunk_style}\n')
368
368
  visited.add(output_chunk.key)
369
369
  if op.key not in visited:
370
- sio.write(f'"{op_name}:{op.key[:trunc_key]}" {operator_style}\n')
370
+ sio.write(f'"{op_name}:{op.key[:trunc_key]}_{id(op)}" {operator_style}\n')
371
371
  visited.add(op.key)
372
- sio.write(f'"{op_name}:{op.key[:trunc_key]}" -> '
372
+ sio.write(f'"{op_name}:{op.key[:trunc_key]}_{id(op)}" -> '
373
373
  f'"Chunk:{self._gen_chunk_key(output_chunk, trunc_key)}"')
374
374
  if show_columns:
375
375
  sio.write(f' [ label={get_col_names(output_chunk)} ]')
@@ -46,6 +46,7 @@ from .misc.cut import cut
46
46
  from .misc.eval import maxframe_eval as eval # pylint: disable=redefined-builtin
47
47
  from .misc.get_dummies import get_dummies
48
48
  from .misc.melt import melt
49
+ from .misc.pivot_table import pivot_table
49
50
  from .misc.qcut import qcut
50
51
  from .misc.to_numeric import to_numeric
51
52
  from .missing import isna, isnull, notna, notnull
@@ -53,7 +54,7 @@ from .reduction import CustomReduction, unique
53
54
  from .tseries.to_datetime import to_datetime
54
55
 
55
56
  try:
56
- from pandas import NA, Timestamp
57
+ from pandas import NA, NaT, Timestamp
57
58
  except ImportError: # pragma: no cover
58
59
  pass
59
60
 
@@ -43,20 +43,20 @@ def around(df, decimals=0, *args, **kwargs):
43
43
  return op(df)
44
44
 
45
45
 
46
+ # FIXME Series input of decimals not supported yet
46
47
  around.__frame_doc__ = """
47
48
  Round a DataFrame to a variable number of decimal places.
48
49
 
49
50
  Parameters
50
51
  ----------
51
- decimals : int, dict, Series
52
+ decimals : int, dict
52
53
  Number of decimal places to round each column to. If an int is
53
54
  given, round each column to the same number of places.
54
55
  Otherwise dict and Series round to variable numbers of places.
55
56
  Column names should be in the keys if `decimals` is a
56
- dict-like, or in the index if `decimals` is a Series. Any
57
- columns not included in `decimals` will be left as is. Elements
58
- of `decimals` which are not columns of the input will be
59
- ignored.
57
+ dict-like. Any columns not included in `decimals` will be left
58
+ as is. Elements of `decimals` which are not columns of the
59
+ input will be ignored.
60
60
  *args
61
61
  Additional keywords have no effect but might be accepted for
62
62
  compatibility with numpy.
@@ -107,18 +107,6 @@ places as value
107
107
  1 0.0 1.0
108
108
  2 0.7 0.0
109
109
  3 0.2 0.0
110
-
111
- Using a Series, the number of places for specific columns can be
112
- specified with the column names as index and the number of
113
- decimal places as value
114
-
115
- >>> decimals = md.Series([0, 1], index=['cats', 'dogs'])
116
- >>> df.round(decimals).execute()
117
- dogs cats
118
- 0 0.2 0.0
119
- 1 0.0 1.0
120
- 2 0.7 0.0
121
- 3 0.2 0.0
122
110
  """
123
111
  around.__series_doc__ = """
124
112
  Round each value in a Series to the given number of decimals.
@@ -39,7 +39,7 @@ class DataFrameBinOpMixin(DataFrameOperatorMixin):
39
39
  raise NotImplementedError
40
40
 
41
41
  @classmethod
42
- def _calc_properties(cls, x1, x2=None, axis="columns"):
42
+ def _calc_properties(cls, x1, x2=None, axis="columns", level=None):
43
43
  if isinstance(x1, DATAFRAME_TYPE) and (
44
44
  x2 is None or pd.api.types.is_scalar(x2) or isinstance(x2, TENSOR_TYPE)
45
45
  ):
@@ -108,7 +108,9 @@ class DataFrameBinOpMixin(DataFrameOperatorMixin):
108
108
  index = copy.copy(x1.index_value)
109
109
  index_shape = x1.shape[0]
110
110
  else:
111
- index = infer_index_value(x1.index_value, x2.index_value)
111
+ index = infer_index_value(
112
+ x1.index_value, x2.index_value, level=level
113
+ )
112
114
  if index.key == x1.index_value.key == x2.index_value.key and (
113
115
  not np.isnan(x1.shape[0]) or not np.isnan(x2.shape[0])
114
116
  ):
@@ -141,7 +143,9 @@ class DataFrameBinOpMixin(DataFrameOperatorMixin):
141
143
  column_shape = len(dtypes)
142
144
  else: # pragma: no cover
143
145
  dtypes = x1.dtypes # FIXME
144
- columns = infer_index_value(x1.columns_value, x2.index_value)
146
+ columns = infer_index_value(
147
+ x1.columns_value, x2.index_value, level=level
148
+ )
145
149
  column_shape = np.nan
146
150
  else:
147
151
  assert axis == "index" or axis == 0
@@ -169,7 +173,9 @@ class DataFrameBinOpMixin(DataFrameOperatorMixin):
169
173
  ],
170
174
  index=x1.dtypes.index,
171
175
  )
172
- index = infer_index_value(x1.index_value, x2.index_value)
176
+ index = infer_index_value(
177
+ x1.index_value, x2.index_value, level=level
178
+ )
173
179
  index_shape = np.nan
174
180
  return {
175
181
  "shape": (index_shape, column_shape),
@@ -187,7 +193,9 @@ class DataFrameBinOpMixin(DataFrameOperatorMixin):
187
193
  index = copy.copy(x1.index_value)
188
194
  index_shape = x1.shape[0]
189
195
  else:
190
- index = infer_index_value(x1.index_value, x2.index_value)
196
+ index = infer_index_value(
197
+ x1.index_value, x2.index_value, level=level
198
+ )
191
199
  if index.key == x1.index_value.key == x2.index_value.key and (
192
200
  not np.isnan(x1.shape[0]) or not np.isnan(x2.shape[0])
193
201
  ):
@@ -237,14 +245,14 @@ class DataFrameBinOpMixin(DataFrameOperatorMixin):
237
245
  self._check_inputs(x1, x2)
238
246
  if isinstance(x1, DATAFRAME_TYPE) or isinstance(x2, DATAFRAME_TYPE):
239
247
  df1, df2 = (x1, x2) if isinstance(x1, DATAFRAME_TYPE) else (x2, x1)
240
- kw = self._calc_properties(df1, df2, axis=self.axis)
248
+ kw = self._calc_properties(df1, df2, axis=self.axis, level=self.level)
241
249
  if not pd.api.types.is_scalar(df2):
242
250
  return self.new_dataframe([x1, x2], **kw)
243
251
  else:
244
252
  return self.new_dataframe([df1], **kw)
245
253
  if isinstance(x1, SERIES_TYPE) or isinstance(x2, SERIES_TYPE):
246
254
  s1, s2 = (x1, x2) if isinstance(x1, SERIES_TYPE) else (x2, x1)
247
- kw = self._calc_properties(s1, s2)
255
+ kw = self._calc_properties(s1, s2, level=self.level)
248
256
  if not pd.api.types.is_scalar(s2):
249
257
  return self.new_series([x1, x2], **kw)
250
258
  else:
@@ -12,6 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ # FIXME:https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/17
15
16
  _flex_doc_FRAME = """
16
17
  Get {desc} of dataframe and other, element-wise (binary operator `{op_name}`).
17
18
  Equivalent to ``{equiv}``, but with support to substitute a fill_value
@@ -127,44 +128,15 @@ circle 0
127
128
  triangle 3
128
129
  rectangle 4
129
130
 
130
- >>> (df * other).execute()
131
- angles degrees
132
- circle 0 NaN
133
- triangle 9 NaN
134
- rectangle 16 NaN
135
-
136
131
  >>> df.mul(other, fill_value=0).execute()
137
132
  angles degrees
138
133
  circle 0 0.0
139
134
  triangle 9 0.0
140
135
  rectangle 16 0.0
141
136
 
142
- Divide by a MultiIndex by level.
143
-
144
- >>> df_multindex = md.DataFrame({{'angles': [0, 3, 4, 4, 5, 6],
145
- ... 'degrees': [360, 180, 360, 360, 540, 720]}},
146
- ... index=[['A', 'A', 'A', 'B', 'B', 'B'],
147
- ... ['circle', 'triangle', 'rectangle',
148
- ... 'square', 'pentagon', 'hexagon']])
149
- >>> df_multindex.execute()
150
- angles degrees
151
- A circle 0 360
152
- triangle 3 180
153
- rectangle 4 360
154
- B square 4 360
155
- pentagon 5 540
156
- hexagon 6 720
157
-
158
- >>> df.div(df_multindex, level=1, fill_value=0).execute()
159
- angles degrees
160
- A circle NaN 1.0
161
- triangle 1.0 1.0
162
- rectangle 1.0 1.0
163
- B square 0.0 0.0
164
- pentagon 0.0 0.0
165
- hexagon 0.0 0.0
166
137
  """
167
138
 
139
+ # FIXME:https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/28
168
140
  _flex_doc_SERIES = """
169
141
  Return {desc} of series and other, element-wise (binary operator `{op_name}`).
170
142
 
@@ -213,6 +185,7 @@ e NaN
213
185
  dtype: float64
214
186
  """
215
187
 
188
+ # FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/48
216
189
  _flex_comp_doc_FRAME = """
217
190
  Get {desc} of dataframe and other, element-wise (binary operator `{op_name}`).
218
191
  Among flexible wrappers (`eq`, `ne`, `le`, `lt`, `ge`, `gt`) to comparison
@@ -257,7 +230,8 @@ Mismatched indices will be unioned together.
257
230
 
258
231
  Examples
259
232
  --------
260
- >>> df = pd.DataFrame({{'cost': [250, 150, 100],
233
+ >>> import maxframe.dataframe as md
234
+ >>> df = md.DataFrame({{'cost': [250, 150, 100],
261
235
  ... 'revenue': [100, 250, 300]}},
262
236
  ... index=['A', 'B', 'C'])
263
237
  >>> df.execute()
@@ -332,30 +306,6 @@ A False False
332
306
  B False False
333
307
  C False True
334
308
  D False False
335
-
336
- Compare to a MultiIndex by level.
337
-
338
- >>> df_multindex = pd.DataFrame({{'cost': [250, 150, 100, 150, 300, 220],
339
- ... 'revenue': [100, 250, 300, 200, 175, 225]}},
340
- ... index=[['Q1', 'Q1', 'Q1', 'Q2', 'Q2', 'Q2'],
341
- ... ['A', 'B', 'C', 'A', 'B', 'C']])
342
- >>> df_multindex.execute()
343
- cost revenue
344
- Q1 A 250 100
345
- B 150 250
346
- C 100 300
347
- Q2 A 150 200
348
- B 300 175
349
- C 220 225
350
-
351
- >>> df.le(df_multindex, level=1).execute()
352
- cost revenue
353
- Q1 A True True
354
- B True True
355
- C True True
356
- Q2 A False True
357
- B True False
358
- C True False
359
309
  """
360
310
 
361
311
 
@@ -239,6 +239,28 @@ def test_dataframe_and_series_with_shuffle(func_name, func_opts):
239
239
  assert df2.columns_value.key != df1.columns_value.key
240
240
 
241
241
 
242
+ @pytest.mark.parametrize("func_name, func_opts", binary_functions.items())
243
+ def test_dataframe_and_series_with_multiindex(func_name, func_opts):
244
+ data1 = pd.DataFrame(
245
+ np.random.rand(10, 10),
246
+ index=pd.MultiIndex.from_arrays(
247
+ [list("AAAAABBBBB"), [4, 9, 3, 2, 1, 5, 8, 6, 7, 10]]
248
+ ),
249
+ columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7],
250
+ )
251
+ data1 = to_boolean_if_needed(func_opts.func_name, data1)
252
+ df1 = from_pandas(data1, chunk_size=5)
253
+ s1 = from_pandas_series(data1[10].reset_index(level=0, drop=True), chunk_size=6)
254
+
255
+ df2 = getattr(df1, func_opts.func_name)(s1, level=1, axis=0)
256
+
257
+ # test df2's index and columns
258
+ assert df2.shape == (np.nan, df1.shape[1])
259
+ assert df2.index_value.key != df1.index_value.key
260
+ assert df2.index_value.names == df1.index_value.names
261
+ assert df2.columns_value.key == df1.columns_value.key
262
+
263
+
242
264
  @pytest.mark.parametrize("func_name, func_opts", binary_functions.items())
243
265
  def test_series_and_series_with_align_map(func_name, func_opts):
244
266
  data1 = pd.DataFrame(