maxframe 0.1.0b4__cp38-cp38-macosx_10_9_x86_64.whl → 1.0.0rc2__cp38-cp38-macosx_10_9_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/__init__.py +1 -0
- maxframe/_utils.cpython-38-darwin.so +0 -0
- maxframe/codegen.py +52 -3
- maxframe/config/config.py +48 -2
- maxframe/config/validators.py +1 -0
- maxframe/conftest.py +2 -0
- maxframe/core/__init__.py +0 -3
- maxframe/core/entity/__init__.py +1 -8
- maxframe/core/entity/objects.py +3 -45
- maxframe/core/graph/core.cpython-38-darwin.so +0 -0
- maxframe/core/graph/core.pyx +4 -4
- maxframe/dataframe/__init__.py +2 -1
- maxframe/dataframe/arithmetic/around.py +5 -17
- maxframe/dataframe/arithmetic/core.py +15 -7
- maxframe/dataframe/arithmetic/docstring.py +5 -55
- maxframe/dataframe/arithmetic/tests/test_arithmetic.py +22 -0
- maxframe/dataframe/core.py +34 -12
- maxframe/dataframe/datasource/date_range.py +2 -2
- maxframe/dataframe/datasource/read_odps_query.py +9 -1
- maxframe/dataframe/datasource/read_odps_table.py +5 -2
- maxframe/dataframe/datasource/tests/test_datasource.py +14 -0
- maxframe/dataframe/datastore/tests/__init__.py +13 -0
- maxframe/dataframe/datastore/tests/test_to_odps.py +48 -0
- maxframe/dataframe/datastore/to_odps.py +21 -0
- maxframe/dataframe/groupby/cum.py +0 -1
- maxframe/dataframe/groupby/tests/test_groupby.py +4 -0
- maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
- maxframe/dataframe/indexing/align.py +1 -1
- maxframe/dataframe/indexing/rename.py +3 -37
- maxframe/dataframe/indexing/sample.py +0 -1
- maxframe/dataframe/indexing/set_index.py +68 -1
- maxframe/dataframe/merge/merge.py +236 -2
- maxframe/dataframe/merge/tests/test_merge.py +123 -0
- maxframe/dataframe/misc/__init__.py +4 -0
- maxframe/dataframe/misc/apply.py +6 -11
- maxframe/dataframe/misc/case_when.py +141 -0
- maxframe/dataframe/misc/describe.py +2 -2
- maxframe/dataframe/misc/drop_duplicates.py +4 -25
- maxframe/dataframe/misc/eval.py +4 -0
- maxframe/dataframe/misc/memory_usage.py +2 -2
- maxframe/dataframe/misc/pct_change.py +1 -83
- maxframe/dataframe/misc/pivot_table.py +262 -0
- maxframe/dataframe/misc/tests/test_misc.py +84 -0
- maxframe/dataframe/misc/transform.py +1 -30
- maxframe/dataframe/misc/value_counts.py +4 -17
- maxframe/dataframe/missing/dropna.py +1 -1
- maxframe/dataframe/missing/fillna.py +5 -5
- maxframe/dataframe/plotting/core.py +2 -2
- maxframe/dataframe/reduction/core.py +2 -1
- maxframe/dataframe/sort/sort_values.py +1 -11
- maxframe/dataframe/statistics/corr.py +3 -3
- maxframe/dataframe/statistics/quantile.py +5 -17
- maxframe/dataframe/utils.py +11 -7
- maxframe/errors.py +13 -0
- maxframe/extension.py +12 -0
- maxframe/learn/contrib/utils.py +52 -0
- maxframe/learn/contrib/xgboost/__init__.py +26 -0
- maxframe/learn/contrib/xgboost/classifier.py +86 -0
- maxframe/learn/contrib/xgboost/core.py +156 -0
- maxframe/learn/contrib/xgboost/dmatrix.py +150 -0
- maxframe/learn/contrib/xgboost/predict.py +138 -0
- maxframe/learn/contrib/xgboost/regressor.py +78 -0
- maxframe/learn/contrib/xgboost/tests/__init__.py +13 -0
- maxframe/learn/contrib/xgboost/tests/test_core.py +43 -0
- maxframe/learn/contrib/xgboost/train.py +121 -0
- maxframe/learn/utils/__init__.py +15 -0
- maxframe/learn/utils/core.py +29 -0
- maxframe/lib/mmh3.cpython-38-darwin.so +0 -0
- maxframe/lib/mmh3.pyi +43 -0
- maxframe/lib/wrapped_pickle.py +2 -1
- maxframe/odpsio/__init__.py +1 -1
- maxframe/odpsio/arrow.py +10 -7
- maxframe/odpsio/schema.py +10 -7
- maxframe/odpsio/tableio.py +410 -14
- maxframe/odpsio/tests/test_schema.py +32 -26
- maxframe/odpsio/tests/test_tableio.py +48 -21
- maxframe/opcodes.py +3 -0
- maxframe/protocol.py +148 -12
- maxframe/serialization/core.cpython-38-darwin.so +0 -0
- maxframe/serialization/core.pxd +3 -0
- maxframe/serialization/core.pyi +64 -0
- maxframe/serialization/core.pyx +54 -25
- maxframe/serialization/exception.py +1 -1
- maxframe/serialization/pandas.py +7 -2
- maxframe/serialization/serializables/core.py +158 -12
- maxframe/serialization/serializables/tests/test_serializable.py +46 -4
- maxframe/session.py +28 -0
- maxframe/tensor/__init__.py +60 -1
- maxframe/tensor/arithmetic/tests/test_arithmetic.py +1 -1
- maxframe/tensor/base/__init__.py +2 -0
- maxframe/tensor/base/atleast_1d.py +74 -0
- maxframe/tensor/base/unique.py +205 -0
- maxframe/tensor/datasource/array.py +4 -2
- maxframe/tensor/datasource/scalar.py +1 -1
- maxframe/tensor/reduction/count_nonzero.py +1 -1
- maxframe/tensor/statistics/quantile.py +2 -2
- maxframe/tests/test_protocol.py +34 -0
- maxframe/tests/test_utils.py +0 -12
- maxframe/tests/utils.py +11 -2
- maxframe/udf.py +63 -3
- maxframe/utils.py +30 -13
- {maxframe-0.1.0b4.dist-info → maxframe-1.0.0rc2.dist-info}/METADATA +76 -3
- {maxframe-0.1.0b4.dist-info → maxframe-1.0.0rc2.dist-info}/RECORD +111 -92
- {maxframe-0.1.0b4.dist-info → maxframe-1.0.0rc2.dist-info}/WHEEL +1 -1
- maxframe_client/__init__.py +0 -1
- maxframe_client/fetcher.py +86 -13
- maxframe_client/session/odps.py +79 -10
- maxframe_client/session/task.py +65 -71
- maxframe_client/tests/test_fetcher.py +21 -3
- maxframe_client/tests/test_session.py +76 -2
- maxframe_client/clients/spe.py +0 -104
- {maxframe-0.1.0b4.dist-info → maxframe-1.0.0rc2.dist-info}/top_level.txt +0 -0
maxframe/__init__.py
CHANGED
|
Binary file
|
maxframe/codegen.py
CHANGED
|
@@ -16,6 +16,7 @@ import abc
|
|
|
16
16
|
import base64
|
|
17
17
|
import dataclasses
|
|
18
18
|
import logging
|
|
19
|
+
from collections import defaultdict
|
|
19
20
|
from enum import Enum
|
|
20
21
|
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union
|
|
21
22
|
|
|
@@ -32,7 +33,7 @@ from .protocol import DataFrameTableMeta, ResultInfo
|
|
|
32
33
|
from .serialization import PickleContainer
|
|
33
34
|
from .serialization.serializables import Serializable, StringField
|
|
34
35
|
from .typing_ import PandasObjectTypes
|
|
35
|
-
from .udf import MarkedFunction
|
|
36
|
+
from .udf import MarkedFunction, PythonPackOptions
|
|
36
37
|
|
|
37
38
|
if TYPE_CHECKING:
|
|
38
39
|
from odpsctx import ODPSSessionContext
|
|
@@ -75,8 +76,18 @@ class AbstractUDF(Serializable):
|
|
|
75
76
|
def unregister(self, odps: "ODPSSessionContext"):
|
|
76
77
|
raise NotImplementedError
|
|
77
78
|
|
|
79
|
+
@abc.abstractmethod
|
|
80
|
+
def collect_pythonpack(self) -> List[PythonPackOptions]:
|
|
81
|
+
raise NotImplementedError
|
|
82
|
+
|
|
83
|
+
@abc.abstractmethod
|
|
84
|
+
def load_pythonpack_resources(self, odps_ctx: "ODPSSessionContext") -> None:
|
|
85
|
+
raise NotImplementedError
|
|
86
|
+
|
|
78
87
|
|
|
79
88
|
class UserCodeMixin:
|
|
89
|
+
__slots__ = ()
|
|
90
|
+
|
|
80
91
|
@classmethod
|
|
81
92
|
def obj_to_python_expr(cls, obj: Any = None) -> str:
|
|
82
93
|
"""
|
|
@@ -335,6 +346,8 @@ def register_engine_codegen(type_: Type["BigDagCodeGenerator"]):
|
|
|
335
346
|
BUILTIN_ENGINE_SPE = "SPE"
|
|
336
347
|
BUILTIN_ENGINE_MCSQL = "MCSQL"
|
|
337
348
|
|
|
349
|
+
FAST_RANGE_INDEX_ENABLED = "codegen.fast_range_index_enabled"
|
|
350
|
+
|
|
338
351
|
|
|
339
352
|
class BigDagCodeGenerator(metaclass=abc.ABCMeta):
|
|
340
353
|
_context: BigDagCodeContext
|
|
@@ -469,14 +482,50 @@ class BigDagCodeGenerator(metaclass=abc.ABCMeta):
|
|
|
469
482
|
output_key_to_result_infos=self._context.get_tileable_result_infos(),
|
|
470
483
|
)
|
|
471
484
|
|
|
485
|
+
def run_pythonpacks(
|
|
486
|
+
self,
|
|
487
|
+
odps_ctx: "ODPSSessionContext",
|
|
488
|
+
python_tag: str,
|
|
489
|
+
is_production: bool = False,
|
|
490
|
+
schedule_id: Optional[str] = None,
|
|
491
|
+
hints: Optional[dict] = None,
|
|
492
|
+
priority: Optional[int] = None,
|
|
493
|
+
) -> Dict[str, PythonPackOptions]:
|
|
494
|
+
key_to_packs = defaultdict(list)
|
|
495
|
+
for udf in self._context.get_udfs():
|
|
496
|
+
for pack in udf.collect_pythonpack():
|
|
497
|
+
key_to_packs[pack.key].append(pack)
|
|
498
|
+
distinct_packs = []
|
|
499
|
+
for packs in key_to_packs.values():
|
|
500
|
+
distinct_packs.append(packs[0])
|
|
501
|
+
|
|
502
|
+
inst_id_to_req = {}
|
|
503
|
+
for pack in distinct_packs:
|
|
504
|
+
inst = odps_ctx.run_pythonpack(
|
|
505
|
+
requirements=pack.requirements,
|
|
506
|
+
prefer_binary=pack.prefer_binary,
|
|
507
|
+
pre_release=pack.pre_release,
|
|
508
|
+
force_rebuild=pack.force_rebuild,
|
|
509
|
+
python_tag=python_tag,
|
|
510
|
+
is_production=is_production,
|
|
511
|
+
schedule_id=schedule_id,
|
|
512
|
+
hints=hints,
|
|
513
|
+
priority=priority,
|
|
514
|
+
)
|
|
515
|
+
# fulfill instance id of pythonpacks with same keys
|
|
516
|
+
for same_pack in key_to_packs[pack.key]:
|
|
517
|
+
same_pack.pack_instance_id = inst.id
|
|
518
|
+
inst_id_to_req[inst.id] = pack
|
|
519
|
+
return inst_id_to_req
|
|
520
|
+
|
|
472
521
|
def register_udfs(self, odps_ctx: "ODPSSessionContext"):
|
|
473
522
|
for udf in self._context.get_udfs():
|
|
474
|
-
logger.info("[Session
|
|
523
|
+
logger.info("[Session=%s] Registering UDF %s", self._session_id, udf.name)
|
|
475
524
|
udf.register(odps_ctx, True)
|
|
476
525
|
|
|
477
526
|
def unregister_udfs(self, odps_ctx: "ODPSSessionContext"):
|
|
478
527
|
for udf in self._context.get_udfs():
|
|
479
|
-
logger.info("[Session
|
|
528
|
+
logger.info("[Session=%s] Unregistering UDF %s", self._session_id, udf.name)
|
|
480
529
|
udf.unregister(odps_ctx)
|
|
481
530
|
|
|
482
531
|
def get_udfs(self) -> List[AbstractUDF]:
|
maxframe/config/config.py
CHANGED
|
@@ -19,6 +19,15 @@ import warnings
|
|
|
19
19
|
from copy import deepcopy
|
|
20
20
|
from typing import Any, Dict, Optional, Union
|
|
21
21
|
|
|
22
|
+
from odps.lib import tzlocal
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
from zoneinfo import available_timezones
|
|
26
|
+
except ImportError:
|
|
27
|
+
from pytz import all_timezones
|
|
28
|
+
|
|
29
|
+
available_timezones = lambda: all_timezones
|
|
30
|
+
|
|
22
31
|
from ..utils import get_python_tag
|
|
23
32
|
from .validators import (
|
|
24
33
|
ValidatorType,
|
|
@@ -28,6 +37,7 @@ from .validators import (
|
|
|
28
37
|
is_dict,
|
|
29
38
|
is_in,
|
|
30
39
|
is_integer,
|
|
40
|
+
is_non_negative_integer,
|
|
31
41
|
is_null,
|
|
32
42
|
is_numeric,
|
|
33
43
|
is_string,
|
|
@@ -37,9 +47,12 @@ _DEFAULT_REDIRECT_WARN = "Option {source} has been replaced by {target} and migh
|
|
|
37
47
|
_DEFAULT_MAX_ALIVE_SECONDS = 3 * 24 * 3600
|
|
38
48
|
_DEFAULT_MAX_IDLE_SECONDS = 3600
|
|
39
49
|
_DEFAULT_SPE_OPERATION_TIMEOUT_SECONDS = 120
|
|
50
|
+
_DEFAULT_SPE_FAILURE_RETRY_TIMES = 5
|
|
40
51
|
_DEFAULT_UPLOAD_BATCH_SIZE = 4096
|
|
41
52
|
_DEFAULT_TEMP_LIFECYCLE = 1
|
|
42
53
|
_DEFAULT_TASK_START_TIMEOUT = 60
|
|
54
|
+
_DEFAULT_TASK_RESTART_TIMEOUT = 300
|
|
55
|
+
_DEFAULT_LOGVIEW_HOURS = 24 * 30
|
|
43
56
|
|
|
44
57
|
|
|
45
58
|
class OptionError(Exception):
|
|
@@ -295,23 +308,46 @@ class Config:
|
|
|
295
308
|
return {k: v for k, v in res.items() if k in self._remote_options}
|
|
296
309
|
|
|
297
310
|
|
|
298
|
-
|
|
311
|
+
def _get_legal_local_tz_name() -> Optional[str]:
|
|
312
|
+
"""Sometimes we may get illegal tz name from tzlocal.get_localzone()"""
|
|
313
|
+
tz_name = str(tzlocal.get_localzone())
|
|
314
|
+
if tz_name not in available_timezones():
|
|
315
|
+
return None
|
|
316
|
+
return tz_name
|
|
317
|
+
|
|
299
318
|
|
|
319
|
+
default_options = Config()
|
|
300
320
|
default_options.register_option(
|
|
301
321
|
"execution_mode", "trigger", validator=is_in(["trigger", "eager"])
|
|
302
322
|
)
|
|
323
|
+
default_options.register_option("use_common_table", False, validator=is_bool)
|
|
303
324
|
default_options.register_option(
|
|
304
325
|
"python_tag", get_python_tag(), validator=is_string, remote=True
|
|
305
326
|
)
|
|
327
|
+
default_options.register_option(
|
|
328
|
+
"local_timezone",
|
|
329
|
+
_get_legal_local_tz_name(),
|
|
330
|
+
validator=any_validator(is_null, is_in(set(available_timezones()))),
|
|
331
|
+
remote=True,
|
|
332
|
+
)
|
|
333
|
+
default_options.register_option(
|
|
334
|
+
"session.logview_hours", _DEFAULT_LOGVIEW_HOURS, validator=is_integer, remote=True
|
|
335
|
+
)
|
|
306
336
|
default_options.register_option(
|
|
307
337
|
"client.task_start_timeout", _DEFAULT_TASK_START_TIMEOUT, validator=is_integer
|
|
308
338
|
)
|
|
339
|
+
default_options.register_option(
|
|
340
|
+
"client.task_restart_timeout", _DEFAULT_TASK_RESTART_TIMEOUT, validator=is_integer
|
|
341
|
+
)
|
|
309
342
|
default_options.register_option("sql.enable_mcqa", True, validator=is_bool, remote=True)
|
|
310
343
|
default_options.register_option(
|
|
311
344
|
"sql.generate_comments", True, validator=is_bool, remote=True
|
|
312
345
|
)
|
|
313
346
|
default_options.register_option("sql.settings", {}, validator=is_dict, remote=True)
|
|
314
347
|
|
|
348
|
+
default_options.register_option("is_production", False, validator=is_bool, remote=True)
|
|
349
|
+
default_options.register_option("schedule_id", "", validator=is_string, remote=True)
|
|
350
|
+
|
|
315
351
|
default_options.register_option(
|
|
316
352
|
"session.max_alive_seconds",
|
|
317
353
|
_DEFAULT_MAX_ALIVE_SECONDS,
|
|
@@ -368,7 +404,13 @@ default_options.register_option(
|
|
|
368
404
|
default_options.register_option(
|
|
369
405
|
"spe.operation_timeout_seconds",
|
|
370
406
|
_DEFAULT_SPE_OPERATION_TIMEOUT_SECONDS,
|
|
371
|
-
validator=
|
|
407
|
+
validator=is_non_negative_integer,
|
|
408
|
+
remote=True,
|
|
409
|
+
)
|
|
410
|
+
default_options.register_option(
|
|
411
|
+
"spe.failure_retry_times",
|
|
412
|
+
_DEFAULT_SPE_FAILURE_RETRY_TIMES,
|
|
413
|
+
validator=is_non_negative_integer,
|
|
372
414
|
remote=True,
|
|
373
415
|
)
|
|
374
416
|
|
|
@@ -376,6 +418,10 @@ default_options.register_option(
|
|
|
376
418
|
"spe.task.settings", dict(), validator=is_dict, remote=True
|
|
377
419
|
)
|
|
378
420
|
|
|
421
|
+
default_options.register_option(
|
|
422
|
+
"pythonpack.task.settings", {}, validator=is_dict, remote=True
|
|
423
|
+
)
|
|
424
|
+
|
|
379
425
|
_options_ctx_var = contextvars.ContextVar("_options_ctx_var")
|
|
380
426
|
|
|
381
427
|
|
maxframe/config/validators.py
CHANGED
|
@@ -40,6 +40,7 @@ is_numeric = lambda x: isinstance(x, (int, float))
|
|
|
40
40
|
is_string = lambda x: isinstance(x, str)
|
|
41
41
|
is_dict = lambda x: isinstance(x, dict)
|
|
42
42
|
is_positive_integer = lambda x: is_integer(x) and x > 0
|
|
43
|
+
is_non_negative_integer = lambda x: is_integer(x) and x >= 0
|
|
43
44
|
|
|
44
45
|
|
|
45
46
|
def is_in(vals):
|
maxframe/conftest.py
CHANGED
|
@@ -87,6 +87,7 @@ def oss_config():
|
|
|
87
87
|
oss_secret_access_key = config.get("oss", "secret_access_key")
|
|
88
88
|
oss_bucket_name = config.get("oss", "bucket_name")
|
|
89
89
|
oss_endpoint = config.get("oss", "endpoint")
|
|
90
|
+
oss_rolearn = config.get("oss", "rolearn")
|
|
90
91
|
|
|
91
92
|
config.oss_config = (
|
|
92
93
|
oss_access_id,
|
|
@@ -99,6 +100,7 @@ def oss_config():
|
|
|
99
100
|
|
|
100
101
|
auth = oss2.Auth(oss_access_id, oss_secret_access_key)
|
|
101
102
|
config.oss_bucket = oss2.Bucket(auth, oss_endpoint, oss_bucket_name)
|
|
103
|
+
config.oss_rolearn = oss_rolearn
|
|
102
104
|
return config
|
|
103
105
|
except (ConfigParser.NoSectionError, ConfigParser.NoOptionError, ImportError):
|
|
104
106
|
return None
|
maxframe/core/__init__.py
CHANGED
|
@@ -19,7 +19,6 @@ from .entity import (
|
|
|
19
19
|
CHUNK_TYPE,
|
|
20
20
|
ENTITY_TYPE,
|
|
21
21
|
FUSE_CHUNK_TYPE,
|
|
22
|
-
OBJECT_CHUNK_TYPE,
|
|
23
22
|
OBJECT_TYPE,
|
|
24
23
|
TILEABLE_TYPE,
|
|
25
24
|
Chunk,
|
|
@@ -33,8 +32,6 @@ from .entity import (
|
|
|
33
32
|
HasShapeTileableData,
|
|
34
33
|
NotSupportTile,
|
|
35
34
|
Object,
|
|
36
|
-
ObjectChunk,
|
|
37
|
-
ObjectChunkData,
|
|
38
35
|
ObjectData,
|
|
39
36
|
OutputType,
|
|
40
37
|
Tileable,
|
maxframe/core/entity/__init__.py
CHANGED
|
@@ -16,14 +16,7 @@ from .chunks import CHUNK_TYPE, Chunk, ChunkData
|
|
|
16
16
|
from .core import ENTITY_TYPE, Entity, EntityData
|
|
17
17
|
from .executable import ExecutableTuple, _ExecuteAndFetchMixin
|
|
18
18
|
from .fuse import FUSE_CHUNK_TYPE, FuseChunk, FuseChunkData
|
|
19
|
-
from .objects import
|
|
20
|
-
OBJECT_CHUNK_TYPE,
|
|
21
|
-
OBJECT_TYPE,
|
|
22
|
-
Object,
|
|
23
|
-
ObjectChunk,
|
|
24
|
-
ObjectChunkData,
|
|
25
|
-
ObjectData,
|
|
26
|
-
)
|
|
19
|
+
from .objects import OBJECT_TYPE, Object, ObjectData
|
|
27
20
|
from .output_types import (
|
|
28
21
|
OutputType,
|
|
29
22
|
get_fetch_class,
|
maxframe/core/entity/objects.py
CHANGED
|
@@ -14,58 +14,17 @@
|
|
|
14
14
|
|
|
15
15
|
from typing import Any, Dict
|
|
16
16
|
|
|
17
|
-
from ...serialization.serializables import FieldTypes, ListField
|
|
18
|
-
from ...utils import skip_na_call
|
|
19
|
-
from .chunks import Chunk, ChunkData
|
|
20
17
|
from .core import Entity
|
|
21
18
|
from .executable import _ToObjectMixin
|
|
22
19
|
from .tileables import TileableData
|
|
23
20
|
|
|
24
21
|
|
|
25
|
-
class ObjectChunkData(ChunkData):
|
|
26
|
-
# chunk whose data could be any serializable
|
|
27
|
-
__slots__ = ()
|
|
28
|
-
type_name = "Object"
|
|
29
|
-
|
|
30
|
-
def __init__(self, op=None, index=None, **kw):
|
|
31
|
-
super().__init__(_op=op, _index=index, **kw)
|
|
32
|
-
|
|
33
|
-
@property
|
|
34
|
-
def params(self) -> Dict[str, Any]:
|
|
35
|
-
# params return the properties which useful to rebuild a new chunk
|
|
36
|
-
return {
|
|
37
|
-
"index": self.index,
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
@params.setter
|
|
41
|
-
def params(self, new_params: Dict[str, Any]):
|
|
42
|
-
params = new_params.copy()
|
|
43
|
-
params.pop("index", None) # index not needed to update
|
|
44
|
-
if params: # pragma: no cover
|
|
45
|
-
raise TypeError(f"Unknown params: {list(params)}")
|
|
46
|
-
|
|
47
|
-
@classmethod
|
|
48
|
-
def get_params_from_data(cls, data: Any) -> Dict[str, Any]:
|
|
49
|
-
return dict()
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
class ObjectChunk(Chunk):
|
|
53
|
-
__slots__ = ()
|
|
54
|
-
_allow_data_type_ = (ObjectChunkData,)
|
|
55
|
-
type_name = "Object"
|
|
56
|
-
|
|
57
|
-
|
|
58
22
|
class ObjectData(TileableData, _ToObjectMixin):
|
|
59
23
|
__slots__ = ()
|
|
60
24
|
type_name = "Object"
|
|
61
|
-
|
|
62
|
-
#
|
|
63
|
-
|
|
64
|
-
"chunks",
|
|
65
|
-
FieldTypes.reference(ObjectChunkData),
|
|
66
|
-
on_serialize=skip_na_call(lambda x: [it.data for it in x]),
|
|
67
|
-
on_deserialize=skip_na_call(lambda x: [ObjectChunk(it) for it in x]),
|
|
68
|
-
)
|
|
25
|
+
# workaround for removed field since v0.1.0b5
|
|
26
|
+
# todo remove this when all versions below v1.0.0rc1 is eliminated
|
|
27
|
+
_legacy_deprecated_non_primitives = ["_chunks"]
|
|
69
28
|
|
|
70
29
|
def __init__(self, op=None, nsplits=None, **kw):
|
|
71
30
|
super().__init__(_op=op, _nsplits=nsplits, **kw)
|
|
@@ -97,4 +56,3 @@ class Object(Entity, _ToObjectMixin):
|
|
|
97
56
|
|
|
98
57
|
|
|
99
58
|
OBJECT_TYPE = (Object, ObjectData)
|
|
100
|
-
OBJECT_CHUNK_TYPE = (ObjectChunk, ObjectChunkData)
|
|
Binary file
|
maxframe/core/graph/core.pyx
CHANGED
|
@@ -354,10 +354,10 @@ cdef class DirectedGraph:
|
|
|
354
354
|
sio.write(f'"Chunk:{self._gen_chunk_key(input_chunk, trunc_key)}" {chunk_style}\n')
|
|
355
355
|
visited.add(input_chunk.key)
|
|
356
356
|
if op.key not in visited:
|
|
357
|
-
sio.write(f'"{op_name}:{op.key[:trunc_key]}" {operator_style}\n')
|
|
357
|
+
sio.write(f'"{op_name}:{op.key[:trunc_key]}_{id(op)}" {operator_style}\n')
|
|
358
358
|
visited.add(op.key)
|
|
359
359
|
sio.write(f'"Chunk:{self._gen_chunk_key(input_chunk, trunc_key)}" -> '
|
|
360
|
-
f'"{op_name}:{op.key[:trunc_key]}"\n')
|
|
360
|
+
f'"{op_name}:{op.key[:trunc_key]}_{id(op)}"\n')
|
|
361
361
|
|
|
362
362
|
for output_chunk in (op.outputs or []):
|
|
363
363
|
if output_chunk.key not in visited:
|
|
@@ -367,9 +367,9 @@ cdef class DirectedGraph:
|
|
|
367
367
|
sio.write(f'"Chunk:{self._gen_chunk_key(output_chunk, trunc_key)}" {tmp_chunk_style}\n')
|
|
368
368
|
visited.add(output_chunk.key)
|
|
369
369
|
if op.key not in visited:
|
|
370
|
-
sio.write(f'"{op_name}:{op.key[:trunc_key]}" {operator_style}\n')
|
|
370
|
+
sio.write(f'"{op_name}:{op.key[:trunc_key]}_{id(op)}" {operator_style}\n')
|
|
371
371
|
visited.add(op.key)
|
|
372
|
-
sio.write(f'"{op_name}:{op.key[:trunc_key]}" -> '
|
|
372
|
+
sio.write(f'"{op_name}:{op.key[:trunc_key]}_{id(op)}" -> '
|
|
373
373
|
f'"Chunk:{self._gen_chunk_key(output_chunk, trunc_key)}"')
|
|
374
374
|
if show_columns:
|
|
375
375
|
sio.write(f' [ label={get_col_names(output_chunk)} ]')
|
maxframe/dataframe/__init__.py
CHANGED
|
@@ -46,6 +46,7 @@ from .misc.cut import cut
|
|
|
46
46
|
from .misc.eval import maxframe_eval as eval # pylint: disable=redefined-builtin
|
|
47
47
|
from .misc.get_dummies import get_dummies
|
|
48
48
|
from .misc.melt import melt
|
|
49
|
+
from .misc.pivot_table import pivot_table
|
|
49
50
|
from .misc.qcut import qcut
|
|
50
51
|
from .misc.to_numeric import to_numeric
|
|
51
52
|
from .missing import isna, isnull, notna, notnull
|
|
@@ -53,7 +54,7 @@ from .reduction import CustomReduction, unique
|
|
|
53
54
|
from .tseries.to_datetime import to_datetime
|
|
54
55
|
|
|
55
56
|
try:
|
|
56
|
-
from pandas import NA, Timestamp
|
|
57
|
+
from pandas import NA, NaT, Timestamp
|
|
57
58
|
except ImportError: # pragma: no cover
|
|
58
59
|
pass
|
|
59
60
|
|
|
@@ -43,20 +43,20 @@ def around(df, decimals=0, *args, **kwargs):
|
|
|
43
43
|
return op(df)
|
|
44
44
|
|
|
45
45
|
|
|
46
|
+
# FIXME Series input of decimals not supported yet
|
|
46
47
|
around.__frame_doc__ = """
|
|
47
48
|
Round a DataFrame to a variable number of decimal places.
|
|
48
49
|
|
|
49
50
|
Parameters
|
|
50
51
|
----------
|
|
51
|
-
decimals : int, dict
|
|
52
|
+
decimals : int, dict
|
|
52
53
|
Number of decimal places to round each column to. If an int is
|
|
53
54
|
given, round each column to the same number of places.
|
|
54
55
|
Otherwise dict and Series round to variable numbers of places.
|
|
55
56
|
Column names should be in the keys if `decimals` is a
|
|
56
|
-
dict-like
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
ignored.
|
|
57
|
+
dict-like. Any columns not included in `decimals` will be left
|
|
58
|
+
as is. Elements of `decimals` which are not columns of the
|
|
59
|
+
input will be ignored.
|
|
60
60
|
*args
|
|
61
61
|
Additional keywords have no effect but might be accepted for
|
|
62
62
|
compatibility with numpy.
|
|
@@ -107,18 +107,6 @@ places as value
|
|
|
107
107
|
1 0.0 1.0
|
|
108
108
|
2 0.7 0.0
|
|
109
109
|
3 0.2 0.0
|
|
110
|
-
|
|
111
|
-
Using a Series, the number of places for specific columns can be
|
|
112
|
-
specified with the column names as index and the number of
|
|
113
|
-
decimal places as value
|
|
114
|
-
|
|
115
|
-
>>> decimals = md.Series([0, 1], index=['cats', 'dogs'])
|
|
116
|
-
>>> df.round(decimals).execute()
|
|
117
|
-
dogs cats
|
|
118
|
-
0 0.2 0.0
|
|
119
|
-
1 0.0 1.0
|
|
120
|
-
2 0.7 0.0
|
|
121
|
-
3 0.2 0.0
|
|
122
110
|
"""
|
|
123
111
|
around.__series_doc__ = """
|
|
124
112
|
Round each value in a Series to the given number of decimals.
|
|
@@ -39,7 +39,7 @@ class DataFrameBinOpMixin(DataFrameOperatorMixin):
|
|
|
39
39
|
raise NotImplementedError
|
|
40
40
|
|
|
41
41
|
@classmethod
|
|
42
|
-
def _calc_properties(cls, x1, x2=None, axis="columns"):
|
|
42
|
+
def _calc_properties(cls, x1, x2=None, axis="columns", level=None):
|
|
43
43
|
if isinstance(x1, DATAFRAME_TYPE) and (
|
|
44
44
|
x2 is None or pd.api.types.is_scalar(x2) or isinstance(x2, TENSOR_TYPE)
|
|
45
45
|
):
|
|
@@ -108,7 +108,9 @@ class DataFrameBinOpMixin(DataFrameOperatorMixin):
|
|
|
108
108
|
index = copy.copy(x1.index_value)
|
|
109
109
|
index_shape = x1.shape[0]
|
|
110
110
|
else:
|
|
111
|
-
index = infer_index_value(
|
|
111
|
+
index = infer_index_value(
|
|
112
|
+
x1.index_value, x2.index_value, level=level
|
|
113
|
+
)
|
|
112
114
|
if index.key == x1.index_value.key == x2.index_value.key and (
|
|
113
115
|
not np.isnan(x1.shape[0]) or not np.isnan(x2.shape[0])
|
|
114
116
|
):
|
|
@@ -141,7 +143,9 @@ class DataFrameBinOpMixin(DataFrameOperatorMixin):
|
|
|
141
143
|
column_shape = len(dtypes)
|
|
142
144
|
else: # pragma: no cover
|
|
143
145
|
dtypes = x1.dtypes # FIXME
|
|
144
|
-
columns = infer_index_value(
|
|
146
|
+
columns = infer_index_value(
|
|
147
|
+
x1.columns_value, x2.index_value, level=level
|
|
148
|
+
)
|
|
145
149
|
column_shape = np.nan
|
|
146
150
|
else:
|
|
147
151
|
assert axis == "index" or axis == 0
|
|
@@ -169,7 +173,9 @@ class DataFrameBinOpMixin(DataFrameOperatorMixin):
|
|
|
169
173
|
],
|
|
170
174
|
index=x1.dtypes.index,
|
|
171
175
|
)
|
|
172
|
-
index = infer_index_value(
|
|
176
|
+
index = infer_index_value(
|
|
177
|
+
x1.index_value, x2.index_value, level=level
|
|
178
|
+
)
|
|
173
179
|
index_shape = np.nan
|
|
174
180
|
return {
|
|
175
181
|
"shape": (index_shape, column_shape),
|
|
@@ -187,7 +193,9 @@ class DataFrameBinOpMixin(DataFrameOperatorMixin):
|
|
|
187
193
|
index = copy.copy(x1.index_value)
|
|
188
194
|
index_shape = x1.shape[0]
|
|
189
195
|
else:
|
|
190
|
-
index = infer_index_value(
|
|
196
|
+
index = infer_index_value(
|
|
197
|
+
x1.index_value, x2.index_value, level=level
|
|
198
|
+
)
|
|
191
199
|
if index.key == x1.index_value.key == x2.index_value.key and (
|
|
192
200
|
not np.isnan(x1.shape[0]) or not np.isnan(x2.shape[0])
|
|
193
201
|
):
|
|
@@ -237,14 +245,14 @@ class DataFrameBinOpMixin(DataFrameOperatorMixin):
|
|
|
237
245
|
self._check_inputs(x1, x2)
|
|
238
246
|
if isinstance(x1, DATAFRAME_TYPE) or isinstance(x2, DATAFRAME_TYPE):
|
|
239
247
|
df1, df2 = (x1, x2) if isinstance(x1, DATAFRAME_TYPE) else (x2, x1)
|
|
240
|
-
kw = self._calc_properties(df1, df2, axis=self.axis)
|
|
248
|
+
kw = self._calc_properties(df1, df2, axis=self.axis, level=self.level)
|
|
241
249
|
if not pd.api.types.is_scalar(df2):
|
|
242
250
|
return self.new_dataframe([x1, x2], **kw)
|
|
243
251
|
else:
|
|
244
252
|
return self.new_dataframe([df1], **kw)
|
|
245
253
|
if isinstance(x1, SERIES_TYPE) or isinstance(x2, SERIES_TYPE):
|
|
246
254
|
s1, s2 = (x1, x2) if isinstance(x1, SERIES_TYPE) else (x2, x1)
|
|
247
|
-
kw = self._calc_properties(s1, s2)
|
|
255
|
+
kw = self._calc_properties(s1, s2, level=self.level)
|
|
248
256
|
if not pd.api.types.is_scalar(s2):
|
|
249
257
|
return self.new_series([x1, x2], **kw)
|
|
250
258
|
else:
|
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
+
# FIXME:https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/17
|
|
15
16
|
_flex_doc_FRAME = """
|
|
16
17
|
Get {desc} of dataframe and other, element-wise (binary operator `{op_name}`).
|
|
17
18
|
Equivalent to ``{equiv}``, but with support to substitute a fill_value
|
|
@@ -127,44 +128,15 @@ circle 0
|
|
|
127
128
|
triangle 3
|
|
128
129
|
rectangle 4
|
|
129
130
|
|
|
130
|
-
>>> (df * other).execute()
|
|
131
|
-
angles degrees
|
|
132
|
-
circle 0 NaN
|
|
133
|
-
triangle 9 NaN
|
|
134
|
-
rectangle 16 NaN
|
|
135
|
-
|
|
136
131
|
>>> df.mul(other, fill_value=0).execute()
|
|
137
132
|
angles degrees
|
|
138
133
|
circle 0 0.0
|
|
139
134
|
triangle 9 0.0
|
|
140
135
|
rectangle 16 0.0
|
|
141
136
|
|
|
142
|
-
Divide by a MultiIndex by level.
|
|
143
|
-
|
|
144
|
-
>>> df_multindex = md.DataFrame({{'angles': [0, 3, 4, 4, 5, 6],
|
|
145
|
-
... 'degrees': [360, 180, 360, 360, 540, 720]}},
|
|
146
|
-
... index=[['A', 'A', 'A', 'B', 'B', 'B'],
|
|
147
|
-
... ['circle', 'triangle', 'rectangle',
|
|
148
|
-
... 'square', 'pentagon', 'hexagon']])
|
|
149
|
-
>>> df_multindex.execute()
|
|
150
|
-
angles degrees
|
|
151
|
-
A circle 0 360
|
|
152
|
-
triangle 3 180
|
|
153
|
-
rectangle 4 360
|
|
154
|
-
B square 4 360
|
|
155
|
-
pentagon 5 540
|
|
156
|
-
hexagon 6 720
|
|
157
|
-
|
|
158
|
-
>>> df.div(df_multindex, level=1, fill_value=0).execute()
|
|
159
|
-
angles degrees
|
|
160
|
-
A circle NaN 1.0
|
|
161
|
-
triangle 1.0 1.0
|
|
162
|
-
rectangle 1.0 1.0
|
|
163
|
-
B square 0.0 0.0
|
|
164
|
-
pentagon 0.0 0.0
|
|
165
|
-
hexagon 0.0 0.0
|
|
166
137
|
"""
|
|
167
138
|
|
|
139
|
+
# FIXME:https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/28
|
|
168
140
|
_flex_doc_SERIES = """
|
|
169
141
|
Return {desc} of series and other, element-wise (binary operator `{op_name}`).
|
|
170
142
|
|
|
@@ -213,6 +185,7 @@ e NaN
|
|
|
213
185
|
dtype: float64
|
|
214
186
|
"""
|
|
215
187
|
|
|
188
|
+
# FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/48
|
|
216
189
|
_flex_comp_doc_FRAME = """
|
|
217
190
|
Get {desc} of dataframe and other, element-wise (binary operator `{op_name}`).
|
|
218
191
|
Among flexible wrappers (`eq`, `ne`, `le`, `lt`, `ge`, `gt`) to comparison
|
|
@@ -257,7 +230,8 @@ Mismatched indices will be unioned together.
|
|
|
257
230
|
|
|
258
231
|
Examples
|
|
259
232
|
--------
|
|
260
|
-
>>>
|
|
233
|
+
>>> import maxframe.dataframe as md
|
|
234
|
+
>>> df = md.DataFrame({{'cost': [250, 150, 100],
|
|
261
235
|
... 'revenue': [100, 250, 300]}},
|
|
262
236
|
... index=['A', 'B', 'C'])
|
|
263
237
|
>>> df.execute()
|
|
@@ -332,30 +306,6 @@ A False False
|
|
|
332
306
|
B False False
|
|
333
307
|
C False True
|
|
334
308
|
D False False
|
|
335
|
-
|
|
336
|
-
Compare to a MultiIndex by level.
|
|
337
|
-
|
|
338
|
-
>>> df_multindex = pd.DataFrame({{'cost': [250, 150, 100, 150, 300, 220],
|
|
339
|
-
... 'revenue': [100, 250, 300, 200, 175, 225]}},
|
|
340
|
-
... index=[['Q1', 'Q1', 'Q1', 'Q2', 'Q2', 'Q2'],
|
|
341
|
-
... ['A', 'B', 'C', 'A', 'B', 'C']])
|
|
342
|
-
>>> df_multindex.execute()
|
|
343
|
-
cost revenue
|
|
344
|
-
Q1 A 250 100
|
|
345
|
-
B 150 250
|
|
346
|
-
C 100 300
|
|
347
|
-
Q2 A 150 200
|
|
348
|
-
B 300 175
|
|
349
|
-
C 220 225
|
|
350
|
-
|
|
351
|
-
>>> df.le(df_multindex, level=1).execute()
|
|
352
|
-
cost revenue
|
|
353
|
-
Q1 A True True
|
|
354
|
-
B True True
|
|
355
|
-
C True True
|
|
356
|
-
Q2 A False True
|
|
357
|
-
B True False
|
|
358
|
-
C True False
|
|
359
309
|
"""
|
|
360
310
|
|
|
361
311
|
|
|
@@ -239,6 +239,28 @@ def test_dataframe_and_series_with_shuffle(func_name, func_opts):
|
|
|
239
239
|
assert df2.columns_value.key != df1.columns_value.key
|
|
240
240
|
|
|
241
241
|
|
|
242
|
+
@pytest.mark.parametrize("func_name, func_opts", binary_functions.items())
|
|
243
|
+
def test_dataframe_and_series_with_multiindex(func_name, func_opts):
|
|
244
|
+
data1 = pd.DataFrame(
|
|
245
|
+
np.random.rand(10, 10),
|
|
246
|
+
index=pd.MultiIndex.from_arrays(
|
|
247
|
+
[list("AAAAABBBBB"), [4, 9, 3, 2, 1, 5, 8, 6, 7, 10]]
|
|
248
|
+
),
|
|
249
|
+
columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7],
|
|
250
|
+
)
|
|
251
|
+
data1 = to_boolean_if_needed(func_opts.func_name, data1)
|
|
252
|
+
df1 = from_pandas(data1, chunk_size=5)
|
|
253
|
+
s1 = from_pandas_series(data1[10].reset_index(level=0, drop=True), chunk_size=6)
|
|
254
|
+
|
|
255
|
+
df2 = getattr(df1, func_opts.func_name)(s1, level=1, axis=0)
|
|
256
|
+
|
|
257
|
+
# test df2's index and columns
|
|
258
|
+
assert df2.shape == (np.nan, df1.shape[1])
|
|
259
|
+
assert df2.index_value.key != df1.index_value.key
|
|
260
|
+
assert df2.index_value.names == df1.index_value.names
|
|
261
|
+
assert df2.columns_value.key == df1.columns_value.key
|
|
262
|
+
|
|
263
|
+
|
|
242
264
|
@pytest.mark.parametrize("func_name, func_opts", binary_functions.items())
|
|
243
265
|
def test_series_and_series_with_align_map(func_name, func_opts):
|
|
244
266
|
data1 = pd.DataFrame(
|