maxframe 0.1.0b4__cp39-cp39-win32.whl → 1.0.0rc1__cp39-cp39-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/__init__.py +1 -0
- maxframe/_utils.cp39-win32.pyd +0 -0
- maxframe/codegen.py +56 -3
- maxframe/config/config.py +15 -1
- maxframe/core/__init__.py +0 -3
- maxframe/core/entity/__init__.py +1 -8
- maxframe/core/entity/objects.py +3 -45
- maxframe/core/graph/core.cp39-win32.pyd +0 -0
- maxframe/core/graph/core.pyx +4 -4
- maxframe/dataframe/__init__.py +1 -0
- maxframe/dataframe/core.py +30 -8
- maxframe/dataframe/datasource/read_odps_query.py +3 -1
- maxframe/dataframe/datasource/read_odps_table.py +3 -1
- maxframe/dataframe/datastore/tests/__init__.py +13 -0
- maxframe/dataframe/datastore/tests/test_to_odps.py +48 -0
- maxframe/dataframe/datastore/to_odps.py +21 -0
- maxframe/dataframe/indexing/align.py +1 -1
- maxframe/dataframe/misc/__init__.py +4 -0
- maxframe/dataframe/misc/apply.py +3 -1
- maxframe/dataframe/misc/case_when.py +141 -0
- maxframe/dataframe/misc/memory_usage.py +2 -2
- maxframe/dataframe/misc/pivot_table.py +262 -0
- maxframe/dataframe/misc/tests/test_misc.py +84 -0
- maxframe/dataframe/plotting/core.py +2 -2
- maxframe/dataframe/reduction/core.py +2 -1
- maxframe/dataframe/statistics/corr.py +3 -3
- maxframe/dataframe/utils.py +7 -0
- maxframe/errors.py +13 -0
- maxframe/extension.py +12 -0
- maxframe/learn/contrib/utils.py +52 -0
- maxframe/learn/contrib/xgboost/__init__.py +26 -0
- maxframe/learn/contrib/xgboost/classifier.py +86 -0
- maxframe/learn/contrib/xgboost/core.py +156 -0
- maxframe/learn/contrib/xgboost/dmatrix.py +150 -0
- maxframe/learn/contrib/xgboost/predict.py +138 -0
- maxframe/learn/contrib/xgboost/regressor.py +78 -0
- maxframe/learn/contrib/xgboost/tests/__init__.py +13 -0
- maxframe/learn/contrib/xgboost/tests/test_core.py +43 -0
- maxframe/learn/contrib/xgboost/train.py +121 -0
- maxframe/learn/utils/__init__.py +15 -0
- maxframe/learn/utils/core.py +29 -0
- maxframe/lib/mmh3.cp39-win32.pyd +0 -0
- maxframe/lib/mmh3.pyi +43 -0
- maxframe/lib/wrapped_pickle.py +2 -1
- maxframe/odpsio/arrow.py +2 -3
- maxframe/odpsio/tableio.py +22 -0
- maxframe/odpsio/tests/test_schema.py +16 -11
- maxframe/opcodes.py +3 -0
- maxframe/protocol.py +108 -10
- maxframe/serialization/core.cp39-win32.pyd +0 -0
- maxframe/serialization/core.pxd +3 -0
- maxframe/serialization/core.pyi +64 -0
- maxframe/serialization/core.pyx +54 -25
- maxframe/serialization/exception.py +1 -1
- maxframe/serialization/pandas.py +7 -2
- maxframe/serialization/serializables/core.py +119 -12
- maxframe/serialization/serializables/tests/test_serializable.py +46 -4
- maxframe/session.py +28 -0
- maxframe/tensor/__init__.py +1 -1
- maxframe/tensor/arithmetic/tests/test_arithmetic.py +1 -1
- maxframe/tensor/base/__init__.py +2 -0
- maxframe/tensor/base/atleast_1d.py +74 -0
- maxframe/tensor/base/unique.py +205 -0
- maxframe/tensor/datasource/array.py +4 -2
- maxframe/tensor/datasource/scalar.py +1 -1
- maxframe/tensor/reduction/count_nonzero.py +1 -1
- maxframe/tests/test_protocol.py +34 -0
- maxframe/tests/test_utils.py +0 -12
- maxframe/tests/utils.py +2 -2
- maxframe/udf.py +63 -3
- maxframe/utils.py +22 -13
- {maxframe-0.1.0b4.dist-info → maxframe-1.0.0rc1.dist-info}/METADATA +3 -3
- {maxframe-0.1.0b4.dist-info → maxframe-1.0.0rc1.dist-info}/RECORD +80 -61
- maxframe_client/__init__.py +0 -1
- maxframe_client/fetcher.py +65 -3
- maxframe_client/session/odps.py +74 -5
- maxframe_client/session/task.py +65 -71
- maxframe_client/tests/test_session.py +64 -1
- maxframe_client/clients/spe.py +0 -104
- {maxframe-0.1.0b4.dist-info → maxframe-1.0.0rc1.dist-info}/WHEEL +0 -0
- {maxframe-0.1.0b4.dist-info → maxframe-1.0.0rc1.dist-info}/top_level.txt +0 -0
maxframe/__init__.py
CHANGED
maxframe/_utils.cp39-win32.pyd
CHANGED
|
Binary file
|
maxframe/codegen.py
CHANGED
|
@@ -16,6 +16,7 @@ import abc
|
|
|
16
16
|
import base64
|
|
17
17
|
import dataclasses
|
|
18
18
|
import logging
|
|
19
|
+
from collections import defaultdict
|
|
19
20
|
from enum import Enum
|
|
20
21
|
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union
|
|
21
22
|
|
|
@@ -32,7 +33,7 @@ from .protocol import DataFrameTableMeta, ResultInfo
|
|
|
32
33
|
from .serialization import PickleContainer
|
|
33
34
|
from .serialization.serializables import Serializable, StringField
|
|
34
35
|
from .typing_ import PandasObjectTypes
|
|
35
|
-
from .udf import MarkedFunction
|
|
36
|
+
from .udf import MarkedFunction, PythonPackOptions
|
|
36
37
|
|
|
37
38
|
if TYPE_CHECKING:
|
|
38
39
|
from odpsctx import ODPSSessionContext
|
|
@@ -75,8 +76,18 @@ class AbstractUDF(Serializable):
|
|
|
75
76
|
def unregister(self, odps: "ODPSSessionContext"):
|
|
76
77
|
raise NotImplementedError
|
|
77
78
|
|
|
79
|
+
@abc.abstractmethod
|
|
80
|
+
def collect_pythonpack(self) -> List[PythonPackOptions]:
|
|
81
|
+
raise NotImplementedError
|
|
82
|
+
|
|
83
|
+
@abc.abstractmethod
|
|
84
|
+
def load_pythonpack_resources(self, odps_ctx: "ODPSSessionContext") -> None:
|
|
85
|
+
raise NotImplementedError
|
|
86
|
+
|
|
78
87
|
|
|
79
88
|
class UserCodeMixin:
|
|
89
|
+
__slots__ = ()
|
|
90
|
+
|
|
80
91
|
@classmethod
|
|
81
92
|
def obj_to_python_expr(cls, obj: Any = None) -> str:
|
|
82
93
|
"""
|
|
@@ -194,8 +205,12 @@ class BigDagCodeContext(metaclass=abc.ABCMeta):
|
|
|
194
205
|
return self._session_id
|
|
195
206
|
|
|
196
207
|
def register_udf(self, udf: AbstractUDF):
|
|
208
|
+
from maxframe_framedriver.services.session import SessionManager
|
|
209
|
+
|
|
197
210
|
udf.session_id = self._session_id
|
|
198
211
|
self._udfs[udf.name] = udf
|
|
212
|
+
if self._session_id and SessionManager.initialized():
|
|
213
|
+
SessionManager.instance().register_udf(self._session_id, udf)
|
|
199
214
|
|
|
200
215
|
def get_udfs(self) -> List[AbstractUDF]:
|
|
201
216
|
return list(self._udfs.values())
|
|
@@ -335,6 +350,8 @@ def register_engine_codegen(type_: Type["BigDagCodeGenerator"]):
|
|
|
335
350
|
BUILTIN_ENGINE_SPE = "SPE"
|
|
336
351
|
BUILTIN_ENGINE_MCSQL = "MCSQL"
|
|
337
352
|
|
|
353
|
+
FAST_RANGE_INDEX_ENABLED = "codegen.fast_range_index_enabled"
|
|
354
|
+
|
|
338
355
|
|
|
339
356
|
class BigDagCodeGenerator(metaclass=abc.ABCMeta):
|
|
340
357
|
_context: BigDagCodeContext
|
|
@@ -469,14 +486,50 @@ class BigDagCodeGenerator(metaclass=abc.ABCMeta):
|
|
|
469
486
|
output_key_to_result_infos=self._context.get_tileable_result_infos(),
|
|
470
487
|
)
|
|
471
488
|
|
|
489
|
+
def run_pythonpacks(
|
|
490
|
+
self,
|
|
491
|
+
odps_ctx: "ODPSSessionContext",
|
|
492
|
+
python_tag: str,
|
|
493
|
+
is_production: bool = False,
|
|
494
|
+
schedule_id: Optional[str] = None,
|
|
495
|
+
hints: Optional[dict] = None,
|
|
496
|
+
priority: Optional[int] = None,
|
|
497
|
+
) -> Dict[str, PythonPackOptions]:
|
|
498
|
+
key_to_packs = defaultdict(list)
|
|
499
|
+
for udf in self._context.get_udfs():
|
|
500
|
+
for pack in udf.collect_pythonpack():
|
|
501
|
+
key_to_packs[pack.key].append(pack)
|
|
502
|
+
distinct_packs = []
|
|
503
|
+
for packs in key_to_packs.values():
|
|
504
|
+
distinct_packs.append(packs[0])
|
|
505
|
+
|
|
506
|
+
inst_id_to_req = {}
|
|
507
|
+
for pack in distinct_packs:
|
|
508
|
+
inst = odps_ctx.run_pythonpack(
|
|
509
|
+
requirements=pack.requirements,
|
|
510
|
+
prefer_binary=pack.prefer_binary,
|
|
511
|
+
pre_release=pack.pre_release,
|
|
512
|
+
force_rebuild=pack.force_rebuild,
|
|
513
|
+
python_tag=python_tag,
|
|
514
|
+
is_production=is_production,
|
|
515
|
+
schedule_id=schedule_id,
|
|
516
|
+
hints=hints,
|
|
517
|
+
priority=priority,
|
|
518
|
+
)
|
|
519
|
+
# fulfill instance id of pythonpacks with same keys
|
|
520
|
+
for same_pack in key_to_packs[pack.key]:
|
|
521
|
+
same_pack.pack_instance_id = inst.id
|
|
522
|
+
inst_id_to_req[inst.id] = pack
|
|
523
|
+
return inst_id_to_req
|
|
524
|
+
|
|
472
525
|
def register_udfs(self, odps_ctx: "ODPSSessionContext"):
|
|
473
526
|
for udf in self._context.get_udfs():
|
|
474
|
-
logger.info("[Session
|
|
527
|
+
logger.info("[Session=%s] Registering UDF %s", self._session_id, udf.name)
|
|
475
528
|
udf.register(odps_ctx, True)
|
|
476
529
|
|
|
477
530
|
def unregister_udfs(self, odps_ctx: "ODPSSessionContext"):
|
|
478
531
|
for udf in self._context.get_udfs():
|
|
479
|
-
logger.info("[Session
|
|
532
|
+
logger.info("[Session=%s] Unregistering UDF %s", self._session_id, udf.name)
|
|
480
533
|
udf.unregister(odps_ctx)
|
|
481
534
|
|
|
482
535
|
def get_udfs(self) -> List[AbstractUDF]:
|
maxframe/config/config.py
CHANGED
|
@@ -40,6 +40,8 @@ _DEFAULT_SPE_OPERATION_TIMEOUT_SECONDS = 120
|
|
|
40
40
|
_DEFAULT_UPLOAD_BATCH_SIZE = 4096
|
|
41
41
|
_DEFAULT_TEMP_LIFECYCLE = 1
|
|
42
42
|
_DEFAULT_TASK_START_TIMEOUT = 60
|
|
43
|
+
_DEFAULT_TASK_RESTART_TIMEOUT = 300
|
|
44
|
+
_DEFAULT_LOGVIEW_HOURS = 24 * 60
|
|
43
45
|
|
|
44
46
|
|
|
45
47
|
class OptionError(Exception):
|
|
@@ -296,22 +298,30 @@ class Config:
|
|
|
296
298
|
|
|
297
299
|
|
|
298
300
|
default_options = Config()
|
|
299
|
-
|
|
300
301
|
default_options.register_option(
|
|
301
302
|
"execution_mode", "trigger", validator=is_in(["trigger", "eager"])
|
|
302
303
|
)
|
|
303
304
|
default_options.register_option(
|
|
304
305
|
"python_tag", get_python_tag(), validator=is_string, remote=True
|
|
305
306
|
)
|
|
307
|
+
default_options.register_option(
|
|
308
|
+
"session.logview_hours", _DEFAULT_LOGVIEW_HOURS, validator=is_integer, remote=True
|
|
309
|
+
)
|
|
306
310
|
default_options.register_option(
|
|
307
311
|
"client.task_start_timeout", _DEFAULT_TASK_START_TIMEOUT, validator=is_integer
|
|
308
312
|
)
|
|
313
|
+
default_options.register_option(
|
|
314
|
+
"client.task_restart_timeout", _DEFAULT_TASK_RESTART_TIMEOUT, validator=is_integer
|
|
315
|
+
)
|
|
309
316
|
default_options.register_option("sql.enable_mcqa", True, validator=is_bool, remote=True)
|
|
310
317
|
default_options.register_option(
|
|
311
318
|
"sql.generate_comments", True, validator=is_bool, remote=True
|
|
312
319
|
)
|
|
313
320
|
default_options.register_option("sql.settings", {}, validator=is_dict, remote=True)
|
|
314
321
|
|
|
322
|
+
default_options.register_option("is_production", False, validator=is_bool, remote=True)
|
|
323
|
+
default_options.register_option("schedule_id", "", validator=is_string, remote=True)
|
|
324
|
+
|
|
315
325
|
default_options.register_option(
|
|
316
326
|
"session.max_alive_seconds",
|
|
317
327
|
_DEFAULT_MAX_ALIVE_SECONDS,
|
|
@@ -376,6 +386,10 @@ default_options.register_option(
|
|
|
376
386
|
"spe.task.settings", dict(), validator=is_dict, remote=True
|
|
377
387
|
)
|
|
378
388
|
|
|
389
|
+
default_options.register_option(
|
|
390
|
+
"pythonpack.task.settings", {}, validator=is_dict, remote=True
|
|
391
|
+
)
|
|
392
|
+
|
|
379
393
|
_options_ctx_var = contextvars.ContextVar("_options_ctx_var")
|
|
380
394
|
|
|
381
395
|
|
maxframe/core/__init__.py
CHANGED
|
@@ -19,7 +19,6 @@ from .entity import (
|
|
|
19
19
|
CHUNK_TYPE,
|
|
20
20
|
ENTITY_TYPE,
|
|
21
21
|
FUSE_CHUNK_TYPE,
|
|
22
|
-
OBJECT_CHUNK_TYPE,
|
|
23
22
|
OBJECT_TYPE,
|
|
24
23
|
TILEABLE_TYPE,
|
|
25
24
|
Chunk,
|
|
@@ -33,8 +32,6 @@ from .entity import (
|
|
|
33
32
|
HasShapeTileableData,
|
|
34
33
|
NotSupportTile,
|
|
35
34
|
Object,
|
|
36
|
-
ObjectChunk,
|
|
37
|
-
ObjectChunkData,
|
|
38
35
|
ObjectData,
|
|
39
36
|
OutputType,
|
|
40
37
|
Tileable,
|
maxframe/core/entity/__init__.py
CHANGED
|
@@ -16,14 +16,7 @@ from .chunks import CHUNK_TYPE, Chunk, ChunkData
|
|
|
16
16
|
from .core import ENTITY_TYPE, Entity, EntityData
|
|
17
17
|
from .executable import ExecutableTuple, _ExecuteAndFetchMixin
|
|
18
18
|
from .fuse import FUSE_CHUNK_TYPE, FuseChunk, FuseChunkData
|
|
19
|
-
from .objects import
|
|
20
|
-
OBJECT_CHUNK_TYPE,
|
|
21
|
-
OBJECT_TYPE,
|
|
22
|
-
Object,
|
|
23
|
-
ObjectChunk,
|
|
24
|
-
ObjectChunkData,
|
|
25
|
-
ObjectData,
|
|
26
|
-
)
|
|
19
|
+
from .objects import OBJECT_TYPE, Object, ObjectData
|
|
27
20
|
from .output_types import (
|
|
28
21
|
OutputType,
|
|
29
22
|
get_fetch_class,
|
maxframe/core/entity/objects.py
CHANGED
|
@@ -14,58 +14,17 @@
|
|
|
14
14
|
|
|
15
15
|
from typing import Any, Dict
|
|
16
16
|
|
|
17
|
-
from ...serialization.serializables import FieldTypes, ListField
|
|
18
|
-
from ...utils import skip_na_call
|
|
19
|
-
from .chunks import Chunk, ChunkData
|
|
20
17
|
from .core import Entity
|
|
21
18
|
from .executable import _ToObjectMixin
|
|
22
19
|
from .tileables import TileableData
|
|
23
20
|
|
|
24
21
|
|
|
25
|
-
class ObjectChunkData(ChunkData):
|
|
26
|
-
# chunk whose data could be any serializable
|
|
27
|
-
__slots__ = ()
|
|
28
|
-
type_name = "Object"
|
|
29
|
-
|
|
30
|
-
def __init__(self, op=None, index=None, **kw):
|
|
31
|
-
super().__init__(_op=op, _index=index, **kw)
|
|
32
|
-
|
|
33
|
-
@property
|
|
34
|
-
def params(self) -> Dict[str, Any]:
|
|
35
|
-
# params return the properties which useful to rebuild a new chunk
|
|
36
|
-
return {
|
|
37
|
-
"index": self.index,
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
@params.setter
|
|
41
|
-
def params(self, new_params: Dict[str, Any]):
|
|
42
|
-
params = new_params.copy()
|
|
43
|
-
params.pop("index", None) # index not needed to update
|
|
44
|
-
if params: # pragma: no cover
|
|
45
|
-
raise TypeError(f"Unknown params: {list(params)}")
|
|
46
|
-
|
|
47
|
-
@classmethod
|
|
48
|
-
def get_params_from_data(cls, data: Any) -> Dict[str, Any]:
|
|
49
|
-
return dict()
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
class ObjectChunk(Chunk):
|
|
53
|
-
__slots__ = ()
|
|
54
|
-
_allow_data_type_ = (ObjectChunkData,)
|
|
55
|
-
type_name = "Object"
|
|
56
|
-
|
|
57
|
-
|
|
58
22
|
class ObjectData(TileableData, _ToObjectMixin):
|
|
59
23
|
__slots__ = ()
|
|
60
24
|
type_name = "Object"
|
|
61
|
-
|
|
62
|
-
#
|
|
63
|
-
|
|
64
|
-
"chunks",
|
|
65
|
-
FieldTypes.reference(ObjectChunkData),
|
|
66
|
-
on_serialize=skip_na_call(lambda x: [it.data for it in x]),
|
|
67
|
-
on_deserialize=skip_na_call(lambda x: [ObjectChunk(it) for it in x]),
|
|
68
|
-
)
|
|
25
|
+
# workaround for removed field since v0.1.0b5
|
|
26
|
+
# todo remove this when all versions below v0.1.0b5 is eliminated
|
|
27
|
+
_legacy_deprecated_non_primitives = ["_chunks"]
|
|
69
28
|
|
|
70
29
|
def __init__(self, op=None, nsplits=None, **kw):
|
|
71
30
|
super().__init__(_op=op, _nsplits=nsplits, **kw)
|
|
@@ -97,4 +56,3 @@ class Object(Entity, _ToObjectMixin):
|
|
|
97
56
|
|
|
98
57
|
|
|
99
58
|
OBJECT_TYPE = (Object, ObjectData)
|
|
100
|
-
OBJECT_CHUNK_TYPE = (ObjectChunk, ObjectChunkData)
|
|
Binary file
|
maxframe/core/graph/core.pyx
CHANGED
|
@@ -354,10 +354,10 @@ cdef class DirectedGraph:
|
|
|
354
354
|
sio.write(f'"Chunk:{self._gen_chunk_key(input_chunk, trunc_key)}" {chunk_style}\n')
|
|
355
355
|
visited.add(input_chunk.key)
|
|
356
356
|
if op.key not in visited:
|
|
357
|
-
sio.write(f'"{op_name}:{op.key[:trunc_key]}" {operator_style}\n')
|
|
357
|
+
sio.write(f'"{op_name}:{op.key[:trunc_key]}_{id(op)}" {operator_style}\n')
|
|
358
358
|
visited.add(op.key)
|
|
359
359
|
sio.write(f'"Chunk:{self._gen_chunk_key(input_chunk, trunc_key)}" -> '
|
|
360
|
-
f'"{op_name}:{op.key[:trunc_key]}"\n')
|
|
360
|
+
f'"{op_name}:{op.key[:trunc_key]}_{id(op)}"\n')
|
|
361
361
|
|
|
362
362
|
for output_chunk in (op.outputs or []):
|
|
363
363
|
if output_chunk.key not in visited:
|
|
@@ -367,9 +367,9 @@ cdef class DirectedGraph:
|
|
|
367
367
|
sio.write(f'"Chunk:{self._gen_chunk_key(output_chunk, trunc_key)}" {tmp_chunk_style}\n')
|
|
368
368
|
visited.add(output_chunk.key)
|
|
369
369
|
if op.key not in visited:
|
|
370
|
-
sio.write(f'"{op_name}:{op.key[:trunc_key]}" {operator_style}\n')
|
|
370
|
+
sio.write(f'"{op_name}:{op.key[:trunc_key]}_{id(op)}" {operator_style}\n')
|
|
371
371
|
visited.add(op.key)
|
|
372
|
-
sio.write(f'"{op_name}:{op.key[:trunc_key]}" -> '
|
|
372
|
+
sio.write(f'"{op_name}:{op.key[:trunc_key]}_{id(op)}" -> '
|
|
373
373
|
f'"Chunk:{self._gen_chunk_key(output_chunk, trunc_key)}"')
|
|
374
374
|
if show_columns:
|
|
375
375
|
sio.write(f' [ label={get_col_names(output_chunk)} ]')
|
maxframe/dataframe/__init__.py
CHANGED
|
@@ -46,6 +46,7 @@ from .misc.cut import cut
|
|
|
46
46
|
from .misc.eval import maxframe_eval as eval # pylint: disable=redefined-builtin
|
|
47
47
|
from .misc.get_dummies import get_dummies
|
|
48
48
|
from .misc.melt import melt
|
|
49
|
+
from .misc.pivot_table import pivot_table
|
|
49
50
|
from .misc.qcut import qcut
|
|
50
51
|
from .misc.to_numeric import to_numeric
|
|
51
52
|
from .missing import isna, isnull, notna, notnull
|
maxframe/dataframe/core.py
CHANGED
|
@@ -35,6 +35,7 @@ from ..core import (
|
|
|
35
35
|
register_output_types,
|
|
36
36
|
)
|
|
37
37
|
from ..core.entity.utils import refresh_tileable_shape
|
|
38
|
+
from ..protocol import DataFrameTableMeta
|
|
38
39
|
from ..serialization.serializables import (
|
|
39
40
|
AnyField,
|
|
40
41
|
BoolField,
|
|
@@ -59,7 +60,13 @@ from ..utils import (
|
|
|
59
60
|
on_serialize_numpy_type,
|
|
60
61
|
tokenize,
|
|
61
62
|
)
|
|
62
|
-
from .utils import
|
|
63
|
+
from .utils import (
|
|
64
|
+
ReprSeries,
|
|
65
|
+
apply_if_callable,
|
|
66
|
+
fetch_corner_data,
|
|
67
|
+
merge_index_value,
|
|
68
|
+
parse_index,
|
|
69
|
+
)
|
|
63
70
|
|
|
64
71
|
|
|
65
72
|
class IndexValue(Serializable):
|
|
@@ -616,6 +623,9 @@ class IndexData(HasShapeTileableData, _ToPandasMixin):
|
|
|
616
623
|
if self._name is None:
|
|
617
624
|
self._name = self.chunks[0].name
|
|
618
625
|
|
|
626
|
+
def refresh_from_table_meta(self, table_meta: DataFrameTableMeta) -> None:
|
|
627
|
+
pass
|
|
628
|
+
|
|
619
629
|
def _to_str(self, representation=False):
|
|
620
630
|
if is_build_mode() or len(self._executed_sessions) == 0:
|
|
621
631
|
# in build mode, or not executed, just return representation
|
|
@@ -945,6 +955,9 @@ class BaseSeriesData(HasShapeTileableData, _ToPandasMixin):
|
|
|
945
955
|
if self._name is None:
|
|
946
956
|
self._name = self.chunks[0].name
|
|
947
957
|
|
|
958
|
+
def refresh_from_table_meta(self, table_meta: DataFrameTableMeta) -> None:
|
|
959
|
+
pass
|
|
960
|
+
|
|
948
961
|
def _to_str(self, representation=False):
|
|
949
962
|
if is_build_mode() or len(self._executed_sessions) == 0:
|
|
950
963
|
# in build mode, or not executed, just return representation
|
|
@@ -978,7 +991,7 @@ class BaseSeriesData(HasShapeTileableData, _ToPandasMixin):
|
|
|
978
991
|
return self._to_str(representation=False)
|
|
979
992
|
|
|
980
993
|
def __repr__(self):
|
|
981
|
-
return self._to_str(representation=
|
|
994
|
+
return self._to_str(representation=True)
|
|
982
995
|
|
|
983
996
|
@property
|
|
984
997
|
def dtype(self):
|
|
@@ -1501,6 +1514,15 @@ class BaseDataFrameData(HasShapeTileableData, _ToPandasMixin):
|
|
|
1501
1514
|
refresh_index_value(self)
|
|
1502
1515
|
refresh_dtypes(self)
|
|
1503
1516
|
|
|
1517
|
+
def refresh_from_table_meta(self, table_meta: DataFrameTableMeta) -> None:
|
|
1518
|
+
dtypes = table_meta.pd_column_dtypes
|
|
1519
|
+
self._dtypes = dtypes
|
|
1520
|
+
self._columns_value = parse_index(dtypes.index, store_data=True)
|
|
1521
|
+
self._dtypes_value = DtypesValue(key=tokenize(dtypes), value=dtypes)
|
|
1522
|
+
new_shape = list(self._shape)
|
|
1523
|
+
new_shape[0] = len(dtypes)
|
|
1524
|
+
self._shape = tuple(new_shape)
|
|
1525
|
+
|
|
1504
1526
|
@property
|
|
1505
1527
|
def dtypes(self):
|
|
1506
1528
|
dt = getattr(self, "_dtypes", None)
|
|
@@ -1997,12 +2019,6 @@ class DataFrame(HasShapeTileable, _ToPandasMixin):
|
|
|
1997
2019
|
Berkeley 25.0 77.0 298.15
|
|
1998
2020
|
"""
|
|
1999
2021
|
|
|
2000
|
-
def apply_if_callable(maybe_callable, obj, **kwargs):
|
|
2001
|
-
if callable(maybe_callable):
|
|
2002
|
-
return maybe_callable(obj, **kwargs)
|
|
2003
|
-
|
|
2004
|
-
return maybe_callable
|
|
2005
|
-
|
|
2006
2022
|
data = self.copy()
|
|
2007
2023
|
|
|
2008
2024
|
for k, v in kwargs.items():
|
|
@@ -2197,6 +2213,9 @@ class CategoricalData(HasShapeTileableData, _ToPandasMixin):
|
|
|
2197
2213
|
pd.Categorical(categories).categories, store_data=True
|
|
2198
2214
|
)
|
|
2199
2215
|
|
|
2216
|
+
def refresh_from_table_meta(self, table_meta: DataFrameTableMeta) -> None:
|
|
2217
|
+
pass
|
|
2218
|
+
|
|
2200
2219
|
def _to_str(self, representation=False):
|
|
2201
2220
|
if is_build_mode() or len(self._executed_sessions) == 0:
|
|
2202
2221
|
# in build mode, or not executed, just return representation
|
|
@@ -2347,6 +2366,9 @@ class DataFrameOrSeriesData(HasShapeTileableData, _ToPandasMixin):
|
|
|
2347
2366
|
data_params["name"] = self.chunks[0].name
|
|
2348
2367
|
self._data_params.update(data_params)
|
|
2349
2368
|
|
|
2369
|
+
def refresh_from_table_meta(self, table_meta: DataFrameTableMeta) -> None:
|
|
2370
|
+
pass
|
|
2371
|
+
|
|
2350
2372
|
def ensure_data(self):
|
|
2351
2373
|
from .fetch.core import DataFrameFetch
|
|
2352
2374
|
|
|
@@ -216,7 +216,9 @@ class DataFrameReadODPSQuery(
|
|
|
216
216
|
index_value = parse_index(pd.RangeIndex(0))
|
|
217
217
|
elif len(self.index_columns) == 1:
|
|
218
218
|
index_value = parse_index(
|
|
219
|
-
pd.Index([], name=self.index_columns[0]).astype(
|
|
219
|
+
pd.Index([], name=self.index_columns[0]).astype(
|
|
220
|
+
self.index_dtypes.iloc[0]
|
|
221
|
+
)
|
|
220
222
|
)
|
|
221
223
|
else:
|
|
222
224
|
idx = pd.MultiIndex.from_frame(
|
|
@@ -82,7 +82,9 @@ class DataFrameReadODPSTable(
|
|
|
82
82
|
index_value = parse_index(pd.RangeIndex(shape[0]))
|
|
83
83
|
elif len(self.index_columns) == 1:
|
|
84
84
|
index_value = parse_index(
|
|
85
|
-
pd.Index([], name=self.index_columns[0]).astype(
|
|
85
|
+
pd.Index([], name=self.index_columns[0]).astype(
|
|
86
|
+
self.index_dtypes.iloc[0]
|
|
87
|
+
)
|
|
86
88
|
)
|
|
87
89
|
else:
|
|
88
90
|
idx = pd.MultiIndex.from_frame(
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import pytest
|
|
16
|
+
|
|
17
|
+
from ... import DataFrame
|
|
18
|
+
from ..to_odps import to_odps_table
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@pytest.fixture
|
|
22
|
+
def df():
|
|
23
|
+
return DataFrame({"A": [1, 2], "B": [3, 4]})
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@pytest.mark.parametrize(
|
|
27
|
+
"kwargs",
|
|
28
|
+
[
|
|
29
|
+
{"partition_col": ["A", "C"]},
|
|
30
|
+
{"partition_col": "C"},
|
|
31
|
+
{"partition": "a=1,C=2"},
|
|
32
|
+
],
|
|
33
|
+
)
|
|
34
|
+
def test_to_odps_table_validation(df, kwargs):
|
|
35
|
+
with pytest.raises(ValueError):
|
|
36
|
+
to_odps_table(df, "test_table", **kwargs)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@pytest.mark.parametrize(
|
|
40
|
+
"kwargs",
|
|
41
|
+
[
|
|
42
|
+
{"partition_col": ["a", "B"]},
|
|
43
|
+
{"partition_col": "a"},
|
|
44
|
+
{"partition": "C=1,d=2"},
|
|
45
|
+
],
|
|
46
|
+
)
|
|
47
|
+
def test_to_odps_table_vaild(df, kwargs):
|
|
48
|
+
to_odps_table(df, "test_table", **kwargs)
|
|
@@ -18,10 +18,12 @@ import logging
|
|
|
18
18
|
from typing import List, Optional, Union
|
|
19
19
|
|
|
20
20
|
from odps.models import Table as ODPSTable
|
|
21
|
+
from odps.types import PartitionSpec
|
|
21
22
|
|
|
22
23
|
from ... import opcodes
|
|
23
24
|
from ...config import options
|
|
24
25
|
from ...core import OutputType
|
|
26
|
+
from ...odpsio import build_dataframe_table_meta
|
|
25
27
|
from ...serialization.serializables import (
|
|
26
28
|
BoolField,
|
|
27
29
|
FieldTypes,
|
|
@@ -147,6 +149,25 @@ def to_odps_table(
|
|
|
147
149
|
f"index_label needs {len(df.index.nlevels)} labels "
|
|
148
150
|
f"but it only have {len(index_label)}"
|
|
149
151
|
)
|
|
152
|
+
table_cols = set(build_dataframe_table_meta(df).table_column_names)
|
|
153
|
+
if partition:
|
|
154
|
+
partition_intersect = (
|
|
155
|
+
set(x.lower() for x in PartitionSpec(partition).keys()) & table_cols
|
|
156
|
+
)
|
|
157
|
+
if partition_intersect:
|
|
158
|
+
raise ValueError(
|
|
159
|
+
f"Data column(s) {partition_intersect} in the dataframe"
|
|
160
|
+
" cannot be used in parameter 'partition'."
|
|
161
|
+
" Use 'partition_col' instead."
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
if partition_col:
|
|
165
|
+
partition_diff = set(x.lower() for x in partition_col) - table_cols
|
|
166
|
+
if partition_diff:
|
|
167
|
+
raise ValueError(
|
|
168
|
+
f"Partition column(s) {partition_diff}"
|
|
169
|
+
" is not the data column(s) of the input dataframe."
|
|
170
|
+
)
|
|
150
171
|
|
|
151
172
|
op = DataFrameToODPSTable(
|
|
152
173
|
dtypes=df.dtypes,
|
|
@@ -138,7 +138,7 @@ class DataFrameAlign(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
138
138
|
series_index = rhs.index_value.to_pandas()
|
|
139
139
|
dtypes = lhs.dtypes.reindex(
|
|
140
140
|
lhs.dtypes.index.join(series_index, how=self.join)
|
|
141
|
-
).fillna(np.dtype(
|
|
141
|
+
).fillna(np.dtype(float))
|
|
142
142
|
l_shape[1] = r_size = len(dtypes)
|
|
143
143
|
col_val = r_idx_val = parse_index(dtypes.index, store_data=True)
|
|
144
144
|
|
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
|
|
15
15
|
from .apply import df_apply, series_apply
|
|
16
16
|
from .astype import astype, index_astype
|
|
17
|
+
from .case_when import case_when
|
|
17
18
|
from .check_monotonic import (
|
|
18
19
|
check_monotonic,
|
|
19
20
|
is_monotonic,
|
|
@@ -37,6 +38,7 @@ from .map import index_map, series_map
|
|
|
37
38
|
from .melt import melt
|
|
38
39
|
from .memory_usage import df_memory_usage, index_memory_usage, series_memory_usage
|
|
39
40
|
from .pct_change import pct_change
|
|
41
|
+
from .pivot_table import pivot_table
|
|
40
42
|
from .qcut import qcut
|
|
41
43
|
from .select_dtypes import select_dtypes
|
|
42
44
|
from .shift import shift, tshift
|
|
@@ -69,6 +71,7 @@ def _install():
|
|
|
69
71
|
setattr(t, "melt", melt)
|
|
70
72
|
setattr(t, "memory_usage", df_memory_usage)
|
|
71
73
|
setattr(t, "pct_change", pct_change)
|
|
74
|
+
setattr(t, "pivot_table", pivot_table)
|
|
72
75
|
setattr(t, "pop", df_pop)
|
|
73
76
|
setattr(t, "query", df_query)
|
|
74
77
|
setattr(t, "select_dtypes", select_dtypes)
|
|
@@ -81,6 +84,7 @@ def _install():
|
|
|
81
84
|
for t in SERIES_TYPE:
|
|
82
85
|
setattr(t, "apply", series_apply)
|
|
83
86
|
setattr(t, "astype", astype)
|
|
87
|
+
setattr(t, "case_when", case_when)
|
|
84
88
|
setattr(t, "check_monotonic", check_monotonic)
|
|
85
89
|
setattr(t, "describe", describe)
|
|
86
90
|
setattr(t, "diff", series_diff)
|
maxframe/dataframe/misc/apply.py
CHANGED
|
@@ -170,6 +170,8 @@ class ApplyOperator(
|
|
|
170
170
|
elif self.output_types[0] == OutputType.dataframe:
|
|
171
171
|
shape = [np.nan, np.nan]
|
|
172
172
|
shape[1 - self.axis] = df.shape[1 - self.axis]
|
|
173
|
+
if self.axis == 1:
|
|
174
|
+
shape[1] = len(dtypes)
|
|
173
175
|
shape = tuple(shape)
|
|
174
176
|
else:
|
|
175
177
|
shape = (df.shape[1 - self.axis],)
|
|
@@ -225,7 +227,7 @@ class ApplyOperator(
|
|
|
225
227
|
else: # pragma: no cover
|
|
226
228
|
index_value = parse_index(infer_series.index)
|
|
227
229
|
else:
|
|
228
|
-
index_value = parse_index(
|
|
230
|
+
index_value = parse_index(series.index_value)
|
|
229
231
|
|
|
230
232
|
if output_type == OutputType.dataframe:
|
|
231
233
|
if dtypes is None:
|