maxframe 1.2.1__cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl → 1.3.0__cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/codegen.py +70 -21
- maxframe/config/config.py +6 -0
- maxframe/core/accessor.py +1 -0
- maxframe/dataframe/accessors/__init__.py +1 -1
- maxframe/dataframe/accessors/dict_/accessor.py +1 -0
- maxframe/dataframe/accessors/dict_/length.py +1 -0
- maxframe/dataframe/accessors/dict_/setitem.py +1 -0
- maxframe/dataframe/accessors/dict_/tests/test_dict_accessor.py +5 -7
- maxframe/dataframe/accessors/list_/__init__.py +37 -0
- maxframe/dataframe/accessors/list_/accessor.py +39 -0
- maxframe/dataframe/accessors/list_/getitem.py +135 -0
- maxframe/dataframe/accessors/list_/length.py +73 -0
- maxframe/dataframe/accessors/list_/tests/__init__.py +13 -0
- maxframe/dataframe/accessors/list_/tests/test_list_accessor.py +79 -0
- maxframe/dataframe/accessors/plotting/__init__.py +2 -0
- maxframe/dataframe/accessors/string_/__init__.py +1 -0
- maxframe/dataframe/datastore/to_odps.py +6 -0
- maxframe/dataframe/extensions/accessor.py +1 -0
- maxframe/dataframe/extensions/apply_chunk.py +34 -21
- maxframe/dataframe/extensions/flatmap.py +8 -1
- maxframe/dataframe/extensions/tests/test_apply_chunk.py +2 -1
- maxframe/dataframe/extensions/tests/test_extensions.py +1 -0
- maxframe/dataframe/merge/concat.py +7 -4
- maxframe/dataframe/merge/merge.py +1 -0
- maxframe/dataframe/merge/tests/test_merge.py +97 -47
- maxframe/dataframe/missing/tests/test_missing.py +1 -0
- maxframe/dataframe/tests/test_utils.py +7 -0
- maxframe/dataframe/ufunc/ufunc.py +1 -0
- maxframe/dataframe/utils.py +3 -0
- maxframe/io/odpsio/schema.py +1 -0
- maxframe/learn/contrib/__init__.py +2 -4
- maxframe/learn/contrib/llm/__init__.py +1 -0
- maxframe/learn/contrib/llm/core.py +31 -10
- maxframe/learn/contrib/llm/models/__init__.py +1 -0
- maxframe/learn/contrib/llm/models/dashscope.py +4 -3
- maxframe/learn/contrib/llm/models/managed.py +39 -0
- maxframe/learn/contrib/llm/multi_modal.py +1 -0
- maxframe/learn/contrib/llm/text.py +252 -8
- maxframe/learn/contrib/models.py +77 -0
- maxframe/learn/contrib/utils.py +1 -0
- maxframe/learn/contrib/xgboost/__init__.py +8 -1
- maxframe/learn/contrib/xgboost/classifier.py +15 -4
- maxframe/learn/contrib/xgboost/core.py +108 -1
- maxframe/learn/contrib/xgboost/dmatrix.py +1 -1
- maxframe/learn/contrib/xgboost/predict.py +8 -3
- maxframe/learn/contrib/xgboost/regressor.py +15 -1
- maxframe/learn/contrib/xgboost/train.py +5 -4
- maxframe/lib/dtypes_extension/__init__.py +2 -1
- maxframe/lib/dtypes_extension/dtypes.py +21 -0
- maxframe/lib/dtypes_extension/tests/test_dtypes.py +13 -3
- maxframe/opcodes.py +19 -0
- maxframe/serialization/__init__.py +1 -0
- maxframe/serialization/core.cpython-39-aarch64-linux-gnu.so +0 -0
- maxframe/serialization/core.pyx +12 -1
- maxframe/serialization/numpy.py +12 -4
- maxframe/serialization/serializables/tests/test_serializable.py +13 -2
- maxframe/serialization/tests/test_serial.py +2 -0
- maxframe/tensor/merge/concatenate.py +1 -0
- maxframe/tensor/misc/unique.py +11 -10
- maxframe/tensor/reshape/reshape.py +4 -1
- maxframe/utils.py +4 -0
- {maxframe-1.2.1.dist-info → maxframe-1.3.0.dist-info}/METADATA +2 -2
- {maxframe-1.2.1.dist-info → maxframe-1.3.0.dist-info}/RECORD +648 -640
- {maxframe-1.2.1.dist-info → maxframe-1.3.0.dist-info}/WHEEL +1 -1
- maxframe_client/session/odps.py +3 -0
- maxframe_client/session/tests/test_task.py +1 -0
- {maxframe-1.2.1.dist-info → maxframe-1.3.0.dist-info}/top_level.txt +0 -0
maxframe/codegen.py
CHANGED
|
@@ -24,7 +24,7 @@ from odps.types import OdpsSchema
|
|
|
24
24
|
from odps.utils import camel_to_underline
|
|
25
25
|
|
|
26
26
|
from .core import OperatorType, Tileable, TileableGraph
|
|
27
|
-
from .core.operator import Fetch
|
|
27
|
+
from .core.operator import Fetch, Operator
|
|
28
28
|
from .extension import iter_extensions
|
|
29
29
|
from .io.odpsio import build_dataframe_table_meta
|
|
30
30
|
from .io.odpsio.schema import pandas_to_odps_schema
|
|
@@ -211,7 +211,21 @@ class BigDagCodeContext(metaclass=abc.ABCMeta):
|
|
|
211
211
|
def get_udfs(self) -> List[AbstractUDF]:
|
|
212
212
|
return list(self._udfs.values())
|
|
213
213
|
|
|
214
|
-
def
|
|
214
|
+
def get_input_tileable_variable(self, tileable: Tileable) -> str:
|
|
215
|
+
"""
|
|
216
|
+
Get or create the variable name for an input tileable. It should be used on the
|
|
217
|
+
RIGHT side of the assignment.
|
|
218
|
+
"""
|
|
219
|
+
return self._get_tileable_variable(tileable)
|
|
220
|
+
|
|
221
|
+
def get_output_tileable_variable(self, tileable: Tileable) -> str:
|
|
222
|
+
"""
|
|
223
|
+
Get or create the variable name for an output tileable. It should be used on the
|
|
224
|
+
LEFT side of the assignment.
|
|
225
|
+
"""
|
|
226
|
+
return self._get_tileable_variable(tileable)
|
|
227
|
+
|
|
228
|
+
def _get_tileable_variable(self, tileable: Tileable) -> str:
|
|
215
229
|
try:
|
|
216
230
|
return self._tileable_key_to_variables[tileable.key]
|
|
217
231
|
except KeyError:
|
|
@@ -315,7 +329,7 @@ class EngineAcceptance(Enum):
|
|
|
315
329
|
|
|
316
330
|
class BigDagOperatorAdapter(metaclass=abc.ABCMeta):
|
|
317
331
|
# todo handle refcount issue when generated code is being executed
|
|
318
|
-
def accepts(self, op:
|
|
332
|
+
def accepts(self, op: Operator) -> EngineAcceptance:
|
|
319
333
|
return EngineAcceptance.ACCEPT
|
|
320
334
|
|
|
321
335
|
@abc.abstractmethod
|
|
@@ -330,7 +344,7 @@ class BigDagOperatorAdapter(metaclass=abc.ABCMeta):
|
|
|
330
344
|
|
|
331
345
|
Parameters
|
|
332
346
|
----------
|
|
333
|
-
op :
|
|
347
|
+
op : Operator
|
|
334
348
|
The operator instance.
|
|
335
349
|
context : BigDagCodeContext
|
|
336
350
|
The BigDagCodeContext instance.
|
|
@@ -342,6 +356,48 @@ class BigDagOperatorAdapter(metaclass=abc.ABCMeta):
|
|
|
342
356
|
"""
|
|
343
357
|
return list()
|
|
344
358
|
|
|
359
|
+
def generate_pre_op_code(
|
|
360
|
+
self, op: Operator, context: BigDagCodeContext
|
|
361
|
+
) -> List[str]:
|
|
362
|
+
"""
|
|
363
|
+
Generate the codes before actually handling the operator.
|
|
364
|
+
This method is usually implemented in the base class of each engine.
|
|
365
|
+
|
|
366
|
+
Parameters
|
|
367
|
+
----------
|
|
368
|
+
op : Operator
|
|
369
|
+
The operator instance.
|
|
370
|
+
context : BigDagCodeContext
|
|
371
|
+
The BigDagCodeContext instance.
|
|
372
|
+
|
|
373
|
+
Returns
|
|
374
|
+
-------
|
|
375
|
+
result: List[str]
|
|
376
|
+
The codes generated before one operator actually handled, one per line.
|
|
377
|
+
"""
|
|
378
|
+
return list()
|
|
379
|
+
|
|
380
|
+
def generate_post_op_code(
|
|
381
|
+
self, op: Operator, context: BigDagCodeContext
|
|
382
|
+
) -> List[str]:
|
|
383
|
+
"""
|
|
384
|
+
Generate the codes after actually handling the operator.
|
|
385
|
+
This method is usually implemented in the base class of each engine.
|
|
386
|
+
|
|
387
|
+
Parameters
|
|
388
|
+
----------
|
|
389
|
+
op : Operator
|
|
390
|
+
The operator instance.
|
|
391
|
+
context : BigDagCodeContext
|
|
392
|
+
The BigDagCodeContext instance.
|
|
393
|
+
|
|
394
|
+
Returns
|
|
395
|
+
-------
|
|
396
|
+
result: List[str]
|
|
397
|
+
The codes generated after one operator actually handled, one per line.
|
|
398
|
+
"""
|
|
399
|
+
return list()
|
|
400
|
+
|
|
345
401
|
|
|
346
402
|
_engine_to_codegen: Dict[str, Type["BigDagCodeGenerator"]] = dict()
|
|
347
403
|
|
|
@@ -354,9 +410,6 @@ def register_engine_codegen(type_: Type["BigDagCodeGenerator"]):
|
|
|
354
410
|
BUILTIN_ENGINE_SPE = "SPE"
|
|
355
411
|
BUILTIN_ENGINE_MCSQL = "MCSQL"
|
|
356
412
|
|
|
357
|
-
FAST_RANGE_INDEX_ENABLED = "codegen.fast_range_index_enabled"
|
|
358
|
-
ROW_NUMBER_WINDOW_INDEX_ENABLED = "codegen.row_number_window_index_enabled"
|
|
359
|
-
|
|
360
413
|
|
|
361
414
|
class BigDagCodeGenerator(metaclass=abc.ABCMeta):
|
|
362
415
|
_context: BigDagCodeContext
|
|
@@ -364,11 +417,13 @@ class BigDagCodeGenerator(metaclass=abc.ABCMeta):
|
|
|
364
417
|
engine_type: Optional[str] = None
|
|
365
418
|
engine_priority: int = 0
|
|
366
419
|
_extension_loaded = False
|
|
420
|
+
_generate_comments_enabled: bool = True
|
|
367
421
|
|
|
368
422
|
def __init__(self, session_id: str, subdag_id: str = None):
|
|
369
423
|
self._session_id = session_id
|
|
370
424
|
self._subdag_id = subdag_id
|
|
371
425
|
self._context = self._init_context(session_id, subdag_id)
|
|
426
|
+
self._generate_comments_enabled = True
|
|
372
427
|
|
|
373
428
|
@classmethod
|
|
374
429
|
def _load_engine_extensions(cls):
|
|
@@ -401,14 +456,6 @@ class BigDagCodeGenerator(metaclass=abc.ABCMeta):
|
|
|
401
456
|
def _init_context(self, session_id: str, subdag_id: str) -> BigDagCodeContext:
|
|
402
457
|
raise NotImplementedError
|
|
403
458
|
|
|
404
|
-
def _generate_comments(
|
|
405
|
-
self, op: OperatorType, adapter: BigDagOperatorAdapter
|
|
406
|
-
) -> List[str]:
|
|
407
|
-
return adapter.generate_comment(op, self._context)
|
|
408
|
-
|
|
409
|
-
def _generate_pre_op_code(self, op: OperatorType) -> List[str]:
|
|
410
|
-
return []
|
|
411
|
-
|
|
412
459
|
def _generate_delete_code(self, var_name: str) -> List[str]:
|
|
413
460
|
return []
|
|
414
461
|
|
|
@@ -438,9 +485,11 @@ class BigDagCodeGenerator(metaclass=abc.ABCMeta):
|
|
|
438
485
|
visited_op_key.add(op.key)
|
|
439
486
|
|
|
440
487
|
adapter = self.get_op_adapter(type(op))()
|
|
441
|
-
code_lines.extend(
|
|
442
|
-
|
|
488
|
+
code_lines.extend(adapter.generate_pre_op_code(op, self._context))
|
|
489
|
+
if self._generate_comments_enabled:
|
|
490
|
+
code_lines.extend(adapter.generate_comment(op, self._context))
|
|
443
491
|
code_lines.extend(adapter.generate_code(op, self._context))
|
|
492
|
+
code_lines.extend(adapter.generate_post_op_code(op, self._context))
|
|
444
493
|
code_lines.append("") # Append an empty line to separate operators
|
|
445
494
|
|
|
446
495
|
# record refcounts
|
|
@@ -449,7 +498,7 @@ class BigDagCodeGenerator(metaclass=abc.ABCMeta):
|
|
|
449
498
|
continue
|
|
450
499
|
if dag.count_successors(out_t) == 0:
|
|
451
500
|
delete_code = self._generate_delete_code(
|
|
452
|
-
self._context.
|
|
501
|
+
self._context.get_input_tileable_variable(out_t)
|
|
453
502
|
)
|
|
454
503
|
code_lines.extend(delete_code)
|
|
455
504
|
else:
|
|
@@ -462,7 +511,7 @@ class BigDagCodeGenerator(metaclass=abc.ABCMeta):
|
|
|
462
511
|
out_refcounts[inp_t.key] -= 1
|
|
463
512
|
if out_refcounts[inp_t.key] == 0:
|
|
464
513
|
delete_code = self._generate_delete_code(
|
|
465
|
-
self._context.
|
|
514
|
+
self._context.get_input_tileable_variable(inp_t)
|
|
466
515
|
)
|
|
467
516
|
code_lines.extend(delete_code)
|
|
468
517
|
out_refcounts.pop(inp_t.key)
|
|
@@ -475,11 +524,11 @@ class BigDagCodeGenerator(metaclass=abc.ABCMeta):
|
|
|
475
524
|
for tileable in dag.topological_iter():
|
|
476
525
|
op: OperatorType = tileable.op
|
|
477
526
|
if isinstance(op, Fetch):
|
|
478
|
-
fetch_tileable = self._context.
|
|
527
|
+
fetch_tileable = self._context.get_input_tileable_variable(tileable)
|
|
479
528
|
input_key_to_vars[op.outputs[0].key] = fetch_tileable
|
|
480
529
|
|
|
481
530
|
result_variables = {
|
|
482
|
-
t.key: self._context.
|
|
531
|
+
t.key: self._context.get_input_tileable_variable(t) for t in dag.results
|
|
483
532
|
}
|
|
484
533
|
|
|
485
534
|
return CodeGenResult(
|
maxframe/config/config.py
CHANGED
|
@@ -407,6 +407,12 @@ default_options.register_option(
|
|
|
407
407
|
validator=is_integer,
|
|
408
408
|
remote=True,
|
|
409
409
|
)
|
|
410
|
+
default_options.register_option(
|
|
411
|
+
"session.temp_table_properties",
|
|
412
|
+
None,
|
|
413
|
+
validator=is_null | is_dict,
|
|
414
|
+
remote=True,
|
|
415
|
+
)
|
|
410
416
|
default_options.register_option(
|
|
411
417
|
"session.auto_purge_temp_tables",
|
|
412
418
|
False,
|
maxframe/core/accessor.py
CHANGED
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
|
|
14
15
|
from .... import opcodes
|
|
15
16
|
from ....core.entity.output_types import OutputType
|
|
16
17
|
from ....serialization.serializables.field import AnyField
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
|
|
14
15
|
import numpy as np
|
|
15
16
|
import pandas as pd
|
|
16
17
|
import pyarrow as pa
|
|
@@ -25,6 +26,10 @@ from ..length import SeriesDictLengthOperator
|
|
|
25
26
|
from ..remove import SeriesDictRemoveOperator
|
|
26
27
|
from ..setitem import SeriesDictSetItemOperator
|
|
27
28
|
|
|
29
|
+
pytestmark = pytest.mark.skipif(
|
|
30
|
+
ARROW_DTYPE_NOT_SUPPORTED, reason="Arrow Dtype is not supported"
|
|
31
|
+
)
|
|
32
|
+
|
|
28
33
|
|
|
29
34
|
@pytest.fixture
|
|
30
35
|
def df():
|
|
@@ -40,13 +45,11 @@ def df():
|
|
|
40
45
|
)
|
|
41
46
|
|
|
42
47
|
|
|
43
|
-
@pytest.mark.skipif(ARROW_DTYPE_NOT_SUPPORTED, reason="Arrow Dtype is not supported")
|
|
44
48
|
def test_invalid_dtype(df):
|
|
45
49
|
with pytest.raises(AttributeError):
|
|
46
50
|
df["C"].dict.contains("k1")
|
|
47
51
|
|
|
48
52
|
|
|
49
|
-
@pytest.mark.skipif(ARROW_DTYPE_NOT_SUPPORTED, reason="Arrow Dtype is not supported")
|
|
50
53
|
def test_getitem(df):
|
|
51
54
|
s1 = df["A"].dict["k1"]
|
|
52
55
|
assert isinstance(s1, md.Series)
|
|
@@ -61,7 +64,6 @@ def test_getitem(df):
|
|
|
61
64
|
assert op.ignore_key_error is False
|
|
62
65
|
|
|
63
66
|
|
|
64
|
-
@pytest.mark.skipif(ARROW_DTYPE_NOT_SUPPORTED, reason="Arrow Dtype is not supported")
|
|
65
67
|
def test_getitem_with_default_value(df):
|
|
66
68
|
s1 = df["B"].dict.get("k1", 1)
|
|
67
69
|
assert isinstance(s1, md.Series)
|
|
@@ -76,7 +78,6 @@ def test_getitem_with_default_value(df):
|
|
|
76
78
|
assert op.ignore_key_error is True
|
|
77
79
|
|
|
78
80
|
|
|
79
|
-
@pytest.mark.skipif(ARROW_DTYPE_NOT_SUPPORTED, reason="Arrow Dtype is not supported")
|
|
80
81
|
def test_setitem(df):
|
|
81
82
|
s1 = df["A"]
|
|
82
83
|
s1.dict["k1"] = "v3"
|
|
@@ -91,7 +92,6 @@ def test_setitem(df):
|
|
|
91
92
|
assert op.value == "v3"
|
|
92
93
|
|
|
93
94
|
|
|
94
|
-
@pytest.mark.skipif(ARROW_DTYPE_NOT_SUPPORTED, reason="Arrow Dtype is not supported")
|
|
95
95
|
def test_length(df):
|
|
96
96
|
s1 = df["A"].dict.len()
|
|
97
97
|
assert isinstance(s1, md.Series)
|
|
@@ -103,7 +103,6 @@ def test_length(df):
|
|
|
103
103
|
assert isinstance(op, SeriesDictLengthOperator)
|
|
104
104
|
|
|
105
105
|
|
|
106
|
-
@pytest.mark.skipif(ARROW_DTYPE_NOT_SUPPORTED, reason="Arrow Dtype is not supported")
|
|
107
106
|
def test_remove(df):
|
|
108
107
|
s1 = df["A"].dict.remove("k1", ignore_key_error=True)
|
|
109
108
|
assert isinstance(s1, md.Series)
|
|
@@ -117,7 +116,6 @@ def test_remove(df):
|
|
|
117
116
|
assert op.ignore_key_error is True
|
|
118
117
|
|
|
119
118
|
|
|
120
|
-
@pytest.mark.skipif(ARROW_DTYPE_NOT_SUPPORTED, reason="Arrow Dtype is not supported")
|
|
121
119
|
def test_contains(df):
|
|
122
120
|
s1 = df["A"].dict.contains("k1")
|
|
123
121
|
assert isinstance(s1, md.Series)
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# Copyright 1999-2025 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _install():
|
|
17
|
+
from ....core import CachedAccessor
|
|
18
|
+
from ...core import SERIES_TYPE
|
|
19
|
+
from .accessor import ListAccessor
|
|
20
|
+
from .getitem import series_list_getitem, series_list_getitem_with_index_error
|
|
21
|
+
from .length import series_list_length
|
|
22
|
+
|
|
23
|
+
dict_method_to_handlers = {
|
|
24
|
+
"__getitem__": series_list_getitem_with_index_error,
|
|
25
|
+
"get": series_list_getitem,
|
|
26
|
+
"len": series_list_length,
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
for name, handler in dict_method_to_handlers.items():
|
|
30
|
+
ListAccessor._register(name, handler)
|
|
31
|
+
|
|
32
|
+
for series in SERIES_TYPE:
|
|
33
|
+
series.list = CachedAccessor("list", ListAccessor)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
_install()
|
|
37
|
+
del _install
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# Copyright 1999-2025 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from typing import TYPE_CHECKING
|
|
16
|
+
|
|
17
|
+
import pandas as pd
|
|
18
|
+
import pyarrow as pa
|
|
19
|
+
|
|
20
|
+
from ....core import BaseMaxFrameAccessor
|
|
21
|
+
from ....utils import ARROW_DTYPE_NOT_SUPPORTED
|
|
22
|
+
|
|
23
|
+
if TYPE_CHECKING:
|
|
24
|
+
from ...core import Series
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ListAccessor(BaseMaxFrameAccessor):
|
|
28
|
+
obj: "Series"
|
|
29
|
+
_api_count: int = 0
|
|
30
|
+
|
|
31
|
+
def __init__(self, series):
|
|
32
|
+
super().__init__(series)
|
|
33
|
+
if ARROW_DTYPE_NOT_SUPPORTED:
|
|
34
|
+
raise ImportError("pd.ArrowDtype is not supported in current environment")
|
|
35
|
+
|
|
36
|
+
if not isinstance(series.dtype, pd.ArrowDtype) or not isinstance(
|
|
37
|
+
series.dtype.pyarrow_dtype, pa.ListType
|
|
38
|
+
):
|
|
39
|
+
raise AttributeError("Can only use .list accessor with list values")
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# Copyright 1999-2025 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import pandas as pd
|
|
16
|
+
|
|
17
|
+
from .... import opcodes
|
|
18
|
+
from ....core.entity.output_types import OutputType
|
|
19
|
+
from ....serialization.serializables.field import AnyField, BoolField
|
|
20
|
+
from ...operators import DataFrameOperator, DataFrameOperatorMixin
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class SeriesListGetItemOperator(DataFrameOperator, DataFrameOperatorMixin):
|
|
24
|
+
_op_type_ = opcodes.SERIES_LIST_GETITEM
|
|
25
|
+
query_index = AnyField("query_index", default=None)
|
|
26
|
+
ignore_index_error = BoolField("ignore_index_error", default=False)
|
|
27
|
+
|
|
28
|
+
def __init__(self, **kw):
|
|
29
|
+
super().__init__(_output_types=[OutputType.series], **kw)
|
|
30
|
+
|
|
31
|
+
def __call__(self, series):
|
|
32
|
+
arrow_list_type = series.dtype.pyarrow_dtype
|
|
33
|
+
return self.new_series(
|
|
34
|
+
[series],
|
|
35
|
+
shape=series.shape,
|
|
36
|
+
dtype=pd.ArrowDtype(arrow_list_type.value_type),
|
|
37
|
+
index_value=series.index_value,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def series_list_getitem(series, query_index):
|
|
42
|
+
"""
|
|
43
|
+
Get the value by the index of each list in the Series.
|
|
44
|
+
|
|
45
|
+
Parameters
|
|
46
|
+
----------
|
|
47
|
+
query_index : Any
|
|
48
|
+
The key to check, must be index.
|
|
49
|
+
|
|
50
|
+
Returns
|
|
51
|
+
-------
|
|
52
|
+
Series :
|
|
53
|
+
A Series with the list value's data type. The value will be
|
|
54
|
+
``None`` if the list is None.
|
|
55
|
+
|
|
56
|
+
Examples
|
|
57
|
+
--------
|
|
58
|
+
Create a series with list type data.
|
|
59
|
+
|
|
60
|
+
>>> import maxframe.dataframe as md
|
|
61
|
+
>>> import pyarrow as pa
|
|
62
|
+
>>> from maxframe.lib.dtypes_extension import list_
|
|
63
|
+
>>> s = md.Series(
|
|
64
|
+
... data=[[1, 2, 3], [4, 5, 6], None],
|
|
65
|
+
... index=[1, 2, 3],
|
|
66
|
+
... dtype=list_(pa.int64()),
|
|
67
|
+
... )
|
|
68
|
+
>>> s.execute()
|
|
69
|
+
1 [1, 2, 3]
|
|
70
|
+
2 [4, 5, 6]
|
|
71
|
+
3 <NA>
|
|
72
|
+
dtype: list<int64>[pyarrow]
|
|
73
|
+
|
|
74
|
+
>>> s.list.get(0).execute()
|
|
75
|
+
1 1
|
|
76
|
+
2 4
|
|
77
|
+
3 <NA>
|
|
78
|
+
dtype: int64[pyarrow]
|
|
79
|
+
"""
|
|
80
|
+
return SeriesListGetItemOperator(query_index=query_index, ignore_index_error=True)(
|
|
81
|
+
series
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def series_list_getitem_with_index_error(series, query_index):
|
|
86
|
+
"""
|
|
87
|
+
Get the value by the index of each list in the Series. If the index
|
|
88
|
+
is not in the list, raise IndexError.
|
|
89
|
+
|
|
90
|
+
Parameters
|
|
91
|
+
----------
|
|
92
|
+
query_index : Any
|
|
93
|
+
The index to check, must be integer.
|
|
94
|
+
|
|
95
|
+
Returns
|
|
96
|
+
-------
|
|
97
|
+
Series :
|
|
98
|
+
A Series with the list value's data type. Return ``None`` if the list is None.
|
|
99
|
+
|
|
100
|
+
Raises
|
|
101
|
+
------
|
|
102
|
+
KeyError
|
|
103
|
+
If the index is not in one list.
|
|
104
|
+
|
|
105
|
+
See Also
|
|
106
|
+
--------
|
|
107
|
+
Series.list.get: Get the value by the index of each list in the Series.
|
|
108
|
+
|
|
109
|
+
Examples
|
|
110
|
+
--------
|
|
111
|
+
Create a series with list type data.
|
|
112
|
+
|
|
113
|
+
>>> import maxframe.dataframe as md
|
|
114
|
+
>>> import pyarrow as pa
|
|
115
|
+
>>> from maxframe.lib.dtypes_extension import list_
|
|
116
|
+
>>> s = md.Series(
|
|
117
|
+
... data=[[1, 2, 3], [4, 5, 6], None],
|
|
118
|
+
... index=[1, 2, 3],
|
|
119
|
+
... dtype=list_(pa.int64()),
|
|
120
|
+
... )
|
|
121
|
+
>>> s.execute()
|
|
122
|
+
1 [1, 2, 3]
|
|
123
|
+
2 [4, 5, 6]
|
|
124
|
+
3 <NA>
|
|
125
|
+
dtype: list<int64>[pyarrow]
|
|
126
|
+
|
|
127
|
+
>>> s.list.get(0).execute()
|
|
128
|
+
1 1
|
|
129
|
+
2 4
|
|
130
|
+
3 <NA>
|
|
131
|
+
dtype: int64[pyarrow]
|
|
132
|
+
"""
|
|
133
|
+
return SeriesListGetItemOperator(query_index=query_index, ignore_index_error=False)(
|
|
134
|
+
series
|
|
135
|
+
)
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# Copyright 1999-2025 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import pandas as pd
|
|
16
|
+
import pyarrow as pa
|
|
17
|
+
|
|
18
|
+
from .... import opcodes
|
|
19
|
+
from ....core.entity.output_types import OutputType
|
|
20
|
+
from ...operators import DataFrameOperator, DataFrameOperatorMixin
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class SeriesListLengthOperator(DataFrameOperator, DataFrameOperatorMixin):
|
|
24
|
+
_op_type_ = opcodes.SERIES_LIST_LENGTH
|
|
25
|
+
|
|
26
|
+
def __init__(self, **kw):
|
|
27
|
+
super().__init__(_output_types=[OutputType.series], **kw)
|
|
28
|
+
|
|
29
|
+
def __call__(self, series):
|
|
30
|
+
return self.new_series(
|
|
31
|
+
[series],
|
|
32
|
+
shape=series.shape,
|
|
33
|
+
index_value=series.index_value,
|
|
34
|
+
dtype=pd.ArrowDtype(pa.int64()),
|
|
35
|
+
name=None,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def series_list_length(series):
|
|
40
|
+
"""
|
|
41
|
+
Get the length of each list of the Series.
|
|
42
|
+
|
|
43
|
+
Returns
|
|
44
|
+
-------
|
|
45
|
+
Series :
|
|
46
|
+
A Series with data type ``pandas.ArrowDtype(pyarrow.int64)``. Each element
|
|
47
|
+
represents the length of the list, or ``None`` if the list is ``None``.
|
|
48
|
+
|
|
49
|
+
Examples
|
|
50
|
+
--------
|
|
51
|
+
Create a series with list type data.
|
|
52
|
+
|
|
53
|
+
>>> import maxframe.dataframe as md
|
|
54
|
+
>>> import pyarrow as pa
|
|
55
|
+
>>> from maxframe.lib.dtypes_extension import list_
|
|
56
|
+
>>> s = md.Series(
|
|
57
|
+
... data=[[1, 2, 3], [4, 5, 6], None],
|
|
58
|
+
... index=[1, 2, 3],
|
|
59
|
+
... dtype=list_(pa.int64()),
|
|
60
|
+
... )
|
|
61
|
+
>>> s.execute()
|
|
62
|
+
1 [1, 2, 3]
|
|
63
|
+
2 [4, 5, 6]
|
|
64
|
+
3 <NA>
|
|
65
|
+
dtype: list<int64>[pyarrow]
|
|
66
|
+
|
|
67
|
+
>>> s.list.len().execute()
|
|
68
|
+
1 2
|
|
69
|
+
2 1
|
|
70
|
+
3 <NA>
|
|
71
|
+
dtype: int64[pyarrow]
|
|
72
|
+
"""
|
|
73
|
+
return SeriesListLengthOperator()(series)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Copyright 1999-2025 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# Copyright 1999-2025 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import numpy as np
|
|
16
|
+
import pandas as pd
|
|
17
|
+
import pyarrow as pa
|
|
18
|
+
import pytest
|
|
19
|
+
|
|
20
|
+
from ..... import dataframe as md
|
|
21
|
+
from .....lib.dtypes_extension import list_
|
|
22
|
+
from .....utils import ARROW_DTYPE_NOT_SUPPORTED
|
|
23
|
+
from ..getitem import SeriesListGetItemOperator
|
|
24
|
+
from ..length import SeriesListLengthOperator
|
|
25
|
+
|
|
26
|
+
pytestmark = pytest.mark.skipif(
|
|
27
|
+
ARROW_DTYPE_NOT_SUPPORTED, reason="Arrow Dtype is not supported"
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@pytest.fixture
|
|
32
|
+
def df():
|
|
33
|
+
return md.DataFrame(
|
|
34
|
+
{
|
|
35
|
+
"A": pd.Series([[5, 3, 2]], dtype=list_(pa.int32())),
|
|
36
|
+
"B": pd.Series([["ab", "cd"]], dtype=list_(pa.string())),
|
|
37
|
+
"C": pd.Series([1], dtype=np.dtype("int64")),
|
|
38
|
+
},
|
|
39
|
+
index=[1],
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def test_invalid_dtype(df):
|
|
44
|
+
with pytest.raises(AttributeError):
|
|
45
|
+
df["C"].list.len()
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def test_getitem(df):
|
|
49
|
+
s1 = df["A"].list[1]
|
|
50
|
+
assert isinstance(s1, md.Series)
|
|
51
|
+
assert s1.dtype == pd.ArrowDtype(pa.int32())
|
|
52
|
+
assert s1.shape == (1,)
|
|
53
|
+
assert s1.index_value == df.index_value
|
|
54
|
+
op = s1.op
|
|
55
|
+
assert isinstance(op, SeriesListGetItemOperator)
|
|
56
|
+
assert op.query_index == 1
|
|
57
|
+
assert op.ignore_index_error is False
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def test_getitem_ignore_index_err(df):
|
|
61
|
+
s1 = df["B"].list.get(1)
|
|
62
|
+
assert isinstance(s1, md.Series)
|
|
63
|
+
assert s1.dtype == pd.ArrowDtype(pa.string())
|
|
64
|
+
assert s1.shape == (1,)
|
|
65
|
+
assert s1.index_value == df.index_value
|
|
66
|
+
op = s1.op
|
|
67
|
+
assert isinstance(op, SeriesListGetItemOperator)
|
|
68
|
+
assert op.query_index == 1
|
|
69
|
+
assert op.ignore_index_error is True
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def test_length(df):
|
|
73
|
+
s1 = df["A"].list.len()
|
|
74
|
+
assert isinstance(s1, md.Series)
|
|
75
|
+
assert s1.dtype == pd.ArrowDtype(pa.int64())
|
|
76
|
+
assert s1.shape == (1,)
|
|
77
|
+
assert s1.index_value == df.index_value
|
|
78
|
+
op = s1.op
|
|
79
|
+
assert isinstance(op, SeriesListLengthOperator)
|