maxframe 1.2.1__cp311-cp311-macosx_10_9_universal2.whl → 1.3.0__cp311-cp311-macosx_10_9_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cpython-311-darwin.so +0 -0
- maxframe/codegen.py +70 -21
- maxframe/config/config.py +6 -0
- maxframe/core/accessor.py +1 -0
- maxframe/core/graph/core.cpython-311-darwin.so +0 -0
- maxframe/dataframe/accessors/__init__.py +1 -1
- maxframe/dataframe/accessors/dict_/accessor.py +1 -0
- maxframe/dataframe/accessors/dict_/length.py +1 -0
- maxframe/dataframe/accessors/dict_/setitem.py +1 -0
- maxframe/dataframe/accessors/dict_/tests/test_dict_accessor.py +5 -7
- maxframe/dataframe/accessors/list_/__init__.py +37 -0
- maxframe/dataframe/accessors/list_/accessor.py +39 -0
- maxframe/dataframe/accessors/list_/getitem.py +135 -0
- maxframe/dataframe/accessors/list_/length.py +73 -0
- maxframe/dataframe/accessors/list_/tests/__init__.py +13 -0
- maxframe/dataframe/accessors/list_/tests/test_list_accessor.py +79 -0
- maxframe/dataframe/accessors/plotting/__init__.py +2 -0
- maxframe/dataframe/accessors/string_/__init__.py +1 -0
- maxframe/dataframe/datastore/to_odps.py +6 -0
- maxframe/dataframe/extensions/accessor.py +1 -0
- maxframe/dataframe/extensions/apply_chunk.py +34 -21
- maxframe/dataframe/extensions/flatmap.py +8 -1
- maxframe/dataframe/extensions/tests/test_apply_chunk.py +2 -1
- maxframe/dataframe/extensions/tests/test_extensions.py +1 -0
- maxframe/dataframe/merge/concat.py +7 -4
- maxframe/dataframe/merge/merge.py +1 -0
- maxframe/dataframe/merge/tests/test_merge.py +97 -47
- maxframe/dataframe/missing/tests/test_missing.py +1 -0
- maxframe/dataframe/tests/test_utils.py +7 -0
- maxframe/dataframe/ufunc/ufunc.py +1 -0
- maxframe/dataframe/utils.py +3 -0
- maxframe/io/odpsio/schema.py +1 -0
- maxframe/learn/contrib/__init__.py +2 -4
- maxframe/learn/contrib/llm/__init__.py +1 -0
- maxframe/learn/contrib/llm/core.py +31 -10
- maxframe/learn/contrib/llm/models/__init__.py +1 -0
- maxframe/learn/contrib/llm/models/dashscope.py +4 -3
- maxframe/learn/contrib/llm/models/managed.py +39 -0
- maxframe/learn/contrib/llm/multi_modal.py +1 -0
- maxframe/learn/contrib/llm/text.py +252 -8
- maxframe/learn/contrib/models.py +77 -0
- maxframe/learn/contrib/utils.py +1 -0
- maxframe/learn/contrib/xgboost/__init__.py +8 -1
- maxframe/learn/contrib/xgboost/classifier.py +15 -4
- maxframe/learn/contrib/xgboost/core.py +108 -1
- maxframe/learn/contrib/xgboost/dmatrix.py +1 -1
- maxframe/learn/contrib/xgboost/predict.py +8 -3
- maxframe/learn/contrib/xgboost/regressor.py +15 -1
- maxframe/learn/contrib/xgboost/train.py +5 -4
- maxframe/lib/dtypes_extension/__init__.py +2 -1
- maxframe/lib/dtypes_extension/dtypes.py +21 -0
- maxframe/lib/dtypes_extension/tests/test_dtypes.py +13 -3
- maxframe/lib/mmh3.cpython-311-darwin.so +0 -0
- maxframe/opcodes.py +19 -0
- maxframe/serialization/__init__.py +1 -0
- maxframe/serialization/core.cpython-311-darwin.so +0 -0
- maxframe/serialization/core.pyx +12 -1
- maxframe/serialization/numpy.py +12 -4
- maxframe/serialization/serializables/tests/test_serializable.py +13 -2
- maxframe/serialization/tests/test_serial.py +2 -0
- maxframe/tensor/merge/concatenate.py +1 -0
- maxframe/tensor/misc/unique.py +11 -10
- maxframe/tensor/reshape/reshape.py +4 -1
- maxframe/utils.py +4 -0
- {maxframe-1.2.1.dist-info → maxframe-1.3.0.dist-info}/METADATA +2 -2
- {maxframe-1.2.1.dist-info → maxframe-1.3.0.dist-info}/RECORD +70 -62
- {maxframe-1.2.1.dist-info → maxframe-1.3.0.dist-info}/WHEEL +1 -1
- maxframe_client/session/odps.py +3 -0
- maxframe_client/session/tests/test_task.py +1 -0
- {maxframe-1.2.1.dist-info → maxframe-1.3.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# Copyright 1999-2025 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import numpy as np
|
|
16
|
+
import pandas as pd
|
|
17
|
+
import pyarrow as pa
|
|
18
|
+
import pytest
|
|
19
|
+
|
|
20
|
+
from ..... import dataframe as md
|
|
21
|
+
from .....lib.dtypes_extension import list_
|
|
22
|
+
from .....utils import ARROW_DTYPE_NOT_SUPPORTED
|
|
23
|
+
from ..getitem import SeriesListGetItemOperator
|
|
24
|
+
from ..length import SeriesListLengthOperator
|
|
25
|
+
|
|
26
|
+
pytestmark = pytest.mark.skipif(
|
|
27
|
+
ARROW_DTYPE_NOT_SUPPORTED, reason="Arrow Dtype is not supported"
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@pytest.fixture
|
|
32
|
+
def df():
|
|
33
|
+
return md.DataFrame(
|
|
34
|
+
{
|
|
35
|
+
"A": pd.Series([[5, 3, 2]], dtype=list_(pa.int32())),
|
|
36
|
+
"B": pd.Series([["ab", "cd"]], dtype=list_(pa.string())),
|
|
37
|
+
"C": pd.Series([1], dtype=np.dtype("int64")),
|
|
38
|
+
},
|
|
39
|
+
index=[1],
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def test_invalid_dtype(df):
|
|
44
|
+
with pytest.raises(AttributeError):
|
|
45
|
+
df["C"].list.len()
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def test_getitem(df):
|
|
49
|
+
s1 = df["A"].list[1]
|
|
50
|
+
assert isinstance(s1, md.Series)
|
|
51
|
+
assert s1.dtype == pd.ArrowDtype(pa.int32())
|
|
52
|
+
assert s1.shape == (1,)
|
|
53
|
+
assert s1.index_value == df.index_value
|
|
54
|
+
op = s1.op
|
|
55
|
+
assert isinstance(op, SeriesListGetItemOperator)
|
|
56
|
+
assert op.query_index == 1
|
|
57
|
+
assert op.ignore_index_error is False
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def test_getitem_ignore_index_err(df):
|
|
61
|
+
s1 = df["B"].list.get(1)
|
|
62
|
+
assert isinstance(s1, md.Series)
|
|
63
|
+
assert s1.dtype == pd.ArrowDtype(pa.string())
|
|
64
|
+
assert s1.shape == (1,)
|
|
65
|
+
assert s1.index_value == df.index_value
|
|
66
|
+
op = s1.op
|
|
67
|
+
assert isinstance(op, SeriesListGetItemOperator)
|
|
68
|
+
assert op.query_index == 1
|
|
69
|
+
assert op.ignore_index_error is True
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def test_length(df):
|
|
73
|
+
s1 = df["A"].list.len()
|
|
74
|
+
assert isinstance(s1, md.Series)
|
|
75
|
+
assert s1.dtype == pd.ArrowDtype(pa.int64())
|
|
76
|
+
assert s1.shape == (1,)
|
|
77
|
+
assert s1.index_value == df.index_value
|
|
78
|
+
op = s1.op
|
|
79
|
+
assert isinstance(op, SeriesListLengthOperator)
|
|
@@ -27,6 +27,7 @@ from ...core import OutputType
|
|
|
27
27
|
from ...io.odpsio import build_dataframe_table_meta
|
|
28
28
|
from ...serialization.serializables import (
|
|
29
29
|
BoolField,
|
|
30
|
+
DictField,
|
|
30
31
|
FieldTypes,
|
|
31
32
|
Int64Field,
|
|
32
33
|
ListField,
|
|
@@ -55,6 +56,7 @@ class DataFrameToODPSTable(DataFrameDataStore):
|
|
|
55
56
|
index = BoolField("index", default=True)
|
|
56
57
|
index_label = ListField("index_label", FieldTypes.string, default=None)
|
|
57
58
|
lifecycle = Int64Field("lifecycle", default=None)
|
|
59
|
+
table_properties = DictField("table_properties", default=None)
|
|
58
60
|
|
|
59
61
|
def __init__(self, **kw):
|
|
60
62
|
super().__init__(_output_types=[OutputType.dataframe], **kw)
|
|
@@ -84,6 +86,7 @@ def to_odps_table(
|
|
|
84
86
|
index: bool = True,
|
|
85
87
|
index_label: Union[None, str, List[str]] = None,
|
|
86
88
|
lifecycle: Optional[int] = None,
|
|
89
|
+
table_properties: Optional[dict] = None,
|
|
87
90
|
):
|
|
88
91
|
"""
|
|
89
92
|
Write DataFrame object into a MaxCompute (ODPS) table.
|
|
@@ -122,6 +125,8 @@ def to_odps_table(
|
|
|
122
125
|
names will be used.
|
|
123
126
|
lifecycle: Optional[int]
|
|
124
127
|
Specify lifecycle of the output table.
|
|
128
|
+
table_properties: Optional[dict]
|
|
129
|
+
Specify properties of the output table.
|
|
125
130
|
|
|
126
131
|
Returns
|
|
127
132
|
-------
|
|
@@ -186,5 +191,6 @@ def to_odps_table(
|
|
|
186
191
|
index=index,
|
|
187
192
|
index_label=index_label,
|
|
188
193
|
lifecycle=lifecycle or options.session.table_lifecycle,
|
|
194
|
+
table_properties=table_properties,
|
|
189
195
|
)
|
|
190
196
|
return op(df)
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
|
|
14
15
|
from typing import TYPE_CHECKING
|
|
15
16
|
|
|
16
17
|
from ...core import BaseMaxFrameAccessor
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
|
|
14
15
|
import functools
|
|
15
16
|
from typing import Any, Callable, Dict, List, Tuple, Union
|
|
16
17
|
|
|
@@ -19,7 +20,12 @@ import pandas as pd
|
|
|
19
20
|
|
|
20
21
|
from ... import opcodes
|
|
21
22
|
from ...core import OutputType
|
|
22
|
-
from ...serialization.serializables import
|
|
23
|
+
from ...serialization.serializables import (
|
|
24
|
+
DictField,
|
|
25
|
+
FunctionField,
|
|
26
|
+
Int32Field,
|
|
27
|
+
TupleField,
|
|
28
|
+
)
|
|
23
29
|
from ...utils import quiet_stdio
|
|
24
30
|
from ..core import DATAFRAME_TYPE, DataFrame, IndexValue, Series
|
|
25
31
|
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
@@ -38,7 +44,9 @@ class DataFrameApplyChunkOperator(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
38
44
|
_op_type_ = opcodes.APPLY_CHUNK
|
|
39
45
|
|
|
40
46
|
func = FunctionField("func")
|
|
41
|
-
batch_rows = Int32Field("batch_rows")
|
|
47
|
+
batch_rows = Int32Field("batch_rows", default=None)
|
|
48
|
+
args = TupleField("args", default=None)
|
|
49
|
+
kwargs = DictField("kwargs", default=None)
|
|
42
50
|
|
|
43
51
|
def __init__(self, output_type=None, **kw):
|
|
44
52
|
if output_type:
|
|
@@ -104,12 +112,11 @@ class DataFrameApplyChunkOperator(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
104
112
|
dtypes: Union[Tuple[str, Any], Dict[str, Any]] = None,
|
|
105
113
|
output_type=None,
|
|
106
114
|
index=None,
|
|
107
|
-
args=(),
|
|
108
|
-
**kwargs,
|
|
109
115
|
):
|
|
116
|
+
args = self.args or ()
|
|
117
|
+
kwargs = self.kwargs or {}
|
|
110
118
|
# if not dtypes and not skip_infer:
|
|
111
|
-
|
|
112
|
-
self.func = get_packed_func(df_or_series, origin_func, *args, **kwargs)
|
|
119
|
+
packed_func = get_packed_func(df_or_series, self.func, *args, **kwargs)
|
|
113
120
|
|
|
114
121
|
# if skip_infer, directly build a frame
|
|
115
122
|
if self.output_types and self.output_types[0] == OutputType.df_or_series:
|
|
@@ -118,8 +125,8 @@ class DataFrameApplyChunkOperator(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
118
125
|
# infer return index and dtypes
|
|
119
126
|
dtypes, index_value, elementwise = self._infer_batch_func_returns(
|
|
120
127
|
df_or_series,
|
|
121
|
-
origin_func=
|
|
122
|
-
packed_func=
|
|
128
|
+
origin_func=self.func,
|
|
129
|
+
packed_func=packed_func,
|
|
123
130
|
given_output_type=output_type,
|
|
124
131
|
given_dtypes=dtypes,
|
|
125
132
|
given_index=index,
|
|
@@ -166,6 +173,8 @@ class DataFrameApplyChunkOperator(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
166
173
|
given_dtypes: Union[Tuple[str, Any], pd.Series, List[Any], Dict[str, Any]],
|
|
167
174
|
given_index: Union[pd.Index, IndexValue],
|
|
168
175
|
given_elementwise: bool = False,
|
|
176
|
+
*args,
|
|
177
|
+
**kwargs,
|
|
169
178
|
):
|
|
170
179
|
inferred_output_type = inferred_dtypes = inferred_index_value = None
|
|
171
180
|
inferred_is_elementwise = False
|
|
@@ -190,7 +199,7 @@ class DataFrameApplyChunkOperator(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
190
199
|
try:
|
|
191
200
|
# execute
|
|
192
201
|
with np.errstate(all="ignore"), quiet_stdio():
|
|
193
|
-
infer_result = packed_func(empty_data)
|
|
202
|
+
infer_result = packed_func(empty_data, *args, **kwargs)
|
|
194
203
|
|
|
195
204
|
# if executed successfully, get index and dtypes from returned object
|
|
196
205
|
if inferred_index_value is None:
|
|
@@ -258,7 +267,7 @@ def get_packed_func(df, func, *args, **kwargs) -> Any:
|
|
|
258
267
|
def df_apply_chunk(
|
|
259
268
|
dataframe,
|
|
260
269
|
func: Union[str, Callable],
|
|
261
|
-
batch_rows,
|
|
270
|
+
batch_rows=None,
|
|
262
271
|
dtypes=None,
|
|
263
272
|
dtype=None,
|
|
264
273
|
name=None,
|
|
@@ -462,11 +471,11 @@ def df_apply_chunk(
|
|
|
462
471
|
if not isinstance(func, Callable):
|
|
463
472
|
raise TypeError("function must be a callable object")
|
|
464
473
|
|
|
465
|
-
if not
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
474
|
+
if batch_rows is not None:
|
|
475
|
+
if not isinstance(batch_rows, int):
|
|
476
|
+
raise TypeError("batch_rows must be an integer")
|
|
477
|
+
elif batch_rows <= 0:
|
|
478
|
+
raise ValueError("batch_rows must be greater than 0")
|
|
470
479
|
|
|
471
480
|
dtypes = (name, dtype) if dtype is not None else dtypes
|
|
472
481
|
|
|
@@ -481,15 +490,17 @@ def df_apply_chunk(
|
|
|
481
490
|
|
|
482
491
|
# bind args and kwargs
|
|
483
492
|
op = DataFrameApplyChunkOperator(
|
|
484
|
-
func=func,
|
|
493
|
+
func=func,
|
|
494
|
+
batch_rows=batch_rows,
|
|
495
|
+
output_type=output_type,
|
|
496
|
+
args=args,
|
|
497
|
+
kwargs=kwargs,
|
|
485
498
|
)
|
|
486
499
|
|
|
487
500
|
return op(
|
|
488
501
|
dataframe,
|
|
489
502
|
dtypes=dtypes,
|
|
490
503
|
index=index,
|
|
491
|
-
args=args,
|
|
492
|
-
**kwargs,
|
|
493
504
|
)
|
|
494
505
|
|
|
495
506
|
|
|
@@ -720,7 +731,11 @@ def series_apply_chunk(
|
|
|
720
731
|
output_type = OutputType.df_or_series
|
|
721
732
|
|
|
722
733
|
op = DataFrameApplyChunkOperator(
|
|
723
|
-
func=func,
|
|
734
|
+
func=func,
|
|
735
|
+
batch_rows=batch_rows,
|
|
736
|
+
output_type=output_type,
|
|
737
|
+
args=args,
|
|
738
|
+
kwargs=kwargs,
|
|
724
739
|
)
|
|
725
740
|
|
|
726
741
|
dtypes = (name, dtype) if dtype is not None else dtypes
|
|
@@ -729,6 +744,4 @@ def series_apply_chunk(
|
|
|
729
744
|
dtypes=dtypes,
|
|
730
745
|
output_type=output_type,
|
|
731
746
|
index=index,
|
|
732
|
-
args=args,
|
|
733
|
-
**kwargs,
|
|
734
747
|
)
|
|
@@ -27,7 +27,12 @@ from ...serialization.serializables import (
|
|
|
27
27
|
)
|
|
28
28
|
from ..core import DataFrame
|
|
29
29
|
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
30
|
-
from ..utils import
|
|
30
|
+
from ..utils import (
|
|
31
|
+
copy_func_scheduling_hints,
|
|
32
|
+
gen_unknown_index_value,
|
|
33
|
+
make_dtypes,
|
|
34
|
+
parse_index,
|
|
35
|
+
)
|
|
31
36
|
|
|
32
37
|
|
|
33
38
|
class DataFrameFlatMapOperator(DataFrameOperator, DataFrameOperatorMixin):
|
|
@@ -40,6 +45,8 @@ class DataFrameFlatMapOperator(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
40
45
|
|
|
41
46
|
def __init__(self, output_types=None, **kw):
|
|
42
47
|
super().__init__(_output_types=output_types, **kw)
|
|
48
|
+
if hasattr(self, "func"):
|
|
49
|
+
copy_func_scheduling_hints(self.func, self)
|
|
43
50
|
|
|
44
51
|
def _call_dataframe(self, df: DataFrame, dtypes: pd.Series):
|
|
45
52
|
dtypes = make_dtypes(dtypes)
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
|
|
14
15
|
import numpy as np
|
|
15
16
|
import pandas as pd
|
|
16
17
|
import pytest
|
|
@@ -102,7 +103,7 @@ def test_apply_chunk_infer_dtypes_and_index(df1, df2, df3):
|
|
|
102
103
|
assert result.index_value is df1.index_value
|
|
103
104
|
assert result.dtypes.equals(df1.dtypes)
|
|
104
105
|
assert isinstance(result.op.func, MarkedFunction)
|
|
105
|
-
assert result.op.func is
|
|
106
|
+
assert result.op.func is process
|
|
106
107
|
assert result.op.func.resources is process.resources
|
|
107
108
|
assert result.op.func.pythonpacks is process.pythonpacks
|
|
108
109
|
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
|
|
14
15
|
from typing import List, Union
|
|
15
16
|
|
|
16
17
|
import pandas as pd
|
|
@@ -100,8 +101,9 @@ class DataFrameConcat(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
100
101
|
row_length = 0
|
|
101
102
|
for series in objs:
|
|
102
103
|
row_length += series.shape[0]
|
|
103
|
-
if self.ignore_index:
|
|
104
|
-
|
|
104
|
+
if self.ignore_index:
|
|
105
|
+
idx_length = 0 if pd.isna(row_length) else row_length
|
|
106
|
+
index_value = parse_index(pd.RangeIndex(idx_length))
|
|
105
107
|
else:
|
|
106
108
|
index = self._concat_index(objs)
|
|
107
109
|
index_value = parse_index(index, objs)
|
|
@@ -159,8 +161,9 @@ class DataFrameConcat(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
159
161
|
if self.join == "inner":
|
|
160
162
|
objs = [o[list(emtpy_result.columns)] for o in objs]
|
|
161
163
|
|
|
162
|
-
if self.ignore_index:
|
|
163
|
-
|
|
164
|
+
if self.ignore_index:
|
|
165
|
+
idx_length = 0 if pd.isna(row_length) else row_length
|
|
166
|
+
index_value = parse_index(pd.RangeIndex(idx_length))
|
|
164
167
|
else:
|
|
165
168
|
index = self._concat_index(objs)
|
|
166
169
|
index_value = parse_index(index, objs)
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
|
|
14
15
|
import logging
|
|
15
16
|
from abc import abstractmethod
|
|
16
17
|
from collections import namedtuple
|
|
@@ -16,10 +16,10 @@ import numpy as np
|
|
|
16
16
|
import pandas as pd
|
|
17
17
|
import pytest
|
|
18
18
|
|
|
19
|
+
from .... import dataframe as md
|
|
19
20
|
from ....tests.utils import assert_mf_index_dtype
|
|
20
21
|
from ...core import IndexValue
|
|
21
|
-
from
|
|
22
|
-
from .. import DataFrameMerge, concat
|
|
22
|
+
from .. import DataFrameMerge
|
|
23
23
|
from ..merge import DistributedMapJoinHint, MapJoinHint, SkewJoinHint
|
|
24
24
|
|
|
25
25
|
|
|
@@ -29,8 +29,8 @@ def test_merge():
|
|
|
29
29
|
)
|
|
30
30
|
df2 = pd.DataFrame(np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"])
|
|
31
31
|
|
|
32
|
-
mdf1 =
|
|
33
|
-
mdf2 =
|
|
32
|
+
mdf1 = md.DataFrame(df1, chunk_size=2)
|
|
33
|
+
mdf2 = md.DataFrame(df2, chunk_size=3)
|
|
34
34
|
|
|
35
35
|
mapjoin = MapJoinHint()
|
|
36
36
|
dist_mapjoin1 = DistributedMapJoinHint(shard_count=5)
|
|
@@ -83,8 +83,8 @@ def test_merge_invalid_parameters():
|
|
|
83
83
|
)
|
|
84
84
|
pdf2 = pd.DataFrame(np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"])
|
|
85
85
|
|
|
86
|
-
df1 =
|
|
87
|
-
df2 =
|
|
86
|
+
df1 = md.DataFrame(pdf1, chunk_size=2)
|
|
87
|
+
df2 = md.DataFrame(pdf2, chunk_size=3)
|
|
88
88
|
|
|
89
89
|
with pytest.raises(ValueError):
|
|
90
90
|
df1.merge(df2, bloom_filter="wrong")
|
|
@@ -104,8 +104,8 @@ def test_join():
|
|
|
104
104
|
df2 = pd.DataFrame([[1, 2, 3], [1, 5, 6], [7, 8, 9]], index=["a1", "b2", "b3"]) + 1
|
|
105
105
|
df2 = pd.concat([df2, df2 + 1])
|
|
106
106
|
|
|
107
|
-
mdf1 =
|
|
108
|
-
mdf2 =
|
|
107
|
+
mdf1 = md.DataFrame(df1, chunk_size=2)
|
|
108
|
+
mdf2 = md.DataFrame(df2, chunk_size=2)
|
|
109
109
|
|
|
110
110
|
parameters = [
|
|
111
111
|
{"lsuffix": "l_", "rsuffix": "r_"},
|
|
@@ -132,8 +132,8 @@ def test_join_on():
|
|
|
132
132
|
)
|
|
133
133
|
df2 = pd.concat([df2, df2 + 1])
|
|
134
134
|
|
|
135
|
-
mdf1 =
|
|
136
|
-
mdf2 =
|
|
135
|
+
mdf1 = md.DataFrame(df1, chunk_size=2)
|
|
136
|
+
mdf2 = md.DataFrame(df2, chunk_size=2)
|
|
137
137
|
|
|
138
138
|
parameters = [
|
|
139
139
|
{"lsuffix": "l_", "rsuffix": "r_"},
|
|
@@ -157,15 +157,15 @@ def test_append():
|
|
|
157
157
|
df1 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD"))
|
|
158
158
|
df2 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD"))
|
|
159
159
|
|
|
160
|
-
mdf1 =
|
|
161
|
-
mdf2 =
|
|
160
|
+
mdf1 = md.DataFrame(df1, chunk_size=3)
|
|
161
|
+
mdf2 = md.DataFrame(df2, chunk_size=3)
|
|
162
162
|
adf = mdf1.append(mdf2)
|
|
163
163
|
|
|
164
164
|
assert adf.shape == (20, 4)
|
|
165
165
|
assert_mf_index_dtype(adf.index_value.value, np.int64)
|
|
166
166
|
|
|
167
|
-
mdf1 =
|
|
168
|
-
mdf2 =
|
|
167
|
+
mdf1 = md.DataFrame(df1, chunk_size=3)
|
|
168
|
+
mdf2 = md.DataFrame(df2, chunk_size=3)
|
|
169
169
|
adf = mdf1.append(mdf2, ignore_index=True)
|
|
170
170
|
|
|
171
171
|
assert adf.shape == (20, 4)
|
|
@@ -173,84 +173,135 @@ def test_append():
|
|
|
173
173
|
pd.testing.assert_index_equal(adf.index_value.to_pandas(), pd.RangeIndex(20))
|
|
174
174
|
|
|
175
175
|
|
|
176
|
-
def
|
|
176
|
+
def test_concat_dataframe():
|
|
177
|
+
# test index concatenate
|
|
177
178
|
df1 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD"))
|
|
178
179
|
df2 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD"))
|
|
179
180
|
|
|
180
|
-
mdf1 =
|
|
181
|
-
mdf2 =
|
|
182
|
-
r = concat([mdf1, mdf2], axis="index")
|
|
181
|
+
mdf1 = md.DataFrame(df1, chunk_size=4)
|
|
182
|
+
mdf2 = md.DataFrame(df2, chunk_size=4)
|
|
183
|
+
r = md.concat([mdf1, mdf2], axis="index")
|
|
183
184
|
|
|
184
185
|
assert r.shape == (20, 4)
|
|
185
186
|
assert not isinstance(r.index_value.to_pandas(), pd.RangeIndex)
|
|
186
|
-
pd.testing.assert_series_equal(r.dtypes,
|
|
187
|
+
pd.testing.assert_series_equal(r.dtypes, mdf1.dtypes)
|
|
187
188
|
|
|
188
|
-
|
|
189
|
-
|
|
189
|
+
# test index concatenate with range index
|
|
190
|
+
mdf3 = md.DataFrame(
|
|
191
|
+
np.random.rand(10, 4),
|
|
192
|
+
columns=list("ABCD"),
|
|
193
|
+
index=pd.RangeIndex(10, 20),
|
|
194
|
+
chunk_size=4,
|
|
190
195
|
)
|
|
191
|
-
|
|
192
|
-
mdf3 = from_pandas(df3, chunk_size=4)
|
|
193
|
-
r = concat([mdf1, mdf3], axis="index")
|
|
196
|
+
r = md.concat([mdf1, mdf3], axis="index")
|
|
194
197
|
|
|
195
198
|
assert r.shape == (20, 4)
|
|
196
|
-
pd.testing.assert_series_equal(r.dtypes,
|
|
199
|
+
pd.testing.assert_series_equal(r.dtypes, mdf1.dtypes)
|
|
197
200
|
pd.testing.assert_index_equal(r.index_value.to_pandas(), pd.RangeIndex(20))
|
|
198
201
|
|
|
202
|
+
# test index concatenate with perm index
|
|
199
203
|
df4 = pd.DataFrame(
|
|
200
204
|
np.random.rand(10, 4),
|
|
201
205
|
columns=list("ABCD"),
|
|
202
206
|
index=np.random.permutation(np.arange(10)),
|
|
203
207
|
)
|
|
204
208
|
|
|
205
|
-
|
|
206
|
-
|
|
209
|
+
# test concat with same index with different sources
|
|
210
|
+
mdf4 = md.DataFrame(df4, chunk_size=4)
|
|
211
|
+
r = md.concat([mdf1, mdf4], axis="index")
|
|
207
212
|
|
|
208
213
|
assert r.shape == (20, 4)
|
|
209
|
-
pd.testing.assert_series_equal(r.dtypes,
|
|
214
|
+
pd.testing.assert_series_equal(r.dtypes, mdf1.dtypes)
|
|
210
215
|
pd.testing.assert_index_equal(
|
|
211
216
|
r.index_value.to_pandas(), pd.Index([], dtype=np.int64)
|
|
212
217
|
)
|
|
213
218
|
|
|
214
|
-
r = concat([mdf4, mdf1], axis="index")
|
|
219
|
+
r = md.concat([mdf4, mdf1], axis="index")
|
|
215
220
|
|
|
216
221
|
assert r.shape == (20, 4)
|
|
217
|
-
pd.testing.assert_series_equal(r.dtypes,
|
|
222
|
+
pd.testing.assert_series_equal(r.dtypes, mdf1.dtypes)
|
|
218
223
|
pd.testing.assert_index_equal(
|
|
219
224
|
r.index_value.to_pandas(), pd.Index([], dtype=np.int64)
|
|
220
225
|
)
|
|
221
226
|
|
|
222
|
-
|
|
227
|
+
# test concat with same index with same source
|
|
228
|
+
r = md.concat([mdf4, mdf4], axis="index")
|
|
223
229
|
|
|
224
230
|
assert r.shape == (20, 4)
|
|
225
|
-
pd.testing.assert_series_equal(r.dtypes,
|
|
231
|
+
pd.testing.assert_series_equal(r.dtypes, mdf1.dtypes)
|
|
226
232
|
pd.testing.assert_index_equal(
|
|
227
233
|
r.index_value.to_pandas(), pd.Index([], dtype=np.int64)
|
|
228
234
|
)
|
|
229
235
|
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
236
|
+
# test concat with column outer join
|
|
237
|
+
mdf1 = md.DataFrame(df1, chunk_size=3)
|
|
238
|
+
mdf2 = md.DataFrame(df2, chunk_size=4)
|
|
239
|
+
r = md.concat([mdf1, mdf2], axis="columns")
|
|
233
240
|
|
|
234
241
|
assert r.shape == (10, 8)
|
|
235
242
|
expected_dtypes = pd.concat([df1, df2], axis="columns").dtypes
|
|
236
243
|
pd.testing.assert_series_equal(r.dtypes, expected_dtypes)
|
|
237
244
|
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
r = concat([mdf1, mdf2], join="inner")
|
|
245
|
+
# test concat with column inner join
|
|
246
|
+
mdf1 = md.DataFrame(np.random.rand(10, 4), columns=list("ABCD"), chunk_size=3)
|
|
247
|
+
mdf2 = md.DataFrame(np.random.rand(10, 3), columns=list("ABC"), chunk_size=3)
|
|
248
|
+
r = md.concat([mdf1, mdf2], join="inner")
|
|
243
249
|
assert r.shape == (20, 3)
|
|
244
250
|
|
|
251
|
+
# test concat with ignore index
|
|
252
|
+
r = md.concat([mdf1, mdf2], join="inner", ignore_index=True)
|
|
253
|
+
assert r.shape == (20, 3)
|
|
254
|
+
pd.testing.assert_index_equal(r.index_value.to_pandas(), pd.RangeIndex(20))
|
|
255
|
+
|
|
256
|
+
# test concat with unknown shapes
|
|
257
|
+
mdf1._shape = (np.nan, 4)
|
|
258
|
+
r = md.concat([mdf1, mdf2], join="inner", ignore_index=True)
|
|
259
|
+
np.testing.assert_array_equal(np.array(r.shape), np.array((np.nan, 3)))
|
|
260
|
+
r = md.concat([mdf1, mdf2], join="inner", ignore_index=True)
|
|
261
|
+
np.testing.assert_array_equal(np.array(r.shape), np.array((np.nan, 3)))
|
|
262
|
+
|
|
263
|
+
# test concat with empty frames
|
|
264
|
+
r = md.concat([md.DataFrame([]), mdf2], ignore_index=True)
|
|
265
|
+
assert r.shape == (10, 3)
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def test_concat_series():
|
|
269
|
+
# test row concat
|
|
270
|
+
ms1 = md.Series(np.random.rand(10))
|
|
271
|
+
ms2 = md.Series(np.random.rand(10))
|
|
272
|
+
r = md.concat([ms1, ms2])
|
|
273
|
+
assert r.shape == (20,)
|
|
274
|
+
|
|
275
|
+
# test row concat with unknown shape
|
|
276
|
+
ms1._shape = (np.nan,)
|
|
277
|
+
r = md.concat([ms1, ms2])
|
|
278
|
+
assert np.isnan(r.shape[0])
|
|
279
|
+
r = md.concat([ms1, ms2], ignore_index=True)
|
|
280
|
+
assert np.isnan(r.shape[0])
|
|
281
|
+
|
|
282
|
+
# test col concat
|
|
283
|
+
ms1 = md.Series(np.random.rand(10))
|
|
284
|
+
ms2 = md.Series(np.random.rand(10))
|
|
285
|
+
r = md.concat([ms1, ms2], axis=1)
|
|
286
|
+
assert r.shape == (10, 2)
|
|
287
|
+
|
|
288
|
+
# test col concat with names
|
|
289
|
+
ms1.name = "col1"
|
|
290
|
+
ms2.name = "col2"
|
|
291
|
+
r = md.concat([ms1, ms2], axis=1)
|
|
292
|
+
assert r.shape == (10, 2)
|
|
293
|
+
assert r.dtypes.index.tolist() == ["col1", "col2"]
|
|
294
|
+
|
|
245
295
|
|
|
246
296
|
def test_invalid_join_hint():
|
|
247
|
-
|
|
248
|
-
np.arange(20).reshape((4, 5)) + 1,
|
|
297
|
+
mdf1 = md.DataFrame(
|
|
298
|
+
np.arange(20).reshape((4, 5)) + 1,
|
|
299
|
+
columns=["a", "b", "c", "d", "e"],
|
|
300
|
+
chunk_size=2,
|
|
301
|
+
)
|
|
302
|
+
mdf2 = md.DataFrame(
|
|
303
|
+
np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"], chunk_size=3
|
|
249
304
|
)
|
|
250
|
-
df2 = pd.DataFrame(np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"])
|
|
251
|
-
|
|
252
|
-
mdf1 = from_pandas(df1, chunk_size=2)
|
|
253
|
-
mdf2 = from_pandas(df2, chunk_size=3)
|
|
254
305
|
|
|
255
306
|
# type error
|
|
256
307
|
parameters = [
|
|
@@ -282,7 +333,6 @@ def test_invalid_join_hint():
|
|
|
282
333
|
]
|
|
283
334
|
|
|
284
335
|
for kw in parameters:
|
|
285
|
-
print(kw)
|
|
286
336
|
with pytest.raises(TypeError):
|
|
287
337
|
mdf1.merge(mdf2, **kw)
|
|
288
338
|
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
|
|
14
15
|
import numpy as np
|
|
15
16
|
import pandas as pd
|
|
16
17
|
import pyarrow as pa
|
|
@@ -71,6 +72,12 @@ def test_pack_function(df1):
|
|
|
71
72
|
@pytest.mark.parametrize(
|
|
72
73
|
"dtype, fill_value, expected",
|
|
73
74
|
[
|
|
75
|
+
(
|
|
76
|
+
ArrowDtype(pa.list_(pa.string())) if ArrowDtype else None,
|
|
77
|
+
1,
|
|
78
|
+
["1"],
|
|
79
|
+
),
|
|
80
|
+
(pa.list_(pa.string()), 1, ["1"]),
|
|
74
81
|
(
|
|
75
82
|
ArrowDtype(pa.map_(pa.int32(), pa.string())) if ArrowDtype else None,
|
|
76
83
|
1,
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
|
|
14
15
|
from numbers import Number
|
|
15
16
|
|
|
16
17
|
from ...tensor import tensor as astensor
|
maxframe/dataframe/utils.py
CHANGED
|
@@ -463,6 +463,9 @@ def _generate_value(dtype, fill_value):
|
|
|
463
463
|
if ArrowDtype and isinstance(dtype, pd.ArrowDtype):
|
|
464
464
|
return _generate_value(dtype.pyarrow_dtype, fill_value)
|
|
465
465
|
|
|
466
|
+
if isinstance(dtype, pa.ListType):
|
|
467
|
+
return [_generate_value(dtype.value_type, fill_value)]
|
|
468
|
+
|
|
466
469
|
if isinstance(dtype, pa.MapType):
|
|
467
470
|
return [
|
|
468
471
|
(
|