maxframe 1.0.0rc3__cp311-cp311-win_amd64.whl → 1.1.0__cp311-cp311-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cp311-win_amd64.pyd +0 -0
- maxframe/codegen.py +1 -0
- maxframe/config/config.py +16 -1
- maxframe/conftest.py +52 -14
- maxframe/core/entity/executable.py +1 -1
- maxframe/core/graph/core.cp311-win_amd64.pyd +0 -0
- maxframe/core/operator/base.py +2 -0
- maxframe/dataframe/arithmetic/docstring.py +26 -2
- maxframe/dataframe/arithmetic/equal.py +4 -2
- maxframe/dataframe/arithmetic/greater.py +4 -2
- maxframe/dataframe/arithmetic/greater_equal.py +4 -2
- maxframe/dataframe/arithmetic/less.py +2 -2
- maxframe/dataframe/arithmetic/less_equal.py +4 -2
- maxframe/dataframe/arithmetic/not_equal.py +4 -2
- maxframe/dataframe/arithmetic/tests/test_arithmetic.py +17 -16
- maxframe/dataframe/core.py +26 -2
- maxframe/dataframe/datasource/read_odps_query.py +116 -28
- maxframe/dataframe/datasource/read_odps_table.py +3 -1
- maxframe/dataframe/datasource/tests/test_datasource.py +93 -12
- maxframe/dataframe/datastore/to_odps.py +7 -0
- maxframe/dataframe/extensions/__init__.py +8 -0
- maxframe/dataframe/extensions/apply_chunk.py +649 -0
- maxframe/dataframe/extensions/flatjson.py +131 -0
- maxframe/dataframe/extensions/flatmap.py +314 -0
- maxframe/dataframe/extensions/reshuffle.py +1 -1
- maxframe/dataframe/extensions/tests/test_apply_chunk.py +186 -0
- maxframe/dataframe/extensions/tests/test_extensions.py +108 -3
- maxframe/dataframe/groupby/__init__.py +1 -0
- maxframe/dataframe/groupby/aggregation.py +1 -0
- maxframe/dataframe/groupby/apply.py +9 -1
- maxframe/dataframe/groupby/core.py +1 -1
- maxframe/dataframe/groupby/fill.py +4 -1
- maxframe/dataframe/groupby/getitem.py +6 -0
- maxframe/dataframe/groupby/tests/test_groupby.py +1 -1
- maxframe/dataframe/groupby/transform.py +8 -2
- maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
- maxframe/dataframe/indexing/loc.py +6 -4
- maxframe/dataframe/indexing/rename.py +11 -0
- maxframe/dataframe/initializer.py +11 -1
- maxframe/dataframe/merge/__init__.py +9 -1
- maxframe/dataframe/merge/concat.py +41 -31
- maxframe/dataframe/merge/merge.py +1 -1
- maxframe/dataframe/merge/tests/test_merge.py +3 -1
- maxframe/dataframe/misc/apply.py +3 -0
- maxframe/dataframe/misc/drop_duplicates.py +23 -2
- maxframe/dataframe/misc/map.py +3 -1
- maxframe/dataframe/misc/tests/test_misc.py +24 -2
- maxframe/dataframe/misc/transform.py +22 -13
- maxframe/dataframe/reduction/__init__.py +3 -0
- maxframe/dataframe/reduction/aggregation.py +1 -0
- maxframe/dataframe/reduction/median.py +56 -0
- maxframe/dataframe/reduction/tests/test_reduction.py +17 -7
- maxframe/dataframe/statistics/quantile.py +8 -2
- maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
- maxframe/dataframe/tests/test_initializer.py +33 -2
- maxframe/dataframe/tests/test_utils.py +60 -0
- maxframe/dataframe/utils.py +110 -7
- maxframe/dataframe/window/expanding.py +5 -3
- maxframe/dataframe/window/tests/test_expanding.py +2 -2
- maxframe/io/objects/tests/test_object_io.py +39 -12
- maxframe/io/odpsio/arrow.py +30 -2
- maxframe/io/odpsio/schema.py +28 -8
- maxframe/io/odpsio/tableio.py +55 -133
- maxframe/io/odpsio/tests/test_schema.py +40 -4
- maxframe/io/odpsio/tests/test_tableio.py +5 -5
- maxframe/io/odpsio/tests/test_volumeio.py +35 -11
- maxframe/io/odpsio/volumeio.py +36 -6
- maxframe/learn/contrib/__init__.py +3 -1
- maxframe/learn/contrib/graph/__init__.py +15 -0
- maxframe/learn/contrib/graph/connected_components.py +215 -0
- maxframe/learn/contrib/graph/tests/__init__.py +13 -0
- maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
- maxframe/learn/contrib/llm/__init__.py +16 -0
- maxframe/learn/contrib/llm/core.py +54 -0
- maxframe/learn/contrib/llm/models/__init__.py +14 -0
- maxframe/learn/contrib/llm/models/dashscope.py +73 -0
- maxframe/learn/contrib/llm/multi_modal.py +42 -0
- maxframe/learn/contrib/llm/text.py +42 -0
- maxframe/learn/contrib/xgboost/classifier.py +3 -3
- maxframe/learn/contrib/xgboost/predict.py +8 -39
- maxframe/learn/contrib/xgboost/train.py +4 -3
- maxframe/lib/mmh3.cp311-win_amd64.pyd +0 -0
- maxframe/lib/sparse/tests/test_sparse.py +15 -15
- maxframe/opcodes.py +10 -1
- maxframe/protocol.py +6 -1
- maxframe/serialization/core.cp311-win_amd64.pyd +0 -0
- maxframe/serialization/core.pyx +13 -1
- maxframe/serialization/pandas.py +50 -20
- maxframe/serialization/serializables/core.py +24 -5
- maxframe/serialization/serializables/field_type.py +4 -1
- maxframe/serialization/serializables/tests/test_serializable.py +8 -1
- maxframe/serialization/tests/test_serial.py +2 -1
- maxframe/session.py +9 -2
- maxframe/tensor/__init__.py +19 -7
- maxframe/tensor/indexing/getitem.py +2 -0
- maxframe/tensor/merge/concatenate.py +23 -20
- maxframe/tensor/merge/vstack.py +5 -1
- maxframe/tensor/misc/transpose.py +1 -1
- maxframe/tests/utils.py +16 -0
- maxframe/udf.py +27 -0
- maxframe/utils.py +64 -14
- {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/METADATA +2 -2
- {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/RECORD +112 -96
- {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/WHEEL +1 -1
- maxframe_client/clients/framedriver.py +4 -1
- maxframe_client/fetcher.py +28 -10
- maxframe_client/session/consts.py +3 -0
- maxframe_client/session/odps.py +104 -20
- maxframe_client/session/task.py +42 -26
- maxframe_client/session/tests/test_task.py +0 -4
- maxframe_client/tests/test_session.py +44 -12
- {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/top_level.txt +0 -0
|
@@ -20,6 +20,7 @@ from .... import opcodes
|
|
|
20
20
|
from ....core import OutputType
|
|
21
21
|
from ....dataframe import DataFrame
|
|
22
22
|
from ....tensor.core import TENSOR_TYPE
|
|
23
|
+
from ....udf import with_running_options
|
|
23
24
|
from ... import eval as maxframe_eval
|
|
24
25
|
from ... import get_dummies, to_numeric
|
|
25
26
|
from ...arithmetic import DataFrameGreater, DataFrameLess
|
|
@@ -65,6 +66,17 @@ def test_transform():
|
|
|
65
66
|
assert r.op._op_type_ == opcodes.TRANSFORM
|
|
66
67
|
assert r.op.output_types[0] == OutputType.dataframe
|
|
67
68
|
|
|
69
|
+
def transform_df_with_param(row, param, k):
|
|
70
|
+
assert param == 5
|
|
71
|
+
assert k == "6"
|
|
72
|
+
return row
|
|
73
|
+
|
|
74
|
+
r = df.transform(transform_df_with_param, 1, 5, k="6")
|
|
75
|
+
assert all(v == np.dtype("int64") for v in r.dtypes) is True
|
|
76
|
+
assert r.shape == df.shape
|
|
77
|
+
assert r.op._op_type_ == opcodes.TRANSFORM
|
|
78
|
+
assert r.op.output_types[0] == OutputType.dataframe
|
|
79
|
+
|
|
68
80
|
r = df.transform(lambda x: list(range(len(x))), axis=1)
|
|
69
81
|
assert all(v == np.dtype("int64") for v in r.dtypes) is True
|
|
70
82
|
assert r.shape == df.shape
|
|
@@ -349,7 +361,9 @@ def test_drop():
|
|
|
349
361
|
def test_drop_duplicates():
|
|
350
362
|
rs = np.random.RandomState(0)
|
|
351
363
|
raw = pd.DataFrame(
|
|
352
|
-
rs.randint(1000, size=(20, 7)),
|
|
364
|
+
rs.randint(1000, size=(20, 7)),
|
|
365
|
+
columns=["c" + str(i + 1) for i in range(7)],
|
|
366
|
+
index=pd.Index(range(20), name="idx"),
|
|
353
367
|
)
|
|
354
368
|
raw["c7"] = [f"s{j}" for j in range(20)]
|
|
355
369
|
|
|
@@ -361,6 +375,12 @@ def test_drop_duplicates():
|
|
|
361
375
|
with pytest.raises(KeyError):
|
|
362
376
|
df.drop_duplicates(subset="c8")
|
|
363
377
|
|
|
378
|
+
# check index
|
|
379
|
+
distinct_df = df.drop_duplicates()
|
|
380
|
+
assert distinct_df.index_value.name == df.index_value.name
|
|
381
|
+
assert isinstance(df.index_value.to_pandas(), pd.RangeIndex)
|
|
382
|
+
assert not isinstance(distinct_df.index_value.to_pandas(), pd.RangeIndex)
|
|
383
|
+
|
|
364
384
|
s = df["c7"]
|
|
365
385
|
with pytest.raises(ValueError):
|
|
366
386
|
s.drop_duplicates(method="unknown")
|
|
@@ -436,6 +456,7 @@ def test_apply():
|
|
|
436
456
|
|
|
437
457
|
keys = [1, 2]
|
|
438
458
|
|
|
459
|
+
@with_running_options(engine="spe")
|
|
439
460
|
def f(x, keys):
|
|
440
461
|
if x["a"] in keys:
|
|
441
462
|
return [1, 0]
|
|
@@ -451,6 +472,7 @@ def test_apply():
|
|
|
451
472
|
keys=keys,
|
|
452
473
|
)
|
|
453
474
|
assert apply_df.shape == (3, 2)
|
|
475
|
+
assert apply_df.op.expect_engine == "SPE"
|
|
454
476
|
|
|
455
477
|
|
|
456
478
|
def test_pivot_table():
|
|
@@ -474,7 +496,7 @@ def test_pivot_table():
|
|
|
474
496
|
with pytest.raises(ValueError):
|
|
475
497
|
df.pivot_table(values=["D", "E"], aggfunc="sum")
|
|
476
498
|
|
|
477
|
-
t = df.pivot_table(index="A")
|
|
499
|
+
t = df.pivot_table(index=["A", "B", "C"])
|
|
478
500
|
assert isinstance(t.op, DataFrameGroupByAgg)
|
|
479
501
|
t = df.pivot_table(index="A", values=["D", "E"], aggfunc="sum")
|
|
480
502
|
assert isinstance(t.op, DataFrameGroupByAgg)
|
|
@@ -27,6 +27,7 @@ from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
|
27
27
|
from ..utils import (
|
|
28
28
|
build_df,
|
|
29
29
|
build_series,
|
|
30
|
+
copy_func_scheduling_hints,
|
|
30
31
|
make_dtypes,
|
|
31
32
|
pack_func_args,
|
|
32
33
|
parse_index,
|
|
@@ -49,10 +50,12 @@ class TransformOperator(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
49
50
|
|
|
50
51
|
def __init__(self, output_types=None, memory_scale=None, **kw):
|
|
51
52
|
super().__init__(_output_types=output_types, _memory_scale=memory_scale, **kw)
|
|
53
|
+
if hasattr(self, "func"):
|
|
54
|
+
copy_func_scheduling_hints(self.func, self)
|
|
52
55
|
|
|
53
56
|
def _infer_df_func_returns(self, df, dtypes):
|
|
54
|
-
packed_funcs = self.
|
|
55
|
-
test_df =
|
|
57
|
+
packed_funcs = self.func
|
|
58
|
+
test_df = _build_stub_pandas_obj(df, self.output_types[0])
|
|
56
59
|
if self.output_types[0] == OutputType.dataframe:
|
|
57
60
|
try:
|
|
58
61
|
with np.errstate(all="ignore"), quiet_stdio():
|
|
@@ -147,16 +150,18 @@ class TransformOperator(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
147
150
|
index_value=new_index_value,
|
|
148
151
|
)
|
|
149
152
|
|
|
150
|
-
def get_packed_funcs(self, df=None) -> Any:
|
|
151
|
-
stub_df = self._build_stub_pandas_obj(df or self.inputs[0])
|
|
152
|
-
return pack_func_args(stub_df, self.func, *self.args, **self.kwds)
|
|
153
153
|
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
154
|
+
def get_packed_funcs(df, output_type, func, *args, **kwds) -> Any:
|
|
155
|
+
stub_df = _build_stub_pandas_obj(df, output_type)
|
|
156
|
+
return pack_func_args(stub_df, func, *args, **kwds)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def _build_stub_pandas_obj(df, output_type) -> Union[DataFrame, Series]:
|
|
160
|
+
# TODO: Simulate a dataframe with the corresponding indexes if self.func is
|
|
161
|
+
# a dict and axis=1
|
|
162
|
+
if output_type == OutputType.dataframe:
|
|
163
|
+
return build_df(df, fill_value=1, size=1)
|
|
164
|
+
return build_series(df, size=1, name=df.name)
|
|
160
165
|
|
|
161
166
|
|
|
162
167
|
def df_transform(df, func, axis=0, *args, dtypes=None, skip_infer=False, **kwargs):
|
|
@@ -229,13 +234,15 @@ def df_transform(df, func, axis=0, *args, dtypes=None, skip_infer=False, **kwarg
|
|
|
229
234
|
1 2 3
|
|
230
235
|
2 3 4
|
|
231
236
|
"""
|
|
237
|
+
call_agg = kwargs.pop("_call_agg", False)
|
|
238
|
+
func = get_packed_funcs(df, OutputType.dataframe, func, *args, **kwargs)
|
|
232
239
|
op = TransformOperator(
|
|
233
240
|
func=func,
|
|
234
241
|
axis=axis,
|
|
235
242
|
args=args,
|
|
236
243
|
kwds=kwargs,
|
|
237
244
|
output_types=[OutputType.dataframe],
|
|
238
|
-
call_agg=
|
|
245
|
+
call_agg=call_agg,
|
|
239
246
|
)
|
|
240
247
|
return op(df, dtypes=dtypes, skip_infer=skip_infer)
|
|
241
248
|
|
|
@@ -319,6 +326,8 @@ def series_transform(
|
|
|
319
326
|
1 2 3
|
|
320
327
|
2 3 4
|
|
321
328
|
"""
|
|
329
|
+
call_agg = kwargs.pop("_call_agg", False)
|
|
330
|
+
func = get_packed_funcs(series, OutputType.series, func, *args, **kwargs)
|
|
322
331
|
op = TransformOperator(
|
|
323
332
|
func=func,
|
|
324
333
|
axis=axis,
|
|
@@ -326,7 +335,7 @@ def series_transform(
|
|
|
326
335
|
args=args,
|
|
327
336
|
kwds=kwargs,
|
|
328
337
|
output_types=[OutputType.series],
|
|
329
|
-
call_agg=
|
|
338
|
+
call_agg=call_agg,
|
|
330
339
|
)
|
|
331
340
|
dtypes = (series.name, dtype) if dtype is not None else None
|
|
332
341
|
return op(series, dtypes=dtypes, skip_infer=skip_infer)
|
|
@@ -25,6 +25,7 @@ from .custom_reduction import DataFrameCustomReduction
|
|
|
25
25
|
from .kurtosis import DataFrameKurtosis
|
|
26
26
|
from .max import DataFrameMax
|
|
27
27
|
from .mean import DataFrameMean
|
|
28
|
+
from .median import DataFrameMedian
|
|
28
29
|
from .min import DataFrameMin
|
|
29
30
|
from .nunique import DataFrameNunique
|
|
30
31
|
from .prod import DataFrameProd
|
|
@@ -50,6 +51,7 @@ def _install():
|
|
|
50
51
|
from .kurtosis import kurt_dataframe, kurt_series
|
|
51
52
|
from .max import max_dataframe, max_index, max_series
|
|
52
53
|
from .mean import mean_dataframe, mean_series
|
|
54
|
+
from .median import median_dataframe, median_series
|
|
53
55
|
from .min import min_dataframe, min_index, min_series
|
|
54
56
|
from .nunique import nunique_dataframe, nunique_series
|
|
55
57
|
from .prod import prod_dataframe, prod_series
|
|
@@ -68,6 +70,7 @@ def _install():
|
|
|
68
70
|
("min", min_series, min_dataframe),
|
|
69
71
|
("count", count_series, count_dataframe),
|
|
70
72
|
("mean", mean_series, mean_dataframe),
|
|
73
|
+
("median", median_series, median_dataframe),
|
|
71
74
|
("var", var_series, var_dataframe),
|
|
72
75
|
("std", std_series, std_dataframe),
|
|
73
76
|
("all", all_series, all_dataframe),
|
|
@@ -71,6 +71,7 @@ _agg_functions = {
|
|
|
71
71
|
"kurt": lambda x, skipna=True, bias=False: x.kurt(skipna=skipna, bias=bias),
|
|
72
72
|
"kurtosis": lambda x, skipna=True, bias=False: x.kurtosis(skipna=skipna, bias=bias),
|
|
73
73
|
"nunique": lambda x: x.nunique(),
|
|
74
|
+
"median": lambda x, skipna=True: x.median(skipna=skipna),
|
|
74
75
|
}
|
|
75
76
|
|
|
76
77
|
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from ... import opcodes
|
|
16
|
+
from ...core import OutputType
|
|
17
|
+
from .core import DataFrameReductionMixin, DataFrameReductionOperator
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class DataFrameMedian(DataFrameReductionOperator, DataFrameReductionMixin):
|
|
21
|
+
_op_type_ = opcodes.MEDIAN
|
|
22
|
+
_func_name = "median"
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
def is_atomic(self):
|
|
26
|
+
return True
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def median_series(df, axis=None, skipna=True, level=None, method=None):
|
|
30
|
+
op = DataFrameMedian(
|
|
31
|
+
axis=axis,
|
|
32
|
+
skipna=skipna,
|
|
33
|
+
level=level,
|
|
34
|
+
output_types=[OutputType.scalar if level is not None else OutputType.scalar],
|
|
35
|
+
method=method,
|
|
36
|
+
)
|
|
37
|
+
return op(df)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def median_dataframe(
|
|
41
|
+
df,
|
|
42
|
+
axis=0,
|
|
43
|
+
skipna=True,
|
|
44
|
+
level=None,
|
|
45
|
+
numeric_only=None,
|
|
46
|
+
method=None,
|
|
47
|
+
):
|
|
48
|
+
op = DataFrameMedian(
|
|
49
|
+
axis=axis,
|
|
50
|
+
skipna=skipna,
|
|
51
|
+
level=level,
|
|
52
|
+
numeric_only=numeric_only,
|
|
53
|
+
output_types=[OutputType.dataframe if level is not None else OutputType.series],
|
|
54
|
+
method=method,
|
|
55
|
+
)
|
|
56
|
+
return op(df)
|
|
@@ -23,6 +23,7 @@ import pytest
|
|
|
23
23
|
|
|
24
24
|
from .... import dataframe as md
|
|
25
25
|
from ....tensor import Tensor
|
|
26
|
+
from ....tests.utils import assert_mf_index_dtype
|
|
26
27
|
from ...core import DataFrame, IndexValue, OutputType, Series
|
|
27
28
|
from ...datasource.dataframe import from_pandas as from_pandas_df
|
|
28
29
|
from ...datasource.series import from_pandas as from_pandas_series
|
|
@@ -38,6 +39,7 @@ from .. import (
|
|
|
38
39
|
DataFrameKurtosis,
|
|
39
40
|
DataFrameMax,
|
|
40
41
|
DataFrameMean,
|
|
42
|
+
DataFrameMedian,
|
|
41
43
|
DataFrameMin,
|
|
42
44
|
DataFrameNunique,
|
|
43
45
|
DataFrameProd,
|
|
@@ -71,6 +73,7 @@ reduction_functions = [
|
|
|
71
73
|
("sem", DataFrameSem, FunctionOptions()),
|
|
72
74
|
("all", DataFrameAll, FunctionOptions(has_numeric_only=False, has_bool_only=True)),
|
|
73
75
|
("any", DataFrameAny, FunctionOptions(has_numeric_only=False, has_bool_only=True)),
|
|
76
|
+
("median", DataFrameMedian, FunctionOptions()),
|
|
74
77
|
]
|
|
75
78
|
|
|
76
79
|
|
|
@@ -111,10 +114,7 @@ def test_dataframe_reduction(func_name, op, func_opts: FunctionOptions):
|
|
|
111
114
|
reduction_df = getattr(from_pandas_df(data, chunk_size=3), func_name)()
|
|
112
115
|
|
|
113
116
|
assert isinstance(reduction_df, Series)
|
|
114
|
-
|
|
115
|
-
reduction_df.index_value._index_value,
|
|
116
|
-
(IndexValue.RangeIndex, IndexValue.Int64Index),
|
|
117
|
-
)
|
|
117
|
+
assert_mf_index_dtype(reduction_df.index_value._index_value, np.int64)
|
|
118
118
|
assert reduction_df.shape == (10,)
|
|
119
119
|
|
|
120
120
|
data = pd.DataFrame(np.random.rand(20, 20), index=[str(i) for i in range(20)])
|
|
@@ -210,6 +210,7 @@ def test_dataframe_aggregate():
|
|
|
210
210
|
"skew",
|
|
211
211
|
"kurt",
|
|
212
212
|
"sem",
|
|
213
|
+
"median",
|
|
213
214
|
]
|
|
214
215
|
|
|
215
216
|
df = from_pandas_df(data)
|
|
@@ -253,7 +254,7 @@ def test_dataframe_aggregate():
|
|
|
253
254
|
assert result.op.output_types[0] == OutputType.dataframe
|
|
254
255
|
assert result.op.func == agg_funcs
|
|
255
256
|
|
|
256
|
-
dict_fun = {0: "sum", 2: ["var", "max"], 9: ["mean", "var", "std"]}
|
|
257
|
+
dict_fun = {0: "sum", 2: ["var", "max"], 9: ["mean", "var", "std", "median"]}
|
|
257
258
|
all_cols = set(
|
|
258
259
|
reduce(
|
|
259
260
|
operator.add, [[v] if isinstance(v, str) else v for v in dict_fun.values()]
|
|
@@ -268,9 +269,9 @@ def test_dataframe_aggregate():
|
|
|
268
269
|
assert result.op.func[2] == dict_fun[2]
|
|
269
270
|
|
|
270
271
|
with pytest.raises(TypeError):
|
|
271
|
-
df.agg(sum_0="sum", mean_0="mean")
|
|
272
|
+
df.agg(sum_0="sum", mean_0="mean", median_0="median")
|
|
272
273
|
with pytest.raises(NotImplementedError):
|
|
273
|
-
df.agg({0: ["sum", "min", "var"], 9: ["mean", "var", "std"]}, axis=1)
|
|
274
|
+
df.agg({0: ["sum", "min", "var"], 9: ["mean", "var", "std", "median"]}, axis=1)
|
|
274
275
|
|
|
275
276
|
|
|
276
277
|
def test_series_aggregate():
|
|
@@ -287,6 +288,7 @@ def test_series_aggregate():
|
|
|
287
288
|
"skew",
|
|
288
289
|
"kurt",
|
|
289
290
|
"sem",
|
|
291
|
+
"median",
|
|
290
292
|
]
|
|
291
293
|
|
|
292
294
|
series = from_pandas_series(data)
|
|
@@ -303,6 +305,14 @@ def test_series_aggregate():
|
|
|
303
305
|
assert result.shape == ()
|
|
304
306
|
assert result.op.output_types[0] == OutputType.scalar
|
|
305
307
|
|
|
308
|
+
result = series.agg("median")
|
|
309
|
+
assert result.shape == ()
|
|
310
|
+
assert result.op.output_types[0] == OutputType.scalar
|
|
311
|
+
|
|
312
|
+
result = series.median(level=0)
|
|
313
|
+
assert result.shape == (np.nan,)
|
|
314
|
+
assert result.op.output_types[0] == OutputType.series
|
|
315
|
+
|
|
306
316
|
result = series.agg(agg_funcs)
|
|
307
317
|
assert result.shape == (len(agg_funcs),)
|
|
308
318
|
assert list(result.index_value.to_pandas()) == agg_funcs
|
|
@@ -81,7 +81,10 @@ class DataFrameQuantile(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
81
81
|
store_index_value = False
|
|
82
82
|
else:
|
|
83
83
|
q_val = np.asanyarray(self.q)
|
|
84
|
-
|
|
84
|
+
if q_val.ndim == 0:
|
|
85
|
+
pd_index = pd.Index(q_val.reshape(1))
|
|
86
|
+
else:
|
|
87
|
+
pd_index = pd.Index(q_val)
|
|
85
88
|
name = self.q if q_val.size == 1 else None
|
|
86
89
|
store_index_value = True
|
|
87
90
|
tokenize_objects = (a, q_val, self.interpolation, type(self).__name__)
|
|
@@ -164,7 +167,10 @@ class DataFrameQuantile(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
164
167
|
store_index_value = False
|
|
165
168
|
else:
|
|
166
169
|
q_val = np.asanyarray(self.q)
|
|
167
|
-
|
|
170
|
+
if q_val.ndim == 0:
|
|
171
|
+
index_val = pd.Index(q_val.reshape(1))
|
|
172
|
+
else:
|
|
173
|
+
index_val = pd.Index(q_val)
|
|
168
174
|
store_index_value = True
|
|
169
175
|
|
|
170
176
|
# get dtype by tensor
|
|
@@ -49,7 +49,7 @@ def test_dataframe_quantile():
|
|
|
49
49
|
|
|
50
50
|
# q = 0.3, axis = 0
|
|
51
51
|
r = s.quantile(0.3)
|
|
52
|
-
e = raw.quantile(0.3)
|
|
52
|
+
e = raw.quantile(0.3, numeric_only=True)
|
|
53
53
|
assert isinstance(r, Series)
|
|
54
54
|
assert r.shape == (2,)
|
|
55
55
|
assert r.dtype == e.dtype
|
|
@@ -57,7 +57,7 @@ def test_dataframe_quantile():
|
|
|
57
57
|
|
|
58
58
|
# q = 0.3, axis = 1
|
|
59
59
|
r = s.quantile(0.3, axis=1)
|
|
60
|
-
e = raw.quantile(0.3, axis=1)
|
|
60
|
+
e = raw.quantile(0.3, numeric_only=True, axis=1)
|
|
61
61
|
assert isinstance(r, Series)
|
|
62
62
|
assert r.shape == e.shape
|
|
63
63
|
assert r.dtype == e.dtype
|
|
@@ -65,7 +65,7 @@ def test_dataframe_quantile():
|
|
|
65
65
|
|
|
66
66
|
# q = [0.3, 0.7], axis = 0
|
|
67
67
|
r = s.quantile([0.3, 0.7])
|
|
68
|
-
e = raw.quantile([0.3, 0.7])
|
|
68
|
+
e = raw.quantile([0.3, 0.7], numeric_only=True)
|
|
69
69
|
assert isinstance(r, DataFrame)
|
|
70
70
|
assert r.shape == e.shape
|
|
71
71
|
pd.testing.assert_series_equal(r.dtypes, e.dtypes)
|
|
@@ -74,7 +74,7 @@ def test_dataframe_quantile():
|
|
|
74
74
|
|
|
75
75
|
# q = [0.3, 0.7], axis = 1
|
|
76
76
|
r = s.quantile([0.3, 0.7], axis=1)
|
|
77
|
-
e = raw.quantile([0.3, 0.7], axis=1)
|
|
77
|
+
e = raw.quantile([0.3, 0.7], numeric_only=True, axis=1)
|
|
78
78
|
assert isinstance(r, DataFrame)
|
|
79
79
|
assert r.shape == e.shape
|
|
80
80
|
pd.testing.assert_series_equal(r.dtypes, e.dtypes)
|
|
@@ -13,12 +13,13 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import pandas as pd
|
|
16
|
+
import pytest
|
|
16
17
|
|
|
17
18
|
from ..core import DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE
|
|
18
|
-
from ..initializer import read_pandas
|
|
19
|
+
from ..initializer import DataFrame, Series, read_pandas
|
|
19
20
|
|
|
20
21
|
|
|
21
|
-
def
|
|
22
|
+
def test_read_pandas():
|
|
22
23
|
df_data = pd.DataFrame([["a", 1], ["b", 2]], columns=["a", "b"])
|
|
23
24
|
assert isinstance(read_pandas(df_data), DATAFRAME_TYPE)
|
|
24
25
|
|
|
@@ -27,3 +28,33 @@ def test_from_pandas():
|
|
|
27
28
|
|
|
28
29
|
idx_data = pd.Index(["a", "b"])
|
|
29
30
|
assert isinstance(read_pandas(idx_data), INDEX_TYPE)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def test_init_dataframe_from_maxframe_series():
|
|
34
|
+
s = Series([1, 2, 3, 4], index=[1, 2, 3, 4])
|
|
35
|
+
|
|
36
|
+
df = DataFrame(s, index=s.index, columns=["col1"])
|
|
37
|
+
|
|
38
|
+
assert isinstance(df, DATAFRAME_TYPE)
|
|
39
|
+
assert df.dtypes.index == ["col1"]
|
|
40
|
+
|
|
41
|
+
with pytest.raises(ValueError):
|
|
42
|
+
DataFrame(s, index=s.index, columns=[])
|
|
43
|
+
|
|
44
|
+
with pytest.raises(ValueError):
|
|
45
|
+
DataFrame(s, index=s.index, columns="col1")
|
|
46
|
+
|
|
47
|
+
with pytest.raises(ValueError):
|
|
48
|
+
DataFrame(s, index=s.index, columns="col2")
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def test_init_dataframe_from_maxframe_dataframe():
|
|
52
|
+
df1 = DataFrame({"A": [1, 2, 3, 4], "B": [1, 2, 3, 4]}, index=[1, 2, 3, 4])
|
|
53
|
+
|
|
54
|
+
df2 = DataFrame(df1, index=df1.index, columns=["col1", "col2"])
|
|
55
|
+
|
|
56
|
+
assert isinstance(df2, DATAFRAME_TYPE)
|
|
57
|
+
assert list(df2.dtypes.index) == ["col1", "col2"]
|
|
58
|
+
|
|
59
|
+
with pytest.raises(ValueError):
|
|
60
|
+
DataFrame(df1, index=df1.index, columns=["col1", "col2", "col3"])
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import numpy as np
|
|
15
|
+
import pandas as pd
|
|
16
|
+
import pytest
|
|
17
|
+
|
|
18
|
+
from ...udf import MarkedFunction, with_python_requirements, with_resources
|
|
19
|
+
from ..utils import pack_func_args
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@pytest.fixture
|
|
23
|
+
def df1():
|
|
24
|
+
return pd.DataFrame([[1, 2], [3, 4]], columns=["a", "b"])
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def test_pack_function(df1):
|
|
28
|
+
# pack normal function
|
|
29
|
+
@with_resources("a.zip")
|
|
30
|
+
def keep(df):
|
|
31
|
+
return df
|
|
32
|
+
|
|
33
|
+
f = pack_func_args(df1, keep)
|
|
34
|
+
assert f(df1).equals(df1)
|
|
35
|
+
assert isinstance(f, MarkedFunction)
|
|
36
|
+
assert f.resources == ["a.zip"]
|
|
37
|
+
|
|
38
|
+
# pack with args
|
|
39
|
+
@with_python_requirements("numpy")
|
|
40
|
+
def add(a, b):
|
|
41
|
+
return a + b
|
|
42
|
+
|
|
43
|
+
f = pack_func_args(df1, add, 1)
|
|
44
|
+
assert f(df1).equals(df1 + 1)
|
|
45
|
+
assert isinstance(f, MarkedFunction)
|
|
46
|
+
assert f.pythonpacks[0].requirements == ("numpy",)
|
|
47
|
+
|
|
48
|
+
f = pack_func_args(df1, np.sum)
|
|
49
|
+
assert f(df1).equals(np.sum(df1))
|
|
50
|
+
|
|
51
|
+
@with_resources("a.txt")
|
|
52
|
+
@with_python_requirements("pandas")
|
|
53
|
+
def times_add(df, param, times):
|
|
54
|
+
return df * times + param
|
|
55
|
+
|
|
56
|
+
f = pack_func_args(df1, times_add, 5, 6)
|
|
57
|
+
assert f(df1).equals(df1 * 6 + 5)
|
|
58
|
+
assert isinstance(f, MarkedFunction)
|
|
59
|
+
assert f.resources == ["a.txt"]
|
|
60
|
+
assert f.pythonpacks[0].requirements == ("pandas",)
|
maxframe/dataframe/utils.py
CHANGED
|
@@ -20,7 +20,7 @@ import operator
|
|
|
20
20
|
import sys
|
|
21
21
|
from contextlib import contextmanager
|
|
22
22
|
from numbers import Integral
|
|
23
|
-
from typing import Any, Callable, List
|
|
23
|
+
from typing import TYPE_CHECKING, Any, Callable, List
|
|
24
24
|
|
|
25
25
|
import numpy as np
|
|
26
26
|
import pandas as pd
|
|
@@ -30,6 +30,7 @@ from pandas.core.dtypes.inference import is_dict_like, is_list_like
|
|
|
30
30
|
|
|
31
31
|
from ..core import Entity, ExecutableTuple
|
|
32
32
|
from ..lib.mmh3 import hash as mmh_hash
|
|
33
|
+
from ..udf import MarkedFunction
|
|
33
34
|
from ..utils import (
|
|
34
35
|
ModulePlaceholder,
|
|
35
36
|
is_full_slice,
|
|
@@ -44,6 +45,9 @@ try:
|
|
|
44
45
|
except ImportError: # pragma: no cover
|
|
45
46
|
pa = ModulePlaceholder("pyarrow")
|
|
46
47
|
|
|
48
|
+
if TYPE_CHECKING:
|
|
49
|
+
from .operators import DataFrameOperator
|
|
50
|
+
|
|
47
51
|
cudf = lazy_import("cudf", rename="cudf")
|
|
48
52
|
vineyard = lazy_import("vineyard")
|
|
49
53
|
try:
|
|
@@ -263,12 +267,30 @@ def parse_index(index_value, *args, store_data=False, key=None):
|
|
|
263
267
|
return IndexValue(_index_value=_serialize_index(index_value))
|
|
264
268
|
|
|
265
269
|
|
|
266
|
-
def gen_unknown_index_value(index_value, *args):
|
|
270
|
+
def gen_unknown_index_value(index_value, *args, normalize_range_index=False):
|
|
271
|
+
"""
|
|
272
|
+
Generate new index value with the same likes of given index_value and args, but without any value.
|
|
273
|
+
|
|
274
|
+
Parameters
|
|
275
|
+
----------
|
|
276
|
+
index_value
|
|
277
|
+
Given index value.
|
|
278
|
+
args
|
|
279
|
+
Arguments for parse_index.
|
|
280
|
+
normalize_range_index
|
|
281
|
+
If normalize range index to normal index.
|
|
282
|
+
|
|
283
|
+
Returns
|
|
284
|
+
-------
|
|
285
|
+
New created range index value.
|
|
286
|
+
"""
|
|
267
287
|
pd_index = index_value.to_pandas()
|
|
268
|
-
if isinstance(pd_index, pd.RangeIndex):
|
|
269
|
-
return parse_index(pd.RangeIndex(-1), *args)
|
|
288
|
+
if not normalize_range_index and isinstance(pd_index, pd.RangeIndex):
|
|
289
|
+
return parse_index(pd.RangeIndex(-1, name=pd_index.name), *args)
|
|
270
290
|
elif not isinstance(pd_index, pd.MultiIndex):
|
|
271
|
-
return parse_index(
|
|
291
|
+
return parse_index(
|
|
292
|
+
pd.Index([], dtype=pd_index.dtype, name=pd_index.name), *args
|
|
293
|
+
)
|
|
272
294
|
else:
|
|
273
295
|
i = pd.MultiIndex.from_arrays(
|
|
274
296
|
[c[:0] for c in pd_index.levels], names=pd_index.names
|
|
@@ -1160,7 +1182,65 @@ def patch_sa_engine_execute():
|
|
|
1160
1182
|
Engine.execute = execute
|
|
1161
1183
|
|
|
1162
1184
|
|
|
1163
|
-
def
|
|
1185
|
+
def bind_func_args_from_pos(func, args_bind_position, *bound_args, **bound_kwargs):
|
|
1186
|
+
"""
|
|
1187
|
+
Create a new function with arguments bound from specified position.
|
|
1188
|
+
|
|
1189
|
+
Parameters
|
|
1190
|
+
----------
|
|
1191
|
+
func : callable
|
|
1192
|
+
Target function to be wrapped.
|
|
1193
|
+
args_bind_position : int
|
|
1194
|
+
Position to start binding arguments (0-based).
|
|
1195
|
+
e.g., n=0 binds from first arg, n=1 binds from second arg.
|
|
1196
|
+
*bound_args : tuple
|
|
1197
|
+
Arguments to be bound from position n.
|
|
1198
|
+
**bound_kwargs : dict
|
|
1199
|
+
Keyword arguments to be bound.
|
|
1200
|
+
|
|
1201
|
+
Returns
|
|
1202
|
+
-------
|
|
1203
|
+
callable
|
|
1204
|
+
Wrapped function with bound arguments.
|
|
1205
|
+
|
|
1206
|
+
Examples
|
|
1207
|
+
--------
|
|
1208
|
+
>>> def func(x, y, z=0):
|
|
1209
|
+
... return x * y + z
|
|
1210
|
+
>>> f = bind_func_args_from_pos(func, 0, 10) # bind from second position
|
|
1211
|
+
>>> f(5) # equals func(5, 10)
|
|
1212
|
+
10
|
|
1213
|
+
|
|
1214
|
+
Raises
|
|
1215
|
+
------
|
|
1216
|
+
TypeError
|
|
1217
|
+
If func is not callable or n is not an integer.
|
|
1218
|
+
ValueError
|
|
1219
|
+
If n is negative or exceeds the number of parameters.
|
|
1220
|
+
"""
|
|
1221
|
+
|
|
1222
|
+
@functools.wraps(func)
|
|
1223
|
+
def wrapper(*runtime_args, **runtime_kwargs):
|
|
1224
|
+
try:
|
|
1225
|
+
# Combine arguments
|
|
1226
|
+
all_args = (
|
|
1227
|
+
runtime_args[:args_bind_position]
|
|
1228
|
+
+ bound_args
|
|
1229
|
+
+ runtime_args[args_bind_position:]
|
|
1230
|
+
)
|
|
1231
|
+
all_kwargs = {**bound_kwargs, **runtime_kwargs}
|
|
1232
|
+
|
|
1233
|
+
return func(*all_args, **all_kwargs)
|
|
1234
|
+
except Exception as e:
|
|
1235
|
+
# Enhance error message with context
|
|
1236
|
+
raise type(e)(
|
|
1237
|
+
f"Error calling {func.__name__} with bound arguments: {str(e)}"
|
|
1238
|
+
) from e
|
|
1239
|
+
|
|
1240
|
+
return wrapper
|
|
1241
|
+
|
|
1242
|
+
|
|
1243
|
+
def pack_func_args(df, funcs, *args, args_bind_position=1, **kwargs) -> Any:
|
|
1164
1244
|
"""
|
|
1165
1245
|
Pack the funcs with args and kwargs to avoid the ambiguity between other
|
|
1166
1246
|
positional and keyword arguments. It will process the funcs by the following rule:
|
|
@@ -1189,6 +1269,9 @@ def pack_func_args(df, funcs, *args, **kwargs) -> Any:
|
|
|
1189
1269
|
The DataFrame or Series object to test the function.
|
|
1190
1270
|
funcs : function, str, list-like or dict-like
|
|
1191
1271
|
Function to pack. It should have the same type with Dataframe.transform().
|
|
1272
|
+
args_bind_position: int
|
|
1273
|
+
Position to start binding arguments (0-based).
|
|
1274
|
+
e.g., n=0 binds from first arg, n=1 binds from second arg.
|
|
1192
1275
|
*args :
|
|
1193
1276
|
The positional arguments to func. If funcs contains many functions, each one
|
|
1194
1277
|
should be able to accept *args.
|
|
@@ -1219,8 +1302,19 @@ def pack_func_args(df, funcs, *args, **kwargs) -> Any:
|
|
|
1219
1302
|
|
|
1220
1303
|
f = get_callable_by_name(df, funcs) if isinstance(funcs, str) else funcs
|
|
1221
1304
|
|
|
1305
|
+
from ..udf import MarkedFunction
|
|
1306
|
+
|
|
1307
|
+
if isinstance(f, MarkedFunction):
|
|
1308
|
+
# for marked function, pack the inner function, and reset as mark function
|
|
1309
|
+
packed_func = f.copy()
|
|
1310
|
+
packed_func.func = bind_func_args_from_pos(
|
|
1311
|
+
f.func, args_bind_position, *args, **kwargs
|
|
1312
|
+
)
|
|
1313
|
+
else:
|
|
1314
|
+
packed_func = bind_func_args_from_pos(f, args_bind_position, *args, **kwargs)
|
|
1315
|
+
|
|
1222
1316
|
# Callable
|
|
1223
|
-
return
|
|
1317
|
+
return packed_func
|
|
1224
1318
|
|
|
1225
1319
|
|
|
1226
1320
|
def get_callable_by_name(df: Any, func_name: str) -> Callable:
|
|
@@ -1262,3 +1356,12 @@ def get_callable_by_name(df: Any, func_name: str) -> Callable:
|
|
|
1262
1356
|
raise AttributeError(
|
|
1263
1357
|
f"'{func_name}' is not a valid function for '{type(df).__name__}' object"
|
|
1264
1358
|
)
|
|
1359
|
+
|
|
1360
|
+
|
|
1361
|
+
def copy_func_scheduling_hints(func, op: "DataFrameOperator") -> None:
|
|
1362
|
+
if not isinstance(func, MarkedFunction):
|
|
1363
|
+
return
|
|
1364
|
+
if func.expect_engine:
|
|
1365
|
+
op.expect_engine = func.expect_engine
|
|
1366
|
+
if func.expect_resources:
|
|
1367
|
+
op.expect_resources = func.expect_resources
|