maxframe 1.0.0rc4__cp310-cp310-win32.whl → 1.1.1__cp310-cp310-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cp310-win32.pyd +0 -0
- maxframe/config/__init__.py +1 -1
- maxframe/config/config.py +26 -0
- maxframe/config/tests/test_config.py +20 -1
- maxframe/conftest.py +17 -4
- maxframe/core/graph/core.cp310-win32.pyd +0 -0
- maxframe/core/operator/base.py +2 -0
- maxframe/dataframe/arithmetic/tests/test_arithmetic.py +17 -16
- maxframe/dataframe/core.py +24 -2
- maxframe/dataframe/datasource/read_odps_query.py +65 -35
- maxframe/dataframe/datasource/read_odps_table.py +4 -2
- maxframe/dataframe/datasource/tests/test_datasource.py +59 -7
- maxframe/dataframe/extensions/__init__.py +5 -0
- maxframe/dataframe/extensions/apply_chunk.py +649 -0
- maxframe/dataframe/extensions/flatjson.py +131 -0
- maxframe/dataframe/extensions/flatmap.py +28 -40
- maxframe/dataframe/extensions/reshuffle.py +1 -1
- maxframe/dataframe/extensions/tests/test_apply_chunk.py +186 -0
- maxframe/dataframe/extensions/tests/test_extensions.py +46 -2
- maxframe/dataframe/groupby/__init__.py +1 -0
- maxframe/dataframe/groupby/aggregation.py +1 -0
- maxframe/dataframe/groupby/apply.py +9 -1
- maxframe/dataframe/groupby/core.py +1 -1
- maxframe/dataframe/groupby/fill.py +4 -1
- maxframe/dataframe/groupby/getitem.py +6 -0
- maxframe/dataframe/groupby/tests/test_groupby.py +1 -1
- maxframe/dataframe/groupby/transform.py +8 -2
- maxframe/dataframe/indexing/loc.py +6 -4
- maxframe/dataframe/merge/__init__.py +9 -1
- maxframe/dataframe/merge/concat.py +41 -31
- maxframe/dataframe/merge/merge.py +1 -1
- maxframe/dataframe/merge/tests/test_merge.py +3 -1
- maxframe/dataframe/misc/apply.py +3 -0
- maxframe/dataframe/misc/drop_duplicates.py +5 -1
- maxframe/dataframe/misc/map.py +3 -1
- maxframe/dataframe/misc/tests/test_misc.py +24 -2
- maxframe/dataframe/misc/transform.py +22 -13
- maxframe/dataframe/reduction/__init__.py +3 -0
- maxframe/dataframe/reduction/aggregation.py +1 -0
- maxframe/dataframe/reduction/median.py +56 -0
- maxframe/dataframe/reduction/tests/test_reduction.py +17 -7
- maxframe/dataframe/statistics/quantile.py +8 -2
- maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
- maxframe/dataframe/tests/test_utils.py +60 -0
- maxframe/dataframe/utils.py +110 -7
- maxframe/dataframe/window/expanding.py +5 -3
- maxframe/dataframe/window/tests/test_expanding.py +2 -2
- maxframe/io/objects/tests/test_object_io.py +39 -12
- maxframe/io/odpsio/__init__.py +1 -1
- maxframe/io/odpsio/arrow.py +51 -2
- maxframe/io/odpsio/schema.py +23 -5
- maxframe/io/odpsio/tableio.py +80 -124
- maxframe/io/odpsio/tests/test_schema.py +40 -0
- maxframe/io/odpsio/tests/test_tableio.py +5 -5
- maxframe/io/odpsio/tests/test_volumeio.py +35 -11
- maxframe/io/odpsio/volumeio.py +27 -3
- maxframe/learn/contrib/__init__.py +3 -2
- maxframe/learn/contrib/llm/__init__.py +16 -0
- maxframe/learn/contrib/llm/core.py +54 -0
- maxframe/learn/contrib/llm/models/__init__.py +14 -0
- maxframe/learn/contrib/llm/models/dashscope.py +73 -0
- maxframe/learn/contrib/llm/multi_modal.py +42 -0
- maxframe/learn/contrib/llm/text.py +42 -0
- maxframe/lib/mmh3.cp310-win32.pyd +0 -0
- maxframe/lib/sparse/tests/test_sparse.py +15 -15
- maxframe/opcodes.py +7 -1
- maxframe/serialization/core.cp310-win32.pyd +0 -0
- maxframe/serialization/core.pyx +13 -1
- maxframe/serialization/pandas.py +50 -20
- maxframe/serialization/serializables/core.py +70 -15
- maxframe/serialization/serializables/field_type.py +4 -1
- maxframe/serialization/serializables/tests/test_serializable.py +12 -2
- maxframe/serialization/tests/test_serial.py +2 -1
- maxframe/tensor/__init__.py +19 -7
- maxframe/tensor/merge/vstack.py +1 -1
- maxframe/tests/utils.py +16 -0
- maxframe/udf.py +27 -0
- maxframe/utils.py +42 -8
- {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/METADATA +4 -4
- {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/RECORD +88 -77
- {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/WHEEL +1 -1
- maxframe_client/clients/framedriver.py +4 -1
- maxframe_client/fetcher.py +23 -8
- maxframe_client/session/odps.py +40 -11
- maxframe_client/session/task.py +6 -25
- maxframe_client/session/tests/test_task.py +35 -6
- maxframe_client/tests/test_session.py +30 -10
- {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from typing import List
|
|
16
|
+
|
|
17
|
+
from ... import opcodes
|
|
18
|
+
from ...core import OutputType
|
|
19
|
+
from ...serialization.serializables import ListField
|
|
20
|
+
from ...serialization.serializables.field_type import FieldTypes
|
|
21
|
+
from ..core import DataFrame
|
|
22
|
+
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
23
|
+
from ..utils import make_dtypes, parse_index
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class SeriesFlatJSONOperator(DataFrameOperator, DataFrameOperatorMixin):
|
|
27
|
+
_op_type_ = opcodes.FLATJSON
|
|
28
|
+
|
|
29
|
+
query_paths = ListField("query_paths", field_type=FieldTypes.string, default=None)
|
|
30
|
+
|
|
31
|
+
def __call__(self, series, dtypes):
|
|
32
|
+
if self._output_types[0] == OutputType.series:
|
|
33
|
+
name, dtype = dtypes
|
|
34
|
+
return self.new_series(
|
|
35
|
+
[series],
|
|
36
|
+
shape=series.shape,
|
|
37
|
+
index_value=series.index_value,
|
|
38
|
+
name=name,
|
|
39
|
+
dtype=dtype,
|
|
40
|
+
)
|
|
41
|
+
return self.new_dataframe(
|
|
42
|
+
[series],
|
|
43
|
+
shape=(series.shape[0], len(dtypes)),
|
|
44
|
+
index_value=series.index_value,
|
|
45
|
+
columns_value=parse_index(dtypes.index, store_data=True),
|
|
46
|
+
dtypes=make_dtypes(dtypes),
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def series_flatjson(
|
|
51
|
+
series,
|
|
52
|
+
query_paths: List[str],
|
|
53
|
+
dtypes=None,
|
|
54
|
+
dtype=None,
|
|
55
|
+
name: str = None,
|
|
56
|
+
) -> DataFrame:
|
|
57
|
+
"""
|
|
58
|
+
Flat JSON object in the series to a dataframe according to JSON query.
|
|
59
|
+
|
|
60
|
+
Parameters
|
|
61
|
+
----------
|
|
62
|
+
series : Series
|
|
63
|
+
The series of json strings.
|
|
64
|
+
|
|
65
|
+
query_paths: List[str] or str
|
|
66
|
+
The JSON query paths for each generated column. The path format should follow
|
|
67
|
+
[RFC9535](https://datatracker.ietf.org/doc/rfc9535/).
|
|
68
|
+
|
|
69
|
+
dtypes : Series, default None
|
|
70
|
+
Specify dtypes of returned DataFrame. Can't work with dtype.
|
|
71
|
+
|
|
72
|
+
dtype : numpy.dtype, default None
|
|
73
|
+
Specify dtype of returned Series. Can't work with dtypes.
|
|
74
|
+
|
|
75
|
+
name : str, default None
|
|
76
|
+
Specify name of the returned Series.
|
|
77
|
+
|
|
78
|
+
Returns
|
|
79
|
+
-------
|
|
80
|
+
DataFrame or Series
|
|
81
|
+
Result of DataFrame when dtypes specified, else Series.
|
|
82
|
+
|
|
83
|
+
Examples
|
|
84
|
+
--------
|
|
85
|
+
>>> import maxframe.dataframe as md
|
|
86
|
+
>>> import pandas as pd
|
|
87
|
+
>>> s = md.Series(
|
|
88
|
+
... [
|
|
89
|
+
... '{"age": 24, "gender": "male", "graduated": false}',
|
|
90
|
+
... '{"age": 25, "gender": "female", "graduated": true}',
|
|
91
|
+
... ]
|
|
92
|
+
... )
|
|
93
|
+
>>> s.execute()
|
|
94
|
+
0 {"age": 24, "gender": "male", "graduated": false}
|
|
95
|
+
1 {"age": 25, "gender": "female", "graduated": true}
|
|
96
|
+
dtype: object
|
|
97
|
+
|
|
98
|
+
>>> df = s.mf.flatjson(
|
|
99
|
+
... ["$.age", "$.gender", "$.graduated"],
|
|
100
|
+
... dtypes=pd.Series(["int32", "object", "bool"], index=["age", "gender", "graduated"]),
|
|
101
|
+
... )
|
|
102
|
+
>>> df.execute()
|
|
103
|
+
age gender graduated
|
|
104
|
+
0 24 male True
|
|
105
|
+
1 25 female True
|
|
106
|
+
|
|
107
|
+
>>> s2 = s.mf.flatjson("$.age", name="age", dtype="int32")
|
|
108
|
+
>>> s2.execute()
|
|
109
|
+
0 24
|
|
110
|
+
1 25
|
|
111
|
+
Name: age, dtype: int32
|
|
112
|
+
"""
|
|
113
|
+
if isinstance(query_paths, str):
|
|
114
|
+
query_paths = [query_paths]
|
|
115
|
+
if dtypes is not None and dtype is not None:
|
|
116
|
+
raise ValueError("Both dtypes and dtype cannot be specified at the same time.")
|
|
117
|
+
if dtype is not None:
|
|
118
|
+
if len(query_paths) != 1:
|
|
119
|
+
raise ValueError("query_paths should have only one path if dtype is set")
|
|
120
|
+
output_type = OutputType.series
|
|
121
|
+
elif dtypes is not None:
|
|
122
|
+
if len(dtypes) != len(query_paths):
|
|
123
|
+
raise ValueError("query_paths and dtypes should have same length")
|
|
124
|
+
output_type = OutputType.dataframe
|
|
125
|
+
else:
|
|
126
|
+
raise ValueError("dtypes or dtype should be specified")
|
|
127
|
+
|
|
128
|
+
dtypes = (name, dtype) if dtype is not None else dtypes
|
|
129
|
+
return SeriesFlatJSONOperator(query_paths=query_paths, _output_types=[output_type])(
|
|
130
|
+
series, dtypes
|
|
131
|
+
)
|
|
@@ -17,17 +17,17 @@ from typing import Callable
|
|
|
17
17
|
import numpy as np
|
|
18
18
|
import pandas as pd
|
|
19
19
|
|
|
20
|
-
from
|
|
21
|
-
from
|
|
22
|
-
from
|
|
23
|
-
from maxframe.dataframe.operators import DataFrameOperator, DataFrameOperatorMixin
|
|
24
|
-
from maxframe.dataframe.utils import make_dtypes, parse_index
|
|
25
|
-
from maxframe.serialization.serializables import (
|
|
20
|
+
from ... import opcodes
|
|
21
|
+
from ...core import OutputType
|
|
22
|
+
from ...serialization.serializables import (
|
|
26
23
|
BoolField,
|
|
27
24
|
DictField,
|
|
28
25
|
FunctionField,
|
|
29
26
|
TupleField,
|
|
30
27
|
)
|
|
28
|
+
from ..core import DataFrame
|
|
29
|
+
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
30
|
+
from ..utils import gen_unknown_index_value, make_dtypes, parse_index
|
|
31
31
|
|
|
32
32
|
|
|
33
33
|
class DataFrameFlatMapOperator(DataFrameOperator, DataFrameOperatorMixin):
|
|
@@ -41,22 +41,12 @@ class DataFrameFlatMapOperator(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
41
41
|
def __init__(self, output_types=None, **kw):
|
|
42
42
|
super().__init__(_output_types=output_types, **kw)
|
|
43
43
|
|
|
44
|
-
@staticmethod
|
|
45
|
-
def _gen_flattening_index_value(index_value, *args) -> IndexValue:
|
|
46
|
-
pd_index = index_value.to_pandas()
|
|
47
|
-
if not isinstance(pd_index, pd.MultiIndex):
|
|
48
|
-
# for func return multi rows, will copy indexes
|
|
49
|
-
return parse_index(pd.Index([], dtype=pd_index.dtype), *args)
|
|
50
|
-
# multi index will keep the same level and types
|
|
51
|
-
return parse_index(
|
|
52
|
-
pd.MultiIndex.from_arrays([c[:0] for c in pd_index.levels]), *args
|
|
53
|
-
)
|
|
54
|
-
|
|
55
44
|
def _call_dataframe(self, df: DataFrame, dtypes: pd.Series):
|
|
56
45
|
dtypes = make_dtypes(dtypes)
|
|
57
|
-
index_value =
|
|
46
|
+
index_value = gen_unknown_index_value(
|
|
58
47
|
df.index_value,
|
|
59
48
|
(df.key, df.index_value.key, self.func),
|
|
49
|
+
normalize_range_index=True,
|
|
60
50
|
)
|
|
61
51
|
return self.new_dataframe(
|
|
62
52
|
[df],
|
|
@@ -67,9 +57,10 @@ class DataFrameFlatMapOperator(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
67
57
|
)
|
|
68
58
|
|
|
69
59
|
def _call_series_or_index(self, series, dtypes=None):
|
|
70
|
-
index_value =
|
|
60
|
+
index_value = gen_unknown_index_value(
|
|
71
61
|
series.index_value,
|
|
72
62
|
(series.key, series.index_value.key, self.func),
|
|
63
|
+
normalize_range_index=True,
|
|
73
64
|
)
|
|
74
65
|
|
|
75
66
|
if self.output_types[0] == OutputType.series:
|
|
@@ -114,9 +105,6 @@ def df_flatmap(dataframe, func: Callable, dtypes=None, raw=False, args=(), **kwa
|
|
|
114
105
|
|
|
115
106
|
Parameters
|
|
116
107
|
----------
|
|
117
|
-
dataframe : DataFrame
|
|
118
|
-
The DataFrame to which the function will be applied.
|
|
119
|
-
|
|
120
108
|
func : Callable
|
|
121
109
|
Function to apply to each row of the DataFrame. It should accept a Series (or an array if `raw=True`)
|
|
122
110
|
representing a row and return a list or iterable of values.
|
|
@@ -143,8 +131,8 @@ def df_flatmap(dataframe, func: Callable, dtypes=None, raw=False, args=(), **kwa
|
|
|
143
131
|
|
|
144
132
|
Notes
|
|
145
133
|
-----
|
|
146
|
-
The
|
|
147
|
-
|
|
134
|
+
The ``func`` must return an iterable of values for each input row. The index of the resulting DataFrame will be
|
|
135
|
+
repeated based on the number of output rows generated by `func`.
|
|
148
136
|
|
|
149
137
|
Examples
|
|
150
138
|
--------
|
|
@@ -226,12 +214,9 @@ def series_flatmap(
|
|
|
226
214
|
|
|
227
215
|
Parameters
|
|
228
216
|
----------
|
|
229
|
-
series : Series
|
|
230
|
-
The series to which the function will be applied.
|
|
231
|
-
|
|
232
217
|
func : Callable
|
|
233
218
|
Function to apply to each element of the Series. It should accept a scalar value
|
|
234
|
-
(or an array if
|
|
219
|
+
(or an array if ``raw=True``) and return a list or iterable of values.
|
|
235
220
|
|
|
236
221
|
dtypes : Series, default None
|
|
237
222
|
Specify dtypes of returned DataFrame. Can't work with dtype.
|
|
@@ -243,10 +228,10 @@ def series_flatmap(
|
|
|
243
228
|
Specify name of the returned Series.
|
|
244
229
|
|
|
245
230
|
args : tuple
|
|
246
|
-
Positional arguments to pass to
|
|
231
|
+
Positional arguments to pass to ``func``.
|
|
247
232
|
|
|
248
233
|
**kwargs
|
|
249
|
-
Additional keyword arguments to pass as keywords arguments to
|
|
234
|
+
Additional keyword arguments to pass as keywords arguments to ``func``.
|
|
250
235
|
|
|
251
236
|
Returns
|
|
252
237
|
-------
|
|
@@ -255,9 +240,11 @@ def series_flatmap(
|
|
|
255
240
|
|
|
256
241
|
Notes
|
|
257
242
|
-----
|
|
258
|
-
The
|
|
259
|
-
`flatmap` will return a DataFrame, if
|
|
260
|
-
|
|
243
|
+
The ``func`` must return an iterable of values for each input element. If ``dtypes`` is specified,
|
|
244
|
+
`flatmap` will return a DataFrame, if ``dtype`` and ``name`` is specified, a Series will be returned.
|
|
245
|
+
|
|
246
|
+
The index of the resulting DataFrame/Series will be repeated based on the number of output rows generated
|
|
247
|
+
by ``func``.
|
|
261
248
|
|
|
262
249
|
Examples
|
|
263
250
|
--------
|
|
@@ -275,11 +262,7 @@ def series_flatmap(
|
|
|
275
262
|
>>> def generate_values_array(x):
|
|
276
263
|
... return [x * 2, x * 3]
|
|
277
264
|
|
|
278
|
-
|
|
279
|
-
... yield pd.Series([x * 2, x * 4])
|
|
280
|
-
... yield pd.Series([x * 3, x * 5])
|
|
281
|
-
|
|
282
|
-
Specify `dtype` with a function which returns list to return more than one elements as a Series:
|
|
265
|
+
Specify ``dtype`` with a function which returns list to return more elements as a Series:
|
|
283
266
|
|
|
284
267
|
>>> df['A'].mf.flatmap(generate_values_array, dtype="int", name="C").execute()
|
|
285
268
|
0 2
|
|
@@ -290,7 +273,12 @@ def series_flatmap(
|
|
|
290
273
|
2 9
|
|
291
274
|
Name: C, dtype: int64
|
|
292
275
|
|
|
293
|
-
Specify
|
|
276
|
+
Specify ``dtypes`` to return multi columns as a DataFrame:
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
>>> def generate_values_in_generator(x):
|
|
280
|
+
... yield pd.Series([x * 2, x * 4])
|
|
281
|
+
... yield pd.Series([x * 3, x * 5])
|
|
294
282
|
|
|
295
283
|
>>> df['A'].mf.flatmap(generate_values_in_generator, dtypes={"A": "int", "B": "int"}).execute()
|
|
296
284
|
A B
|
|
@@ -302,7 +290,7 @@ def series_flatmap(
|
|
|
302
290
|
2 9 15
|
|
303
291
|
"""
|
|
304
292
|
|
|
305
|
-
if dtypes and dtype:
|
|
293
|
+
if dtypes is not None and dtype is not None:
|
|
306
294
|
raise ValueError("Both dtypes and dtype cannot be specified at the same time.")
|
|
307
295
|
|
|
308
296
|
dtypes = (name, dtype) if dtype is not None else dtypes
|
|
@@ -38,7 +38,7 @@ class DataFrameReshuffle(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
38
38
|
else:
|
|
39
39
|
idx_value = df.index_value
|
|
40
40
|
if isinstance(idx_value.value, IndexValue.RangeIndex):
|
|
41
|
-
idx_value = parse_index(pd.
|
|
41
|
+
idx_value = parse_index(pd.RangeIndex(1))
|
|
42
42
|
params = df.params
|
|
43
43
|
params["index_value"] = idx_value
|
|
44
44
|
self._output_types = get_output_types(df)
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import numpy as np
|
|
15
|
+
import pandas as pd
|
|
16
|
+
import pytest
|
|
17
|
+
|
|
18
|
+
from ....udf import MarkedFunction
|
|
19
|
+
from ... import DataFrame
|
|
20
|
+
from ...core import DATAFRAME_TYPE, SERIES_TYPE
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@pytest.fixture
|
|
24
|
+
def df1():
|
|
25
|
+
return DataFrame({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]})
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@pytest.fixture
|
|
29
|
+
def df2():
|
|
30
|
+
return DataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3]], columns=["a", "b", "c"])
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@pytest.fixture
|
|
34
|
+
def df3():
|
|
35
|
+
return DataFrame(
|
|
36
|
+
[[1, 2, 3], [1, 2, 3], [1, 2, 3]],
|
|
37
|
+
columns=["a", "b", "c"],
|
|
38
|
+
index=pd.MultiIndex.from_arrays([[1, 2, 3], [1, 2, 3]], names=["A", "B"]),
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def test_apply_chunk_infer_dtypes_and_index(df1, df2, df3):
|
|
43
|
+
# dataframe -> dataframe filter
|
|
44
|
+
result = df3.mf.apply_chunk(
|
|
45
|
+
lambda data: data.query("A > 1"), batch_rows=2, output_type="dataframe"
|
|
46
|
+
)
|
|
47
|
+
assert isinstance(result, DATAFRAME_TYPE)
|
|
48
|
+
assert df3.index_value.key != result.index_value.key
|
|
49
|
+
assert df3.index_value.to_pandas().names == result.index_value.to_pandas().names
|
|
50
|
+
|
|
51
|
+
# dataframe -> dataframe keep same
|
|
52
|
+
result = df1.mf.apply_chunk(
|
|
53
|
+
lambda data: data, batch_rows=2, output_type="dataframe"
|
|
54
|
+
)
|
|
55
|
+
assert isinstance(result, DATAFRAME_TYPE)
|
|
56
|
+
assert result.index_value is df1.index_value
|
|
57
|
+
|
|
58
|
+
# dataframe -> dataframe ufunc with arguments
|
|
59
|
+
result = df1.mf.apply_chunk(
|
|
60
|
+
np.add, batch_rows=2, args=(2,), output_type="dataframe"
|
|
61
|
+
)
|
|
62
|
+
assert isinstance(result, DATAFRAME_TYPE)
|
|
63
|
+
assert result.index_value is df1.index_value
|
|
64
|
+
assert result.dtypes.equals(df1.dtypes)
|
|
65
|
+
assert result.shape == df1.shape
|
|
66
|
+
|
|
67
|
+
# dataframe -> series ufunc return series
|
|
68
|
+
result = df1.mf.apply_chunk(np.sum, batch_rows=2)
|
|
69
|
+
assert isinstance(result, SERIES_TYPE)
|
|
70
|
+
assert result.index_value is not df1.index_value
|
|
71
|
+
|
|
72
|
+
# series -> series
|
|
73
|
+
result = df3.a.mf.apply_chunk(lambda data: data, batch_rows=2, output_type="series")
|
|
74
|
+
assert isinstance(result, SERIES_TYPE)
|
|
75
|
+
assert df3.a.index_value is result.index_value
|
|
76
|
+
|
|
77
|
+
result = df3.a.mf.apply_chunk(
|
|
78
|
+
np.sum, batch_rows=2, output_type="series", dtype=np.int64, name="sum"
|
|
79
|
+
)
|
|
80
|
+
assert isinstance(result, SERIES_TYPE)
|
|
81
|
+
assert isinstance(result.index_value.to_pandas(), pd.RangeIndex)
|
|
82
|
+
|
|
83
|
+
# general functions
|
|
84
|
+
def process(data, param, k):
|
|
85
|
+
return data * param * k
|
|
86
|
+
|
|
87
|
+
result = df2.mf.apply_chunk(
|
|
88
|
+
process, batch_rows=3, output_type="dataframe", args=(4,), k=1
|
|
89
|
+
)
|
|
90
|
+
assert result.index_value is df2.index_value
|
|
91
|
+
assert result.dtypes.equals(df2.dtypes)
|
|
92
|
+
|
|
93
|
+
# mark functions
|
|
94
|
+
from ....udf import with_python_requirements, with_resources
|
|
95
|
+
|
|
96
|
+
@with_resources("empty.txt")
|
|
97
|
+
@with_python_requirements("numpy")
|
|
98
|
+
def process(data, k):
|
|
99
|
+
return data
|
|
100
|
+
|
|
101
|
+
result = df1.mf.apply_chunk(process, batch_rows=3, output_type="dataframe", k=1)
|
|
102
|
+
assert result.index_value is df1.index_value
|
|
103
|
+
assert result.dtypes.equals(df1.dtypes)
|
|
104
|
+
assert isinstance(result.op.func, MarkedFunction)
|
|
105
|
+
assert result.op.func is not process
|
|
106
|
+
assert result.op.func.resources is process.resources
|
|
107
|
+
assert result.op.func.pythonpacks is process.pythonpacks
|
|
108
|
+
|
|
109
|
+
def func_series_ret_series(data):
|
|
110
|
+
return pd.DataFrame([data, data])
|
|
111
|
+
|
|
112
|
+
result = df3.a.mf.apply_chunk(
|
|
113
|
+
func_series_ret_series, batch_rows=2, output_type="dataframe"
|
|
114
|
+
)
|
|
115
|
+
assert isinstance(result, DATAFRAME_TYPE)
|
|
116
|
+
assert result.op.func is func_series_ret_series
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def test_apply_test(df1):
|
|
120
|
+
def process(x, param):
|
|
121
|
+
return x * param
|
|
122
|
+
|
|
123
|
+
result = df1.a.mf.apply_chunk(
|
|
124
|
+
process, batch_rows=2, output_type="series", args=(5,)
|
|
125
|
+
)
|
|
126
|
+
assert isinstance(result, SERIES_TYPE)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def test_apply_chunk(df1):
|
|
130
|
+
keys = [1, 2]
|
|
131
|
+
|
|
132
|
+
def f(x, keys):
|
|
133
|
+
if x["a"] in keys:
|
|
134
|
+
return [1, 0]
|
|
135
|
+
else:
|
|
136
|
+
return [0, 1]
|
|
137
|
+
|
|
138
|
+
result = df1[["a"]].mf.apply_chunk(
|
|
139
|
+
f,
|
|
140
|
+
output_type="dataframe",
|
|
141
|
+
dtypes=pd.Series(["int64", "int64"]),
|
|
142
|
+
batch_rows=5,
|
|
143
|
+
keys=keys,
|
|
144
|
+
)
|
|
145
|
+
assert result.shape == (np.nan, 2)
|
|
146
|
+
assert df1.index_value.key != result.index_value.key
|
|
147
|
+
|
|
148
|
+
# dataframe return series
|
|
149
|
+
result = df1.mf.apply_chunk(
|
|
150
|
+
lambda x: x.a,
|
|
151
|
+
output_type="series",
|
|
152
|
+
dtype="int64",
|
|
153
|
+
batch_rows=5,
|
|
154
|
+
)
|
|
155
|
+
assert result.shape == (np.nan,)
|
|
156
|
+
assert df1.index_value.key == result.index_value.key
|
|
157
|
+
assert df1.a.index_value.key == result.index_value.key
|
|
158
|
+
|
|
159
|
+
# return dataframe with given dtypes
|
|
160
|
+
result = df1.a.mf.apply_chunk(
|
|
161
|
+
lambda x: pd.concat([x, x], axis=1),
|
|
162
|
+
output_type="dataframe",
|
|
163
|
+
dtypes=pd.Series(["int64", "int64"]),
|
|
164
|
+
batch_rows=5,
|
|
165
|
+
)
|
|
166
|
+
assert result.shape == (np.nan, 2)
|
|
167
|
+
assert df1.a.index_value.key != result.index_value.key
|
|
168
|
+
|
|
169
|
+
# return series but as dataframe
|
|
170
|
+
result = df1.a.mf.apply_chunk(
|
|
171
|
+
lambda x: pd.concat([x, x], axis=0),
|
|
172
|
+
output_type="dataframe",
|
|
173
|
+
dtypes={"c": np.int_},
|
|
174
|
+
batch_rows=5,
|
|
175
|
+
)
|
|
176
|
+
assert result.shape == (np.nan, 1)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def test_apply_chunk_exception(df1):
|
|
180
|
+
with pytest.raises(ValueError):
|
|
181
|
+
df1.mf.apply_chunk(lambda data: data, batch_rows=-1, output_type="dataframe")
|
|
182
|
+
|
|
183
|
+
with pytest.raises(TypeError):
|
|
184
|
+
df1.mf.apply_chunk(
|
|
185
|
+
lambda data: data, batch_rows=object(), output_type="dataframe"
|
|
186
|
+
)
|
|
@@ -16,8 +16,9 @@ import pandas as pd
|
|
|
16
16
|
import pytest
|
|
17
17
|
|
|
18
18
|
from .... import dataframe as md
|
|
19
|
+
from ....tests.utils import assert_mf_index_dtype
|
|
19
20
|
from ... import DataFrame
|
|
20
|
-
from ...core import IndexValue
|
|
21
|
+
from ...core import DATAFRAME_TYPE, SERIES_TYPE, IndexValue
|
|
21
22
|
from ..reshuffle import DataFrameReshuffle
|
|
22
23
|
|
|
23
24
|
|
|
@@ -32,7 +33,7 @@ def test_reshuffle():
|
|
|
32
33
|
|
|
33
34
|
r = mdf.mf.reshuffle()
|
|
34
35
|
assert isinstance(r.op, DataFrameReshuffle)
|
|
35
|
-
|
|
36
|
+
assert_mf_index_dtype(r.index_value.value, np.int64)
|
|
36
37
|
|
|
37
38
|
r = mdf.mf.reshuffle(ignore_index=True)
|
|
38
39
|
assert isinstance(r.op, DataFrameReshuffle)
|
|
@@ -97,3 +98,46 @@ def test_flatmap(df1, df2, df3):
|
|
|
97
98
|
assert apply_s.shape == (np.nan, 2)
|
|
98
99
|
assert df3.index_value.key != apply_s.index_value.key
|
|
99
100
|
assert df3.key != apply_s.index_value.key
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def test_flatjson():
|
|
104
|
+
s1 = md.Series(["{{'a': 1, 'b': false}}"], index=[1])
|
|
105
|
+
df1 = s1.mf.flatjson(
|
|
106
|
+
["$.a", "$.b"], dtypes=pd.Series(["int32", "bool"], index=["a", "b"])
|
|
107
|
+
)
|
|
108
|
+
assert df1.shape == (1, 2)
|
|
109
|
+
assert df1.index_value.key == s1.index_value.key
|
|
110
|
+
assert isinstance(df1, DATAFRAME_TYPE)
|
|
111
|
+
assert list(df1.dtypes) == [np.dtype("int32"), np.dtype("bool")]
|
|
112
|
+
assert list(df1.dtypes.index) == ["a", "b"]
|
|
113
|
+
|
|
114
|
+
df2 = s1.mf.flatjson(["$.a"], dtypes=pd.Series(["int32"], index=["a"]))
|
|
115
|
+
assert df2.shape == (1, 1)
|
|
116
|
+
assert df2.index_value.key == s1.index_value.key
|
|
117
|
+
assert isinstance(df2, DATAFRAME_TYPE)
|
|
118
|
+
assert list(df2.dtypes) == [np.dtype("int32")]
|
|
119
|
+
assert list(df2.dtypes.index) == ["a"]
|
|
120
|
+
|
|
121
|
+
s2 = s1.mf.flatjson("$.a", dtype="int32", name="a")
|
|
122
|
+
assert s2.shape == (1,)
|
|
123
|
+
assert s2.index_value.key == s1.index_value.key
|
|
124
|
+
assert isinstance(s2, SERIES_TYPE)
|
|
125
|
+
assert s2.dtype == np.dtype("int32")
|
|
126
|
+
assert s2.name == "a"
|
|
127
|
+
|
|
128
|
+
with pytest.raises(ValueError):
|
|
129
|
+
s1.mf.flatjson([], dtypes=pd.Series(["int32", "bool"], index=["a", "b"]))
|
|
130
|
+
with pytest.raises(ValueError):
|
|
131
|
+
s1.mf.flatjson(["$.a"], dtypes=pd.Series(["int32", "bool"], index=["a", "b"]))
|
|
132
|
+
with pytest.raises(ValueError):
|
|
133
|
+
s1.mf.flatjson(["$.a"], dtypes=pd.Series(["int32", "bool"], index=["a", "b"]))
|
|
134
|
+
with pytest.raises(ValueError):
|
|
135
|
+
s1.mf.flatjson(["$.a", "$.b"], dtypes=pd.Series(["bool"], index=["b"]))
|
|
136
|
+
with pytest.raises(ValueError):
|
|
137
|
+
s1.mf.flatjson(
|
|
138
|
+
["$.a"],
|
|
139
|
+
dtype="int32",
|
|
140
|
+
dtypes=pd.Series(["int32"], index=["a"]),
|
|
141
|
+
)
|
|
142
|
+
with pytest.raises(ValueError):
|
|
143
|
+
s1.mf.flatjson(["$.a"])
|
|
@@ -55,6 +55,7 @@ def _install():
|
|
|
55
55
|
setattr(cls, "kurtosis", lambda groupby, **kw: agg(groupby, "kurtosis", **kw))
|
|
56
56
|
setattr(cls, "sem", lambda groupby, **kw: agg(groupby, "sem", **kw))
|
|
57
57
|
setattr(cls, "nunique", lambda groupby, **kw: agg(groupby, "nunique", **kw))
|
|
58
|
+
setattr(cls, "median", lambda groupby, **kw: agg(groupby, "median", **kw))
|
|
58
59
|
|
|
59
60
|
setattr(cls, "apply", groupby_apply)
|
|
60
61
|
setattr(cls, "transform", groupby_transform)
|
|
@@ -28,7 +28,13 @@ from ...serialization.serializables import (
|
|
|
28
28
|
)
|
|
29
29
|
from ...utils import get_func_token, quiet_stdio, tokenize
|
|
30
30
|
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
31
|
-
from ..utils import
|
|
31
|
+
from ..utils import (
|
|
32
|
+
copy_func_scheduling_hints,
|
|
33
|
+
make_dtype,
|
|
34
|
+
make_dtypes,
|
|
35
|
+
parse_index,
|
|
36
|
+
validate_output_types,
|
|
37
|
+
)
|
|
32
38
|
|
|
33
39
|
|
|
34
40
|
class GroupByApplyLogicKeyGeneratorMixin(OperatorLogicKeyGeneratorMixin):
|
|
@@ -56,6 +62,8 @@ class GroupByApply(
|
|
|
56
62
|
|
|
57
63
|
def __init__(self, output_types=None, **kw):
|
|
58
64
|
super().__init__(_output_types=output_types, **kw)
|
|
65
|
+
if hasattr(self, "func"):
|
|
66
|
+
copy_func_scheduling_hints(self.func, self)
|
|
59
67
|
|
|
60
68
|
def _update_key(self):
|
|
61
69
|
values = [v for v in self._values_ if v is not self.func] + [
|
|
@@ -28,7 +28,7 @@ from ..utils import build_df, build_series, parse_index
|
|
|
28
28
|
|
|
29
29
|
cudf = lazy_import("cudf")
|
|
30
30
|
|
|
31
|
-
_GROUP_KEYS_NO_DEFAULT = pd_release_version
|
|
31
|
+
_GROUP_KEYS_NO_DEFAULT = pd_release_version[:2] == (1, 5)
|
|
32
32
|
_default_group_keys = no_default if _GROUP_KEYS_NO_DEFAULT else True
|
|
33
33
|
|
|
34
34
|
|
|
@@ -35,12 +35,15 @@ class GroupByFillOperator(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
35
35
|
func_name = getattr(self, "_func_name")
|
|
36
36
|
|
|
37
37
|
if func_name == "fillna":
|
|
38
|
+
kw = {}
|
|
39
|
+
if self.axis is not None:
|
|
40
|
+
kw["axis"] = self.axis
|
|
38
41
|
result_df = mock_groupby.fillna(
|
|
39
42
|
value=self.value,
|
|
40
43
|
method=self.method,
|
|
41
|
-
axis=self.axis,
|
|
42
44
|
limit=self.limit,
|
|
43
45
|
downcast=self.downcast,
|
|
46
|
+
**kw,
|
|
44
47
|
)
|
|
45
48
|
else:
|
|
46
49
|
result_df = getattr(mock_groupby, func_name)(limit=self.limit)
|
|
@@ -88,5 +88,11 @@ def df_groupby_getitem(df_groupby, item):
|
|
|
88
88
|
if df_groupby.selection:
|
|
89
89
|
raise IndexError(f"Column(s) {df_groupby.selection!r} already selected")
|
|
90
90
|
|
|
91
|
+
if (
|
|
92
|
+
isinstance(item, tuple)
|
|
93
|
+
and item not in df_groupby.dtypes
|
|
94
|
+
and item not in df_groupby.index.names
|
|
95
|
+
):
|
|
96
|
+
item = list(item)
|
|
91
97
|
op = GroupByIndex(selection=item, output_types=output_types)
|
|
92
98
|
return op(df_groupby)
|
|
@@ -230,7 +230,7 @@ def test_groupby_transform():
|
|
|
230
230
|
assert r.op._op_type_ == opcodes.TRANSFORM
|
|
231
231
|
assert r.op.output_types[0] == OutputType.dataframe
|
|
232
232
|
|
|
233
|
-
r = mdf.groupby("b").transform(["cummax", "cumcount"], _call_agg=True)
|
|
233
|
+
r = mdf[list("abde")].groupby("b").transform(["cummax", "cumcount"], _call_agg=True)
|
|
234
234
|
assert r.shape == (np.nan, 6)
|
|
235
235
|
assert r.op._op_type_ == opcodes.TRANSFORM
|
|
236
236
|
assert r.op.output_types[0] == OutputType.dataframe
|
|
@@ -12,6 +12,8 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
+
import logging
|
|
16
|
+
|
|
15
17
|
import numpy as np
|
|
16
18
|
import pandas as pd
|
|
17
19
|
|
|
@@ -20,7 +22,9 @@ from ...core import OutputType
|
|
|
20
22
|
from ...serialization.serializables import AnyField, BoolField, DictField, TupleField
|
|
21
23
|
from ...utils import quiet_stdio
|
|
22
24
|
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
23
|
-
from ..utils import parse_index
|
|
25
|
+
from ..utils import copy_func_scheduling_hints, parse_index
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
24
28
|
|
|
25
29
|
|
|
26
30
|
class GroupByTransform(DataFrameOperator, DataFrameOperatorMixin):
|
|
@@ -35,6 +39,8 @@ class GroupByTransform(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
35
39
|
|
|
36
40
|
def __init__(self, output_types=None, **kw):
|
|
37
41
|
super().__init__(_output_types=output_types, **kw)
|
|
42
|
+
if hasattr(self, "func"):
|
|
43
|
+
copy_func_scheduling_hints(self.func, self)
|
|
38
44
|
|
|
39
45
|
def _infer_df_func_returns(self, in_groupby, dtypes, index):
|
|
40
46
|
index_value, output_types, new_dtypes = None, None, None
|
|
@@ -65,7 +71,7 @@ class GroupByTransform(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
65
71
|
output_types = [OutputType.series]
|
|
66
72
|
new_dtypes = new_dtypes or (infer_df.name, infer_df.dtype)
|
|
67
73
|
except: # noqa: E722 # nosec
|
|
68
|
-
|
|
74
|
+
logger.info("Exception raised while inferring df_func", exc_info=True)
|
|
69
75
|
|
|
70
76
|
self.output_types = output_types if not self.output_types else self.output_types
|
|
71
77
|
dtypes = new_dtypes if dtypes is None else dtypes
|