maxframe 1.0.0rc4__cp39-cp39-win_amd64.whl → 1.1.1__cp39-cp39-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (88) hide show
  1. maxframe/_utils.cp39-win_amd64.pyd +0 -0
  2. maxframe/config/__init__.py +1 -1
  3. maxframe/config/config.py +26 -0
  4. maxframe/config/tests/test_config.py +20 -1
  5. maxframe/conftest.py +17 -4
  6. maxframe/core/graph/core.cp39-win_amd64.pyd +0 -0
  7. maxframe/core/operator/base.py +2 -0
  8. maxframe/dataframe/arithmetic/tests/test_arithmetic.py +17 -16
  9. maxframe/dataframe/core.py +24 -2
  10. maxframe/dataframe/datasource/read_odps_query.py +65 -35
  11. maxframe/dataframe/datasource/read_odps_table.py +4 -2
  12. maxframe/dataframe/datasource/tests/test_datasource.py +59 -7
  13. maxframe/dataframe/extensions/__init__.py +5 -0
  14. maxframe/dataframe/extensions/apply_chunk.py +649 -0
  15. maxframe/dataframe/extensions/flatjson.py +131 -0
  16. maxframe/dataframe/extensions/flatmap.py +28 -40
  17. maxframe/dataframe/extensions/reshuffle.py +1 -1
  18. maxframe/dataframe/extensions/tests/test_apply_chunk.py +186 -0
  19. maxframe/dataframe/extensions/tests/test_extensions.py +46 -2
  20. maxframe/dataframe/groupby/__init__.py +1 -0
  21. maxframe/dataframe/groupby/aggregation.py +1 -0
  22. maxframe/dataframe/groupby/apply.py +9 -1
  23. maxframe/dataframe/groupby/core.py +1 -1
  24. maxframe/dataframe/groupby/fill.py +4 -1
  25. maxframe/dataframe/groupby/getitem.py +6 -0
  26. maxframe/dataframe/groupby/tests/test_groupby.py +1 -1
  27. maxframe/dataframe/groupby/transform.py +8 -2
  28. maxframe/dataframe/indexing/loc.py +6 -4
  29. maxframe/dataframe/merge/__init__.py +9 -1
  30. maxframe/dataframe/merge/concat.py +41 -31
  31. maxframe/dataframe/merge/merge.py +1 -1
  32. maxframe/dataframe/merge/tests/test_merge.py +3 -1
  33. maxframe/dataframe/misc/apply.py +3 -0
  34. maxframe/dataframe/misc/drop_duplicates.py +5 -1
  35. maxframe/dataframe/misc/map.py +3 -1
  36. maxframe/dataframe/misc/tests/test_misc.py +24 -2
  37. maxframe/dataframe/misc/transform.py +22 -13
  38. maxframe/dataframe/reduction/__init__.py +3 -0
  39. maxframe/dataframe/reduction/aggregation.py +1 -0
  40. maxframe/dataframe/reduction/median.py +56 -0
  41. maxframe/dataframe/reduction/tests/test_reduction.py +17 -7
  42. maxframe/dataframe/statistics/quantile.py +8 -2
  43. maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
  44. maxframe/dataframe/tests/test_utils.py +60 -0
  45. maxframe/dataframe/utils.py +110 -7
  46. maxframe/dataframe/window/expanding.py +5 -3
  47. maxframe/dataframe/window/tests/test_expanding.py +2 -2
  48. maxframe/io/objects/tests/test_object_io.py +39 -12
  49. maxframe/io/odpsio/__init__.py +1 -1
  50. maxframe/io/odpsio/arrow.py +51 -2
  51. maxframe/io/odpsio/schema.py +23 -5
  52. maxframe/io/odpsio/tableio.py +80 -124
  53. maxframe/io/odpsio/tests/test_schema.py +40 -0
  54. maxframe/io/odpsio/tests/test_tableio.py +5 -5
  55. maxframe/io/odpsio/tests/test_volumeio.py +35 -11
  56. maxframe/io/odpsio/volumeio.py +27 -3
  57. maxframe/learn/contrib/__init__.py +3 -2
  58. maxframe/learn/contrib/llm/__init__.py +16 -0
  59. maxframe/learn/contrib/llm/core.py +54 -0
  60. maxframe/learn/contrib/llm/models/__init__.py +14 -0
  61. maxframe/learn/contrib/llm/models/dashscope.py +73 -0
  62. maxframe/learn/contrib/llm/multi_modal.py +42 -0
  63. maxframe/learn/contrib/llm/text.py +42 -0
  64. maxframe/lib/mmh3.cp39-win_amd64.pyd +0 -0
  65. maxframe/lib/sparse/tests/test_sparse.py +15 -15
  66. maxframe/opcodes.py +7 -1
  67. maxframe/serialization/core.cp39-win_amd64.pyd +0 -0
  68. maxframe/serialization/core.pyx +13 -1
  69. maxframe/serialization/pandas.py +50 -20
  70. maxframe/serialization/serializables/core.py +70 -15
  71. maxframe/serialization/serializables/field_type.py +4 -1
  72. maxframe/serialization/serializables/tests/test_serializable.py +12 -2
  73. maxframe/serialization/tests/test_serial.py +2 -1
  74. maxframe/tensor/__init__.py +19 -7
  75. maxframe/tensor/merge/vstack.py +1 -1
  76. maxframe/tests/utils.py +16 -0
  77. maxframe/udf.py +27 -0
  78. maxframe/utils.py +42 -8
  79. {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/METADATA +4 -4
  80. {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/RECORD +88 -77
  81. {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/WHEEL +1 -1
  82. maxframe_client/clients/framedriver.py +4 -1
  83. maxframe_client/fetcher.py +23 -8
  84. maxframe_client/session/odps.py +40 -11
  85. maxframe_client/session/task.py +6 -25
  86. maxframe_client/session/tests/test_task.py +35 -6
  87. maxframe_client/tests/test_session.py +30 -10
  88. {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,131 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import List
16
+
17
+ from ... import opcodes
18
+ from ...core import OutputType
19
+ from ...serialization.serializables import ListField
20
+ from ...serialization.serializables.field_type import FieldTypes
21
+ from ..core import DataFrame
22
+ from ..operators import DataFrameOperator, DataFrameOperatorMixin
23
+ from ..utils import make_dtypes, parse_index
24
+
25
+
26
+ class SeriesFlatJSONOperator(DataFrameOperator, DataFrameOperatorMixin):
27
+ _op_type_ = opcodes.FLATJSON
28
+
29
+ query_paths = ListField("query_paths", field_type=FieldTypes.string, default=None)
30
+
31
+ def __call__(self, series, dtypes):
32
+ if self._output_types[0] == OutputType.series:
33
+ name, dtype = dtypes
34
+ return self.new_series(
35
+ [series],
36
+ shape=series.shape,
37
+ index_value=series.index_value,
38
+ name=name,
39
+ dtype=dtype,
40
+ )
41
+ return self.new_dataframe(
42
+ [series],
43
+ shape=(series.shape[0], len(dtypes)),
44
+ index_value=series.index_value,
45
+ columns_value=parse_index(dtypes.index, store_data=True),
46
+ dtypes=make_dtypes(dtypes),
47
+ )
48
+
49
+
50
+ def series_flatjson(
51
+ series,
52
+ query_paths: List[str],
53
+ dtypes=None,
54
+ dtype=None,
55
+ name: str = None,
56
+ ) -> DataFrame:
57
+ """
58
+ Flat JSON object in the series to a dataframe according to JSON query.
59
+
60
+ Parameters
61
+ ----------
62
+ series : Series
63
+ The series of json strings.
64
+
65
+ query_paths: List[str] or str
66
+ The JSON query paths for each generated column. The path format should follow
67
+ [RFC9535](https://datatracker.ietf.org/doc/rfc9535/).
68
+
69
+ dtypes : Series, default None
70
+ Specify dtypes of returned DataFrame. Can't work with dtype.
71
+
72
+ dtype : numpy.dtype, default None
73
+ Specify dtype of returned Series. Can't work with dtypes.
74
+
75
+ name : str, default None
76
+ Specify name of the returned Series.
77
+
78
+ Returns
79
+ -------
80
+ DataFrame or Series
81
+ Result of DataFrame when dtypes specified, else Series.
82
+
83
+ Examples
84
+ --------
85
+ >>> import maxframe.dataframe as md
86
+ >>> import pandas as pd
87
+ >>> s = md.Series(
88
+ ... [
89
+ ... '{"age": 24, "gender": "male", "graduated": false}',
90
+ ... '{"age": 25, "gender": "female", "graduated": true}',
91
+ ... ]
92
+ ... )
93
+ >>> s.execute()
94
+ 0 {"age": 24, "gender": "male", "graduated": false}
95
+ 1 {"age": 25, "gender": "female", "graduated": true}
96
+ dtype: object
97
+
98
+ >>> df = s.mf.flatjson(
99
+ ... ["$.age", "$.gender", "$.graduated"],
100
+ ... dtypes=pd.Series(["int32", "object", "bool"], index=["age", "gender", "graduated"]),
101
+ ... )
102
+ >>> df.execute()
103
+ age gender graduated
104
+ 0 24 male True
105
+ 1 25 female True
106
+
107
+ >>> s2 = s.mf.flatjson("$.age", name="age", dtype="int32")
108
+ >>> s2.execute()
109
+ 0 24
110
+ 1 25
111
+ Name: age, dtype: int32
112
+ """
113
+ if isinstance(query_paths, str):
114
+ query_paths = [query_paths]
115
+ if dtypes is not None and dtype is not None:
116
+ raise ValueError("Both dtypes and dtype cannot be specified at the same time.")
117
+ if dtype is not None:
118
+ if len(query_paths) != 1:
119
+ raise ValueError("query_paths should have only one path if dtype is set")
120
+ output_type = OutputType.series
121
+ elif dtypes is not None:
122
+ if len(dtypes) != len(query_paths):
123
+ raise ValueError("query_paths and dtypes should have same length")
124
+ output_type = OutputType.dataframe
125
+ else:
126
+ raise ValueError("dtypes or dtype should be specified")
127
+
128
+ dtypes = (name, dtype) if dtype is not None else dtypes
129
+ return SeriesFlatJSONOperator(query_paths=query_paths, _output_types=[output_type])(
130
+ series, dtypes
131
+ )
@@ -17,17 +17,17 @@ from typing import Callable
17
17
  import numpy as np
18
18
  import pandas as pd
19
19
 
20
- from maxframe import opcodes
21
- from maxframe.core import OutputType
22
- from maxframe.dataframe.core import DataFrame, IndexValue
23
- from maxframe.dataframe.operators import DataFrameOperator, DataFrameOperatorMixin
24
- from maxframe.dataframe.utils import make_dtypes, parse_index
25
- from maxframe.serialization.serializables import (
20
+ from ... import opcodes
21
+ from ...core import OutputType
22
+ from ...serialization.serializables import (
26
23
  BoolField,
27
24
  DictField,
28
25
  FunctionField,
29
26
  TupleField,
30
27
  )
28
+ from ..core import DataFrame
29
+ from ..operators import DataFrameOperator, DataFrameOperatorMixin
30
+ from ..utils import gen_unknown_index_value, make_dtypes, parse_index
31
31
 
32
32
 
33
33
  class DataFrameFlatMapOperator(DataFrameOperator, DataFrameOperatorMixin):
@@ -41,22 +41,12 @@ class DataFrameFlatMapOperator(DataFrameOperator, DataFrameOperatorMixin):
41
41
  def __init__(self, output_types=None, **kw):
42
42
  super().__init__(_output_types=output_types, **kw)
43
43
 
44
- @staticmethod
45
- def _gen_flattening_index_value(index_value, *args) -> IndexValue:
46
- pd_index = index_value.to_pandas()
47
- if not isinstance(pd_index, pd.MultiIndex):
48
- # for func return multi rows, will copy indexes
49
- return parse_index(pd.Index([], dtype=pd_index.dtype), *args)
50
- # multi index will keep the same level and types
51
- return parse_index(
52
- pd.MultiIndex.from_arrays([c[:0] for c in pd_index.levels]), *args
53
- )
54
-
55
44
  def _call_dataframe(self, df: DataFrame, dtypes: pd.Series):
56
45
  dtypes = make_dtypes(dtypes)
57
- index_value = self._gen_flattening_index_value(
46
+ index_value = gen_unknown_index_value(
58
47
  df.index_value,
59
48
  (df.key, df.index_value.key, self.func),
49
+ normalize_range_index=True,
60
50
  )
61
51
  return self.new_dataframe(
62
52
  [df],
@@ -67,9 +57,10 @@ class DataFrameFlatMapOperator(DataFrameOperator, DataFrameOperatorMixin):
67
57
  )
68
58
 
69
59
  def _call_series_or_index(self, series, dtypes=None):
70
- index_value = self._gen_flattening_index_value(
60
+ index_value = gen_unknown_index_value(
71
61
  series.index_value,
72
62
  (series.key, series.index_value.key, self.func),
63
+ normalize_range_index=True,
73
64
  )
74
65
 
75
66
  if self.output_types[0] == OutputType.series:
@@ -114,9 +105,6 @@ def df_flatmap(dataframe, func: Callable, dtypes=None, raw=False, args=(), **kwa
114
105
 
115
106
  Parameters
116
107
  ----------
117
- dataframe : DataFrame
118
- The DataFrame to which the function will be applied.
119
-
120
108
  func : Callable
121
109
  Function to apply to each row of the DataFrame. It should accept a Series (or an array if `raw=True`)
122
110
  representing a row and return a list or iterable of values.
@@ -143,8 +131,8 @@ def df_flatmap(dataframe, func: Callable, dtypes=None, raw=False, args=(), **kwa
143
131
 
144
132
  Notes
145
133
  -----
146
- The `func` must return an iterable of values for each input row. The index of the resulting DataFrame will be
147
- repeated based on the number of output rows generated by `func`.
134
+ The ``func`` must return an iterable of values for each input row. The index of the resulting DataFrame will be
135
+ repeated based on the number of output rows generated by `func`.
148
136
 
149
137
  Examples
150
138
  --------
@@ -226,12 +214,9 @@ def series_flatmap(
226
214
 
227
215
  Parameters
228
216
  ----------
229
- series : Series
230
- The series to which the function will be applied.
231
-
232
217
  func : Callable
233
218
  Function to apply to each element of the Series. It should accept a scalar value
234
- (or an array if `raw=True`) and return a list or iterable of values.
219
+ (or an array if ``raw=True``) and return a list or iterable of values.
235
220
 
236
221
  dtypes : Series, default None
237
222
  Specify dtypes of returned DataFrame. Can't work with dtype.
@@ -243,10 +228,10 @@ def series_flatmap(
243
228
  Specify name of the returned Series.
244
229
 
245
230
  args : tuple
246
- Positional arguments to pass to `func`.
231
+ Positional arguments to pass to ``func``.
247
232
 
248
233
  **kwargs
249
- Additional keyword arguments to pass as keywords arguments to `func`.
234
+ Additional keyword arguments to pass as keywords arguments to ``func``.
250
235
 
251
236
  Returns
252
237
  -------
@@ -255,9 +240,11 @@ def series_flatmap(
255
240
 
256
241
  Notes
257
242
  -----
258
- The `func` must return an iterable of values for each input element. If `dtypes` is specified,
259
- `flatmap` will return a DataFrame, if `dtype` and `name` is specified, a Series will be returned. The index of
260
- the resulting DataFrame/Series will be repeated based on the number of output rows generated by `func`.
243
+ The ``func`` must return an iterable of values for each input element. If ``dtypes`` is specified,
244
+ `flatmap` will return a DataFrame, if ``dtype`` and ``name`` is specified, a Series will be returned.
245
+
246
+ The index of the resulting DataFrame/Series will be repeated based on the number of output rows generated
247
+ by ``func``.
261
248
 
262
249
  Examples
263
250
  --------
@@ -275,11 +262,7 @@ def series_flatmap(
275
262
  >>> def generate_values_array(x):
276
263
  ... return [x * 2, x * 3]
277
264
 
278
- >>> def generate_values_in_generator(x):
279
- ... yield pd.Series([x * 2, x * 4])
280
- ... yield pd.Series([x * 3, x * 5])
281
-
282
- Specify `dtype` with a function which returns list to return more than one elements as a Series:
265
+ Specify ``dtype`` with a function which returns list to return more elements as a Series:
283
266
 
284
267
  >>> df['A'].mf.flatmap(generate_values_array, dtype="int", name="C").execute()
285
268
  0 2
@@ -290,7 +273,12 @@ def series_flatmap(
290
273
  2 9
291
274
  Name: C, dtype: int64
292
275
 
293
- Specify `dtypes` to return multi columns as a DataFrame:
276
+ Specify ``dtypes`` to return multi columns as a DataFrame:
277
+
278
+
279
+ >>> def generate_values_in_generator(x):
280
+ ... yield pd.Series([x * 2, x * 4])
281
+ ... yield pd.Series([x * 3, x * 5])
294
282
 
295
283
  >>> df['A'].mf.flatmap(generate_values_in_generator, dtypes={"A": "int", "B": "int"}).execute()
296
284
  A B
@@ -302,7 +290,7 @@ def series_flatmap(
302
290
  2 9 15
303
291
  """
304
292
 
305
- if dtypes and dtype:
293
+ if dtypes is not None and dtype is not None:
306
294
  raise ValueError("Both dtypes and dtype cannot be specified at the same time.")
307
295
 
308
296
  dtypes = (name, dtype) if dtype is not None else dtypes
@@ -38,7 +38,7 @@ class DataFrameReshuffle(DataFrameOperator, DataFrameOperatorMixin):
38
38
  else:
39
39
  idx_value = df.index_value
40
40
  if isinstance(idx_value.value, IndexValue.RangeIndex):
41
- idx_value = parse_index(pd.Int64Index([0]))
41
+ idx_value = parse_index(pd.RangeIndex(1))
42
42
  params = df.params
43
43
  params["index_value"] = idx_value
44
44
  self._output_types = get_output_types(df)
@@ -0,0 +1,186 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import numpy as np
15
+ import pandas as pd
16
+ import pytest
17
+
18
+ from ....udf import MarkedFunction
19
+ from ... import DataFrame
20
+ from ...core import DATAFRAME_TYPE, SERIES_TYPE
21
+
22
+
23
+ @pytest.fixture
24
+ def df1():
25
+ return DataFrame({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]})
26
+
27
+
28
+ @pytest.fixture
29
+ def df2():
30
+ return DataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3]], columns=["a", "b", "c"])
31
+
32
+
33
+ @pytest.fixture
34
+ def df3():
35
+ return DataFrame(
36
+ [[1, 2, 3], [1, 2, 3], [1, 2, 3]],
37
+ columns=["a", "b", "c"],
38
+ index=pd.MultiIndex.from_arrays([[1, 2, 3], [1, 2, 3]], names=["A", "B"]),
39
+ )
40
+
41
+
42
+ def test_apply_chunk_infer_dtypes_and_index(df1, df2, df3):
43
+ # dataframe -> dataframe filter
44
+ result = df3.mf.apply_chunk(
45
+ lambda data: data.query("A > 1"), batch_rows=2, output_type="dataframe"
46
+ )
47
+ assert isinstance(result, DATAFRAME_TYPE)
48
+ assert df3.index_value.key != result.index_value.key
49
+ assert df3.index_value.to_pandas().names == result.index_value.to_pandas().names
50
+
51
+ # dataframe -> dataframe keep same
52
+ result = df1.mf.apply_chunk(
53
+ lambda data: data, batch_rows=2, output_type="dataframe"
54
+ )
55
+ assert isinstance(result, DATAFRAME_TYPE)
56
+ assert result.index_value is df1.index_value
57
+
58
+ # dataframe -> dataframe ufunc with arguments
59
+ result = df1.mf.apply_chunk(
60
+ np.add, batch_rows=2, args=(2,), output_type="dataframe"
61
+ )
62
+ assert isinstance(result, DATAFRAME_TYPE)
63
+ assert result.index_value is df1.index_value
64
+ assert result.dtypes.equals(df1.dtypes)
65
+ assert result.shape == df1.shape
66
+
67
+ # dataframe -> series ufunc return series
68
+ result = df1.mf.apply_chunk(np.sum, batch_rows=2)
69
+ assert isinstance(result, SERIES_TYPE)
70
+ assert result.index_value is not df1.index_value
71
+
72
+ # series -> series
73
+ result = df3.a.mf.apply_chunk(lambda data: data, batch_rows=2, output_type="series")
74
+ assert isinstance(result, SERIES_TYPE)
75
+ assert df3.a.index_value is result.index_value
76
+
77
+ result = df3.a.mf.apply_chunk(
78
+ np.sum, batch_rows=2, output_type="series", dtype=np.int64, name="sum"
79
+ )
80
+ assert isinstance(result, SERIES_TYPE)
81
+ assert isinstance(result.index_value.to_pandas(), pd.RangeIndex)
82
+
83
+ # general functions
84
+ def process(data, param, k):
85
+ return data * param * k
86
+
87
+ result = df2.mf.apply_chunk(
88
+ process, batch_rows=3, output_type="dataframe", args=(4,), k=1
89
+ )
90
+ assert result.index_value is df2.index_value
91
+ assert result.dtypes.equals(df2.dtypes)
92
+
93
+ # mark functions
94
+ from ....udf import with_python_requirements, with_resources
95
+
96
+ @with_resources("empty.txt")
97
+ @with_python_requirements("numpy")
98
+ def process(data, k):
99
+ return data
100
+
101
+ result = df1.mf.apply_chunk(process, batch_rows=3, output_type="dataframe", k=1)
102
+ assert result.index_value is df1.index_value
103
+ assert result.dtypes.equals(df1.dtypes)
104
+ assert isinstance(result.op.func, MarkedFunction)
105
+ assert result.op.func is not process
106
+ assert result.op.func.resources is process.resources
107
+ assert result.op.func.pythonpacks is process.pythonpacks
108
+
109
+ def func_series_ret_series(data):
110
+ return pd.DataFrame([data, data])
111
+
112
+ result = df3.a.mf.apply_chunk(
113
+ func_series_ret_series, batch_rows=2, output_type="dataframe"
114
+ )
115
+ assert isinstance(result, DATAFRAME_TYPE)
116
+ assert result.op.func is func_series_ret_series
117
+
118
+
119
+ def test_apply_test(df1):
120
+ def process(x, param):
121
+ return x * param
122
+
123
+ result = df1.a.mf.apply_chunk(
124
+ process, batch_rows=2, output_type="series", args=(5,)
125
+ )
126
+ assert isinstance(result, SERIES_TYPE)
127
+
128
+
129
+ def test_apply_chunk(df1):
130
+ keys = [1, 2]
131
+
132
+ def f(x, keys):
133
+ if x["a"] in keys:
134
+ return [1, 0]
135
+ else:
136
+ return [0, 1]
137
+
138
+ result = df1[["a"]].mf.apply_chunk(
139
+ f,
140
+ output_type="dataframe",
141
+ dtypes=pd.Series(["int64", "int64"]),
142
+ batch_rows=5,
143
+ keys=keys,
144
+ )
145
+ assert result.shape == (np.nan, 2)
146
+ assert df1.index_value.key != result.index_value.key
147
+
148
+ # dataframe return series
149
+ result = df1.mf.apply_chunk(
150
+ lambda x: x.a,
151
+ output_type="series",
152
+ dtype="int64",
153
+ batch_rows=5,
154
+ )
155
+ assert result.shape == (np.nan,)
156
+ assert df1.index_value.key == result.index_value.key
157
+ assert df1.a.index_value.key == result.index_value.key
158
+
159
+ # return dataframe with given dtypes
160
+ result = df1.a.mf.apply_chunk(
161
+ lambda x: pd.concat([x, x], axis=1),
162
+ output_type="dataframe",
163
+ dtypes=pd.Series(["int64", "int64"]),
164
+ batch_rows=5,
165
+ )
166
+ assert result.shape == (np.nan, 2)
167
+ assert df1.a.index_value.key != result.index_value.key
168
+
169
+ # return series but as dataframe
170
+ result = df1.a.mf.apply_chunk(
171
+ lambda x: pd.concat([x, x], axis=0),
172
+ output_type="dataframe",
173
+ dtypes={"c": np.int_},
174
+ batch_rows=5,
175
+ )
176
+ assert result.shape == (np.nan, 1)
177
+
178
+
179
+ def test_apply_chunk_exception(df1):
180
+ with pytest.raises(ValueError):
181
+ df1.mf.apply_chunk(lambda data: data, batch_rows=-1, output_type="dataframe")
182
+
183
+ with pytest.raises(TypeError):
184
+ df1.mf.apply_chunk(
185
+ lambda data: data, batch_rows=object(), output_type="dataframe"
186
+ )
@@ -16,8 +16,9 @@ import pandas as pd
16
16
  import pytest
17
17
 
18
18
  from .... import dataframe as md
19
+ from ....tests.utils import assert_mf_index_dtype
19
20
  from ... import DataFrame
20
- from ...core import IndexValue
21
+ from ...core import DATAFRAME_TYPE, SERIES_TYPE, IndexValue
21
22
  from ..reshuffle import DataFrameReshuffle
22
23
 
23
24
 
@@ -32,7 +33,7 @@ def test_reshuffle():
32
33
 
33
34
  r = mdf.mf.reshuffle()
34
35
  assert isinstance(r.op, DataFrameReshuffle)
35
- assert isinstance(r.index_value.value, IndexValue.Int64Index)
36
+ assert_mf_index_dtype(r.index_value.value, np.int64)
36
37
 
37
38
  r = mdf.mf.reshuffle(ignore_index=True)
38
39
  assert isinstance(r.op, DataFrameReshuffle)
@@ -97,3 +98,46 @@ def test_flatmap(df1, df2, df3):
97
98
  assert apply_s.shape == (np.nan, 2)
98
99
  assert df3.index_value.key != apply_s.index_value.key
99
100
  assert df3.key != apply_s.index_value.key
101
+
102
+
103
+ def test_flatjson():
104
+ s1 = md.Series(["{{'a': 1, 'b': false}}"], index=[1])
105
+ df1 = s1.mf.flatjson(
106
+ ["$.a", "$.b"], dtypes=pd.Series(["int32", "bool"], index=["a", "b"])
107
+ )
108
+ assert df1.shape == (1, 2)
109
+ assert df1.index_value.key == s1.index_value.key
110
+ assert isinstance(df1, DATAFRAME_TYPE)
111
+ assert list(df1.dtypes) == [np.dtype("int32"), np.dtype("bool")]
112
+ assert list(df1.dtypes.index) == ["a", "b"]
113
+
114
+ df2 = s1.mf.flatjson(["$.a"], dtypes=pd.Series(["int32"], index=["a"]))
115
+ assert df2.shape == (1, 1)
116
+ assert df2.index_value.key == s1.index_value.key
117
+ assert isinstance(df2, DATAFRAME_TYPE)
118
+ assert list(df2.dtypes) == [np.dtype("int32")]
119
+ assert list(df2.dtypes.index) == ["a"]
120
+
121
+ s2 = s1.mf.flatjson("$.a", dtype="int32", name="a")
122
+ assert s2.shape == (1,)
123
+ assert s2.index_value.key == s1.index_value.key
124
+ assert isinstance(s2, SERIES_TYPE)
125
+ assert s2.dtype == np.dtype("int32")
126
+ assert s2.name == "a"
127
+
128
+ with pytest.raises(ValueError):
129
+ s1.mf.flatjson([], dtypes=pd.Series(["int32", "bool"], index=["a", "b"]))
130
+ with pytest.raises(ValueError):
131
+ s1.mf.flatjson(["$.a"], dtypes=pd.Series(["int32", "bool"], index=["a", "b"]))
132
+ with pytest.raises(ValueError):
133
+ s1.mf.flatjson(["$.a"], dtypes=pd.Series(["int32", "bool"], index=["a", "b"]))
134
+ with pytest.raises(ValueError):
135
+ s1.mf.flatjson(["$.a", "$.b"], dtypes=pd.Series(["bool"], index=["b"]))
136
+ with pytest.raises(ValueError):
137
+ s1.mf.flatjson(
138
+ ["$.a"],
139
+ dtype="int32",
140
+ dtypes=pd.Series(["int32"], index=["a"]),
141
+ )
142
+ with pytest.raises(ValueError):
143
+ s1.mf.flatjson(["$.a"])
@@ -55,6 +55,7 @@ def _install():
55
55
  setattr(cls, "kurtosis", lambda groupby, **kw: agg(groupby, "kurtosis", **kw))
56
56
  setattr(cls, "sem", lambda groupby, **kw: agg(groupby, "sem", **kw))
57
57
  setattr(cls, "nunique", lambda groupby, **kw: agg(groupby, "nunique", **kw))
58
+ setattr(cls, "median", lambda groupby, **kw: agg(groupby, "median", **kw))
58
59
 
59
60
  setattr(cls, "apply", groupby_apply)
60
61
  setattr(cls, "transform", groupby_transform)
@@ -79,6 +79,7 @@ _agg_functions = {
79
79
  "kurt": lambda x, bias=False: x.kurt(bias=bias),
80
80
  "kurtosis": lambda x, bias=False: x.kurtosis(bias=bias),
81
81
  "nunique": lambda x: x.nunique(),
82
+ "median": lambda x: x.median(),
82
83
  }
83
84
  _series_col_name = "col_name"
84
85
 
@@ -28,7 +28,13 @@ from ...serialization.serializables import (
28
28
  )
29
29
  from ...utils import get_func_token, quiet_stdio, tokenize
30
30
  from ..operators import DataFrameOperator, DataFrameOperatorMixin
31
- from ..utils import make_dtype, make_dtypes, parse_index, validate_output_types
31
+ from ..utils import (
32
+ copy_func_scheduling_hints,
33
+ make_dtype,
34
+ make_dtypes,
35
+ parse_index,
36
+ validate_output_types,
37
+ )
32
38
 
33
39
 
34
40
  class GroupByApplyLogicKeyGeneratorMixin(OperatorLogicKeyGeneratorMixin):
@@ -56,6 +62,8 @@ class GroupByApply(
56
62
 
57
63
  def __init__(self, output_types=None, **kw):
58
64
  super().__init__(_output_types=output_types, **kw)
65
+ if hasattr(self, "func"):
66
+ copy_func_scheduling_hints(self.func, self)
59
67
 
60
68
  def _update_key(self):
61
69
  values = [v for v in self._values_ if v is not self.func] + [
@@ -28,7 +28,7 @@ from ..utils import build_df, build_series, parse_index
28
28
 
29
29
  cudf = lazy_import("cudf")
30
30
 
31
- _GROUP_KEYS_NO_DEFAULT = pd_release_version >= (1, 5, 0)
31
+ _GROUP_KEYS_NO_DEFAULT = pd_release_version[:2] == (1, 5)
32
32
  _default_group_keys = no_default if _GROUP_KEYS_NO_DEFAULT else True
33
33
 
34
34
 
@@ -35,12 +35,15 @@ class GroupByFillOperator(DataFrameOperator, DataFrameOperatorMixin):
35
35
  func_name = getattr(self, "_func_name")
36
36
 
37
37
  if func_name == "fillna":
38
+ kw = {}
39
+ if self.axis is not None:
40
+ kw["axis"] = self.axis
38
41
  result_df = mock_groupby.fillna(
39
42
  value=self.value,
40
43
  method=self.method,
41
- axis=self.axis,
42
44
  limit=self.limit,
43
45
  downcast=self.downcast,
46
+ **kw,
44
47
  )
45
48
  else:
46
49
  result_df = getattr(mock_groupby, func_name)(limit=self.limit)
@@ -88,5 +88,11 @@ def df_groupby_getitem(df_groupby, item):
88
88
  if df_groupby.selection:
89
89
  raise IndexError(f"Column(s) {df_groupby.selection!r} already selected")
90
90
 
91
+ if (
92
+ isinstance(item, tuple)
93
+ and item not in df_groupby.dtypes
94
+ and item not in df_groupby.index.names
95
+ ):
96
+ item = list(item)
91
97
  op = GroupByIndex(selection=item, output_types=output_types)
92
98
  return op(df_groupby)
@@ -230,7 +230,7 @@ def test_groupby_transform():
230
230
  assert r.op._op_type_ == opcodes.TRANSFORM
231
231
  assert r.op.output_types[0] == OutputType.dataframe
232
232
 
233
- r = mdf.groupby("b").transform(["cummax", "cumcount"], _call_agg=True)
233
+ r = mdf[list("abde")].groupby("b").transform(["cummax", "cumcount"], _call_agg=True)
234
234
  assert r.shape == (np.nan, 6)
235
235
  assert r.op._op_type_ == opcodes.TRANSFORM
236
236
  assert r.op.output_types[0] == OutputType.dataframe
@@ -12,6 +12,8 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ import logging
16
+
15
17
  import numpy as np
16
18
  import pandas as pd
17
19
 
@@ -20,7 +22,9 @@ from ...core import OutputType
20
22
  from ...serialization.serializables import AnyField, BoolField, DictField, TupleField
21
23
  from ...utils import quiet_stdio
22
24
  from ..operators import DataFrameOperator, DataFrameOperatorMixin
23
- from ..utils import parse_index
25
+ from ..utils import copy_func_scheduling_hints, parse_index
26
+
27
+ logger = logging.getLogger(__name__)
24
28
 
25
29
 
26
30
  class GroupByTransform(DataFrameOperator, DataFrameOperatorMixin):
@@ -35,6 +39,8 @@ class GroupByTransform(DataFrameOperator, DataFrameOperatorMixin):
35
39
 
36
40
  def __init__(self, output_types=None, **kw):
37
41
  super().__init__(_output_types=output_types, **kw)
42
+ if hasattr(self, "func"):
43
+ copy_func_scheduling_hints(self.func, self)
38
44
 
39
45
  def _infer_df_func_returns(self, in_groupby, dtypes, index):
40
46
  index_value, output_types, new_dtypes = None, None, None
@@ -65,7 +71,7 @@ class GroupByTransform(DataFrameOperator, DataFrameOperatorMixin):
65
71
  output_types = [OutputType.series]
66
72
  new_dtypes = new_dtypes or (infer_df.name, infer_df.dtype)
67
73
  except: # noqa: E722 # nosec
68
- pass
74
+ logger.info("Exception raised while inferring df_func", exc_info=True)
69
75
 
70
76
  self.output_types = output_types if not self.output_types else self.output_types
71
77
  dtypes = new_dtypes if dtypes is None else dtypes