maxframe 0.1.0b5__cp311-cp311-win32.whl → 1.0.0__cp311-cp311-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (203) hide show
  1. maxframe/_utils.cp311-win32.pyd +0 -0
  2. maxframe/codegen.py +10 -4
  3. maxframe/config/config.py +68 -10
  4. maxframe/config/validators.py +42 -11
  5. maxframe/conftest.py +58 -14
  6. maxframe/core/__init__.py +2 -16
  7. maxframe/core/entity/__init__.py +1 -12
  8. maxframe/core/entity/executable.py +1 -1
  9. maxframe/core/entity/objects.py +46 -45
  10. maxframe/core/entity/output_types.py +0 -3
  11. maxframe/core/entity/tests/test_objects.py +43 -0
  12. maxframe/core/entity/tileables.py +5 -78
  13. maxframe/core/graph/__init__.py +2 -2
  14. maxframe/core/graph/builder/__init__.py +0 -1
  15. maxframe/core/graph/builder/base.py +5 -4
  16. maxframe/core/graph/builder/tileable.py +4 -4
  17. maxframe/core/graph/builder/utils.py +4 -8
  18. maxframe/core/graph/core.cp311-win32.pyd +0 -0
  19. maxframe/core/graph/core.pyx +4 -4
  20. maxframe/core/graph/entity.py +9 -33
  21. maxframe/core/operator/__init__.py +2 -9
  22. maxframe/core/operator/base.py +3 -5
  23. maxframe/core/operator/objects.py +0 -9
  24. maxframe/core/operator/utils.py +55 -0
  25. maxframe/dataframe/__init__.py +1 -1
  26. maxframe/dataframe/arithmetic/around.py +5 -17
  27. maxframe/dataframe/arithmetic/core.py +15 -7
  28. maxframe/dataframe/arithmetic/docstring.py +7 -33
  29. maxframe/dataframe/arithmetic/equal.py +4 -2
  30. maxframe/dataframe/arithmetic/greater.py +4 -2
  31. maxframe/dataframe/arithmetic/greater_equal.py +4 -2
  32. maxframe/dataframe/arithmetic/less.py +2 -2
  33. maxframe/dataframe/arithmetic/less_equal.py +4 -2
  34. maxframe/dataframe/arithmetic/not_equal.py +4 -2
  35. maxframe/dataframe/arithmetic/tests/test_arithmetic.py +39 -16
  36. maxframe/dataframe/core.py +31 -7
  37. maxframe/dataframe/datasource/date_range.py +2 -2
  38. maxframe/dataframe/datasource/read_odps_query.py +117 -23
  39. maxframe/dataframe/datasource/read_odps_table.py +6 -3
  40. maxframe/dataframe/datasource/tests/test_datasource.py +103 -8
  41. maxframe/dataframe/datastore/tests/test_to_odps.py +48 -0
  42. maxframe/dataframe/datastore/to_odps.py +28 -0
  43. maxframe/dataframe/extensions/__init__.py +5 -0
  44. maxframe/dataframe/extensions/flatjson.py +131 -0
  45. maxframe/dataframe/extensions/flatmap.py +317 -0
  46. maxframe/dataframe/extensions/reshuffle.py +1 -1
  47. maxframe/dataframe/extensions/tests/test_extensions.py +108 -3
  48. maxframe/dataframe/groupby/core.py +1 -1
  49. maxframe/dataframe/groupby/cum.py +0 -1
  50. maxframe/dataframe/groupby/fill.py +4 -1
  51. maxframe/dataframe/groupby/getitem.py +6 -0
  52. maxframe/dataframe/groupby/tests/test_groupby.py +5 -1
  53. maxframe/dataframe/groupby/transform.py +5 -1
  54. maxframe/dataframe/indexing/align.py +1 -1
  55. maxframe/dataframe/indexing/loc.py +6 -4
  56. maxframe/dataframe/indexing/rename.py +5 -28
  57. maxframe/dataframe/indexing/sample.py +0 -1
  58. maxframe/dataframe/indexing/set_index.py +68 -1
  59. maxframe/dataframe/initializer.py +11 -1
  60. maxframe/dataframe/merge/__init__.py +9 -1
  61. maxframe/dataframe/merge/concat.py +41 -31
  62. maxframe/dataframe/merge/merge.py +237 -3
  63. maxframe/dataframe/merge/tests/test_merge.py +126 -1
  64. maxframe/dataframe/misc/apply.py +5 -10
  65. maxframe/dataframe/misc/case_when.py +1 -1
  66. maxframe/dataframe/misc/describe.py +2 -2
  67. maxframe/dataframe/misc/drop_duplicates.py +8 -8
  68. maxframe/dataframe/misc/eval.py +4 -0
  69. maxframe/dataframe/misc/memory_usage.py +2 -2
  70. maxframe/dataframe/misc/pct_change.py +1 -83
  71. maxframe/dataframe/misc/tests/test_misc.py +33 -2
  72. maxframe/dataframe/misc/transform.py +1 -30
  73. maxframe/dataframe/misc/value_counts.py +4 -17
  74. maxframe/dataframe/missing/dropna.py +1 -1
  75. maxframe/dataframe/missing/fillna.py +5 -5
  76. maxframe/dataframe/operators.py +1 -17
  77. maxframe/dataframe/reduction/core.py +2 -2
  78. maxframe/dataframe/reduction/tests/test_reduction.py +2 -4
  79. maxframe/dataframe/sort/sort_values.py +1 -11
  80. maxframe/dataframe/statistics/corr.py +3 -3
  81. maxframe/dataframe/statistics/quantile.py +13 -19
  82. maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
  83. maxframe/dataframe/tests/test_initializer.py +33 -2
  84. maxframe/dataframe/utils.py +26 -11
  85. maxframe/dataframe/window/expanding.py +5 -3
  86. maxframe/dataframe/window/tests/test_expanding.py +2 -2
  87. maxframe/errors.py +13 -0
  88. maxframe/extension.py +12 -0
  89. maxframe/io/__init__.py +13 -0
  90. maxframe/io/objects/__init__.py +24 -0
  91. maxframe/io/objects/core.py +140 -0
  92. maxframe/io/objects/tensor.py +76 -0
  93. maxframe/io/objects/tests/__init__.py +13 -0
  94. maxframe/io/objects/tests/test_object_io.py +97 -0
  95. maxframe/{odpsio → io/odpsio}/__init__.py +3 -1
  96. maxframe/{odpsio → io/odpsio}/arrow.py +42 -10
  97. maxframe/{odpsio → io/odpsio}/schema.py +38 -16
  98. maxframe/io/odpsio/tableio.py +719 -0
  99. maxframe/io/odpsio/tests/__init__.py +13 -0
  100. maxframe/{odpsio → io/odpsio}/tests/test_schema.py +59 -22
  101. maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +50 -23
  102. maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
  103. maxframe/io/odpsio/volumeio.py +63 -0
  104. maxframe/learn/contrib/__init__.py +3 -1
  105. maxframe/learn/contrib/graph/__init__.py +15 -0
  106. maxframe/learn/contrib/graph/connected_components.py +215 -0
  107. maxframe/learn/contrib/graph/tests/__init__.py +13 -0
  108. maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
  109. maxframe/learn/contrib/llm/__init__.py +16 -0
  110. maxframe/learn/contrib/llm/core.py +54 -0
  111. maxframe/learn/contrib/llm/models/__init__.py +14 -0
  112. maxframe/learn/contrib/llm/models/dashscope.py +73 -0
  113. maxframe/learn/contrib/llm/multi_modal.py +42 -0
  114. maxframe/learn/contrib/llm/text.py +42 -0
  115. maxframe/learn/contrib/xgboost/classifier.py +26 -2
  116. maxframe/learn/contrib/xgboost/core.py +87 -2
  117. maxframe/learn/contrib/xgboost/dmatrix.py +3 -6
  118. maxframe/learn/contrib/xgboost/predict.py +29 -46
  119. maxframe/learn/contrib/xgboost/regressor.py +3 -10
  120. maxframe/learn/contrib/xgboost/train.py +29 -18
  121. maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
  122. maxframe/lib/mmh3.cp311-win32.pyd +0 -0
  123. maxframe/lib/mmh3.pyi +43 -0
  124. maxframe/lib/sparse/tests/test_sparse.py +15 -15
  125. maxframe/lib/wrapped_pickle.py +2 -1
  126. maxframe/opcodes.py +8 -0
  127. maxframe/protocol.py +154 -27
  128. maxframe/remote/core.py +4 -8
  129. maxframe/serialization/__init__.py +1 -0
  130. maxframe/serialization/core.cp311-win32.pyd +0 -0
  131. maxframe/serialization/core.pxd +3 -0
  132. maxframe/serialization/core.pyi +3 -0
  133. maxframe/serialization/core.pyx +67 -26
  134. maxframe/serialization/exception.py +1 -1
  135. maxframe/serialization/pandas.py +52 -17
  136. maxframe/serialization/serializables/core.py +180 -15
  137. maxframe/serialization/serializables/field_type.py +4 -1
  138. maxframe/serialization/serializables/tests/test_serializable.py +54 -5
  139. maxframe/serialization/tests/test_serial.py +2 -1
  140. maxframe/session.py +9 -2
  141. maxframe/tensor/__init__.py +81 -2
  142. maxframe/tensor/arithmetic/isclose.py +1 -0
  143. maxframe/tensor/arithmetic/tests/test_arithmetic.py +22 -18
  144. maxframe/tensor/core.py +5 -136
  145. maxframe/tensor/datasource/array.py +3 -0
  146. maxframe/tensor/datasource/full.py +1 -1
  147. maxframe/tensor/datasource/tests/test_datasource.py +1 -1
  148. maxframe/tensor/indexing/flatnonzero.py +1 -1
  149. maxframe/tensor/indexing/getitem.py +2 -0
  150. maxframe/tensor/merge/__init__.py +2 -0
  151. maxframe/tensor/merge/concatenate.py +101 -0
  152. maxframe/tensor/merge/tests/test_merge.py +30 -1
  153. maxframe/tensor/merge/vstack.py +74 -0
  154. maxframe/tensor/{base → misc}/__init__.py +2 -0
  155. maxframe/tensor/{base → misc}/atleast_1d.py +1 -3
  156. maxframe/tensor/misc/atleast_2d.py +70 -0
  157. maxframe/tensor/misc/atleast_3d.py +85 -0
  158. maxframe/tensor/misc/tests/__init__.py +13 -0
  159. maxframe/tensor/{base → misc}/transpose.py +22 -18
  160. maxframe/tensor/{base → misc}/unique.py +3 -3
  161. maxframe/tensor/operators.py +1 -7
  162. maxframe/tensor/random/core.py +1 -1
  163. maxframe/tensor/reduction/count_nonzero.py +2 -1
  164. maxframe/tensor/reduction/mean.py +1 -0
  165. maxframe/tensor/reduction/nanmean.py +1 -0
  166. maxframe/tensor/reduction/nanvar.py +2 -0
  167. maxframe/tensor/reduction/tests/test_reduction.py +12 -1
  168. maxframe/tensor/reduction/var.py +2 -0
  169. maxframe/tensor/statistics/quantile.py +2 -2
  170. maxframe/tensor/utils.py +2 -22
  171. maxframe/tests/test_protocol.py +34 -0
  172. maxframe/tests/test_utils.py +0 -12
  173. maxframe/tests/utils.py +17 -2
  174. maxframe/typing_.py +4 -1
  175. maxframe/udf.py +8 -9
  176. maxframe/utils.py +106 -86
  177. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/METADATA +25 -25
  178. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/RECORD +197 -173
  179. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/WHEEL +1 -1
  180. maxframe_client/__init__.py +0 -1
  181. maxframe_client/clients/framedriver.py +4 -1
  182. maxframe_client/fetcher.py +81 -74
  183. maxframe_client/session/consts.py +3 -0
  184. maxframe_client/session/graph.py +8 -2
  185. maxframe_client/session/odps.py +194 -40
  186. maxframe_client/session/task.py +94 -39
  187. maxframe_client/tests/test_fetcher.py +21 -3
  188. maxframe_client/tests/test_session.py +109 -8
  189. maxframe/core/entity/chunks.py +0 -68
  190. maxframe/core/entity/fuse.py +0 -73
  191. maxframe/core/graph/builder/chunk.py +0 -430
  192. maxframe/odpsio/tableio.py +0 -322
  193. maxframe/odpsio/volumeio.py +0 -95
  194. maxframe_client/clients/spe.py +0 -104
  195. /maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
  196. /maxframe/{tensor/base → dataframe/datastore}/tests/__init__.py +0 -0
  197. /maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
  198. /maxframe/tensor/{base → misc}/astype.py +0 -0
  199. /maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
  200. /maxframe/tensor/{base → misc}/ravel.py +0 -0
  201. /maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
  202. /maxframe/tensor/{base → misc}/where.py +0 -0
  203. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,131 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import List
16
+
17
+ from ... import opcodes
18
+ from ...core import OutputType
19
+ from ...serialization.serializables import ListField
20
+ from ...serialization.serializables.field_type import FieldTypes
21
+ from ..core import DataFrame
22
+ from ..operators import DataFrameOperator, DataFrameOperatorMixin
23
+ from ..utils import make_dtypes, parse_index
24
+
25
+
26
+ class SeriesFlatJSONOperator(DataFrameOperator, DataFrameOperatorMixin):
27
+ _op_type_ = opcodes.FLATJSON
28
+
29
+ query_paths = ListField("query_paths", field_type=FieldTypes.string, default=None)
30
+
31
+ def __call__(self, series, dtypes):
32
+ if self._output_types[0] == OutputType.series:
33
+ name, dtype = dtypes
34
+ return self.new_series(
35
+ [series],
36
+ shape=series.shape,
37
+ index_value=series.index_value,
38
+ name=name,
39
+ dtype=dtype,
40
+ )
41
+ return self.new_dataframe(
42
+ [series],
43
+ shape=(series.shape[0], len(dtypes)),
44
+ index_value=series.index_value,
45
+ columns_value=parse_index(dtypes.index, store_data=True),
46
+ dtypes=make_dtypes(dtypes),
47
+ )
48
+
49
+
50
+ def series_flatjson(
51
+ series,
52
+ query_paths: List[str],
53
+ dtypes=None,
54
+ dtype=None,
55
+ name: str = None,
56
+ ) -> DataFrame:
57
+ """
58
+ Flat JSON object in the series to a dataframe according to JSON query.
59
+
60
+ Parameters
61
+ ----------
62
+ series : Series
63
+ The series of json strings.
64
+
65
+ query_paths: List[str] or str
66
+ The JSON query paths for each generated column. The path format should follow
67
+ [RFC9535](https://datatracker.ietf.org/doc/rfc9535/).
68
+
69
+ dtypes : Series, default None
70
+ Specify dtypes of returned DataFrame. Can't work with dtype.
71
+
72
+ dtype : numpy.dtype, default None
73
+ Specify dtype of returned Series. Can't work with dtypes.
74
+
75
+ name : str, default None
76
+ Specify name of the returned Series.
77
+
78
+ Returns
79
+ -------
80
+ DataFrame or Series
81
+ Result of DataFrame when dtypes specified, else Series.
82
+
83
+ Examples
84
+ --------
85
+ >>> import maxframe.dataframe as md
86
+ >>> import pandas as pd
87
+ >>> s = md.Series(
88
+ ... [
89
+ ... '{"age": 24, "gender": "male", "graduated": false}',
90
+ ... '{"age": 25, "gender": "female", "graduated": true}',
91
+ ... ]
92
+ ... )
93
+ >>> s.execute()
94
+ 0 {"age": 24, "gender": "male", "graduated": false}
95
+ 1 {"age": 25, "gender": "female", "graduated": true}
96
+ dtype: object
97
+
98
+ >>> df = s.mf.flatjson(
99
+ ... ["$.age", "$.gender", "$.graduated"],
100
+ ... dtypes=pd.Series(["int32", "object", "bool"], index=["age", "gender", "graduated"]),
101
+ ... )
102
+ >>> df.execute()
103
+ age gender graduated
104
+ 0 24 male True
105
+ 1 25 female True
106
+
107
+ >>> s2 = s.mf.flatjson("$.age", name="age", dtype="int32")
108
+ >>> s2.execute()
109
+ 0 24
110
+ 1 25
111
+ Name: age, dtype: int32
112
+ """
113
+ if isinstance(query_paths, str):
114
+ query_paths = [query_paths]
115
+ if dtypes is not None and dtype is not None:
116
+ raise ValueError("Both dtypes and dtype cannot be specified at the same time.")
117
+ if dtype is not None:
118
+ if len(query_paths) != 1:
119
+ raise ValueError("query_paths should have only one path if dtype is set")
120
+ output_type = OutputType.series
121
+ elif dtypes is not None:
122
+ if len(dtypes) != len(query_paths):
123
+ raise ValueError("query_paths and dtypes should have same length")
124
+ output_type = OutputType.dataframe
125
+ else:
126
+ raise ValueError("dtypes or dtype should be specified")
127
+
128
+ dtypes = (name, dtype) if dtype is not None else dtypes
129
+ return SeriesFlatJSONOperator(query_paths=query_paths, _output_types=[output_type])(
130
+ series, dtypes
131
+ )
@@ -0,0 +1,317 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import Callable
16
+
17
+ import numpy as np
18
+ import pandas as pd
19
+
20
+ from ... import opcodes
21
+ from ...core import OutputType
22
+ from ...serialization.serializables import (
23
+ BoolField,
24
+ DictField,
25
+ FunctionField,
26
+ TupleField,
27
+ )
28
+ from ..core import DataFrame
29
+ from ..operators import DataFrameOperator, DataFrameOperatorMixin
30
+ from ..utils import gen_unknown_index_value, make_dtypes, parse_index
31
+
32
+
33
+ class DataFrameFlatMapOperator(DataFrameOperator, DataFrameOperatorMixin):
34
+ _op_type_ = opcodes.FLATMAP
35
+
36
+ func = FunctionField("func")
37
+ raw = BoolField("raw", default=False)
38
+ args = TupleField("args", default=())
39
+ kwargs = DictField("kwargs", default={})
40
+
41
+ def __init__(self, output_types=None, **kw):
42
+ super().__init__(_output_types=output_types, **kw)
43
+
44
+ def _call_dataframe(self, df: DataFrame, dtypes: pd.Series):
45
+ dtypes = make_dtypes(dtypes)
46
+ index_value = gen_unknown_index_value(
47
+ df.index_value,
48
+ (df.key, df.index_value.key, self.func),
49
+ normalize_range_index=True,
50
+ )
51
+ return self.new_dataframe(
52
+ [df],
53
+ shape=(np.nan, len(dtypes)),
54
+ index_value=index_value,
55
+ columns_value=parse_index(dtypes.index, store_data=True),
56
+ dtypes=dtypes,
57
+ )
58
+
59
+ def _call_series_or_index(self, series, dtypes=None):
60
+ index_value = gen_unknown_index_value(
61
+ series.index_value,
62
+ (series.key, series.index_value.key, self.func),
63
+ normalize_range_index=True,
64
+ )
65
+
66
+ if self.output_types[0] == OutputType.series:
67
+ name, dtype = dtypes
68
+ return self.new_series(
69
+ [series],
70
+ dtype=dtype,
71
+ shape=(np.nan,),
72
+ index_value=index_value,
73
+ name=name,
74
+ )
75
+
76
+ dtypes = make_dtypes(dtypes)
77
+ columns_value = parse_index(dtypes.index, store_data=True)
78
+ return self.new_dataframe(
79
+ [series],
80
+ shape=(np.nan, len(dtypes)),
81
+ index_value=index_value,
82
+ columns_value=columns_value,
83
+ dtypes=dtypes,
84
+ )
85
+
86
+ def __call__(
87
+ self,
88
+ df_or_series,
89
+ dtypes=None,
90
+ output_type=None,
91
+ ):
92
+ if df_or_series.op.output_types[0] == OutputType.dataframe:
93
+ return self._call_dataframe(df_or_series, dtypes=dtypes)
94
+ else:
95
+ return self._call_series_or_index(df_or_series, dtypes=dtypes)
96
+
97
+
98
+ def df_flatmap(dataframe, func: Callable, dtypes=None, raw=False, args=(), **kwargs):
99
+ """
100
+ Apply the given function to each row and then flatten results. Use this method if your transformation returns
101
+ multiple rows for each input row.
102
+
103
+ This function applies a transformation to each row of the DataFrame, where the transformation can return zero
104
+ or multiple values, effectively flattening Python generators, list-like collections, and DataFrames.
105
+
106
+ Parameters
107
+ ----------
108
+ dataframe : DataFrame
109
+ The DataFrame to which the function will be applied.
110
+
111
+ func : Callable
112
+ Function to apply to each row of the DataFrame. It should accept a Series (or an array if `raw=True`)
113
+ representing a row and return a list or iterable of values.
114
+
115
+ dtypes : Series, dict or list
116
+ Specify dtypes of returned DataFrame.
117
+
118
+ raw : bool, default False
119
+ Determines if the row is passed as a Series or as a numpy array:
120
+
121
+ * ``False`` : passes each row as a Series to the function.
122
+ * ``True`` : the passed function will receive numpy array objects instead.
123
+
124
+ args : tuple
125
+ Positional arguments to pass to `func`.
126
+
127
+ **kwargs
128
+ Additional keyword arguments to pass as keywords arguments to `func`.
129
+
130
+ Returns
131
+ -------
132
+ DataFrame
133
+ Return DataFrame with specified `dtypes`.
134
+
135
+ Notes
136
+ -----
137
+ The `func` must return an iterable of values for each input row. The index of the resulting DataFrame will be
138
+ repeated based on the number of output rows generated by `func`.
139
+
140
+ Examples
141
+ --------
142
+ >>> import numpy as np
143
+ >>> import maxframe.dataframe as md
144
+ >>> df = md.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
145
+ >>> df.execute()
146
+ A B
147
+ 0 1 4
148
+ 1 2 5
149
+ 2 3 6
150
+
151
+ Define a function that takes a number and returns a list of two numbers:
152
+
153
+ >>> def generate_values_array(row):
154
+ ... return [row['A'] * 2, row['B'] * 3]
155
+
156
+ Define a function that takes a row and return two rows and two columns:
157
+
158
+ >>> def generate_values_in_generator(row):
159
+ ... yield [row[0] * 2, row[1] * 4]
160
+ ... yield [row[0] * 3, row[1] * 5]
161
+
162
+ Which equals to the following function return a dataframe:
163
+
164
+ >>> def generate_values_in_dataframe(row):
165
+ ... return pd.DataFrame([[row[0] * 2, row[1] * 4], [row[0] * 3, row[1] * 5]])
166
+
167
+ Specify `dtypes` with a function which returns a DataFrame:
168
+
169
+ >>> df.mf.flatmap(generate_values_array, dtypes=pd.Series({'A': 'int'})).execute()
170
+ A
171
+ 0 2
172
+ 0 12
173
+ 1 4
174
+ 1 15
175
+ 2 6
176
+ 2 18
177
+
178
+ Specify raw=True to pass input row as array:
179
+
180
+ >>> df.mf.flatmap(generate_values_in_generator, dtypes={"A": "int", "B": "int"}, raw=True).execute()
181
+ A B
182
+ 0 2 16
183
+ 0 3 20
184
+ 1 4 20
185
+ 1 6 25
186
+ 2 6 24
187
+ 2 9 30
188
+ """
189
+ if dtypes is None or len(dtypes) == 0:
190
+ raise TypeError(
191
+ "Cannot determine {dtypes} by calculating with enumerate data, "
192
+ "please specify it as arguments"
193
+ )
194
+
195
+ if not isinstance(func, Callable):
196
+ raise TypeError("function must be a callable object")
197
+
198
+ output_types = [OutputType.dataframe]
199
+ op = DataFrameFlatMapOperator(
200
+ func=func, raw=raw, output_types=output_types, args=args, kwargs=kwargs
201
+ )
202
+ return op(
203
+ dataframe,
204
+ dtypes=dtypes,
205
+ )
206
+
207
+
208
+ def series_flatmap(
209
+ series, func: Callable, dtypes=None, dtype=None, name=None, args=(), **kwargs
210
+ ):
211
+ """
212
+ Apply the given function to each row and then flatten results. Use this method if your transformation returns
213
+ multiple rows for each input row.
214
+
215
+ This function applies a transformation to each element of the Series, where the transformation can return zero
216
+ or multiple values, effectively flattening Python generator, list-liked collections and DataFrame.
217
+
218
+ Parameters
219
+ ----------
220
+ series : Series
221
+ The series to which the function will be applied.
222
+
223
+ func : Callable
224
+ Function to apply to each element of the Series. It should accept a scalar value
225
+ (or an array if `raw=True`) and return a list or iterable of values.
226
+
227
+ dtypes : Series, default None
228
+ Specify dtypes of returned DataFrame. Can't work with dtype.
229
+
230
+ dtype : numpy.dtype, default None
231
+ Specify dtype of returned Series. Can't work with dtypes.
232
+
233
+ name : str, default None
234
+ Specify name of the returned Series.
235
+
236
+ args : tuple
237
+ Positional arguments to pass to `func`.
238
+
239
+ **kwargs
240
+ Additional keyword arguments to pass as keywords arguments to `func`.
241
+
242
+ Returns
243
+ -------
244
+ DataFrame or Series
245
+ Result of DataFrame when dtypes specified, else Series.
246
+
247
+ Notes
248
+ -----
249
+ The `func` must return an iterable of values for each input element. If `dtypes` is specified,
250
+ `flatmap` will return a DataFrame, if `dtype` and `name` is specified, a Series will be returned. The index of
251
+ the resulting DataFrame/Series will be repeated based on the number of output rows generated by `func`.
252
+
253
+ Examples
254
+ --------
255
+ >>> import numpy as np
256
+ >>> import maxframe.dataframe as md
257
+ >>> df = md.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
258
+ >>> df.execute()
259
+ A B
260
+ 0 1 4
261
+ 1 2 5
262
+ 2 3 6
263
+
264
+ Define a function that takes a number and returns a list of two numbers:
265
+
266
+ >>> def generate_values_array(x):
267
+ ... return [x * 2, x * 3]
268
+
269
+ >>> def generate_values_in_generator(x):
270
+ ... yield pd.Series([x * 2, x * 4])
271
+ ... yield pd.Series([x * 3, x * 5])
272
+
273
+ Specify `dtype` with a function which returns list to return more than one elements as a Series:
274
+
275
+ >>> df['A'].mf.flatmap(generate_values_array, dtype="int", name="C").execute()
276
+ 0 2
277
+ 0 3
278
+ 1 4
279
+ 1 6
280
+ 2 6
281
+ 2 9
282
+ Name: C, dtype: int64
283
+
284
+ Specify `dtypes` to return multi columns as a DataFrame:
285
+
286
+ >>> df['A'].mf.flatmap(generate_values_in_generator, dtypes={"A": "int", "B": "int"}).execute()
287
+ A B
288
+ 0 2 4
289
+ 0 3 5
290
+ 1 4 8
291
+ 1 6 10
292
+ 2 6 12
293
+ 2 9 15
294
+ """
295
+
296
+ if dtypes is not None and dtype is not None:
297
+ raise ValueError("Both dtypes and dtype cannot be specified at the same time.")
298
+
299
+ dtypes = (name, dtype) if dtype is not None else dtypes
300
+ if dtypes is None:
301
+ raise TypeError(
302
+ "Cannot determine {dtypes} or {dtype} by calculating with enumerate data, "
303
+ "please specify it as arguments"
304
+ )
305
+
306
+ if not isinstance(func, Callable):
307
+ raise TypeError("function must be a callable object")
308
+
309
+ output_type = OutputType.series if dtype is not None else OutputType.dataframe
310
+
311
+ op = DataFrameFlatMapOperator(
312
+ func=func, raw=False, output_types=[output_type], args=args, kwargs=kwargs
313
+ )
314
+ return op(
315
+ series,
316
+ dtypes=dtypes,
317
+ )
@@ -38,7 +38,7 @@ class DataFrameReshuffle(DataFrameOperator, DataFrameOperatorMixin):
38
38
  else:
39
39
  idx_value = df.index_value
40
40
  if isinstance(idx_value.value, IndexValue.RangeIndex):
41
- idx_value = parse_index(pd.Int64Index([0]))
41
+ idx_value = parse_index(pd.RangeIndex(1))
42
42
  params = df.params
43
43
  params["index_value"] = idx_value
44
44
  self._output_types = get_output_types(df)
@@ -11,12 +11,14 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
-
14
+ import numpy as np
15
15
  import pandas as pd
16
16
  import pytest
17
17
 
18
18
  from .... import dataframe as md
19
- from ...core import IndexValue
19
+ from ....tests.utils import assert_mf_index_dtype
20
+ from ... import DataFrame
21
+ from ...core import DATAFRAME_TYPE, SERIES_TYPE, IndexValue
20
22
  from ..reshuffle import DataFrameReshuffle
21
23
 
22
24
 
@@ -31,8 +33,111 @@ def test_reshuffle():
31
33
 
32
34
  r = mdf.mf.reshuffle()
33
35
  assert isinstance(r.op, DataFrameReshuffle)
34
- assert isinstance(r.index_value.value, IndexValue.Int64Index)
36
+ assert_mf_index_dtype(r.index_value.value, np.int64)
35
37
 
36
38
  r = mdf.mf.reshuffle(ignore_index=True)
37
39
  assert isinstance(r.op, DataFrameReshuffle)
38
40
  assert isinstance(r.index_value.value, IndexValue.RangeIndex)
41
+
42
+
43
+ @pytest.fixture
44
+ def df1():
45
+ return DataFrame({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]})
46
+
47
+
48
+ @pytest.fixture
49
+ def df2():
50
+ return DataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3]], columns=["a", "b", "c"])
51
+
52
+
53
+ @pytest.fixture
54
+ def df3():
55
+ return DataFrame(
56
+ [[1, 2, 3], [1, 2, 3], [1, 2, 3]],
57
+ columns=["a", "b", "c"],
58
+ index=pd.MultiIndex.from_arrays([[1, 2, 3], [1, 2, 3]], names=["A", "B"]),
59
+ )
60
+
61
+
62
+ def test_flatmap(df1, df2, df3):
63
+ def f(x, keys):
64
+ if x["a"] in keys:
65
+ yield [1, 0]
66
+ yield [0, 1]
67
+
68
+ apply_df = df1[["a"]].mf.flatmap(
69
+ f,
70
+ dtypes={"a": "int64", "b": "int64"},
71
+ )
72
+ assert apply_df.shape == (np.nan, 2)
73
+ assert df1.index_value.key != apply_df.index_value.key
74
+ assert isinstance(df1.index_value.to_pandas(), pd.RangeIndex)
75
+ assert not isinstance(apply_df.index_value.to_pandas(), pd.RangeIndex)
76
+ apply_df = df2[["a"]].mf.flatmap(
77
+ f,
78
+ dtypes=pd.Series(["int64", "int64"]),
79
+ )
80
+ assert apply_df.shape == (np.nan, 2)
81
+ assert df2.index_value.key != apply_df.index_value.key
82
+ with pytest.raises(TypeError):
83
+ apply_s = df3["a"].mf.flatmap(
84
+ f,
85
+ )
86
+ apply_s = df3["a"].mf.flatmap(
87
+ f,
88
+ dtype="int64",
89
+ )
90
+ assert apply_s.shape == (np.nan,)
91
+ assert df3.index_value.key != apply_s.index_value.key
92
+ assert df3.key != apply_s.index_value.key
93
+ apply_s = df3["a"].mf.flatmap(
94
+ f,
95
+ output_type="dataframe",
96
+ dtypes=["int64", "int64"],
97
+ )
98
+ assert apply_s.shape == (np.nan, 2)
99
+ assert df3.index_value.key != apply_s.index_value.key
100
+ assert df3.key != apply_s.index_value.key
101
+
102
+
103
+ def test_flatjson():
104
+ s1 = md.Series(["{{'a': 1, 'b': false}}"], index=[1])
105
+ df1 = s1.mf.flatjson(
106
+ ["$.a", "$.b"], dtypes=pd.Series(["int32", "bool"], index=["a", "b"])
107
+ )
108
+ assert df1.shape == (1, 2)
109
+ assert df1.index_value.key == s1.index_value.key
110
+ assert isinstance(df1, DATAFRAME_TYPE)
111
+ assert list(df1.dtypes) == [np.dtype("int32"), np.dtype("bool")]
112
+ assert list(df1.dtypes.index) == ["a", "b"]
113
+
114
+ df2 = s1.mf.flatjson(["$.a"], dtypes=pd.Series(["int32"], index=["a"]))
115
+ assert df2.shape == (1, 1)
116
+ assert df2.index_value.key == s1.index_value.key
117
+ assert isinstance(df2, DATAFRAME_TYPE)
118
+ assert list(df2.dtypes) == [np.dtype("int32")]
119
+ assert list(df2.dtypes.index) == ["a"]
120
+
121
+ s2 = s1.mf.flatjson("$.a", dtype="int32", name="a")
122
+ assert s2.shape == (1,)
123
+ assert s2.index_value.key == s1.index_value.key
124
+ assert isinstance(s2, SERIES_TYPE)
125
+ assert s2.dtype == np.dtype("int32")
126
+ assert s2.name == "a"
127
+
128
+ with pytest.raises(ValueError):
129
+ s1.mf.flatjson([], dtypes=pd.Series(["int32", "bool"], index=["a", "b"]))
130
+ with pytest.raises(ValueError):
131
+ s1.mf.flatjson(["$.a"], dtypes=pd.Series(["int32", "bool"], index=["a", "b"]))
132
+ with pytest.raises(ValueError):
133
+ s1.mf.flatjson(["$.a"], dtypes=pd.Series(["int32", "bool"], index=["a", "b"]))
134
+ with pytest.raises(ValueError):
135
+ s1.mf.flatjson(["$.a", "$.b"], dtypes=pd.Series(["bool"], index=["b"]))
136
+ with pytest.raises(ValueError):
137
+ s1.mf.flatjson(
138
+ ["$.a"],
139
+ dtype="int32",
140
+ dtypes=pd.Series(["int32"], index=["a"]),
141
+ )
142
+ with pytest.raises(ValueError):
143
+ s1.mf.flatjson(["$.a"])
@@ -28,7 +28,7 @@ from ..utils import build_df, build_series, parse_index
28
28
 
29
29
  cudf = lazy_import("cudf")
30
30
 
31
- _GROUP_KEYS_NO_DEFAULT = pd_release_version >= (1, 5, 0)
31
+ _GROUP_KEYS_NO_DEFAULT = pd_release_version[:2] == (1, 5)
32
32
  _default_group_keys = no_default if _GROUP_KEYS_NO_DEFAULT else True
33
33
 
34
34
 
@@ -59,7 +59,6 @@ class GroupByCumReductionOperator(DataFrameOperatorMixin, DataFrameOperator):
59
59
  out_dtypes = self._calc_out_dtypes(groupby)
60
60
 
61
61
  kw = in_df.params.copy()
62
- kw["index_value"] = parse_index(pd.RangeIndex(-1), groupby.key)
63
62
  if self.output_types[0] == OutputType.dataframe:
64
63
  kw.update(
65
64
  dict(
@@ -35,12 +35,15 @@ class GroupByFillOperator(DataFrameOperator, DataFrameOperatorMixin):
35
35
  func_name = getattr(self, "_func_name")
36
36
 
37
37
  if func_name == "fillna":
38
+ kw = {}
39
+ if self.axis is not None:
40
+ kw["axis"] = self.axis
38
41
  result_df = mock_groupby.fillna(
39
42
  value=self.value,
40
43
  method=self.method,
41
- axis=self.axis,
42
44
  limit=self.limit,
43
45
  downcast=self.downcast,
46
+ **kw,
44
47
  )
45
48
  else:
46
49
  result_df = getattr(mock_groupby, func_name)(limit=self.limit)
@@ -88,5 +88,11 @@ def df_groupby_getitem(df_groupby, item):
88
88
  if df_groupby.selection:
89
89
  raise IndexError(f"Column(s) {df_groupby.selection!r} already selected")
90
90
 
91
+ if (
92
+ isinstance(item, tuple)
93
+ and item not in df_groupby.dtypes
94
+ and item not in df_groupby.index.names
95
+ ):
96
+ item = list(item)
91
97
  op = GroupByIndex(selection=item, output_types=output_types)
92
98
  return op(df_groupby)
@@ -230,7 +230,7 @@ def test_groupby_transform():
230
230
  assert r.op._op_type_ == opcodes.TRANSFORM
231
231
  assert r.op.output_types[0] == OutputType.dataframe
232
232
 
233
- r = mdf.groupby("b").transform(["cummax", "cumcount"], _call_agg=True)
233
+ r = mdf[list("abde")].groupby("b").transform(["cummax", "cumcount"], _call_agg=True)
234
234
  assert r.shape == (np.nan, 6)
235
235
  assert r.op._op_type_ == opcodes.TRANSFORM
236
236
  assert r.op.output_types[0] == OutputType.dataframe
@@ -282,14 +282,17 @@ def test_groupby_cum():
282
282
  r = getattr(mdf.groupby("b"), fun)()
283
283
  assert r.op.output_types[0] == OutputType.dataframe
284
284
  assert r.shape == (len(df1), 2)
285
+ assert r.index_value.key == mdf.index_value.key
285
286
 
286
287
  r = getattr(mdf.groupby("b"), fun)(axis=1)
287
288
  assert r.op.output_types[0] == OutputType.dataframe
288
289
  assert r.shape == (len(df1), 3)
290
+ assert r.index_value.key == mdf.index_value.key
289
291
 
290
292
  r = mdf.groupby("b").cumcount()
291
293
  assert r.op.output_types[0] == OutputType.series
292
294
  assert r.shape == (len(df1),)
295
+ assert r.index_value.key == mdf.index_value.key
293
296
 
294
297
  series1 = pd.Series([2, 2, 5, 7, 3, 7, 8, 8, 5, 6])
295
298
  ms1 = md.Series(series1, chunk_size=3)
@@ -298,6 +301,7 @@ def test_groupby_cum():
298
301
  r = getattr(ms1.groupby(lambda x: x % 2), fun)()
299
302
  assert r.op.output_types[0] == OutputType.series
300
303
  assert r.shape == (len(series1),)
304
+ assert r.index_value.key == ms1.index_value.key
301
305
 
302
306
 
303
307
  def test_groupby_fill():
@@ -12,6 +12,8 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ import logging
16
+
15
17
  import numpy as np
16
18
  import pandas as pd
17
19
 
@@ -22,6 +24,8 @@ from ...utils import quiet_stdio
22
24
  from ..operators import DataFrameOperator, DataFrameOperatorMixin
23
25
  from ..utils import parse_index
24
26
 
27
+ logger = logging.getLogger(__name__)
28
+
25
29
 
26
30
  class GroupByTransform(DataFrameOperator, DataFrameOperatorMixin):
27
31
  _op_type_ = opcodes.TRANSFORM
@@ -65,7 +69,7 @@ class GroupByTransform(DataFrameOperator, DataFrameOperatorMixin):
65
69
  output_types = [OutputType.series]
66
70
  new_dtypes = new_dtypes or (infer_df.name, infer_df.dtype)
67
71
  except: # noqa: E722 # nosec
68
- pass
72
+ logger.info("Exception raised while inferring df_func", exc_info=True)
69
73
 
70
74
  self.output_types = output_types if not self.output_types else self.output_types
71
75
  dtypes = new_dtypes if dtypes is None else dtypes