maxframe 2.2.0__cp310-cp310-macosx_10_9_universal2.whl → 2.3.0rc1__cp310-cp310-macosx_10_9_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cpython-310-darwin.so +0 -0
- maxframe/codegen/core.py +3 -2
- maxframe/codegen/spe/dataframe/merge.py +4 -0
- maxframe/codegen/spe/dataframe/misc.py +2 -0
- maxframe/codegen/spe/dataframe/reduction.py +18 -0
- maxframe/codegen/spe/dataframe/sort.py +9 -1
- maxframe/codegen/spe/dataframe/tests/test_reduction.py +13 -0
- maxframe/codegen/spe/dataframe/tseries.py +9 -0
- maxframe/codegen/spe/learn/contrib/lightgbm.py +4 -3
- maxframe/codegen/spe/tensor/datasource.py +1 -0
- maxframe/config/config.py +3 -0
- maxframe/conftest.py +10 -0
- maxframe/core/base.py +2 -1
- maxframe/core/entity/tileables.py +2 -0
- maxframe/core/graph/core.cpython-310-darwin.so +0 -0
- maxframe/core/graph/entity.py +7 -1
- maxframe/core/mode.py +6 -1
- maxframe/dataframe/__init__.py +2 -2
- maxframe/dataframe/arithmetic/__init__.py +4 -0
- maxframe/dataframe/arithmetic/maximum.py +33 -0
- maxframe/dataframe/arithmetic/minimum.py +33 -0
- maxframe/dataframe/core.py +98 -106
- maxframe/dataframe/datasource/core.py +6 -0
- maxframe/dataframe/datasource/direct.py +57 -0
- maxframe/dataframe/datasource/read_csv.py +19 -11
- maxframe/dataframe/datasource/read_odps_query.py +29 -6
- maxframe/dataframe/datasource/read_odps_table.py +32 -10
- maxframe/dataframe/datasource/read_parquet.py +38 -39
- maxframe/dataframe/datastore/__init__.py +6 -0
- maxframe/dataframe/datastore/direct.py +268 -0
- maxframe/dataframe/datastore/to_odps.py +6 -0
- maxframe/dataframe/extensions/flatjson.py +2 -1
- maxframe/dataframe/groupby/__init__.py +5 -1
- maxframe/dataframe/groupby/aggregation.py +10 -6
- maxframe/dataframe/groupby/apply_chunk.py +1 -3
- maxframe/dataframe/groupby/core.py +20 -4
- maxframe/dataframe/indexing/__init__.py +2 -1
- maxframe/dataframe/indexing/insert.py +45 -17
- maxframe/dataframe/merge/__init__.py +3 -0
- maxframe/dataframe/merge/combine.py +244 -0
- maxframe/dataframe/misc/__init__.py +14 -3
- maxframe/dataframe/misc/check_unique.py +41 -10
- maxframe/dataframe/misc/drop.py +31 -0
- maxframe/dataframe/misc/infer_dtypes.py +251 -0
- maxframe/dataframe/misc/map.py +31 -18
- maxframe/dataframe/misc/repeat.py +159 -0
- maxframe/dataframe/misc/tests/test_misc.py +35 -1
- maxframe/dataframe/missing/checkna.py +3 -2
- maxframe/dataframe/reduction/__init__.py +10 -5
- maxframe/dataframe/reduction/aggregation.py +6 -6
- maxframe/dataframe/reduction/argmax.py +7 -4
- maxframe/dataframe/reduction/argmin.py +7 -4
- maxframe/dataframe/reduction/core.py +18 -9
- maxframe/dataframe/reduction/mode.py +144 -0
- maxframe/dataframe/reduction/nunique.py +10 -3
- maxframe/dataframe/reduction/tests/test_reduction.py +12 -0
- maxframe/dataframe/sort/__init__.py +9 -2
- maxframe/dataframe/sort/argsort.py +7 -1
- maxframe/dataframe/sort/core.py +1 -1
- maxframe/dataframe/sort/rank.py +147 -0
- maxframe/dataframe/tseries/__init__.py +19 -0
- maxframe/dataframe/tseries/at_time.py +61 -0
- maxframe/dataframe/tseries/between_time.py +122 -0
- maxframe/dataframe/utils.py +30 -26
- maxframe/learn/contrib/llm/core.py +16 -7
- maxframe/learn/contrib/llm/deploy/__init__.py +13 -0
- maxframe/learn/contrib/llm/deploy/config.py +221 -0
- maxframe/learn/contrib/llm/deploy/core.py +247 -0
- maxframe/learn/contrib/llm/deploy/framework.py +35 -0
- maxframe/learn/contrib/llm/deploy/loader.py +360 -0
- maxframe/learn/contrib/llm/deploy/tests/__init__.py +13 -0
- maxframe/learn/contrib/llm/deploy/tests/test_register_models.py +359 -0
- maxframe/learn/contrib/llm/models/__init__.py +1 -0
- maxframe/learn/contrib/llm/models/dashscope.py +12 -6
- maxframe/learn/contrib/llm/models/managed.py +76 -11
- maxframe/learn/contrib/llm/models/openai.py +72 -0
- maxframe/learn/contrib/llm/tests/__init__.py +13 -0
- maxframe/learn/contrib/llm/tests/test_core.py +34 -0
- maxframe/learn/contrib/llm/tests/test_openai.py +187 -0
- maxframe/learn/contrib/llm/tests/test_text_gen.py +155 -0
- maxframe/learn/contrib/llm/text.py +348 -42
- maxframe/learn/contrib/models.py +4 -1
- maxframe/learn/contrib/xgboost/classifier.py +2 -0
- maxframe/learn/contrib/xgboost/core.py +31 -7
- maxframe/learn/contrib/xgboost/predict.py +4 -2
- maxframe/learn/contrib/xgboost/regressor.py +5 -0
- maxframe/learn/contrib/xgboost/train.py +2 -0
- maxframe/learn/preprocessing/_data/min_max_scaler.py +34 -23
- maxframe/learn/preprocessing/_data/standard_scaler.py +34 -25
- maxframe/learn/utils/__init__.py +1 -0
- maxframe/learn/utils/extmath.py +42 -9
- maxframe/learn/utils/odpsio.py +80 -11
- maxframe/lib/filesystem/_oss_lib/common.py +2 -0
- maxframe/lib/mmh3.cpython-310-darwin.so +0 -0
- maxframe/opcodes.py +9 -1
- maxframe/remote/core.py +4 -0
- maxframe/serialization/core.cpython-310-darwin.so +0 -0
- maxframe/serialization/tests/test_serial.py +2 -2
- maxframe/tensor/arithmetic/__init__.py +1 -1
- maxframe/tensor/arithmetic/core.py +2 -2
- maxframe/tensor/arithmetic/tests/test_arithmetic.py +0 -9
- maxframe/tensor/core.py +3 -0
- maxframe/tensor/misc/copyto.py +1 -1
- maxframe/tests/test_udf.py +61 -0
- maxframe/tests/test_utils.py +8 -5
- maxframe/udf.py +103 -7
- maxframe/utils.py +61 -8
- {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/METADATA +1 -2
- {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/RECORD +113 -90
- maxframe_client/session/task.py +8 -1
- maxframe_client/tests/test_session.py +24 -0
- maxframe/dataframe/arrays.py +0 -864
- {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/WHEEL +0 -0
- {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
# Copyright 1999-2025 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from ... import opcodes
|
|
16
|
+
from ...serialization.serializables import AnyField, StringField
|
|
17
|
+
from ..core import DATAFRAME_TYPE, SERIES_TYPE
|
|
18
|
+
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class DataFrameInferDtypes(DataFrameOperator, DataFrameOperatorMixin):
|
|
22
|
+
_op_type_ = opcodes.DATAFRAME_INFER_DTYPES
|
|
23
|
+
|
|
24
|
+
infer_method = StringField("infer_method")
|
|
25
|
+
infer_kwargs = AnyField("infer_kwargs")
|
|
26
|
+
|
|
27
|
+
infer_stage = StringField("infer_stage", default=None)
|
|
28
|
+
|
|
29
|
+
def __init__(self, output_types=None, **kw):
|
|
30
|
+
super().__init__(_output_types=output_types, **kw)
|
|
31
|
+
|
|
32
|
+
def __call__(self, df):
|
|
33
|
+
if isinstance(df, DATAFRAME_TYPE):
|
|
34
|
+
return self.new_dataframe(
|
|
35
|
+
[df],
|
|
36
|
+
shape=df.shape,
|
|
37
|
+
dtypes=None,
|
|
38
|
+
index_value=df.index_value,
|
|
39
|
+
columns_value=df.columns_value,
|
|
40
|
+
)
|
|
41
|
+
else:
|
|
42
|
+
assert isinstance(df, SERIES_TYPE)
|
|
43
|
+
return self.new_series(
|
|
44
|
+
[df],
|
|
45
|
+
shape=df.shape,
|
|
46
|
+
dtype=None,
|
|
47
|
+
name=df.name,
|
|
48
|
+
index_value=df.index_value,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def convert_dtypes(
|
|
53
|
+
df_or_series,
|
|
54
|
+
infer_objects=True,
|
|
55
|
+
convert_string=True,
|
|
56
|
+
convert_integer=True,
|
|
57
|
+
convert_boolean=True,
|
|
58
|
+
convert_floating=True,
|
|
59
|
+
dtype_backend="numpy",
|
|
60
|
+
):
|
|
61
|
+
"""
|
|
62
|
+
Convert columns to best possible dtypes using dtypes supporting ``pd.NA``.
|
|
63
|
+
|
|
64
|
+
Parameters
|
|
65
|
+
----------
|
|
66
|
+
infer_objects : bool, default True
|
|
67
|
+
Whether object dtypes should be converted to the best possible types.
|
|
68
|
+
convert_string : bool, default True
|
|
69
|
+
Whether object dtypes should be converted to ``StringDtype()``.
|
|
70
|
+
convert_integer : bool, default True
|
|
71
|
+
Whether, if possible, conversion can be done to integer extension types.
|
|
72
|
+
convert_boolean : bool, defaults True
|
|
73
|
+
Whether object dtypes should be converted to ``BooleanDtypes()``.
|
|
74
|
+
convert_floating : bool, defaults True
|
|
75
|
+
Whether, if possible, conversion can be done to floating extension types.
|
|
76
|
+
If `convert_integer` is also True, preference will be give to integer
|
|
77
|
+
dtypes if the floats can be faithfully casted to integers.
|
|
78
|
+
|
|
79
|
+
Returns
|
|
80
|
+
-------
|
|
81
|
+
Series or DataFrame
|
|
82
|
+
Copy of input object with new dtype.
|
|
83
|
+
|
|
84
|
+
See Also
|
|
85
|
+
--------
|
|
86
|
+
infer_objects : Infer dtypes of objects.
|
|
87
|
+
to_datetime : Convert argument to datetime.
|
|
88
|
+
to_timedelta : Convert argument to timedelta.
|
|
89
|
+
to_numeric : Convert argument to a numeric type.
|
|
90
|
+
|
|
91
|
+
Notes
|
|
92
|
+
-----
|
|
93
|
+
By default, ``convert_dtypes`` will attempt to convert a Series (or each
|
|
94
|
+
Series in a DataFrame) to dtypes that support ``pd.NA``. By using the options
|
|
95
|
+
``convert_string``, ``convert_integer``, ``convert_boolean`` and
|
|
96
|
+
``convert_boolean``, it is possible to turn off individual conversions
|
|
97
|
+
to ``StringDtype``, the integer extension types, ``BooleanDtype``
|
|
98
|
+
or floating extension types, respectively.
|
|
99
|
+
|
|
100
|
+
For object-dtyped columns, if ``infer_objects`` is ``True``, use the inference
|
|
101
|
+
rules as during normal Series/DataFrame construction. Then, if possible,
|
|
102
|
+
convert to ``StringDtype``, ``BooleanDtype`` or an appropriate integer
|
|
103
|
+
or floating extension type, otherwise leave as ``object``.
|
|
104
|
+
|
|
105
|
+
If the dtype is integer, convert to an appropriate integer extension type.
|
|
106
|
+
|
|
107
|
+
If the dtype is numeric, and consists of all integers, convert to an
|
|
108
|
+
appropriate integer extension type. Otherwise, convert to an
|
|
109
|
+
appropriate floating extension type.
|
|
110
|
+
|
|
111
|
+
.. versionchanged:: 1.2
|
|
112
|
+
Starting with pandas 1.2, this method also converts float columns
|
|
113
|
+
to the nullable floating extension type.
|
|
114
|
+
|
|
115
|
+
In the future, as new dtypes are added that support ``pd.NA``, the results
|
|
116
|
+
of this method will change to support those new dtypes.
|
|
117
|
+
|
|
118
|
+
Examples
|
|
119
|
+
--------
|
|
120
|
+
>>> import maxframe.tensor as mt
|
|
121
|
+
>>> import maxframe.dataframe as md
|
|
122
|
+
>>> df = md.DataFrame(
|
|
123
|
+
... {
|
|
124
|
+
... "a": md.Series([1, 2, 3], dtype=mt.dtype("int32")),
|
|
125
|
+
... "b": md.Series(["x", "y", "z"], dtype=mt.dtype("O")),
|
|
126
|
+
... "c": md.Series([True, False, mt.nan], dtype=mt.dtype("O")),
|
|
127
|
+
... "d": md.Series(["h", "i", mt.nan], dtype=mt.dtype("O")),
|
|
128
|
+
... "e": md.Series([10, mt.nan, 20], dtype=mt.dtype("float")),
|
|
129
|
+
... "f": md.Series([mt.nan, 100.5, 200], dtype=mt.dtype("float")),
|
|
130
|
+
... }
|
|
131
|
+
... )
|
|
132
|
+
|
|
133
|
+
Start with a DataFrame with default dtypes.
|
|
134
|
+
|
|
135
|
+
>>> df.execute()
|
|
136
|
+
a b c d e f
|
|
137
|
+
0 1 x True h 10.0 NaN
|
|
138
|
+
1 2 y False i NaN 100.5
|
|
139
|
+
2 3 z NaN NaN 20.0 200.0
|
|
140
|
+
|
|
141
|
+
>>> df.dtypes.execute()
|
|
142
|
+
a int32
|
|
143
|
+
b object
|
|
144
|
+
c object
|
|
145
|
+
d object
|
|
146
|
+
e float64
|
|
147
|
+
f float64
|
|
148
|
+
dtype: object
|
|
149
|
+
|
|
150
|
+
Convert the DataFrame to use best possible dtypes.
|
|
151
|
+
|
|
152
|
+
>>> dfn = df.convert_dtypes()
|
|
153
|
+
>>> dfn.execute()
|
|
154
|
+
a b c d e f
|
|
155
|
+
0 1 x True h 10 <NA>
|
|
156
|
+
1 2 y False i <NA> 100.5
|
|
157
|
+
2 3 z <NA> <NA> 20 200.0
|
|
158
|
+
|
|
159
|
+
>>> dfn.dtypes.execute()
|
|
160
|
+
a Int32
|
|
161
|
+
b string
|
|
162
|
+
c boolean
|
|
163
|
+
d string
|
|
164
|
+
e Int64
|
|
165
|
+
f Float64
|
|
166
|
+
dtype: object
|
|
167
|
+
|
|
168
|
+
Start with a Series of strings and missing data represented by ``np.nan``.
|
|
169
|
+
|
|
170
|
+
>>> s = md.Series(["a", "b", mt.nan])
|
|
171
|
+
>>> s.execute()
|
|
172
|
+
0 a
|
|
173
|
+
1 b
|
|
174
|
+
2 NaN
|
|
175
|
+
dtype: object
|
|
176
|
+
|
|
177
|
+
Obtain a Series with dtype ``StringDtype``.
|
|
178
|
+
|
|
179
|
+
>>> s.convert_dtypes().execute()
|
|
180
|
+
0 a
|
|
181
|
+
1 b
|
|
182
|
+
2 <NA>
|
|
183
|
+
dtype: string
|
|
184
|
+
"""
|
|
185
|
+
dtype_backend = "numpy" if dtype_backend == "numpy_nullable" else dtype_backend
|
|
186
|
+
op = DataFrameInferDtypes(
|
|
187
|
+
infer_method="convert_dtypes",
|
|
188
|
+
infer_kwargs=dict(
|
|
189
|
+
infer_objects=infer_objects,
|
|
190
|
+
convert_string=convert_string,
|
|
191
|
+
convert_integer=convert_integer,
|
|
192
|
+
convert_boolean=convert_boolean,
|
|
193
|
+
convert_floating=convert_floating,
|
|
194
|
+
dtype_backend=dtype_backend,
|
|
195
|
+
),
|
|
196
|
+
)
|
|
197
|
+
return op(df_or_series)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def infer_objects(df_or_series, copy=True):
|
|
201
|
+
"""
|
|
202
|
+
Attempt to infer better dtypes for object columns.
|
|
203
|
+
|
|
204
|
+
Attempts soft conversion of object-dtyped
|
|
205
|
+
columns, leaving non-object and unconvertible
|
|
206
|
+
columns unchanged. The inference rules are the
|
|
207
|
+
same as during normal Series/DataFrame construction.
|
|
208
|
+
|
|
209
|
+
Returns
|
|
210
|
+
-------
|
|
211
|
+
converted : same type as input object
|
|
212
|
+
|
|
213
|
+
See Also
|
|
214
|
+
--------
|
|
215
|
+
to_datetime : Convert argument to datetime.
|
|
216
|
+
to_timedelta : Convert argument to timedelta.
|
|
217
|
+
to_numeric : Convert argument to numeric type.
|
|
218
|
+
convert_dtypes : Convert argument to best possible dtype.
|
|
219
|
+
|
|
220
|
+
Examples
|
|
221
|
+
--------
|
|
222
|
+
>>> import maxframe.dataframe as md
|
|
223
|
+
>>> df = md.DataFrame({"A": ["a", 1, 2, 3]})
|
|
224
|
+
>>> df = df.iloc[1:]
|
|
225
|
+
>>> df.execute()
|
|
226
|
+
A
|
|
227
|
+
1 1
|
|
228
|
+
2 2
|
|
229
|
+
3 3
|
|
230
|
+
|
|
231
|
+
>>> df.dtypes.execute()
|
|
232
|
+
A object
|
|
233
|
+
dtype: object
|
|
234
|
+
|
|
235
|
+
>>> df.infer_objects().dtypes.execute()
|
|
236
|
+
A int64
|
|
237
|
+
dtype: object
|
|
238
|
+
"""
|
|
239
|
+
if (isinstance(df_or_series, SERIES_TYPE) and df_or_series.dtype != "O") or (
|
|
240
|
+
isinstance(df_or_series, DATAFRAME_TYPE)
|
|
241
|
+
and all(dt != "O" for dt in df_or_series.dtypes)
|
|
242
|
+
):
|
|
243
|
+
# no objects to cast
|
|
244
|
+
return df_or_series
|
|
245
|
+
|
|
246
|
+
_ = copy # in MaxFrame data are immutable, thus ignore the parameter
|
|
247
|
+
op = DataFrameInferDtypes(
|
|
248
|
+
infer_method="infer_objects",
|
|
249
|
+
infer_kwargs={},
|
|
250
|
+
)
|
|
251
|
+
return op(df_or_series)
|
maxframe/dataframe/misc/map.py
CHANGED
|
@@ -21,8 +21,8 @@ import pandas as pd
|
|
|
21
21
|
from ... import opcodes
|
|
22
22
|
from ...core import EntityData, OutputType
|
|
23
23
|
from ...serialization.serializables import AnyField, KeyField, StringField
|
|
24
|
-
from ...udf import BuiltinFunction, MarkedFunction
|
|
25
|
-
from ...utils import quiet_stdio
|
|
24
|
+
from ...udf import BuiltinFunction, MarkedFunction, ODPSFunction
|
|
25
|
+
from ...utils import make_dtype, quiet_stdio
|
|
26
26
|
from ..core import SERIES_TYPE
|
|
27
27
|
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
28
28
|
from ..utils import build_series, copy_func_scheduling_hints
|
|
@@ -40,6 +40,7 @@ class DataFrameMap(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
40
40
|
if not self.output_types:
|
|
41
41
|
self.output_types = [OutputType.series]
|
|
42
42
|
if hasattr(self, "arg"):
|
|
43
|
+
self.arg = ODPSFunction.wrap(self.arg)
|
|
43
44
|
copy_func_scheduling_hints(self.arg, self)
|
|
44
45
|
|
|
45
46
|
@classmethod
|
|
@@ -55,25 +56,34 @@ class DataFrameMap(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
55
56
|
) and not isinstance(self.arg, BuiltinFunction)
|
|
56
57
|
|
|
57
58
|
def __call__(self, series, dtype, skip_infer=False):
|
|
58
|
-
if dtype is
|
|
59
|
-
|
|
60
|
-
|
|
59
|
+
if dtype is not None:
|
|
60
|
+
dtype = make_dtype(dtype)
|
|
61
|
+
else:
|
|
62
|
+
# obtain dtype from existing hints
|
|
63
|
+
if isinstance(self.arg, ODPSFunction):
|
|
64
|
+
if self.arg.result_dtype is not None:
|
|
65
|
+
dtype = self.arg.result_dtype
|
|
66
|
+
elif callable(self.arg):
|
|
61
67
|
# arg is a function, try to inspect the signature
|
|
62
68
|
sig = inspect.signature(self.arg)
|
|
63
69
|
return_type = sig.return_annotation
|
|
64
70
|
if return_type is not inspect._empty:
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
71
|
+
dtype = np.dtype(return_type)
|
|
72
|
+
|
|
73
|
+
err_prefix = None
|
|
74
|
+
if dtype is None and not skip_infer:
|
|
75
|
+
inferred_dtype = None
|
|
76
|
+
if callable(self.arg):
|
|
77
|
+
try:
|
|
78
|
+
with quiet_stdio():
|
|
79
|
+
# try to infer dtype by calling the function
|
|
80
|
+
inferred_dtype = (
|
|
81
|
+
build_series(series)
|
|
82
|
+
.map(self.arg, na_action=self.na_action)
|
|
83
|
+
.dtype
|
|
84
|
+
)
|
|
85
|
+
except: # noqa: E722 # nosec
|
|
86
|
+
pass
|
|
77
87
|
else:
|
|
78
88
|
if isinstance(self.arg, MutableMapping):
|
|
79
89
|
inferred_dtype = pd.Series(self.arg).dtype
|
|
@@ -86,13 +96,16 @@ class DataFrameMap(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
86
96
|
# but for int, due to the nan which may occur,
|
|
87
97
|
# we cannot infer the dtype
|
|
88
98
|
dtype = inferred_dtype
|
|
99
|
+
else:
|
|
100
|
+
err_prefix = "int type may not be exact"
|
|
89
101
|
else:
|
|
90
102
|
dtype = inferred_dtype
|
|
91
103
|
|
|
92
104
|
if dtype is None:
|
|
93
105
|
if not skip_infer:
|
|
106
|
+
err_prefix = err_prefix or "cannot infer dtype"
|
|
94
107
|
raise ValueError(
|
|
95
|
-
"
|
|
108
|
+
f"{err_prefix}, it needs to be specified manually for `map`"
|
|
96
109
|
)
|
|
97
110
|
else:
|
|
98
111
|
dtype = np.int64 if dtype is int else dtype
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
# Copyright 1999-2025 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from typing import List
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
from pandas.api.types import is_list_like
|
|
19
|
+
|
|
20
|
+
from ... import opcodes
|
|
21
|
+
from ...core import ENTITY_TYPE, EntityData, get_output_types
|
|
22
|
+
from ...serialization.serializables import AnyField
|
|
23
|
+
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
24
|
+
from ..utils import parse_index, validate_axis
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class DataFrameRepeat(DataFrameOperator, DataFrameOperatorMixin):
|
|
28
|
+
_op_type_ = opcodes.REPEAT
|
|
29
|
+
|
|
30
|
+
repeats = AnyField("repeats", default=None)
|
|
31
|
+
|
|
32
|
+
def __init__(self, output_types=None, **kw):
|
|
33
|
+
super().__init__(_output_types=output_types, **kw)
|
|
34
|
+
|
|
35
|
+
@classmethod
|
|
36
|
+
def _set_inputs(cls, op: "DataFrameRepeat", inputs: List[EntityData]):
|
|
37
|
+
super()._set_inputs(op, inputs)
|
|
38
|
+
if isinstance(op.repeats, ENTITY_TYPE):
|
|
39
|
+
op.repeats = inputs[1]
|
|
40
|
+
|
|
41
|
+
def __call__(self, obj, repeats):
|
|
42
|
+
self._output_types = get_output_types(obj)
|
|
43
|
+
test_index = obj.index_value.to_pandas()[:0]
|
|
44
|
+
|
|
45
|
+
params = obj.params
|
|
46
|
+
params["index_value"] = parse_index(test_index, obj, type(self), self.repeats)
|
|
47
|
+
params["shape"] = (np.nan,)
|
|
48
|
+
|
|
49
|
+
inputs = [obj]
|
|
50
|
+
if isinstance(repeats, ENTITY_TYPE):
|
|
51
|
+
inputs.append(repeats)
|
|
52
|
+
return self.new_tileable(inputs, **params)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _repeat(obj, repeats, axis=None):
|
|
56
|
+
from ...tensor.datasource import tensor
|
|
57
|
+
|
|
58
|
+
axis = validate_axis(axis or 0, obj)
|
|
59
|
+
if is_list_like(repeats):
|
|
60
|
+
repeats = tensor(repeats)
|
|
61
|
+
op = DataFrameRepeat(repeats=repeats, axis=axis)
|
|
62
|
+
return op(obj, repeats)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def series_repeat(obj, repeats, axis=None):
|
|
66
|
+
"""
|
|
67
|
+
Repeat elements of a Series.
|
|
68
|
+
|
|
69
|
+
Returns a new Series where each element of the current Series
|
|
70
|
+
is repeated consecutively a given number of times.
|
|
71
|
+
|
|
72
|
+
Parameters
|
|
73
|
+
----------
|
|
74
|
+
repeats : int or array of ints
|
|
75
|
+
The number of repetitions for each element. This should be a
|
|
76
|
+
non-negative integer. Repeating 0 times will return an empty
|
|
77
|
+
Series.
|
|
78
|
+
axis : None
|
|
79
|
+
Must be ``None``. Has no effect but is accepted for compatibility
|
|
80
|
+
with numpy.
|
|
81
|
+
|
|
82
|
+
Returns
|
|
83
|
+
-------
|
|
84
|
+
Series
|
|
85
|
+
Newly created Series with repeated elements.
|
|
86
|
+
|
|
87
|
+
See Also
|
|
88
|
+
--------
|
|
89
|
+
Index.repeat : Equivalent function for Index.
|
|
90
|
+
numpy.repeat : Similar method for :class:`numpy.ndarray`.
|
|
91
|
+
|
|
92
|
+
Examples
|
|
93
|
+
--------
|
|
94
|
+
>>> import maxframe.dataframe as md
|
|
95
|
+
>>> s = md.Series(['a', 'b', 'c'])
|
|
96
|
+
>>> s.execute()
|
|
97
|
+
0 a
|
|
98
|
+
1 b
|
|
99
|
+
2 c
|
|
100
|
+
dtype: object
|
|
101
|
+
>>> s.repeat(2).execute()
|
|
102
|
+
0 a
|
|
103
|
+
0 a
|
|
104
|
+
1 b
|
|
105
|
+
1 b
|
|
106
|
+
2 c
|
|
107
|
+
2 c
|
|
108
|
+
dtype: object
|
|
109
|
+
>>> s.repeat([1, 2, 3]).execute()
|
|
110
|
+
0 a
|
|
111
|
+
1 b
|
|
112
|
+
1 b
|
|
113
|
+
2 c
|
|
114
|
+
2 c
|
|
115
|
+
2 c
|
|
116
|
+
dtype: object
|
|
117
|
+
"""
|
|
118
|
+
return _repeat(obj, repeats, axis=axis)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def index_repeat(obj, repeats, axis=None):
|
|
122
|
+
"""
|
|
123
|
+
Repeat elements of an Index.
|
|
124
|
+
|
|
125
|
+
Returns a new Index where each element of the current Index
|
|
126
|
+
is repeated consecutively a given number of times.
|
|
127
|
+
|
|
128
|
+
Parameters
|
|
129
|
+
----------
|
|
130
|
+
repeats : int or array of ints
|
|
131
|
+
The number of repetitions for each element. This should be a
|
|
132
|
+
non-negative integer. Repeating 0 times will return an empty
|
|
133
|
+
Index.
|
|
134
|
+
axis : None
|
|
135
|
+
Must be ``None``. Has no effect but is accepted for compatibility
|
|
136
|
+
with numpy.
|
|
137
|
+
|
|
138
|
+
Returns
|
|
139
|
+
-------
|
|
140
|
+
repeated_index : Index
|
|
141
|
+
Newly created Index with repeated elements.
|
|
142
|
+
|
|
143
|
+
See Also
|
|
144
|
+
--------
|
|
145
|
+
Series.repeat : Equivalent function for Series.
|
|
146
|
+
numpy.repeat : Similar method for :class:`numpy.ndarray`.
|
|
147
|
+
|
|
148
|
+
Examples
|
|
149
|
+
--------
|
|
150
|
+
>>> import maxframe.dataframe as md
|
|
151
|
+
>>> idx = md.Index(['a', 'b', 'c'])
|
|
152
|
+
>>> idx.execute()
|
|
153
|
+
Index(['a', 'b', 'c'], dtype='object')
|
|
154
|
+
>>> idx.repeat(2).execute()
|
|
155
|
+
Index(['a', 'a', 'b', 'b', 'c', 'c'], dtype='object')
|
|
156
|
+
>>> idx.repeat([1, 2, 3]).execute()
|
|
157
|
+
Index(['a', 'b', 'b', 'c', 'c', 'c'], dtype='object')
|
|
158
|
+
"""
|
|
159
|
+
return _repeat(obj, repeats, axis=axis)
|
|
@@ -22,7 +22,7 @@ from .... import opcodes
|
|
|
22
22
|
from ....core import OutputType
|
|
23
23
|
from ....dataframe import DataFrame
|
|
24
24
|
from ....tensor.core import TENSOR_TYPE
|
|
25
|
-
from ....udf import with_running_options
|
|
25
|
+
from ....udf import ODPSFunction, with_running_options
|
|
26
26
|
from ... import eval as maxframe_eval
|
|
27
27
|
from ... import get_dummies, to_numeric
|
|
28
28
|
from ...arithmetic import DataFrameGreater, DataFrameLess
|
|
@@ -613,3 +613,37 @@ def test_pivot_table():
|
|
|
613
613
|
t = df.pivot_table(index=["A", "B"], columns="C", aggfunc="sum")
|
|
614
614
|
assert isinstance(t.op, DataFramePivotTable)
|
|
615
615
|
assert t.shape == (np.nan, np.nan)
|
|
616
|
+
|
|
617
|
+
|
|
618
|
+
def test_map_with_functions():
|
|
619
|
+
raw = pd.Series([1, 2, 3], name="s_name")
|
|
620
|
+
series = from_pandas_series(raw, chunk_size=2)
|
|
621
|
+
|
|
622
|
+
# inferred type may not be exact
|
|
623
|
+
def fn1(val):
|
|
624
|
+
return val
|
|
625
|
+
|
|
626
|
+
with pytest.raises(ValueError, match="int type"):
|
|
627
|
+
series.map(fn1)
|
|
628
|
+
mapped = series.map(fn1, dtype="float64", skip_infer=True)
|
|
629
|
+
assert mapped.dtype == np.dtype("float64")
|
|
630
|
+
|
|
631
|
+
# test when type infer is valid
|
|
632
|
+
def fn2(val):
|
|
633
|
+
return val * 1.0
|
|
634
|
+
|
|
635
|
+
mapped = series.map(fn2)
|
|
636
|
+
assert mapped.dtype == np.dtype("float64")
|
|
637
|
+
|
|
638
|
+
# test function with type annotations
|
|
639
|
+
def fn3(val) -> int:
|
|
640
|
+
return val
|
|
641
|
+
|
|
642
|
+
mapped = series.map(fn3)
|
|
643
|
+
assert mapped.dtype == np.dtype("int64")
|
|
644
|
+
|
|
645
|
+
# test odps function
|
|
646
|
+
odps_func = ODPSFunction("test_odps_udf", dtype=np.float64)
|
|
647
|
+
mapped = series.map(odps_func)
|
|
648
|
+
assert isinstance(mapped.op.arg, ODPSFunction)
|
|
649
|
+
assert mapped.dtype == np.dtype("float64")
|
|
@@ -22,6 +22,7 @@ from ... import tensor as mt
|
|
|
22
22
|
from ...core import ENTITY_TYPE, OutputType
|
|
23
23
|
from ...serialization.serializables import BoolField
|
|
24
24
|
from ...tensor.core import TENSOR_TYPE
|
|
25
|
+
from ...utils import get_pd_option
|
|
25
26
|
from ..core import DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE, MultiIndex
|
|
26
27
|
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
27
28
|
|
|
@@ -138,7 +139,7 @@ def isna(obj):
|
|
|
138
139
|
2 True
|
|
139
140
|
dtype: bool
|
|
140
141
|
"""
|
|
141
|
-
use_inf_as_na =
|
|
142
|
+
use_inf_as_na = get_pd_option("mode.use_inf_as_na", False)
|
|
142
143
|
if isinstance(obj, MultiIndex):
|
|
143
144
|
raise NotImplementedError("isna is not defined for MultiIndex")
|
|
144
145
|
elif isinstance(obj, ENTITY_TYPE):
|
|
@@ -213,7 +214,7 @@ def notna(obj):
|
|
|
213
214
|
2 False
|
|
214
215
|
dtype: bool
|
|
215
216
|
"""
|
|
216
|
-
use_inf_as_na =
|
|
217
|
+
use_inf_as_na = get_pd_option("mode.use_inf_as_na", False)
|
|
217
218
|
if isinstance(obj, MultiIndex):
|
|
218
219
|
raise NotImplementedError("isna is not defined for MultiIndex")
|
|
219
220
|
elif isinstance(obj, ENTITY_TYPE):
|
|
@@ -17,7 +17,7 @@ from .all import DataFrameAll
|
|
|
17
17
|
from .any import DataFrameAny
|
|
18
18
|
from .argmax import DataFrameArgMax
|
|
19
19
|
from .argmin import DataFrameArgMin
|
|
20
|
-
from .core import CustomReduction
|
|
20
|
+
from .core import CustomReduction, NamedAgg
|
|
21
21
|
from .count import DataFrameCount
|
|
22
22
|
from .cummax import DataFrameCummax
|
|
23
23
|
from .cummin import DataFrameCummin
|
|
@@ -31,6 +31,7 @@ from .max import DataFrameMax
|
|
|
31
31
|
from .mean import DataFrameMean
|
|
32
32
|
from .median import DataFrameMedian
|
|
33
33
|
from .min import DataFrameMin
|
|
34
|
+
from .mode import DataFrameMode
|
|
34
35
|
from .nunique import DataFrameNunique
|
|
35
36
|
from .prod import DataFrameProd
|
|
36
37
|
from .reduction_size import DataFrameSize
|
|
@@ -47,8 +48,8 @@ def _install():
|
|
|
47
48
|
from .aggregation import aggregate
|
|
48
49
|
from .all import all_dataframe, all_index, all_series
|
|
49
50
|
from .any import any_dataframe, any_index, any_series
|
|
50
|
-
from .argmax import
|
|
51
|
-
from .argmin import
|
|
51
|
+
from .argmax import argmax_series_index
|
|
52
|
+
from .argmin import argmin_series_index
|
|
52
53
|
from .count import count_dataframe, count_series
|
|
53
54
|
from .cov import cov_dataframe, cov_series
|
|
54
55
|
from .cummax import cummax
|
|
@@ -62,6 +63,7 @@ def _install():
|
|
|
62
63
|
from .mean import mean_dataframe, mean_series
|
|
63
64
|
from .median import median_dataframe, median_series
|
|
64
65
|
from .min import min_dataframe, min_index, min_series
|
|
66
|
+
from .mode import mode_dataframe, mode_series
|
|
65
67
|
from .nunique import nunique_dataframe, nunique_series
|
|
66
68
|
from .prod import prod_dataframe, prod_series
|
|
67
69
|
from .reduction_size import size_dataframe, size_series
|
|
@@ -76,8 +78,8 @@ def _install():
|
|
|
76
78
|
("aggregate", aggregate, aggregate),
|
|
77
79
|
("all", all_series, all_dataframe),
|
|
78
80
|
("any", any_series, any_dataframe),
|
|
79
|
-
("argmax",
|
|
80
|
-
("argmin",
|
|
81
|
+
("argmax", argmax_series_index, None),
|
|
82
|
+
("argmin", argmin_series_index, None),
|
|
81
83
|
("count", count_series, count_dataframe),
|
|
82
84
|
("cov", cov_series, cov_dataframe),
|
|
83
85
|
("cummax", cummax, cummax),
|
|
@@ -92,6 +94,7 @@ def _install():
|
|
|
92
94
|
("mean", mean_series, mean_dataframe),
|
|
93
95
|
("median", median_series, median_dataframe),
|
|
94
96
|
("min", min_series, min_dataframe),
|
|
97
|
+
("mode", mode_series, mode_dataframe),
|
|
95
98
|
("nunique", nunique_series, nunique_dataframe),
|
|
96
99
|
("prod", prod_series, prod_dataframe),
|
|
97
100
|
("product", prod_series, prod_dataframe),
|
|
@@ -118,6 +121,8 @@ def _install():
|
|
|
118
121
|
setattr(t, "any", any_index)
|
|
119
122
|
setattr(t, "min", min_index)
|
|
120
123
|
setattr(t, "max", max_index)
|
|
124
|
+
setattr(t, "argmin", argmin_series_index)
|
|
125
|
+
setattr(t, "argmax", argmax_series_index)
|
|
121
126
|
|
|
122
127
|
|
|
123
128
|
_install()
|
|
@@ -38,7 +38,7 @@ from ...serialization.serializables import (
|
|
|
38
38
|
)
|
|
39
39
|
from ...typing_ import TileableType
|
|
40
40
|
from ...udf import BuiltinFunction
|
|
41
|
-
from ...utils import lazy_import, pd_release_version
|
|
41
|
+
from ...utils import get_pd_option, lazy_import, pd_release_version
|
|
42
42
|
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
43
43
|
from ..utils import build_df, build_empty_df, build_series, parse_index, validate_axis
|
|
44
44
|
from .core import (
|
|
@@ -92,8 +92,8 @@ class DataFrameAggregate(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
92
92
|
_op_type_ = opcodes.AGGREGATE
|
|
93
93
|
|
|
94
94
|
raw_func = AnyField("raw_func")
|
|
95
|
-
raw_func_kw = DictField("raw_func_kw")
|
|
96
|
-
func = AnyField("func")
|
|
95
|
+
raw_func_kw = DictField("raw_func_kw", default=None)
|
|
96
|
+
func = AnyField("func", default=None)
|
|
97
97
|
func_rename = ListField("func_rename", default=None)
|
|
98
98
|
axis = AnyField("axis", default=0)
|
|
99
99
|
numeric_only = BoolField("numeric_only", default=None)
|
|
@@ -199,7 +199,7 @@ class DataFrameAggregate(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
199
199
|
normalize_reduction_funcs(self, ndim=df.ndim)
|
|
200
200
|
compile_reduction_funcs(self, df)
|
|
201
201
|
if output_type is None or dtypes is None:
|
|
202
|
-
with enter_mode(kernel=False, build=False):
|
|
202
|
+
with enter_mode(kernel=False, build=False, mock=True):
|
|
203
203
|
dtypes, index = self._calc_result_shape(df)
|
|
204
204
|
else:
|
|
205
205
|
self.output_types = [output_type]
|
|
@@ -231,7 +231,7 @@ class DataFrameAggregate(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
231
231
|
return self.new_series(
|
|
232
232
|
[df],
|
|
233
233
|
shape=new_shape,
|
|
234
|
-
dtype=dtypes[0],
|
|
234
|
+
dtype=dtypes.iloc[0],
|
|
235
235
|
name=dtypes.index[0],
|
|
236
236
|
index_value=new_index,
|
|
237
237
|
)
|
|
@@ -456,7 +456,7 @@ def aggregate(df, func=None, axis=0, **kw):
|
|
|
456
456
|
min 1
|
|
457
457
|
"""
|
|
458
458
|
axis = validate_axis(axis, df)
|
|
459
|
-
use_inf_as_na = kw.pop("_use_inf_as_na",
|
|
459
|
+
use_inf_as_na = kw.pop("_use_inf_as_na", get_pd_option("mode.use_inf_as_na", False))
|
|
460
460
|
if func == "unique":
|
|
461
461
|
# workaround for direct call of unique function which
|
|
462
462
|
# returns a tensor directly
|