maxframe 2.2.0__cp39-cp39-macosx_10_9_universal2.whl → 2.3.0rc1__cp39-cp39-macosx_10_9_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cpython-39-darwin.so +0 -0
- maxframe/codegen/core.py +3 -2
- maxframe/codegen/spe/dataframe/merge.py +4 -0
- maxframe/codegen/spe/dataframe/misc.py +2 -0
- maxframe/codegen/spe/dataframe/reduction.py +18 -0
- maxframe/codegen/spe/dataframe/sort.py +9 -1
- maxframe/codegen/spe/dataframe/tests/test_reduction.py +13 -0
- maxframe/codegen/spe/dataframe/tseries.py +9 -0
- maxframe/codegen/spe/learn/contrib/lightgbm.py +4 -3
- maxframe/codegen/spe/tensor/datasource.py +1 -0
- maxframe/config/config.py +3 -0
- maxframe/conftest.py +10 -0
- maxframe/core/base.py +2 -1
- maxframe/core/entity/tileables.py +2 -0
- maxframe/core/graph/core.cpython-39-darwin.so +0 -0
- maxframe/core/graph/entity.py +7 -1
- maxframe/core/mode.py +6 -1
- maxframe/dataframe/__init__.py +2 -2
- maxframe/dataframe/arithmetic/__init__.py +4 -0
- maxframe/dataframe/arithmetic/maximum.py +33 -0
- maxframe/dataframe/arithmetic/minimum.py +33 -0
- maxframe/dataframe/core.py +98 -106
- maxframe/dataframe/datasource/core.py +6 -0
- maxframe/dataframe/datasource/direct.py +57 -0
- maxframe/dataframe/datasource/read_csv.py +19 -11
- maxframe/dataframe/datasource/read_odps_query.py +29 -6
- maxframe/dataframe/datasource/read_odps_table.py +32 -10
- maxframe/dataframe/datasource/read_parquet.py +38 -39
- maxframe/dataframe/datastore/__init__.py +6 -0
- maxframe/dataframe/datastore/direct.py +268 -0
- maxframe/dataframe/datastore/to_odps.py +6 -0
- maxframe/dataframe/extensions/flatjson.py +2 -1
- maxframe/dataframe/groupby/__init__.py +5 -1
- maxframe/dataframe/groupby/aggregation.py +10 -6
- maxframe/dataframe/groupby/apply_chunk.py +1 -3
- maxframe/dataframe/groupby/core.py +20 -4
- maxframe/dataframe/indexing/__init__.py +2 -1
- maxframe/dataframe/indexing/insert.py +45 -17
- maxframe/dataframe/merge/__init__.py +3 -0
- maxframe/dataframe/merge/combine.py +244 -0
- maxframe/dataframe/misc/__init__.py +14 -3
- maxframe/dataframe/misc/check_unique.py +41 -10
- maxframe/dataframe/misc/drop.py +31 -0
- maxframe/dataframe/misc/infer_dtypes.py +251 -0
- maxframe/dataframe/misc/map.py +31 -18
- maxframe/dataframe/misc/repeat.py +159 -0
- maxframe/dataframe/misc/tests/test_misc.py +35 -1
- maxframe/dataframe/missing/checkna.py +3 -2
- maxframe/dataframe/reduction/__init__.py +10 -5
- maxframe/dataframe/reduction/aggregation.py +6 -6
- maxframe/dataframe/reduction/argmax.py +7 -4
- maxframe/dataframe/reduction/argmin.py +7 -4
- maxframe/dataframe/reduction/core.py +18 -9
- maxframe/dataframe/reduction/mode.py +144 -0
- maxframe/dataframe/reduction/nunique.py +10 -3
- maxframe/dataframe/reduction/tests/test_reduction.py +12 -0
- maxframe/dataframe/sort/__init__.py +9 -2
- maxframe/dataframe/sort/argsort.py +7 -1
- maxframe/dataframe/sort/core.py +1 -1
- maxframe/dataframe/sort/rank.py +147 -0
- maxframe/dataframe/tseries/__init__.py +19 -0
- maxframe/dataframe/tseries/at_time.py +61 -0
- maxframe/dataframe/tseries/between_time.py +122 -0
- maxframe/dataframe/utils.py +30 -26
- maxframe/learn/contrib/llm/core.py +16 -7
- maxframe/learn/contrib/llm/deploy/__init__.py +13 -0
- maxframe/learn/contrib/llm/deploy/config.py +221 -0
- maxframe/learn/contrib/llm/deploy/core.py +247 -0
- maxframe/learn/contrib/llm/deploy/framework.py +35 -0
- maxframe/learn/contrib/llm/deploy/loader.py +360 -0
- maxframe/learn/contrib/llm/deploy/tests/__init__.py +13 -0
- maxframe/learn/contrib/llm/deploy/tests/test_register_models.py +359 -0
- maxframe/learn/contrib/llm/models/__init__.py +1 -0
- maxframe/learn/contrib/llm/models/dashscope.py +12 -6
- maxframe/learn/contrib/llm/models/managed.py +76 -11
- maxframe/learn/contrib/llm/models/openai.py +72 -0
- maxframe/learn/contrib/llm/tests/__init__.py +13 -0
- maxframe/learn/contrib/llm/tests/test_core.py +34 -0
- maxframe/learn/contrib/llm/tests/test_openai.py +187 -0
- maxframe/learn/contrib/llm/tests/test_text_gen.py +155 -0
- maxframe/learn/contrib/llm/text.py +348 -42
- maxframe/learn/contrib/models.py +4 -1
- maxframe/learn/contrib/xgboost/classifier.py +2 -0
- maxframe/learn/contrib/xgboost/core.py +31 -7
- maxframe/learn/contrib/xgboost/predict.py +4 -2
- maxframe/learn/contrib/xgboost/regressor.py +5 -0
- maxframe/learn/contrib/xgboost/train.py +2 -0
- maxframe/learn/preprocessing/_data/min_max_scaler.py +34 -23
- maxframe/learn/preprocessing/_data/standard_scaler.py +34 -25
- maxframe/learn/utils/__init__.py +1 -0
- maxframe/learn/utils/extmath.py +42 -9
- maxframe/learn/utils/odpsio.py +80 -11
- maxframe/lib/filesystem/_oss_lib/common.py +2 -0
- maxframe/lib/mmh3.cpython-39-darwin.so +0 -0
- maxframe/opcodes.py +9 -1
- maxframe/remote/core.py +4 -0
- maxframe/serialization/core.cpython-39-darwin.so +0 -0
- maxframe/serialization/tests/test_serial.py +2 -2
- maxframe/tensor/arithmetic/__init__.py +1 -1
- maxframe/tensor/arithmetic/core.py +2 -2
- maxframe/tensor/arithmetic/tests/test_arithmetic.py +0 -9
- maxframe/tensor/core.py +3 -0
- maxframe/tensor/misc/copyto.py +1 -1
- maxframe/tests/test_udf.py +61 -0
- maxframe/tests/test_utils.py +8 -5
- maxframe/udf.py +103 -7
- maxframe/utils.py +61 -8
- {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/METADATA +1 -2
- {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/RECORD +113 -90
- maxframe_client/session/task.py +8 -1
- maxframe_client/tests/test_session.py +24 -0
- maxframe/dataframe/arrays.py +0 -864
- {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/WHEEL +0 -0
- {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/top_level.txt +0 -0
|
@@ -40,7 +40,7 @@ class DataFrameArgMax(DataFrameReduction, DataFrameReductionMixin):
|
|
|
40
40
|
return ReductionCallable(func_name=func_name, kwargs=kw)
|
|
41
41
|
|
|
42
42
|
|
|
43
|
-
def
|
|
43
|
+
def argmax_series_index(series_or_index, axis=0, skipna=True, *args, **kwargs):
|
|
44
44
|
"""
|
|
45
45
|
Return int position of the smallest value in the Series.
|
|
46
46
|
|
|
@@ -65,7 +65,7 @@ def argmax_series(series, axis=0, skipna=True):
|
|
|
65
65
|
--------
|
|
66
66
|
Series.argmin : Return position of the minimum value.
|
|
67
67
|
Series.argmax : Return position of the maximum value.
|
|
68
|
-
|
|
68
|
+
maxframe.tensor.argmax : Equivalent method for tensors.
|
|
69
69
|
Series.idxmax : Return index label of the maximum values.
|
|
70
70
|
Series.idxmin : Return index label of the minimum values.
|
|
71
71
|
|
|
@@ -92,9 +92,12 @@ def argmax_series(series, axis=0, skipna=True):
|
|
|
92
92
|
the minimum cereal calories is the first element,
|
|
93
93
|
since series is zero-indexed.
|
|
94
94
|
"""
|
|
95
|
-
|
|
95
|
+
# args not implemented, just ignore
|
|
96
|
+
_ = args, kwargs
|
|
97
|
+
|
|
98
|
+
validate_axis(axis, series_or_index)
|
|
96
99
|
op = DataFrameArgMax(
|
|
97
100
|
dropna=skipna,
|
|
98
101
|
output_types=[OutputType.scalar],
|
|
99
102
|
)
|
|
100
|
-
return op(
|
|
103
|
+
return op(series_or_index)
|
|
@@ -40,7 +40,7 @@ class DataFrameArgMin(DataFrameReduction, DataFrameReductionMixin):
|
|
|
40
40
|
return ReductionCallable(func_name=func_name, kwargs=kw)
|
|
41
41
|
|
|
42
42
|
|
|
43
|
-
def
|
|
43
|
+
def argmin_series_index(series_or_index, axis=0, skipna=True, *args, **kwargs):
|
|
44
44
|
"""
|
|
45
45
|
Return int position of the smallest value in the Series.
|
|
46
46
|
|
|
@@ -65,7 +65,7 @@ def argmin_series(series, axis=0, skipna=True):
|
|
|
65
65
|
--------
|
|
66
66
|
Series.argmin : Return position of the minimum value.
|
|
67
67
|
Series.argmax : Return position of the maximum value.
|
|
68
|
-
|
|
68
|
+
maxframe.tensor.argmin : Equivalent method for tensors.
|
|
69
69
|
Series.idxmax : Return index label of the maximum values.
|
|
70
70
|
Series.idxmin : Return index label of the minimum values.
|
|
71
71
|
|
|
@@ -92,9 +92,12 @@ def argmin_series(series, axis=0, skipna=True):
|
|
|
92
92
|
the minimum cereal calories is the first element,
|
|
93
93
|
since series is zero-indexed.
|
|
94
94
|
"""
|
|
95
|
-
|
|
95
|
+
# args not implemented, just ignore
|
|
96
|
+
_ = args, kwargs
|
|
97
|
+
|
|
98
|
+
validate_axis(axis, series_or_index)
|
|
96
99
|
op = DataFrameArgMin(
|
|
97
100
|
dropna=skipna,
|
|
98
101
|
output_types=[OutputType.scalar],
|
|
99
102
|
)
|
|
100
|
-
return op(
|
|
103
|
+
return op(series_or_index)
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
|
|
15
15
|
import functools
|
|
16
16
|
import inspect
|
|
17
|
-
from collections import OrderedDict
|
|
17
|
+
from collections import OrderedDict, namedtuple
|
|
18
18
|
from typing import Any, Dict, List, NamedTuple, Optional, Tuple
|
|
19
19
|
|
|
20
20
|
import msgpack
|
|
@@ -32,7 +32,7 @@ from ...serialization.serializables import (
|
|
|
32
32
|
StringField,
|
|
33
33
|
)
|
|
34
34
|
from ...typing_ import TileableType
|
|
35
|
-
from ...utils import get_item_if_scalar, pd_release_version, tokenize
|
|
35
|
+
from ...utils import get_item_if_scalar, get_pd_option, pd_release_version, tokenize
|
|
36
36
|
from ..operators import DATAFRAME_TYPE, DataFrameOperator, DataFrameOperatorMixin
|
|
37
37
|
from ..utils import (
|
|
38
38
|
build_df,
|
|
@@ -52,6 +52,8 @@ _reduce_bool_as_object = pd_release_version[:2] != (1, 2)
|
|
|
52
52
|
|
|
53
53
|
_idx_reduction_without_numeric_only = pd_release_version[:2] < (1, 5)
|
|
54
54
|
|
|
55
|
+
NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"])
|
|
56
|
+
|
|
55
57
|
|
|
56
58
|
class DataFrameReduction(DataFrameOperator):
|
|
57
59
|
_legacy_name = "DataFrameReductionOperator" # since v2.2.0
|
|
@@ -70,7 +72,7 @@ class DataFrameReduction(DataFrameOperator):
|
|
|
70
72
|
|
|
71
73
|
def __init__(self, gpu=None, sparse=None, output_types=None, **kw):
|
|
72
74
|
kw["use_inf_as_na"] = kw.pop(
|
|
73
|
-
"use_inf_as_na",
|
|
75
|
+
"use_inf_as_na", get_pd_option("mode.use_inf_as_na", False)
|
|
74
76
|
)
|
|
75
77
|
super().__init__(gpu=gpu, sparse=sparse, _output_types=output_types, **kw)
|
|
76
78
|
|
|
@@ -104,7 +106,7 @@ class DataFrameCumReduction(DataFrameOperator):
|
|
|
104
106
|
|
|
105
107
|
def __init__(self, gpu=None, sparse=None, output_types=None, **kw):
|
|
106
108
|
kw["use_inf_as_na"] = kw.pop(
|
|
107
|
-
"use_inf_as_na",
|
|
109
|
+
"use_inf_as_na", get_pd_option("mode.use_inf_as_na", False)
|
|
108
110
|
)
|
|
109
111
|
super().__init__(gpu=gpu, sparse=sparse, _output_types=output_types, **kw)
|
|
110
112
|
|
|
@@ -300,10 +302,13 @@ class DataFrameReductionMixin(DataFrameOperatorMixin):
|
|
|
300
302
|
|
|
301
303
|
if func_name == "custom_reduction":
|
|
302
304
|
empty_series = build_series(series, ensure_string=True)
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
305
|
+
custom_reduction_obj = getattr(self, "custom_reduction")
|
|
306
|
+
result_dtype = getattr(custom_reduction_obj, "result_dtype", None)
|
|
307
|
+
if result_dtype is None:
|
|
308
|
+
result_scalar = custom_reduction_obj.__call_agg__(empty_series)
|
|
309
|
+
if hasattr(result_scalar, "to_pandas"): # pragma: no cover
|
|
310
|
+
result_scalar = result_scalar.to_pandas()
|
|
311
|
+
result_dtype = pd.Series(result_scalar).dtype
|
|
307
312
|
else:
|
|
308
313
|
result_dtype = _get_series_reduction_dtype(
|
|
309
314
|
series.dtype,
|
|
@@ -378,6 +383,10 @@ class CustomReduction:
|
|
|
378
383
|
def __name__(self):
|
|
379
384
|
return self.name
|
|
380
385
|
|
|
386
|
+
@property
|
|
387
|
+
def result_dtype(self):
|
|
388
|
+
return None
|
|
389
|
+
|
|
381
390
|
def __call__(self, value):
|
|
382
391
|
if isinstance(value, ENTITY_TYPE):
|
|
383
392
|
from .custom_reduction import build_custom_reduction_result
|
|
@@ -512,7 +521,7 @@ class ReductionCompiler:
|
|
|
512
521
|
def _check_function_valid(cls, func):
|
|
513
522
|
if isinstance(func, functools.partial):
|
|
514
523
|
return cls._check_function_valid(func.func)
|
|
515
|
-
elif
|
|
524
|
+
elif not hasattr(func, "__code__"):
|
|
516
525
|
return
|
|
517
526
|
|
|
518
527
|
func_code = func.__code__
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
# Copyright 1999-2025 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import numpy as np
|
|
16
|
+
|
|
17
|
+
from ... import opcodes
|
|
18
|
+
from ...core import OutputType, get_output_types
|
|
19
|
+
from ...serialization.serializables import BoolField, Int32Field
|
|
20
|
+
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
21
|
+
from ..utils import parse_index, validate_axis
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DataFrameMode(DataFrameOperator, DataFrameOperatorMixin):
|
|
25
|
+
_op_type_ = opcodes.MODE
|
|
26
|
+
|
|
27
|
+
axis = Int32Field("axis", default=None)
|
|
28
|
+
numeric_only = BoolField("numeric_only", default=False)
|
|
29
|
+
dropna = BoolField("dropna", default=True)
|
|
30
|
+
combine_size = Int32Field("combine_size", default=None)
|
|
31
|
+
|
|
32
|
+
def __call__(self, in_obj):
|
|
33
|
+
self._output_types = get_output_types(in_obj)
|
|
34
|
+
params = in_obj.params
|
|
35
|
+
shape = list(in_obj.shape)
|
|
36
|
+
shape[self.axis] = np.nan
|
|
37
|
+
params["shape"] = tuple(shape)
|
|
38
|
+
|
|
39
|
+
if self.axis == 0:
|
|
40
|
+
pd_idx = in_obj.index_value.to_pandas()[:0]
|
|
41
|
+
params["index_value"] = parse_index(pd_idx)
|
|
42
|
+
else:
|
|
43
|
+
pd_idx = in_obj.columns_value.to_pandas()[:0]
|
|
44
|
+
params["columns_value"] = parse_index(pd_idx)
|
|
45
|
+
params["dtypes"] = None
|
|
46
|
+
return self.new_tileable([in_obj], **params)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def mode_dataframe(df, axis=0, numeric_only=False, dropna=True, combine_size=None):
|
|
50
|
+
"""
|
|
51
|
+
Get the mode(s) of each element along the selected axis.
|
|
52
|
+
The mode of a set of values is the value that appears most often.
|
|
53
|
+
It can be multiple values.
|
|
54
|
+
Parameters
|
|
55
|
+
----------
|
|
56
|
+
axis : {0 or 'index', 1 or 'columns'}, default 0
|
|
57
|
+
The axis to iterate over while searching for the mode:
|
|
58
|
+
* 0 or 'index' : get mode of each column
|
|
59
|
+
* 1 or 'columns' : get mode of each row.
|
|
60
|
+
numeric_only : bool, default False
|
|
61
|
+
If True, only apply to numeric columns.
|
|
62
|
+
dropna : bool, default True
|
|
63
|
+
Don't consider counts of NaN/NaT.
|
|
64
|
+
Returns
|
|
65
|
+
-------
|
|
66
|
+
DataFrame
|
|
67
|
+
The modes of each column or row.
|
|
68
|
+
See Also
|
|
69
|
+
--------
|
|
70
|
+
Series.mode : Return the highest frequency value in a Series.
|
|
71
|
+
Series.value_counts : Return the counts of values in a Series.
|
|
72
|
+
Examples
|
|
73
|
+
--------
|
|
74
|
+
>>> import maxframe.tensor as mt
|
|
75
|
+
>>> import maxframe.dataframe as md
|
|
76
|
+
>>> df = md.DataFrame([('bird', 2, 2),
|
|
77
|
+
... ('mammal', 4, mt.nan),
|
|
78
|
+
... ('arthropod', 8, 0),
|
|
79
|
+
... ('bird', 2, mt.nan)],
|
|
80
|
+
... index=('falcon', 'horse', 'spider', 'ostrich'),
|
|
81
|
+
... columns=('species', 'legs', 'wings'))
|
|
82
|
+
>>> df.execute()
|
|
83
|
+
species legs wings
|
|
84
|
+
falcon bird 2 2.0
|
|
85
|
+
horse mammal 4 NaN
|
|
86
|
+
spider arthropod 8 0.0
|
|
87
|
+
ostrich bird 2 NaN
|
|
88
|
+
By default, missing values are not considered, and the mode of wings
|
|
89
|
+
are both 0 and 2. Because the resulting DataFrame has two rows,
|
|
90
|
+
the second row of ``species`` and ``legs`` contains ``NaN``.
|
|
91
|
+
>>> df.mode().execute()
|
|
92
|
+
species legs wings
|
|
93
|
+
0 bird 2.0 0.0
|
|
94
|
+
1 NaN NaN 2.0
|
|
95
|
+
Setting ``dropna=False`` ``NaN`` values are considered and they can be
|
|
96
|
+
the mode (like for wings).
|
|
97
|
+
>>> df.mode(dropna=False).execute()
|
|
98
|
+
species legs wings
|
|
99
|
+
0 bird 2 NaN
|
|
100
|
+
Setting ``numeric_only=True``, only the mode of numeric columns is
|
|
101
|
+
computed, and columns of other types are ignored.
|
|
102
|
+
>>> df.mode(numeric_only=True).execute()
|
|
103
|
+
legs wings
|
|
104
|
+
0 2.0 0.0
|
|
105
|
+
1 NaN 2.0
|
|
106
|
+
To compute the mode over columns and not rows, use the axis parameter:
|
|
107
|
+
>>> df.mode(axis='columns', numeric_only=True).execute()
|
|
108
|
+
0 1
|
|
109
|
+
falcon 2.0 NaN
|
|
110
|
+
horse 4.0 NaN
|
|
111
|
+
spider 0.0 8.0
|
|
112
|
+
ostrich 2.0 NaN
|
|
113
|
+
"""
|
|
114
|
+
op = DataFrameMode(
|
|
115
|
+
axis=validate_axis(axis),
|
|
116
|
+
numeric_only=numeric_only,
|
|
117
|
+
dropna=dropna,
|
|
118
|
+
combine_size=combine_size,
|
|
119
|
+
output_types=[OutputType.dataframe],
|
|
120
|
+
)
|
|
121
|
+
return op(df)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def mode_series(series, dropna=True, combine_size=None):
|
|
125
|
+
"""
|
|
126
|
+
Return the mode(s) of the Series.
|
|
127
|
+
The mode is the value that appears most often. There can be multiple modes.
|
|
128
|
+
Always returns Series even if only one value is returned.
|
|
129
|
+
Parameters
|
|
130
|
+
----------
|
|
131
|
+
dropna : bool, default True
|
|
132
|
+
Don't consider counts of NaN/NaT.
|
|
133
|
+
Returns
|
|
134
|
+
-------
|
|
135
|
+
Series
|
|
136
|
+
Modes of the Series in sorted order.
|
|
137
|
+
"""
|
|
138
|
+
op = DataFrameMode(
|
|
139
|
+
axis=0,
|
|
140
|
+
dropna=dropna,
|
|
141
|
+
combine_size=combine_size,
|
|
142
|
+
output_types=[OutputType.series],
|
|
143
|
+
)
|
|
144
|
+
return op(series)
|
|
@@ -20,8 +20,9 @@ except ImportError: # pragma: no cover
|
|
|
20
20
|
from ... import opcodes
|
|
21
21
|
from ...config import options
|
|
22
22
|
from ...core import OutputType
|
|
23
|
-
from ...serialization.serializables import BoolField
|
|
23
|
+
from ...serialization.serializables import BoolField, StringField
|
|
24
24
|
from ...utils import lazy_import
|
|
25
|
+
from ..utils import validate_dtype_backend
|
|
25
26
|
from .core import DataFrameReduction, DataFrameReductionMixin, ReductionCallable
|
|
26
27
|
|
|
27
28
|
cudf = lazy_import("cudf")
|
|
@@ -32,7 +33,13 @@ class DataFrameNunique(DataFrameReduction, DataFrameReductionMixin):
|
|
|
32
33
|
_func_name = "nunique"
|
|
33
34
|
|
|
34
35
|
dropna = BoolField("dropna", default=None)
|
|
35
|
-
|
|
36
|
+
dtype_backend = StringField(
|
|
37
|
+
"dtype_backend", on_deserialize=validate_dtype_backend, default=None
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
def __init__(self, dtype_backend=None, **kw):
|
|
41
|
+
dtype_backend = validate_dtype_backend(dtype_backend)
|
|
42
|
+
super().__init__(dtype_backend=dtype_backend, **kw)
|
|
36
43
|
|
|
37
44
|
@property
|
|
38
45
|
def is_atomic(self):
|
|
@@ -137,6 +144,6 @@ def nunique_series(series, dropna=True):
|
|
|
137
144
|
op = DataFrameNunique(
|
|
138
145
|
dropna=dropna,
|
|
139
146
|
output_types=[OutputType.scalar],
|
|
140
|
-
|
|
147
|
+
dtype_backend=options.dataframe.dtype_backend,
|
|
141
148
|
)
|
|
142
149
|
return op(series)
|
|
@@ -26,6 +26,7 @@ from .... import dataframe as md
|
|
|
26
26
|
from ....lib.dtypes_extension import ArrowDtype
|
|
27
27
|
from ....tensor import Tensor
|
|
28
28
|
from ....tests.utils import assert_mf_index_dtype
|
|
29
|
+
from ....udf import ODPSFunction
|
|
29
30
|
from ...core import DataFrame, IndexValue, OutputType, Series
|
|
30
31
|
from ...datasource.dataframe import from_pandas as from_pandas_df
|
|
31
32
|
from ...datasource.series import from_pandas as from_pandas_series
|
|
@@ -527,3 +528,14 @@ def test_custom_aggregation():
|
|
|
527
528
|
assert result.agg_funcs[0].agg_func_name == "custom_reduction"
|
|
528
529
|
assert isinstance(result.agg_funcs[0].custom_reduction, MockReduction2)
|
|
529
530
|
assert result.agg_funcs[0].output_limit == 2
|
|
531
|
+
|
|
532
|
+
|
|
533
|
+
def test_aggregation_with_odps_function():
|
|
534
|
+
odps_func = ODPSFunction("test_odps_udaf", dtype=np.float64)
|
|
535
|
+
for ndim in [1, 2]:
|
|
536
|
+
compiler = ReductionCompiler()
|
|
537
|
+
compiler.add_function(odps_func, ndim=ndim)
|
|
538
|
+
result = compiler.compile()
|
|
539
|
+
assert result.agg_funcs[0].map_func_name == "custom_reduction"
|
|
540
|
+
assert result.agg_funcs[0].agg_func_name == "custom_reduction"
|
|
541
|
+
assert isinstance(result.agg_funcs[0].custom_reduction, ODPSFunction)
|
|
@@ -12,21 +12,24 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
+
from .rank import DataFrameRank
|
|
15
16
|
from .sort_index import DataFrameSortIndex
|
|
16
17
|
from .sort_values import DataFrameSortValues
|
|
17
18
|
|
|
18
19
|
|
|
19
20
|
def _install():
|
|
20
|
-
from ..core import DATAFRAME_TYPE, SERIES_TYPE
|
|
21
|
-
from .argsort import series_argsort
|
|
21
|
+
from ..core import DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE
|
|
22
|
+
from .argsort import index_argsort, series_argsort
|
|
22
23
|
from .nlargest import df_nlargest, series_nlargest
|
|
23
24
|
from .nsmallest import df_nsmallest, series_nsmallest
|
|
25
|
+
from .rank import rank
|
|
24
26
|
from .sort_index import sort_index
|
|
25
27
|
from .sort_values import dataframe_sort_values, series_sort_values
|
|
26
28
|
|
|
27
29
|
for cls in DATAFRAME_TYPE:
|
|
28
30
|
setattr(cls, "nlargest", df_nlargest)
|
|
29
31
|
setattr(cls, "nsmallest", df_nsmallest)
|
|
32
|
+
setattr(cls, "rank", rank)
|
|
30
33
|
setattr(cls, "sort_values", dataframe_sort_values)
|
|
31
34
|
setattr(cls, "sort_index", sort_index)
|
|
32
35
|
|
|
@@ -34,9 +37,13 @@ def _install():
|
|
|
34
37
|
setattr(cls, "argsort", series_argsort)
|
|
35
38
|
setattr(cls, "nlargest", series_nlargest)
|
|
36
39
|
setattr(cls, "nsmallest", series_nsmallest)
|
|
40
|
+
setattr(cls, "rank", rank)
|
|
37
41
|
setattr(cls, "sort_values", series_sort_values)
|
|
38
42
|
setattr(cls, "sort_index", sort_index)
|
|
39
43
|
|
|
44
|
+
for cls in INDEX_TYPE:
|
|
45
|
+
setattr(cls, "argsort", index_argsort)
|
|
46
|
+
|
|
40
47
|
|
|
41
48
|
_install()
|
|
42
49
|
del _install
|
|
@@ -40,7 +40,7 @@ def series_argsort(series, axis=0, kind="quicksort", order=None, stable=None):
|
|
|
40
40
|
|
|
41
41
|
See Also
|
|
42
42
|
--------
|
|
43
|
-
|
|
43
|
+
maxframe.tensor.argsort : Returns the indices that would sort this array.
|
|
44
44
|
|
|
45
45
|
Examples
|
|
46
46
|
--------
|
|
@@ -60,3 +60,9 @@ def series_argsort(series, axis=0, kind="quicksort", order=None, stable=None):
|
|
|
60
60
|
axis = 0
|
|
61
61
|
t = mt.argsort(series.to_tensor(), axis=axis, kind=kind)
|
|
62
62
|
return series_from_tensor(t, index=series.index)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def index_argsort(index, *args, **kwargs):
|
|
66
|
+
from ... import tensor as mt
|
|
67
|
+
|
|
68
|
+
return mt.argsort(index.to_tensor(), *args, **kwargs)
|
maxframe/dataframe/sort/core.py
CHANGED
|
@@ -32,6 +32,6 @@ class DataFrameSortOperator(DataFrameOperator):
|
|
|
32
32
|
na_position = StringField("na_position")
|
|
33
33
|
ignore_index = BoolField("ignore_index")
|
|
34
34
|
parallel_kind = StringField("parallel_kind")
|
|
35
|
-
psrs_kinds = ListField("psrs_kinds", FieldTypes.string)
|
|
35
|
+
psrs_kinds = ListField("psrs_kinds", FieldTypes.string, default=None)
|
|
36
36
|
nrows = Int64Field("nrows", default=None)
|
|
37
37
|
keep_kind = StringField("keep_kind", default="head")
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
# Copyright 1999-2025 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import numpy as np
|
|
16
|
+
import pandas as pd
|
|
17
|
+
|
|
18
|
+
from ...serialization.serializables import BoolField, StringField
|
|
19
|
+
from ..operators import DataFrameOperatorMixin
|
|
20
|
+
from .core import DataFrameSortOperator
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class DataFrameRank(DataFrameSortOperator, DataFrameOperatorMixin):
|
|
24
|
+
method = StringField("method", default=None)
|
|
25
|
+
numeric_only = BoolField("numeric_only", default=None)
|
|
26
|
+
pct = BoolField("pct", default=False)
|
|
27
|
+
|
|
28
|
+
@property
|
|
29
|
+
def na_option(self):
|
|
30
|
+
return self.na_position
|
|
31
|
+
|
|
32
|
+
def __call__(self, df_obj):
|
|
33
|
+
params = df_obj.params
|
|
34
|
+
if df_obj.ndim == 2: # dataframe
|
|
35
|
+
if self.numeric_only:
|
|
36
|
+
sel_df = df_obj.select_dtypes(include=[np.number])
|
|
37
|
+
cols = sel_df.dtypes.index
|
|
38
|
+
else:
|
|
39
|
+
cols = df_obj.dtypes.index
|
|
40
|
+
params["dtypes"] = pd.Series([np.dtype(float)] * len(cols), index=cols)
|
|
41
|
+
return self.new_dataframe([df_obj], **params)
|
|
42
|
+
else:
|
|
43
|
+
params["dtypes"] = np.dtype(float)
|
|
44
|
+
return self.new_series([df_obj], **params)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def rank(
|
|
48
|
+
df,
|
|
49
|
+
axis=0,
|
|
50
|
+
method="average",
|
|
51
|
+
numeric_only=False,
|
|
52
|
+
na_option="keep",
|
|
53
|
+
ascending=True,
|
|
54
|
+
pct=False,
|
|
55
|
+
):
|
|
56
|
+
"""
|
|
57
|
+
Compute numerical data ranks (1 through n) along axis.
|
|
58
|
+
|
|
59
|
+
By default, equal values are assigned a rank that is the average of the
|
|
60
|
+
ranks of those values.
|
|
61
|
+
|
|
62
|
+
Parameters
|
|
63
|
+
----------
|
|
64
|
+
axis : {0 or 'index', 1 or 'columns'}, default 0
|
|
65
|
+
Index to direct ranking.
|
|
66
|
+
method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
|
|
67
|
+
How to rank the group of records that have the same value (i.e. ties):
|
|
68
|
+
|
|
69
|
+
* average: average rank of the group
|
|
70
|
+
* min: lowest rank in the group
|
|
71
|
+
* max: highest rank in the group
|
|
72
|
+
* first: ranks assigned in order they appear in the array
|
|
73
|
+
* dense: like 'min', but rank always increases by 1 between groups.
|
|
74
|
+
|
|
75
|
+
numeric_only : bool, optional
|
|
76
|
+
For DataFrame objects, rank only numeric columns if set to True.
|
|
77
|
+
na_option : {'keep', 'top', 'bottom'}, default 'keep'
|
|
78
|
+
How to rank NaN values:
|
|
79
|
+
|
|
80
|
+
* keep: assign NaN rank to NaN values
|
|
81
|
+
* top: assign lowest rank to NaN values
|
|
82
|
+
* bottom: assign highest rank to NaN values
|
|
83
|
+
|
|
84
|
+
ascending : bool, default True
|
|
85
|
+
Whether or not the elements should be ranked in ascending order.
|
|
86
|
+
pct : bool, default False
|
|
87
|
+
Whether or not to display the returned rankings in percentile
|
|
88
|
+
form.
|
|
89
|
+
|
|
90
|
+
Returns
|
|
91
|
+
-------
|
|
92
|
+
same type as caller
|
|
93
|
+
Return a Series or DataFrame with data ranks as values.
|
|
94
|
+
|
|
95
|
+
See Also
|
|
96
|
+
--------
|
|
97
|
+
core.groupby.GroupBy.rank : Rank of values within each group.
|
|
98
|
+
|
|
99
|
+
Examples
|
|
100
|
+
--------
|
|
101
|
+
>>> import maxframe.tensor as mt
|
|
102
|
+
>>> import maxframe.dataframe as md
|
|
103
|
+
>>> df = md.DataFrame(data={'Animal': ['cat', 'penguin', 'dog',
|
|
104
|
+
... 'spider', 'snake'],
|
|
105
|
+
... 'Number_legs': [4, 2, 4, 8, mt.nan]})
|
|
106
|
+
>>> df.execute()
|
|
107
|
+
Animal Number_legs
|
|
108
|
+
0 cat 4.0
|
|
109
|
+
1 penguin 2.0
|
|
110
|
+
2 dog 4.0
|
|
111
|
+
3 spider 8.0
|
|
112
|
+
4 snake NaN
|
|
113
|
+
|
|
114
|
+
The following example shows how the method behaves with the above
|
|
115
|
+
parameters:
|
|
116
|
+
|
|
117
|
+
* default_rank: this is the default behaviour obtained without using
|
|
118
|
+
any parameter.
|
|
119
|
+
* max_rank: setting ``method = 'max'`` the records that have the
|
|
120
|
+
same values are ranked using the highest rank (e.g.: since 'cat'
|
|
121
|
+
and 'dog' are both in the 2nd and 3rd position, rank 3 is assigned.)
|
|
122
|
+
* NA_bottom: choosing ``na_option = 'bottom'``, if there are records
|
|
123
|
+
with NaN values they are placed at the bottom of the ranking.
|
|
124
|
+
* pct_rank: when setting ``pct = True``, the ranking is expressed as
|
|
125
|
+
percentile rank.
|
|
126
|
+
|
|
127
|
+
>>> df['default_rank'] = df['Number_legs'].rank()
|
|
128
|
+
>>> df['max_rank'] = df['Number_legs'].rank(method='max')
|
|
129
|
+
>>> df['NA_bottom'] = df['Number_legs'].rank(na_option='bottom')
|
|
130
|
+
>>> df['pct_rank'] = df['Number_legs'].rank(pct=True)
|
|
131
|
+
>>> df.execute()
|
|
132
|
+
Animal Number_legs default_rank max_rank NA_bottom pct_rank
|
|
133
|
+
0 cat 4.0 2.5 3.0 2.5 0.625
|
|
134
|
+
1 penguin 2.0 1.0 1.0 1.0 0.250
|
|
135
|
+
2 dog 4.0 2.5 3.0 2.5 0.625
|
|
136
|
+
3 spider 8.0 4.0 4.0 4.0 1.000
|
|
137
|
+
4 snake NaN NaN NaN 5.0 NaN
|
|
138
|
+
"""
|
|
139
|
+
op = DataFrameRank(
|
|
140
|
+
axis=axis,
|
|
141
|
+
method=method,
|
|
142
|
+
numeric_only=numeric_only,
|
|
143
|
+
na_position=na_option,
|
|
144
|
+
ascending=ascending,
|
|
145
|
+
pct=pct,
|
|
146
|
+
)
|
|
147
|
+
return op(df)
|
|
@@ -11,3 +11,22 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _install():
|
|
17
|
+
from ..core import DATAFRAME_TYPE, SERIES_TYPE
|
|
18
|
+
from .at_time import at_time
|
|
19
|
+
from .between_time import between_time
|
|
20
|
+
from .to_datetime import to_datetime # noqa
|
|
21
|
+
|
|
22
|
+
for t in SERIES_TYPE:
|
|
23
|
+
t.at_time = at_time
|
|
24
|
+
t.between_time = between_time
|
|
25
|
+
|
|
26
|
+
for t in DATAFRAME_TYPE:
|
|
27
|
+
t.at_time = at_time
|
|
28
|
+
t.between_time = between_time
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
_install()
|
|
32
|
+
del _install
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# Copyright 1999-2025 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def at_time(df_or_series, time, axis=0):
|
|
17
|
+
"""
|
|
18
|
+
Select values at particular time of day (e.g., 9:30AM).
|
|
19
|
+
|
|
20
|
+
Parameters
|
|
21
|
+
----------
|
|
22
|
+
time : datetime.time or str
|
|
23
|
+
The values to select.
|
|
24
|
+
axis : {0 or 'index', 1 or 'columns'}, default 0
|
|
25
|
+
For `Series` this parameter is unused and defaults to 0.
|
|
26
|
+
|
|
27
|
+
Returns
|
|
28
|
+
-------
|
|
29
|
+
Series or DataFrame
|
|
30
|
+
|
|
31
|
+
Raises
|
|
32
|
+
------
|
|
33
|
+
TypeError
|
|
34
|
+
If the index is not a :class:`DatetimeIndex`
|
|
35
|
+
|
|
36
|
+
See Also
|
|
37
|
+
--------
|
|
38
|
+
between_time : Select values between particular times of the day.
|
|
39
|
+
first : Select initial periods of time series based on a date offset.
|
|
40
|
+
last : Select final periods of time series based on a date offset.
|
|
41
|
+
DatetimeIndex.indexer_at_time : Get just the index locations for
|
|
42
|
+
values at particular time of the day.
|
|
43
|
+
|
|
44
|
+
Examples
|
|
45
|
+
--------
|
|
46
|
+
>>> import maxframe.dataframe as md
|
|
47
|
+
>>> i = md.date_range('2018-04-09', periods=4, freq='12h')
|
|
48
|
+
>>> ts = md.DataFrame({'A': [1, 2, 3, 4]}, index=i)
|
|
49
|
+
>>> ts.execute()
|
|
50
|
+
A
|
|
51
|
+
2018-04-09 00:00:00 1
|
|
52
|
+
2018-04-09 12:00:00 2
|
|
53
|
+
2018-04-10 00:00:00 3
|
|
54
|
+
2018-04-10 12:00:00 4
|
|
55
|
+
|
|
56
|
+
>>> ts.at_time('12:00').execute()
|
|
57
|
+
A
|
|
58
|
+
2018-04-09 12:00:00 2
|
|
59
|
+
2018-04-10 12:00:00 4
|
|
60
|
+
"""
|
|
61
|
+
return df_or_series.between_time(time, time, inclusive="both", axis=axis)
|