maxframe 2.2.0__cp312-cp312-win_amd64.whl → 2.3.0rc1__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (114) hide show
  1. maxframe/_utils.cp312-win_amd64.pyd +0 -0
  2. maxframe/codegen/core.py +3 -2
  3. maxframe/codegen/spe/dataframe/merge.py +4 -0
  4. maxframe/codegen/spe/dataframe/misc.py +2 -0
  5. maxframe/codegen/spe/dataframe/reduction.py +18 -0
  6. maxframe/codegen/spe/dataframe/sort.py +9 -1
  7. maxframe/codegen/spe/dataframe/tests/test_reduction.py +13 -0
  8. maxframe/codegen/spe/dataframe/tseries.py +9 -0
  9. maxframe/codegen/spe/learn/contrib/lightgbm.py +4 -3
  10. maxframe/codegen/spe/tensor/datasource.py +1 -0
  11. maxframe/config/config.py +3 -0
  12. maxframe/conftest.py +10 -0
  13. maxframe/core/base.py +2 -1
  14. maxframe/core/entity/tileables.py +2 -0
  15. maxframe/core/graph/core.cp312-win_amd64.pyd +0 -0
  16. maxframe/core/graph/entity.py +7 -1
  17. maxframe/core/mode.py +6 -1
  18. maxframe/dataframe/__init__.py +2 -2
  19. maxframe/dataframe/arithmetic/__init__.py +4 -0
  20. maxframe/dataframe/arithmetic/maximum.py +33 -0
  21. maxframe/dataframe/arithmetic/minimum.py +33 -0
  22. maxframe/dataframe/core.py +98 -106
  23. maxframe/dataframe/datasource/core.py +6 -0
  24. maxframe/dataframe/datasource/direct.py +57 -0
  25. maxframe/dataframe/datasource/read_csv.py +19 -11
  26. maxframe/dataframe/datasource/read_odps_query.py +29 -6
  27. maxframe/dataframe/datasource/read_odps_table.py +32 -10
  28. maxframe/dataframe/datasource/read_parquet.py +38 -39
  29. maxframe/dataframe/datastore/__init__.py +6 -0
  30. maxframe/dataframe/datastore/direct.py +268 -0
  31. maxframe/dataframe/datastore/to_odps.py +6 -0
  32. maxframe/dataframe/extensions/flatjson.py +2 -1
  33. maxframe/dataframe/groupby/__init__.py +5 -1
  34. maxframe/dataframe/groupby/aggregation.py +10 -6
  35. maxframe/dataframe/groupby/apply_chunk.py +1 -3
  36. maxframe/dataframe/groupby/core.py +20 -4
  37. maxframe/dataframe/indexing/__init__.py +2 -1
  38. maxframe/dataframe/indexing/insert.py +45 -17
  39. maxframe/dataframe/merge/__init__.py +3 -0
  40. maxframe/dataframe/merge/combine.py +244 -0
  41. maxframe/dataframe/misc/__init__.py +14 -3
  42. maxframe/dataframe/misc/check_unique.py +41 -10
  43. maxframe/dataframe/misc/drop.py +31 -0
  44. maxframe/dataframe/misc/infer_dtypes.py +251 -0
  45. maxframe/dataframe/misc/map.py +31 -18
  46. maxframe/dataframe/misc/repeat.py +159 -0
  47. maxframe/dataframe/misc/tests/test_misc.py +35 -1
  48. maxframe/dataframe/missing/checkna.py +3 -2
  49. maxframe/dataframe/reduction/__init__.py +10 -5
  50. maxframe/dataframe/reduction/aggregation.py +6 -6
  51. maxframe/dataframe/reduction/argmax.py +7 -4
  52. maxframe/dataframe/reduction/argmin.py +7 -4
  53. maxframe/dataframe/reduction/core.py +18 -9
  54. maxframe/dataframe/reduction/mode.py +144 -0
  55. maxframe/dataframe/reduction/nunique.py +10 -3
  56. maxframe/dataframe/reduction/tests/test_reduction.py +12 -0
  57. maxframe/dataframe/sort/__init__.py +9 -2
  58. maxframe/dataframe/sort/argsort.py +7 -1
  59. maxframe/dataframe/sort/core.py +1 -1
  60. maxframe/dataframe/sort/rank.py +147 -0
  61. maxframe/dataframe/tseries/__init__.py +19 -0
  62. maxframe/dataframe/tseries/at_time.py +61 -0
  63. maxframe/dataframe/tseries/between_time.py +122 -0
  64. maxframe/dataframe/utils.py +30 -26
  65. maxframe/learn/contrib/llm/core.py +16 -7
  66. maxframe/learn/contrib/llm/deploy/__init__.py +13 -0
  67. maxframe/learn/contrib/llm/deploy/config.py +221 -0
  68. maxframe/learn/contrib/llm/deploy/core.py +247 -0
  69. maxframe/learn/contrib/llm/deploy/framework.py +35 -0
  70. maxframe/learn/contrib/llm/deploy/loader.py +360 -0
  71. maxframe/learn/contrib/llm/deploy/tests/__init__.py +13 -0
  72. maxframe/learn/contrib/llm/deploy/tests/test_register_models.py +359 -0
  73. maxframe/learn/contrib/llm/models/__init__.py +1 -0
  74. maxframe/learn/contrib/llm/models/dashscope.py +12 -6
  75. maxframe/learn/contrib/llm/models/managed.py +76 -11
  76. maxframe/learn/contrib/llm/models/openai.py +72 -0
  77. maxframe/learn/contrib/llm/tests/__init__.py +13 -0
  78. maxframe/learn/contrib/llm/tests/test_core.py +34 -0
  79. maxframe/learn/contrib/llm/tests/test_openai.py +187 -0
  80. maxframe/learn/contrib/llm/tests/test_text_gen.py +155 -0
  81. maxframe/learn/contrib/llm/text.py +348 -42
  82. maxframe/learn/contrib/models.py +4 -1
  83. maxframe/learn/contrib/xgboost/classifier.py +2 -0
  84. maxframe/learn/contrib/xgboost/core.py +31 -7
  85. maxframe/learn/contrib/xgboost/predict.py +4 -2
  86. maxframe/learn/contrib/xgboost/regressor.py +5 -0
  87. maxframe/learn/contrib/xgboost/train.py +2 -0
  88. maxframe/learn/preprocessing/_data/min_max_scaler.py +34 -23
  89. maxframe/learn/preprocessing/_data/standard_scaler.py +34 -25
  90. maxframe/learn/utils/__init__.py +1 -0
  91. maxframe/learn/utils/extmath.py +42 -9
  92. maxframe/learn/utils/odpsio.py +80 -11
  93. maxframe/lib/filesystem/_oss_lib/common.py +2 -0
  94. maxframe/lib/mmh3.cp312-win_amd64.pyd +0 -0
  95. maxframe/opcodes.py +9 -1
  96. maxframe/remote/core.py +4 -0
  97. maxframe/serialization/core.cp312-win_amd64.pyd +0 -0
  98. maxframe/serialization/tests/test_serial.py +2 -2
  99. maxframe/tensor/arithmetic/__init__.py +1 -1
  100. maxframe/tensor/arithmetic/core.py +2 -2
  101. maxframe/tensor/arithmetic/tests/test_arithmetic.py +0 -9
  102. maxframe/tensor/core.py +3 -0
  103. maxframe/tensor/misc/copyto.py +1 -1
  104. maxframe/tests/test_udf.py +61 -0
  105. maxframe/tests/test_utils.py +8 -5
  106. maxframe/udf.py +103 -7
  107. maxframe/utils.py +61 -8
  108. {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/METADATA +1 -2
  109. {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/RECORD +113 -90
  110. maxframe_client/session/task.py +8 -1
  111. maxframe_client/tests/test_session.py +24 -0
  112. maxframe/dataframe/arrays.py +0 -864
  113. {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/WHEEL +0 -0
  114. {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/top_level.txt +0 -0
@@ -40,7 +40,7 @@ class DataFrameArgMax(DataFrameReduction, DataFrameReductionMixin):
40
40
  return ReductionCallable(func_name=func_name, kwargs=kw)
41
41
 
42
42
 
43
- def argmax_series(series, axis=0, skipna=True):
43
+ def argmax_series_index(series_or_index, axis=0, skipna=True, *args, **kwargs):
44
44
  """
45
45
  Return int position of the smallest value in the Series.
46
46
 
@@ -65,7 +65,7 @@ def argmax_series(series, axis=0, skipna=True):
65
65
  --------
66
66
  Series.argmin : Return position of the minimum value.
67
67
  Series.argmax : Return position of the maximum value.
68
- numpy.ndarray.argmax : Equivalent method for numpy arrays.
68
+ maxframe.tensor.argmax : Equivalent method for tensors.
69
69
  Series.idxmax : Return index label of the maximum values.
70
70
  Series.idxmin : Return index label of the minimum values.
71
71
 
@@ -92,9 +92,12 @@ def argmax_series(series, axis=0, skipna=True):
92
92
  the minimum cereal calories is the first element,
93
93
  since series is zero-indexed.
94
94
  """
95
- validate_axis(axis, series)
95
+ # args not implemented, just ignore
96
+ _ = args, kwargs
97
+
98
+ validate_axis(axis, series_or_index)
96
99
  op = DataFrameArgMax(
97
100
  dropna=skipna,
98
101
  output_types=[OutputType.scalar],
99
102
  )
100
- return op(series)
103
+ return op(series_or_index)
@@ -40,7 +40,7 @@ class DataFrameArgMin(DataFrameReduction, DataFrameReductionMixin):
40
40
  return ReductionCallable(func_name=func_name, kwargs=kw)
41
41
 
42
42
 
43
- def argmin_series(series, axis=0, skipna=True):
43
+ def argmin_series_index(series_or_index, axis=0, skipna=True, *args, **kwargs):
44
44
  """
45
45
  Return int position of the smallest value in the Series.
46
46
 
@@ -65,7 +65,7 @@ def argmin_series(series, axis=0, skipna=True):
65
65
  --------
66
66
  Series.argmin : Return position of the minimum value.
67
67
  Series.argmax : Return position of the maximum value.
68
- numpy.ndarray.argmin : Equivalent method for numpy arrays.
68
+ maxframe.tensor.argmin : Equivalent method for tensors.
69
69
  Series.idxmax : Return index label of the maximum values.
70
70
  Series.idxmin : Return index label of the minimum values.
71
71
 
@@ -92,9 +92,12 @@ def argmin_series(series, axis=0, skipna=True):
92
92
  the minimum cereal calories is the first element,
93
93
  since series is zero-indexed.
94
94
  """
95
- validate_axis(axis, series)
95
+ # args not implemented, just ignore
96
+ _ = args, kwargs
97
+
98
+ validate_axis(axis, series_or_index)
96
99
  op = DataFrameArgMin(
97
100
  dropna=skipna,
98
101
  output_types=[OutputType.scalar],
99
102
  )
100
- return op(series)
103
+ return op(series_or_index)
@@ -14,7 +14,7 @@
14
14
 
15
15
  import functools
16
16
  import inspect
17
- from collections import OrderedDict
17
+ from collections import OrderedDict, namedtuple
18
18
  from typing import Any, Dict, List, NamedTuple, Optional, Tuple
19
19
 
20
20
  import msgpack
@@ -32,7 +32,7 @@ from ...serialization.serializables import (
32
32
  StringField,
33
33
  )
34
34
  from ...typing_ import TileableType
35
- from ...utils import get_item_if_scalar, pd_release_version, tokenize
35
+ from ...utils import get_item_if_scalar, get_pd_option, pd_release_version, tokenize
36
36
  from ..operators import DATAFRAME_TYPE, DataFrameOperator, DataFrameOperatorMixin
37
37
  from ..utils import (
38
38
  build_df,
@@ -52,6 +52,8 @@ _reduce_bool_as_object = pd_release_version[:2] != (1, 2)
52
52
 
53
53
  _idx_reduction_without_numeric_only = pd_release_version[:2] < (1, 5)
54
54
 
55
+ NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"])
56
+
55
57
 
56
58
  class DataFrameReduction(DataFrameOperator):
57
59
  _legacy_name = "DataFrameReductionOperator" # since v2.2.0
@@ -70,7 +72,7 @@ class DataFrameReduction(DataFrameOperator):
70
72
 
71
73
  def __init__(self, gpu=None, sparse=None, output_types=None, **kw):
72
74
  kw["use_inf_as_na"] = kw.pop(
73
- "use_inf_as_na", pd.get_option("mode.use_inf_as_na")
75
+ "use_inf_as_na", get_pd_option("mode.use_inf_as_na", False)
74
76
  )
75
77
  super().__init__(gpu=gpu, sparse=sparse, _output_types=output_types, **kw)
76
78
 
@@ -104,7 +106,7 @@ class DataFrameCumReduction(DataFrameOperator):
104
106
 
105
107
  def __init__(self, gpu=None, sparse=None, output_types=None, **kw):
106
108
  kw["use_inf_as_na"] = kw.pop(
107
- "use_inf_as_na", pd.get_option("mode.use_inf_as_na")
109
+ "use_inf_as_na", get_pd_option("mode.use_inf_as_na", False)
108
110
  )
109
111
  super().__init__(gpu=gpu, sparse=sparse, _output_types=output_types, **kw)
110
112
 
@@ -300,10 +302,13 @@ class DataFrameReductionMixin(DataFrameOperatorMixin):
300
302
 
301
303
  if func_name == "custom_reduction":
302
304
  empty_series = build_series(series, ensure_string=True)
303
- result_scalar = getattr(self, "custom_reduction").__call_agg__(empty_series)
304
- if hasattr(result_scalar, "to_pandas"): # pragma: no cover
305
- result_scalar = result_scalar.to_pandas()
306
- result_dtype = pd.Series(result_scalar).dtype
305
+ custom_reduction_obj = getattr(self, "custom_reduction")
306
+ result_dtype = getattr(custom_reduction_obj, "result_dtype", None)
307
+ if result_dtype is None:
308
+ result_scalar = custom_reduction_obj.__call_agg__(empty_series)
309
+ if hasattr(result_scalar, "to_pandas"): # pragma: no cover
310
+ result_scalar = result_scalar.to_pandas()
311
+ result_dtype = pd.Series(result_scalar).dtype
307
312
  else:
308
313
  result_dtype = _get_series_reduction_dtype(
309
314
  series.dtype,
@@ -378,6 +383,10 @@ class CustomReduction:
378
383
  def __name__(self):
379
384
  return self.name
380
385
 
386
+ @property
387
+ def result_dtype(self):
388
+ return None
389
+
381
390
  def __call__(self, value):
382
391
  if isinstance(value, ENTITY_TYPE):
383
392
  from .custom_reduction import build_custom_reduction_result
@@ -512,7 +521,7 @@ class ReductionCompiler:
512
521
  def _check_function_valid(cls, func):
513
522
  if isinstance(func, functools.partial):
514
523
  return cls._check_function_valid(func.func)
515
- elif isinstance(func, (CustomReduction, ReductionCallable)):
524
+ elif not hasattr(func, "__code__"):
516
525
  return
517
526
 
518
527
  func_code = func.__code__
@@ -0,0 +1,144 @@
1
+ # Copyright 1999-2025 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import numpy as np
16
+
17
+ from ... import opcodes
18
+ from ...core import OutputType, get_output_types
19
+ from ...serialization.serializables import BoolField, Int32Field
20
+ from ..operators import DataFrameOperator, DataFrameOperatorMixin
21
+ from ..utils import parse_index, validate_axis
22
+
23
+
24
+ class DataFrameMode(DataFrameOperator, DataFrameOperatorMixin):
25
+ _op_type_ = opcodes.MODE
26
+
27
+ axis = Int32Field("axis", default=None)
28
+ numeric_only = BoolField("numeric_only", default=False)
29
+ dropna = BoolField("dropna", default=True)
30
+ combine_size = Int32Field("combine_size", default=None)
31
+
32
+ def __call__(self, in_obj):
33
+ self._output_types = get_output_types(in_obj)
34
+ params = in_obj.params
35
+ shape = list(in_obj.shape)
36
+ shape[self.axis] = np.nan
37
+ params["shape"] = tuple(shape)
38
+
39
+ if self.axis == 0:
40
+ pd_idx = in_obj.index_value.to_pandas()[:0]
41
+ params["index_value"] = parse_index(pd_idx)
42
+ else:
43
+ pd_idx = in_obj.columns_value.to_pandas()[:0]
44
+ params["columns_value"] = parse_index(pd_idx)
45
+ params["dtypes"] = None
46
+ return self.new_tileable([in_obj], **params)
47
+
48
+
49
+ def mode_dataframe(df, axis=0, numeric_only=False, dropna=True, combine_size=None):
50
+ """
51
+ Get the mode(s) of each element along the selected axis.
52
+ The mode of a set of values is the value that appears most often.
53
+ It can be multiple values.
54
+ Parameters
55
+ ----------
56
+ axis : {0 or 'index', 1 or 'columns'}, default 0
57
+ The axis to iterate over while searching for the mode:
58
+ * 0 or 'index' : get mode of each column
59
+ * 1 or 'columns' : get mode of each row.
60
+ numeric_only : bool, default False
61
+ If True, only apply to numeric columns.
62
+ dropna : bool, default True
63
+ Don't consider counts of NaN/NaT.
64
+ Returns
65
+ -------
66
+ DataFrame
67
+ The modes of each column or row.
68
+ See Also
69
+ --------
70
+ Series.mode : Return the highest frequency value in a Series.
71
+ Series.value_counts : Return the counts of values in a Series.
72
+ Examples
73
+ --------
74
+ >>> import maxframe.tensor as mt
75
+ >>> import maxframe.dataframe as md
76
+ >>> df = md.DataFrame([('bird', 2, 2),
77
+ ... ('mammal', 4, mt.nan),
78
+ ... ('arthropod', 8, 0),
79
+ ... ('bird', 2, mt.nan)],
80
+ ... index=('falcon', 'horse', 'spider', 'ostrich'),
81
+ ... columns=('species', 'legs', 'wings'))
82
+ >>> df.execute()
83
+ species legs wings
84
+ falcon bird 2 2.0
85
+ horse mammal 4 NaN
86
+ spider arthropod 8 0.0
87
+ ostrich bird 2 NaN
88
+ By default, missing values are not considered, and the mode of wings
89
+ are both 0 and 2. Because the resulting DataFrame has two rows,
90
+ the second row of ``species`` and ``legs`` contains ``NaN``.
91
+ >>> df.mode().execute()
92
+ species legs wings
93
+ 0 bird 2.0 0.0
94
+ 1 NaN NaN 2.0
95
+ Setting ``dropna=False`` ``NaN`` values are considered and they can be
96
+ the mode (like for wings).
97
+ >>> df.mode(dropna=False).execute()
98
+ species legs wings
99
+ 0 bird 2 NaN
100
+ Setting ``numeric_only=True``, only the mode of numeric columns is
101
+ computed, and columns of other types are ignored.
102
+ >>> df.mode(numeric_only=True).execute()
103
+ legs wings
104
+ 0 2.0 0.0
105
+ 1 NaN 2.0
106
+ To compute the mode over columns and not rows, use the axis parameter:
107
+ >>> df.mode(axis='columns', numeric_only=True).execute()
108
+ 0 1
109
+ falcon 2.0 NaN
110
+ horse 4.0 NaN
111
+ spider 0.0 8.0
112
+ ostrich 2.0 NaN
113
+ """
114
+ op = DataFrameMode(
115
+ axis=validate_axis(axis),
116
+ numeric_only=numeric_only,
117
+ dropna=dropna,
118
+ combine_size=combine_size,
119
+ output_types=[OutputType.dataframe],
120
+ )
121
+ return op(df)
122
+
123
+
124
+ def mode_series(series, dropna=True, combine_size=None):
125
+ """
126
+ Return the mode(s) of the Series.
127
+ The mode is the value that appears most often. There can be multiple modes.
128
+ Always returns Series even if only one value is returned.
129
+ Parameters
130
+ ----------
131
+ dropna : bool, default True
132
+ Don't consider counts of NaN/NaT.
133
+ Returns
134
+ -------
135
+ Series
136
+ Modes of the Series in sorted order.
137
+ """
138
+ op = DataFrameMode(
139
+ axis=0,
140
+ dropna=dropna,
141
+ combine_size=combine_size,
142
+ output_types=[OutputType.series],
143
+ )
144
+ return op(series)
@@ -20,8 +20,9 @@ except ImportError: # pragma: no cover
20
20
  from ... import opcodes
21
21
  from ...config import options
22
22
  from ...core import OutputType
23
- from ...serialization.serializables import BoolField
23
+ from ...serialization.serializables import BoolField, StringField
24
24
  from ...utils import lazy_import
25
+ from ..utils import validate_dtype_backend
25
26
  from .core import DataFrameReduction, DataFrameReductionMixin, ReductionCallable
26
27
 
27
28
  cudf = lazy_import("cudf")
@@ -32,7 +33,13 @@ class DataFrameNunique(DataFrameReduction, DataFrameReductionMixin):
32
33
  _func_name = "nunique"
33
34
 
34
35
  dropna = BoolField("dropna", default=None)
35
- use_arrow_dtype = BoolField("use_arrow_dtype", default=None)
36
+ dtype_backend = StringField(
37
+ "dtype_backend", on_deserialize=validate_dtype_backend, default=None
38
+ )
39
+
40
+ def __init__(self, dtype_backend=None, **kw):
41
+ dtype_backend = validate_dtype_backend(dtype_backend)
42
+ super().__init__(dtype_backend=dtype_backend, **kw)
36
43
 
37
44
  @property
38
45
  def is_atomic(self):
@@ -137,6 +144,6 @@ def nunique_series(series, dropna=True):
137
144
  op = DataFrameNunique(
138
145
  dropna=dropna,
139
146
  output_types=[OutputType.scalar],
140
- use_arrow_dtype=options.dataframe.use_arrow_dtype,
147
+ dtype_backend=options.dataframe.dtype_backend,
141
148
  )
142
149
  return op(series)
@@ -26,6 +26,7 @@ from .... import dataframe as md
26
26
  from ....lib.dtypes_extension import ArrowDtype
27
27
  from ....tensor import Tensor
28
28
  from ....tests.utils import assert_mf_index_dtype
29
+ from ....udf import ODPSFunction
29
30
  from ...core import DataFrame, IndexValue, OutputType, Series
30
31
  from ...datasource.dataframe import from_pandas as from_pandas_df
31
32
  from ...datasource.series import from_pandas as from_pandas_series
@@ -527,3 +528,14 @@ def test_custom_aggregation():
527
528
  assert result.agg_funcs[0].agg_func_name == "custom_reduction"
528
529
  assert isinstance(result.agg_funcs[0].custom_reduction, MockReduction2)
529
530
  assert result.agg_funcs[0].output_limit == 2
531
+
532
+
533
+ def test_aggregation_with_odps_function():
534
+ odps_func = ODPSFunction("test_odps_udaf", dtype=np.float64)
535
+ for ndim in [1, 2]:
536
+ compiler = ReductionCompiler()
537
+ compiler.add_function(odps_func, ndim=ndim)
538
+ result = compiler.compile()
539
+ assert result.agg_funcs[0].map_func_name == "custom_reduction"
540
+ assert result.agg_funcs[0].agg_func_name == "custom_reduction"
541
+ assert isinstance(result.agg_funcs[0].custom_reduction, ODPSFunction)
@@ -12,21 +12,24 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ from .rank import DataFrameRank
15
16
  from .sort_index import DataFrameSortIndex
16
17
  from .sort_values import DataFrameSortValues
17
18
 
18
19
 
19
20
  def _install():
20
- from ..core import DATAFRAME_TYPE, SERIES_TYPE
21
- from .argsort import series_argsort
21
+ from ..core import DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE
22
+ from .argsort import index_argsort, series_argsort
22
23
  from .nlargest import df_nlargest, series_nlargest
23
24
  from .nsmallest import df_nsmallest, series_nsmallest
25
+ from .rank import rank
24
26
  from .sort_index import sort_index
25
27
  from .sort_values import dataframe_sort_values, series_sort_values
26
28
 
27
29
  for cls in DATAFRAME_TYPE:
28
30
  setattr(cls, "nlargest", df_nlargest)
29
31
  setattr(cls, "nsmallest", df_nsmallest)
32
+ setattr(cls, "rank", rank)
30
33
  setattr(cls, "sort_values", dataframe_sort_values)
31
34
  setattr(cls, "sort_index", sort_index)
32
35
 
@@ -34,9 +37,13 @@ def _install():
34
37
  setattr(cls, "argsort", series_argsort)
35
38
  setattr(cls, "nlargest", series_nlargest)
36
39
  setattr(cls, "nsmallest", series_nsmallest)
40
+ setattr(cls, "rank", rank)
37
41
  setattr(cls, "sort_values", series_sort_values)
38
42
  setattr(cls, "sort_index", sort_index)
39
43
 
44
+ for cls in INDEX_TYPE:
45
+ setattr(cls, "argsort", index_argsort)
46
+
40
47
 
41
48
  _install()
42
49
  del _install
@@ -40,7 +40,7 @@ def series_argsort(series, axis=0, kind="quicksort", order=None, stable=None):
40
40
 
41
41
  See Also
42
42
  --------
43
- numpy.ndarray.argsort : Returns the indices that would sort this array.
43
+ maxframe.tensor.argsort : Returns the indices that would sort this array.
44
44
 
45
45
  Examples
46
46
  --------
@@ -60,3 +60,9 @@ def series_argsort(series, axis=0, kind="quicksort", order=None, stable=None):
60
60
  axis = 0
61
61
  t = mt.argsort(series.to_tensor(), axis=axis, kind=kind)
62
62
  return series_from_tensor(t, index=series.index)
63
+
64
+
65
+ def index_argsort(index, *args, **kwargs):
66
+ from ... import tensor as mt
67
+
68
+ return mt.argsort(index.to_tensor(), *args, **kwargs)
@@ -32,6 +32,6 @@ class DataFrameSortOperator(DataFrameOperator):
32
32
  na_position = StringField("na_position")
33
33
  ignore_index = BoolField("ignore_index")
34
34
  parallel_kind = StringField("parallel_kind")
35
- psrs_kinds = ListField("psrs_kinds", FieldTypes.string)
35
+ psrs_kinds = ListField("psrs_kinds", FieldTypes.string, default=None)
36
36
  nrows = Int64Field("nrows", default=None)
37
37
  keep_kind = StringField("keep_kind", default="head")
@@ -0,0 +1,147 @@
1
+ # Copyright 1999-2025 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import numpy as np
16
+ import pandas as pd
17
+
18
+ from ...serialization.serializables import BoolField, StringField
19
+ from ..operators import DataFrameOperatorMixin
20
+ from .core import DataFrameSortOperator
21
+
22
+
23
+ class DataFrameRank(DataFrameSortOperator, DataFrameOperatorMixin):
24
+ method = StringField("method", default=None)
25
+ numeric_only = BoolField("numeric_only", default=None)
26
+ pct = BoolField("pct", default=False)
27
+
28
+ @property
29
+ def na_option(self):
30
+ return self.na_position
31
+
32
+ def __call__(self, df_obj):
33
+ params = df_obj.params
34
+ if df_obj.ndim == 2: # dataframe
35
+ if self.numeric_only:
36
+ sel_df = df_obj.select_dtypes(include=[np.number])
37
+ cols = sel_df.dtypes.index
38
+ else:
39
+ cols = df_obj.dtypes.index
40
+ params["dtypes"] = pd.Series([np.dtype(float)] * len(cols), index=cols)
41
+ return self.new_dataframe([df_obj], **params)
42
+ else:
43
+ params["dtypes"] = np.dtype(float)
44
+ return self.new_series([df_obj], **params)
45
+
46
+
47
+ def rank(
48
+ df,
49
+ axis=0,
50
+ method="average",
51
+ numeric_only=False,
52
+ na_option="keep",
53
+ ascending=True,
54
+ pct=False,
55
+ ):
56
+ """
57
+ Compute numerical data ranks (1 through n) along axis.
58
+
59
+ By default, equal values are assigned a rank that is the average of the
60
+ ranks of those values.
61
+
62
+ Parameters
63
+ ----------
64
+ axis : {0 or 'index', 1 or 'columns'}, default 0
65
+ Index to direct ranking.
66
+ method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
67
+ How to rank the group of records that have the same value (i.e. ties):
68
+
69
+ * average: average rank of the group
70
+ * min: lowest rank in the group
71
+ * max: highest rank in the group
72
+ * first: ranks assigned in order they appear in the array
73
+ * dense: like 'min', but rank always increases by 1 between groups.
74
+
75
+ numeric_only : bool, optional
76
+ For DataFrame objects, rank only numeric columns if set to True.
77
+ na_option : {'keep', 'top', 'bottom'}, default 'keep'
78
+ How to rank NaN values:
79
+
80
+ * keep: assign NaN rank to NaN values
81
+ * top: assign lowest rank to NaN values
82
+ * bottom: assign highest rank to NaN values
83
+
84
+ ascending : bool, default True
85
+ Whether or not the elements should be ranked in ascending order.
86
+ pct : bool, default False
87
+ Whether or not to display the returned rankings in percentile
88
+ form.
89
+
90
+ Returns
91
+ -------
92
+ same type as caller
93
+ Return a Series or DataFrame with data ranks as values.
94
+
95
+ See Also
96
+ --------
97
+ core.groupby.GroupBy.rank : Rank of values within each group.
98
+
99
+ Examples
100
+ --------
101
+ >>> import maxframe.tensor as mt
102
+ >>> import maxframe.dataframe as md
103
+ >>> df = md.DataFrame(data={'Animal': ['cat', 'penguin', 'dog',
104
+ ... 'spider', 'snake'],
105
+ ... 'Number_legs': [4, 2, 4, 8, mt.nan]})
106
+ >>> df.execute()
107
+ Animal Number_legs
108
+ 0 cat 4.0
109
+ 1 penguin 2.0
110
+ 2 dog 4.0
111
+ 3 spider 8.0
112
+ 4 snake NaN
113
+
114
+ The following example shows how the method behaves with the above
115
+ parameters:
116
+
117
+ * default_rank: this is the default behaviour obtained without using
118
+ any parameter.
119
+ * max_rank: setting ``method = 'max'`` the records that have the
120
+ same values are ranked using the highest rank (e.g.: since 'cat'
121
+ and 'dog' are both in the 2nd and 3rd position, rank 3 is assigned.)
122
+ * NA_bottom: choosing ``na_option = 'bottom'``, if there are records
123
+ with NaN values they are placed at the bottom of the ranking.
124
+ * pct_rank: when setting ``pct = True``, the ranking is expressed as
125
+ percentile rank.
126
+
127
+ >>> df['default_rank'] = df['Number_legs'].rank()
128
+ >>> df['max_rank'] = df['Number_legs'].rank(method='max')
129
+ >>> df['NA_bottom'] = df['Number_legs'].rank(na_option='bottom')
130
+ >>> df['pct_rank'] = df['Number_legs'].rank(pct=True)
131
+ >>> df.execute()
132
+ Animal Number_legs default_rank max_rank NA_bottom pct_rank
133
+ 0 cat 4.0 2.5 3.0 2.5 0.625
134
+ 1 penguin 2.0 1.0 1.0 1.0 0.250
135
+ 2 dog 4.0 2.5 3.0 2.5 0.625
136
+ 3 spider 8.0 4.0 4.0 4.0 1.000
137
+ 4 snake NaN NaN NaN 5.0 NaN
138
+ """
139
+ op = DataFrameRank(
140
+ axis=axis,
141
+ method=method,
142
+ numeric_only=numeric_only,
143
+ na_position=na_option,
144
+ ascending=ascending,
145
+ pct=pct,
146
+ )
147
+ return op(df)
@@ -11,3 +11,22 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+
15
+
16
+ def _install():
17
+ from ..core import DATAFRAME_TYPE, SERIES_TYPE
18
+ from .at_time import at_time
19
+ from .between_time import between_time
20
+ from .to_datetime import to_datetime # noqa
21
+
22
+ for t in SERIES_TYPE:
23
+ t.at_time = at_time
24
+ t.between_time = between_time
25
+
26
+ for t in DATAFRAME_TYPE:
27
+ t.at_time = at_time
28
+ t.between_time = between_time
29
+
30
+
31
+ _install()
32
+ del _install
@@ -0,0 +1,61 @@
1
+ # Copyright 1999-2025 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ def at_time(df_or_series, time, axis=0):
17
+ """
18
+ Select values at particular time of day (e.g., 9:30AM).
19
+
20
+ Parameters
21
+ ----------
22
+ time : datetime.time or str
23
+ The values to select.
24
+ axis : {0 or 'index', 1 or 'columns'}, default 0
25
+ For `Series` this parameter is unused and defaults to 0.
26
+
27
+ Returns
28
+ -------
29
+ Series or DataFrame
30
+
31
+ Raises
32
+ ------
33
+ TypeError
34
+ If the index is not a :class:`DatetimeIndex`
35
+
36
+ See Also
37
+ --------
38
+ between_time : Select values between particular times of the day.
39
+ first : Select initial periods of time series based on a date offset.
40
+ last : Select final periods of time series based on a date offset.
41
+ DatetimeIndex.indexer_at_time : Get just the index locations for
42
+ values at particular time of the day.
43
+
44
+ Examples
45
+ --------
46
+ >>> import maxframe.dataframe as md
47
+ >>> i = md.date_range('2018-04-09', periods=4, freq='12h')
48
+ >>> ts = md.DataFrame({'A': [1, 2, 3, 4]}, index=i)
49
+ >>> ts.execute()
50
+ A
51
+ 2018-04-09 00:00:00 1
52
+ 2018-04-09 12:00:00 2
53
+ 2018-04-10 00:00:00 3
54
+ 2018-04-10 12:00:00 4
55
+
56
+ >>> ts.at_time('12:00').execute()
57
+ A
58
+ 2018-04-09 12:00:00 2
59
+ 2018-04-10 12:00:00 4
60
+ """
61
+ return df_or_series.between_time(time, time, inclusive="both", axis=axis)