maxframe 1.0.0rc4__cp38-cp38-win_amd64.whl → 1.1.1__cp38-cp38-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (88) hide show
  1. maxframe/_utils.cp38-win_amd64.pyd +0 -0
  2. maxframe/config/__init__.py +1 -1
  3. maxframe/config/config.py +26 -0
  4. maxframe/config/tests/test_config.py +20 -1
  5. maxframe/conftest.py +17 -4
  6. maxframe/core/graph/core.cp38-win_amd64.pyd +0 -0
  7. maxframe/core/operator/base.py +2 -0
  8. maxframe/dataframe/arithmetic/tests/test_arithmetic.py +17 -16
  9. maxframe/dataframe/core.py +24 -2
  10. maxframe/dataframe/datasource/read_odps_query.py +65 -35
  11. maxframe/dataframe/datasource/read_odps_table.py +4 -2
  12. maxframe/dataframe/datasource/tests/test_datasource.py +59 -7
  13. maxframe/dataframe/extensions/__init__.py +5 -0
  14. maxframe/dataframe/extensions/apply_chunk.py +649 -0
  15. maxframe/dataframe/extensions/flatjson.py +131 -0
  16. maxframe/dataframe/extensions/flatmap.py +28 -40
  17. maxframe/dataframe/extensions/reshuffle.py +1 -1
  18. maxframe/dataframe/extensions/tests/test_apply_chunk.py +186 -0
  19. maxframe/dataframe/extensions/tests/test_extensions.py +46 -2
  20. maxframe/dataframe/groupby/__init__.py +1 -0
  21. maxframe/dataframe/groupby/aggregation.py +1 -0
  22. maxframe/dataframe/groupby/apply.py +9 -1
  23. maxframe/dataframe/groupby/core.py +1 -1
  24. maxframe/dataframe/groupby/fill.py +4 -1
  25. maxframe/dataframe/groupby/getitem.py +6 -0
  26. maxframe/dataframe/groupby/tests/test_groupby.py +1 -1
  27. maxframe/dataframe/groupby/transform.py +8 -2
  28. maxframe/dataframe/indexing/loc.py +6 -4
  29. maxframe/dataframe/merge/__init__.py +9 -1
  30. maxframe/dataframe/merge/concat.py +41 -31
  31. maxframe/dataframe/merge/merge.py +1 -1
  32. maxframe/dataframe/merge/tests/test_merge.py +3 -1
  33. maxframe/dataframe/misc/apply.py +3 -0
  34. maxframe/dataframe/misc/drop_duplicates.py +5 -1
  35. maxframe/dataframe/misc/map.py +3 -1
  36. maxframe/dataframe/misc/tests/test_misc.py +24 -2
  37. maxframe/dataframe/misc/transform.py +22 -13
  38. maxframe/dataframe/reduction/__init__.py +3 -0
  39. maxframe/dataframe/reduction/aggregation.py +1 -0
  40. maxframe/dataframe/reduction/median.py +56 -0
  41. maxframe/dataframe/reduction/tests/test_reduction.py +17 -7
  42. maxframe/dataframe/statistics/quantile.py +8 -2
  43. maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
  44. maxframe/dataframe/tests/test_utils.py +60 -0
  45. maxframe/dataframe/utils.py +110 -7
  46. maxframe/dataframe/window/expanding.py +5 -3
  47. maxframe/dataframe/window/tests/test_expanding.py +2 -2
  48. maxframe/io/objects/tests/test_object_io.py +39 -12
  49. maxframe/io/odpsio/__init__.py +1 -1
  50. maxframe/io/odpsio/arrow.py +51 -2
  51. maxframe/io/odpsio/schema.py +23 -5
  52. maxframe/io/odpsio/tableio.py +80 -124
  53. maxframe/io/odpsio/tests/test_schema.py +40 -0
  54. maxframe/io/odpsio/tests/test_tableio.py +5 -5
  55. maxframe/io/odpsio/tests/test_volumeio.py +35 -11
  56. maxframe/io/odpsio/volumeio.py +27 -3
  57. maxframe/learn/contrib/__init__.py +3 -2
  58. maxframe/learn/contrib/llm/__init__.py +16 -0
  59. maxframe/learn/contrib/llm/core.py +54 -0
  60. maxframe/learn/contrib/llm/models/__init__.py +14 -0
  61. maxframe/learn/contrib/llm/models/dashscope.py +73 -0
  62. maxframe/learn/contrib/llm/multi_modal.py +42 -0
  63. maxframe/learn/contrib/llm/text.py +42 -0
  64. maxframe/lib/mmh3.cp38-win_amd64.pyd +0 -0
  65. maxframe/lib/sparse/tests/test_sparse.py +15 -15
  66. maxframe/opcodes.py +7 -1
  67. maxframe/serialization/core.cp38-win_amd64.pyd +0 -0
  68. maxframe/serialization/core.pyx +13 -1
  69. maxframe/serialization/pandas.py +50 -20
  70. maxframe/serialization/serializables/core.py +70 -15
  71. maxframe/serialization/serializables/field_type.py +4 -1
  72. maxframe/serialization/serializables/tests/test_serializable.py +12 -2
  73. maxframe/serialization/tests/test_serial.py +2 -1
  74. maxframe/tensor/__init__.py +19 -7
  75. maxframe/tensor/merge/vstack.py +1 -1
  76. maxframe/tests/utils.py +16 -0
  77. maxframe/udf.py +27 -0
  78. maxframe/utils.py +42 -8
  79. {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/METADATA +2 -2
  80. {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/RECORD +88 -77
  81. {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/WHEEL +1 -1
  82. maxframe_client/clients/framedriver.py +4 -1
  83. maxframe_client/fetcher.py +23 -8
  84. maxframe_client/session/odps.py +40 -11
  85. maxframe_client/session/task.py +6 -25
  86. maxframe_client/session/tests/test_task.py +35 -6
  87. maxframe_client/tests/test_session.py +30 -10
  88. {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/top_level.txt +0 -0
@@ -25,13 +25,14 @@ from ...core import ENTITY_TYPE, OutputType
25
25
  from ...serialization.serializables import AnyField, KeyField, ListField
26
26
  from ...tensor.datasource import asarray
27
27
  from ...tensor.utils import calc_sliced_size, filter_inputs
28
- from ...utils import is_full_slice, lazy_import
28
+ from ...utils import is_full_slice, lazy_import, pd_release_version
29
29
  from ..core import DATAFRAME_TYPE, IndexValue
30
30
  from ..operators import DataFrameOperator, DataFrameOperatorMixin
31
31
  from ..utils import parse_index
32
32
  from .iloc import DataFrameIlocSetItem
33
33
 
34
34
  cudf = lazy_import("cudf")
35
+ with_slice_locs_kind = pd_release_version < (1, 4, 0)
35
36
 
36
37
 
37
38
  def process_loc_indexes(inp, indexes, fetch_index: bool = True):
@@ -210,9 +211,10 @@ class DataFrameLocGetItem(DataFrameOperator, DataFrameOperatorMixin):
210
211
  if axis == 1:
211
212
  param["dtypes"] = inp.dtypes
212
213
  elif input_index_value.has_value():
213
- start, end = pd_index.slice_locs(
214
- index.start, index.stop, index.step, kind="loc"
215
- )
214
+ kw = {}
215
+ if with_slice_locs_kind:
216
+ kw["kind"] = "loc"
217
+ start, end = pd_index.slice_locs(index.start, index.stop, index.step, **kw)
216
218
  slc = slice(start, end, index.step)
217
219
  size = calc_sliced_size(inp.shape[axis], slc)
218
220
  param["shape"] = size
@@ -14,7 +14,15 @@
14
14
 
15
15
  from .append import DataFrameAppend, append
16
16
  from .concat import DataFrameConcat, concat
17
- from .merge import DataFrameMerge, DataFrameMergeAlign, join, merge
17
+ from .merge import (
18
+ DataFrameMerge,
19
+ DataFrameMergeAlign,
20
+ DistributedMapJoinHint,
21
+ MapJoinHint,
22
+ SkewJoinHint,
23
+ join,
24
+ merge,
25
+ )
18
26
 
19
27
 
20
28
  def _install():
@@ -11,6 +11,7 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+ from typing import List, Union
14
15
 
15
16
  import pandas as pd
16
17
 
@@ -24,6 +25,7 @@ from ...serialization.serializables import (
24
25
  StringField,
25
26
  )
26
27
  from ...utils import lazy_import
28
+ from ..core import DataFrame, Series
27
29
  from ..operators import SERIES_TYPE, DataFrameOperator, DataFrameOperatorMixin
28
30
  from ..utils import build_empty_df, build_empty_series, parse_index, validate_axis
29
31
 
@@ -55,41 +57,53 @@ class DataFrameConcat(DataFrameOperator, DataFrameOperatorMixin):
55
57
  return self.names
56
58
 
57
59
  @classmethod
58
- def _concat_index(cls, prev_index: pd.Index, cur_index: pd.Index):
59
- if isinstance(prev_index, pd.RangeIndex) and isinstance(
60
- cur_index, pd.RangeIndex
61
- ):
62
- # handle RangeIndex that append may generate huge amount of data
63
- # e.g. pd.RangeIndex(10_000) and pd.RangeIndex(10_000)
64
- # will generate a Int64Index full of data
65
- # for details see GH#1647
66
- prev_stop = prev_index.start + prev_index.size * prev_index.step
67
- cur_start = cur_index.start
68
- if prev_stop == cur_start and prev_index.step == cur_index.step:
69
- # continuous RangeIndex, still return RangeIndex
70
- return prev_index.append(cur_index)
71
- else:
72
- # otherwise, return an empty index
73
- return pd.Index([], dtype=prev_index.dtype)
74
- elif isinstance(prev_index, pd.RangeIndex):
75
- return pd.Index([], prev_index.dtype).append(cur_index)
76
- elif isinstance(cur_index, pd.RangeIndex):
77
- return prev_index.append(pd.Index([], cur_index.dtype))
78
- return prev_index.append(cur_index)
60
+ def _concat_index(cls, df_or_series_list: Union[List[DataFrame], List[Series]]):
61
+ concat_index = None
62
+ all_indexes_have_value = all(
63
+ input.index_value.has_value() for input in df_or_series_list
64
+ )
65
+
66
+ def _concat(prev_index: pd.Index, cur_index: pd.Index):
67
+ if prev_index is None:
68
+ return cur_index
69
+
70
+ if (
71
+ all_indexes_have_value
72
+ and isinstance(prev_index, pd.RangeIndex)
73
+ and isinstance(cur_index, pd.RangeIndex)
74
+ ):
75
+ # handle RangeIndex that append may generate huge amount of data
76
+ # e.g. pd.RangeIndex(10_000) and pd.RangeIndex(10_000)
77
+ # will generate a Int64Index full of data
78
+ # for details see GH#1647
79
+ prev_stop = prev_index.start + prev_index.size * prev_index.step
80
+ cur_start = cur_index.start
81
+ if prev_stop == cur_start and prev_index.step == cur_index.step:
82
+ # continuous RangeIndex, still return RangeIndex
83
+ return prev_index.append(cur_index)
84
+ else:
85
+ # otherwise, return an empty index
86
+ return pd.Index([], dtype=prev_index.dtype)
87
+ elif isinstance(prev_index, pd.RangeIndex):
88
+ return pd.Index([], prev_index.dtype).append(cur_index)
89
+ elif isinstance(cur_index, pd.RangeIndex):
90
+ return prev_index.append(pd.Index([], cur_index.dtype))
91
+ return prev_index.append(cur_index)
92
+
93
+ for input in df_or_series_list:
94
+ concat_index = _concat(concat_index, input.index_value.to_pandas())
95
+
96
+ return concat_index
79
97
 
80
98
  def _call_series(self, objs):
81
99
  if self.axis == 0:
82
100
  row_length = 0
83
- index = None
84
101
  for series in objs:
85
- if index is None:
86
- index = series.index_value.to_pandas()
87
- else:
88
- index = self._concat_index(index, series.index_value.to_pandas())
89
102
  row_length += series.shape[0]
90
103
  if self.ignore_index: # pragma: no cover
91
104
  index_value = parse_index(pd.RangeIndex(row_length))
92
105
  else:
106
+ index = self._concat_index(objs)
93
107
  index_value = parse_index(index, objs)
94
108
  obj_names = {obj.name for obj in objs}
95
109
  return self.new_series(
@@ -130,13 +144,8 @@ class DataFrameConcat(DataFrameOperator, DataFrameOperatorMixin):
130
144
  def _call_dataframes(self, objs):
131
145
  if self.axis == 0:
132
146
  row_length = 0
133
- index = None
134
147
  empty_dfs = []
135
148
  for df in objs:
136
- if index is None:
137
- index = df.index_value.to_pandas()
138
- else:
139
- index = self._concat_index(index, df.index_value.to_pandas())
140
149
  row_length += df.shape[0]
141
150
  if df.ndim == 2:
142
151
  empty_dfs.append(build_empty_df(df.dtypes))
@@ -153,6 +162,7 @@ class DataFrameConcat(DataFrameOperator, DataFrameOperatorMixin):
153
162
  if self.ignore_index: # pragma: no cover
154
163
  index_value = parse_index(pd.RangeIndex(row_length))
155
164
  else:
165
+ index = self._concat_index(objs)
156
166
  index_value = parse_index(index, objs)
157
167
 
158
168
  new_objs = []
@@ -353,7 +353,7 @@ def merge(
353
353
  df: Union[DataFrame, Series],
354
354
  right: Union[DataFrame, Series],
355
355
  how: str = "inner",
356
- on: str = None,
356
+ on: Union[str, List[str]] = None,
357
357
  left_on: str = None,
358
358
  right_on: str = None,
359
359
  left_index: bool = False,
@@ -16,6 +16,7 @@ import numpy as np
16
16
  import pandas as pd
17
17
  import pytest
18
18
 
19
+ from ....tests.utils import assert_mf_index_dtype
19
20
  from ...core import IndexValue
20
21
  from ...datasource.dataframe import from_pandas
21
22
  from .. import DataFrameMerge, concat
@@ -161,7 +162,7 @@ def test_append():
161
162
  adf = mdf1.append(mdf2)
162
163
 
163
164
  assert adf.shape == (20, 4)
164
- assert isinstance(adf.index_value.value, IndexValue.Int64Index)
165
+ assert_mf_index_dtype(adf.index_value.value, np.int64)
165
166
 
166
167
  mdf1 = from_pandas(df1, chunk_size=3)
167
168
  mdf2 = from_pandas(df2, chunk_size=3)
@@ -181,6 +182,7 @@ def test_concat():
181
182
  r = concat([mdf1, mdf2], axis="index")
182
183
 
183
184
  assert r.shape == (20, 4)
185
+ assert not isinstance(r.index_value.to_pandas(), pd.RangeIndex)
184
186
  pd.testing.assert_series_equal(r.dtypes, df1.dtypes)
185
187
 
186
188
  df3 = pd.DataFrame(
@@ -35,6 +35,7 @@ from ..operators import DataFrameOperator, DataFrameOperatorMixin
35
35
  from ..utils import (
36
36
  build_df,
37
37
  build_series,
38
+ copy_func_scheduling_hints,
38
39
  make_dtype,
39
40
  make_dtypes,
40
41
  pack_func_args,
@@ -79,6 +80,8 @@ class ApplyOperator(
79
80
  if output_type:
80
81
  kw["_output_types"] = [output_type]
81
82
  super().__init__(**kw)
83
+ if hasattr(self, "func"):
84
+ copy_func_scheduling_hints(self.func, self)
82
85
 
83
86
  def _update_key(self):
84
87
  values = [v for v in self._values_ if v is not self.func] + [
@@ -43,7 +43,11 @@ class DataFrameDropDuplicates(DuplicateOperand):
43
43
  params["index_value"] = parse_index(pd.RangeIndex(-1))
44
44
  else:
45
45
  params["index_value"] = gen_unknown_index_value(
46
- input_params["index_value"], op.keep, op.subset, type(op).__name__
46
+ input_params["index_value"],
47
+ op.keep,
48
+ op.subset,
49
+ type(op).__name__,
50
+ normalize_range_index=True,
47
51
  )
48
52
  params["shape"] = self._get_shape(input_params["shape"], op)
49
53
  return params
@@ -24,7 +24,7 @@ from ...serialization.serializables import AnyField, KeyField, StringField
24
24
  from ...utils import quiet_stdio
25
25
  from ..core import SERIES_TYPE
26
26
  from ..operators import DataFrameOperator, DataFrameOperatorMixin
27
- from ..utils import build_series
27
+ from ..utils import build_series, copy_func_scheduling_hints
28
28
 
29
29
 
30
30
  class DataFrameMap(DataFrameOperator, DataFrameOperatorMixin):
@@ -38,6 +38,8 @@ class DataFrameMap(DataFrameOperator, DataFrameOperatorMixin):
38
38
  super().__init__(_output_types=output_types, _memory_scale=memory_scale, **kw)
39
39
  if not self.output_types:
40
40
  self.output_types = [OutputType.series]
41
+ if hasattr(self, "arg"):
42
+ copy_func_scheduling_hints(self.arg, self)
41
43
 
42
44
  def _set_inputs(self, inputs):
43
45
  super()._set_inputs(inputs)
@@ -20,6 +20,7 @@ from .... import opcodes
20
20
  from ....core import OutputType
21
21
  from ....dataframe import DataFrame
22
22
  from ....tensor.core import TENSOR_TYPE
23
+ from ....udf import with_running_options
23
24
  from ... import eval as maxframe_eval
24
25
  from ... import get_dummies, to_numeric
25
26
  from ...arithmetic import DataFrameGreater, DataFrameLess
@@ -65,6 +66,17 @@ def test_transform():
65
66
  assert r.op._op_type_ == opcodes.TRANSFORM
66
67
  assert r.op.output_types[0] == OutputType.dataframe
67
68
 
69
+ def transform_df_with_param(row, param, k):
70
+ assert param == 5
71
+ assert k == "6"
72
+ return row
73
+
74
+ r = df.transform(transform_df_with_param, 1, 5, k="6")
75
+ assert all(v == np.dtype("int64") for v in r.dtypes) is True
76
+ assert r.shape == df.shape
77
+ assert r.op._op_type_ == opcodes.TRANSFORM
78
+ assert r.op.output_types[0] == OutputType.dataframe
79
+
68
80
  r = df.transform(lambda x: list(range(len(x))), axis=1)
69
81
  assert all(v == np.dtype("int64") for v in r.dtypes) is True
70
82
  assert r.shape == df.shape
@@ -349,7 +361,9 @@ def test_drop():
349
361
  def test_drop_duplicates():
350
362
  rs = np.random.RandomState(0)
351
363
  raw = pd.DataFrame(
352
- rs.randint(1000, size=(20, 7)), columns=["c" + str(i + 1) for i in range(7)]
364
+ rs.randint(1000, size=(20, 7)),
365
+ columns=["c" + str(i + 1) for i in range(7)],
366
+ index=pd.Index(range(20), name="idx"),
353
367
  )
354
368
  raw["c7"] = [f"s{j}" for j in range(20)]
355
369
 
@@ -361,6 +375,12 @@ def test_drop_duplicates():
361
375
  with pytest.raises(KeyError):
362
376
  df.drop_duplicates(subset="c8")
363
377
 
378
+ # check index
379
+ distinct_df = df.drop_duplicates()
380
+ assert distinct_df.index_value.name == df.index_value.name
381
+ assert isinstance(df.index_value.to_pandas(), pd.RangeIndex)
382
+ assert not isinstance(distinct_df.index_value.to_pandas(), pd.RangeIndex)
383
+
364
384
  s = df["c7"]
365
385
  with pytest.raises(ValueError):
366
386
  s.drop_duplicates(method="unknown")
@@ -436,6 +456,7 @@ def test_apply():
436
456
 
437
457
  keys = [1, 2]
438
458
 
459
+ @with_running_options(engine="spe")
439
460
  def f(x, keys):
440
461
  if x["a"] in keys:
441
462
  return [1, 0]
@@ -451,6 +472,7 @@ def test_apply():
451
472
  keys=keys,
452
473
  )
453
474
  assert apply_df.shape == (3, 2)
475
+ assert apply_df.op.expect_engine == "SPE"
454
476
 
455
477
 
456
478
  def test_pivot_table():
@@ -474,7 +496,7 @@ def test_pivot_table():
474
496
  with pytest.raises(ValueError):
475
497
  df.pivot_table(values=["D", "E"], aggfunc="sum")
476
498
 
477
- t = df.pivot_table(index="A")
499
+ t = df.pivot_table(index=["A", "B", "C"])
478
500
  assert isinstance(t.op, DataFrameGroupByAgg)
479
501
  t = df.pivot_table(index="A", values=["D", "E"], aggfunc="sum")
480
502
  assert isinstance(t.op, DataFrameGroupByAgg)
@@ -27,6 +27,7 @@ from ..operators import DataFrameOperator, DataFrameOperatorMixin
27
27
  from ..utils import (
28
28
  build_df,
29
29
  build_series,
30
+ copy_func_scheduling_hints,
30
31
  make_dtypes,
31
32
  pack_func_args,
32
33
  parse_index,
@@ -49,10 +50,12 @@ class TransformOperator(DataFrameOperator, DataFrameOperatorMixin):
49
50
 
50
51
  def __init__(self, output_types=None, memory_scale=None, **kw):
51
52
  super().__init__(_output_types=output_types, _memory_scale=memory_scale, **kw)
53
+ if hasattr(self, "func"):
54
+ copy_func_scheduling_hints(self.func, self)
52
55
 
53
56
  def _infer_df_func_returns(self, df, dtypes):
54
- packed_funcs = self.get_packed_funcs(df)
55
- test_df = self._build_stub_pandas_obj(df)
57
+ packed_funcs = self.func
58
+ test_df = _build_stub_pandas_obj(df, self.output_types[0])
56
59
  if self.output_types[0] == OutputType.dataframe:
57
60
  try:
58
61
  with np.errstate(all="ignore"), quiet_stdio():
@@ -147,16 +150,18 @@ class TransformOperator(DataFrameOperator, DataFrameOperatorMixin):
147
150
  index_value=new_index_value,
148
151
  )
149
152
 
150
- def get_packed_funcs(self, df=None) -> Any:
151
- stub_df = self._build_stub_pandas_obj(df or self.inputs[0])
152
- return pack_func_args(stub_df, self.func, *self.args, **self.kwds)
153
153
 
154
- def _build_stub_pandas_obj(self, df) -> Union[DataFrame, Series]:
155
- # TODO: Simulate a dataframe with the corresponding indexes if self.func is
156
- # a dict and axis=1
157
- if self.output_types[0] == OutputType.dataframe:
158
- return build_df(df, fill_value=1, size=1)
159
- return build_series(df, size=1, name=df.name)
154
+ def get_packed_funcs(df, output_type, func, *args, **kwds) -> Any:
155
+ stub_df = _build_stub_pandas_obj(df, output_type)
156
+ return pack_func_args(stub_df, func, *args, **kwds)
157
+
158
+
159
+ def _build_stub_pandas_obj(df, output_type) -> Union[DataFrame, Series]:
160
+ # TODO: Simulate a dataframe with the corresponding indexes if self.func is
161
+ # a dict and axis=1
162
+ if output_type == OutputType.dataframe:
163
+ return build_df(df, fill_value=1, size=1)
164
+ return build_series(df, size=1, name=df.name)
160
165
 
161
166
 
162
167
  def df_transform(df, func, axis=0, *args, dtypes=None, skip_infer=False, **kwargs):
@@ -229,13 +234,15 @@ def df_transform(df, func, axis=0, *args, dtypes=None, skip_infer=False, **kwarg
229
234
  1 2 3
230
235
  2 3 4
231
236
  """
237
+ call_agg = kwargs.pop("_call_agg", False)
238
+ func = get_packed_funcs(df, OutputType.dataframe, func, *args, **kwargs)
232
239
  op = TransformOperator(
233
240
  func=func,
234
241
  axis=axis,
235
242
  args=args,
236
243
  kwds=kwargs,
237
244
  output_types=[OutputType.dataframe],
238
- call_agg=kwargs.pop("_call_agg", False),
245
+ call_agg=call_agg,
239
246
  )
240
247
  return op(df, dtypes=dtypes, skip_infer=skip_infer)
241
248
 
@@ -319,6 +326,8 @@ def series_transform(
319
326
  1 2 3
320
327
  2 3 4
321
328
  """
329
+ call_agg = kwargs.pop("_call_agg", False)
330
+ func = get_packed_funcs(series, OutputType.series, func, *args, **kwargs)
322
331
  op = TransformOperator(
323
332
  func=func,
324
333
  axis=axis,
@@ -326,7 +335,7 @@ def series_transform(
326
335
  args=args,
327
336
  kwds=kwargs,
328
337
  output_types=[OutputType.series],
329
- call_agg=kwargs.pop("_call_agg", False),
338
+ call_agg=call_agg,
330
339
  )
331
340
  dtypes = (series.name, dtype) if dtype is not None else None
332
341
  return op(series, dtypes=dtypes, skip_infer=skip_infer)
@@ -25,6 +25,7 @@ from .custom_reduction import DataFrameCustomReduction
25
25
  from .kurtosis import DataFrameKurtosis
26
26
  from .max import DataFrameMax
27
27
  from .mean import DataFrameMean
28
+ from .median import DataFrameMedian
28
29
  from .min import DataFrameMin
29
30
  from .nunique import DataFrameNunique
30
31
  from .prod import DataFrameProd
@@ -50,6 +51,7 @@ def _install():
50
51
  from .kurtosis import kurt_dataframe, kurt_series
51
52
  from .max import max_dataframe, max_index, max_series
52
53
  from .mean import mean_dataframe, mean_series
54
+ from .median import median_dataframe, median_series
53
55
  from .min import min_dataframe, min_index, min_series
54
56
  from .nunique import nunique_dataframe, nunique_series
55
57
  from .prod import prod_dataframe, prod_series
@@ -68,6 +70,7 @@ def _install():
68
70
  ("min", min_series, min_dataframe),
69
71
  ("count", count_series, count_dataframe),
70
72
  ("mean", mean_series, mean_dataframe),
73
+ ("median", median_series, median_dataframe),
71
74
  ("var", var_series, var_dataframe),
72
75
  ("std", std_series, std_dataframe),
73
76
  ("all", all_series, all_dataframe),
@@ -71,6 +71,7 @@ _agg_functions = {
71
71
  "kurt": lambda x, skipna=True, bias=False: x.kurt(skipna=skipna, bias=bias),
72
72
  "kurtosis": lambda x, skipna=True, bias=False: x.kurtosis(skipna=skipna, bias=bias),
73
73
  "nunique": lambda x: x.nunique(),
74
+ "median": lambda x, skipna=True: x.median(skipna=skipna),
74
75
  }
75
76
 
76
77
 
@@ -0,0 +1,56 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from ... import opcodes
16
+ from ...core import OutputType
17
+ from .core import DataFrameReductionMixin, DataFrameReductionOperator
18
+
19
+
20
+ class DataFrameMedian(DataFrameReductionOperator, DataFrameReductionMixin):
21
+ _op_type_ = opcodes.MEDIAN
22
+ _func_name = "median"
23
+
24
+ @property
25
+ def is_atomic(self):
26
+ return True
27
+
28
+
29
+ def median_series(df, axis=None, skipna=True, level=None, method=None):
30
+ op = DataFrameMedian(
31
+ axis=axis,
32
+ skipna=skipna,
33
+ level=level,
34
+ output_types=[OutputType.scalar if level is not None else OutputType.scalar],
35
+ method=method,
36
+ )
37
+ return op(df)
38
+
39
+
40
+ def median_dataframe(
41
+ df,
42
+ axis=0,
43
+ skipna=True,
44
+ level=None,
45
+ numeric_only=None,
46
+ method=None,
47
+ ):
48
+ op = DataFrameMedian(
49
+ axis=axis,
50
+ skipna=skipna,
51
+ level=level,
52
+ numeric_only=numeric_only,
53
+ output_types=[OutputType.dataframe if level is not None else OutputType.series],
54
+ method=method,
55
+ )
56
+ return op(df)
@@ -23,6 +23,7 @@ import pytest
23
23
 
24
24
  from .... import dataframe as md
25
25
  from ....tensor import Tensor
26
+ from ....tests.utils import assert_mf_index_dtype
26
27
  from ...core import DataFrame, IndexValue, OutputType, Series
27
28
  from ...datasource.dataframe import from_pandas as from_pandas_df
28
29
  from ...datasource.series import from_pandas as from_pandas_series
@@ -38,6 +39,7 @@ from .. import (
38
39
  DataFrameKurtosis,
39
40
  DataFrameMax,
40
41
  DataFrameMean,
42
+ DataFrameMedian,
41
43
  DataFrameMin,
42
44
  DataFrameNunique,
43
45
  DataFrameProd,
@@ -71,6 +73,7 @@ reduction_functions = [
71
73
  ("sem", DataFrameSem, FunctionOptions()),
72
74
  ("all", DataFrameAll, FunctionOptions(has_numeric_only=False, has_bool_only=True)),
73
75
  ("any", DataFrameAny, FunctionOptions(has_numeric_only=False, has_bool_only=True)),
76
+ ("median", DataFrameMedian, FunctionOptions()),
74
77
  ]
75
78
 
76
79
 
@@ -111,10 +114,7 @@ def test_dataframe_reduction(func_name, op, func_opts: FunctionOptions):
111
114
  reduction_df = getattr(from_pandas_df(data, chunk_size=3), func_name)()
112
115
 
113
116
  assert isinstance(reduction_df, Series)
114
- assert isinstance(
115
- reduction_df.index_value._index_value,
116
- (IndexValue.RangeIndex, IndexValue.Int64Index),
117
- )
117
+ assert_mf_index_dtype(reduction_df.index_value._index_value, np.int64)
118
118
  assert reduction_df.shape == (10,)
119
119
 
120
120
  data = pd.DataFrame(np.random.rand(20, 20), index=[str(i) for i in range(20)])
@@ -210,6 +210,7 @@ def test_dataframe_aggregate():
210
210
  "skew",
211
211
  "kurt",
212
212
  "sem",
213
+ "median",
213
214
  ]
214
215
 
215
216
  df = from_pandas_df(data)
@@ -253,7 +254,7 @@ def test_dataframe_aggregate():
253
254
  assert result.op.output_types[0] == OutputType.dataframe
254
255
  assert result.op.func == agg_funcs
255
256
 
256
- dict_fun = {0: "sum", 2: ["var", "max"], 9: ["mean", "var", "std"]}
257
+ dict_fun = {0: "sum", 2: ["var", "max"], 9: ["mean", "var", "std", "median"]}
257
258
  all_cols = set(
258
259
  reduce(
259
260
  operator.add, [[v] if isinstance(v, str) else v for v in dict_fun.values()]
@@ -268,9 +269,9 @@ def test_dataframe_aggregate():
268
269
  assert result.op.func[2] == dict_fun[2]
269
270
 
270
271
  with pytest.raises(TypeError):
271
- df.agg(sum_0="sum", mean_0="mean")
272
+ df.agg(sum_0="sum", mean_0="mean", median_0="median")
272
273
  with pytest.raises(NotImplementedError):
273
- df.agg({0: ["sum", "min", "var"], 9: ["mean", "var", "std"]}, axis=1)
274
+ df.agg({0: ["sum", "min", "var"], 9: ["mean", "var", "std", "median"]}, axis=1)
274
275
 
275
276
 
276
277
  def test_series_aggregate():
@@ -287,6 +288,7 @@ def test_series_aggregate():
287
288
  "skew",
288
289
  "kurt",
289
290
  "sem",
291
+ "median",
290
292
  ]
291
293
 
292
294
  series = from_pandas_series(data)
@@ -303,6 +305,14 @@ def test_series_aggregate():
303
305
  assert result.shape == ()
304
306
  assert result.op.output_types[0] == OutputType.scalar
305
307
 
308
+ result = series.agg("median")
309
+ assert result.shape == ()
310
+ assert result.op.output_types[0] == OutputType.scalar
311
+
312
+ result = series.median(level=0)
313
+ assert result.shape == (np.nan,)
314
+ assert result.op.output_types[0] == OutputType.series
315
+
306
316
  result = series.agg(agg_funcs)
307
317
  assert result.shape == (len(agg_funcs),)
308
318
  assert list(result.index_value.to_pandas()) == agg_funcs
@@ -81,7 +81,10 @@ class DataFrameQuantile(DataFrameOperator, DataFrameOperatorMixin):
81
81
  store_index_value = False
82
82
  else:
83
83
  q_val = np.asanyarray(self.q)
84
- pd_index = pd.Index(q_val)
84
+ if q_val.ndim == 0:
85
+ pd_index = pd.Index(q_val.reshape(1))
86
+ else:
87
+ pd_index = pd.Index(q_val)
85
88
  name = self.q if q_val.size == 1 else None
86
89
  store_index_value = True
87
90
  tokenize_objects = (a, q_val, self.interpolation, type(self).__name__)
@@ -164,7 +167,10 @@ class DataFrameQuantile(DataFrameOperator, DataFrameOperatorMixin):
164
167
  store_index_value = False
165
168
  else:
166
169
  q_val = np.asanyarray(self.q)
167
- index_val = pd.Index(q_val)
170
+ if q_val.ndim == 0:
171
+ index_val = pd.Index(q_val.reshape(1))
172
+ else:
173
+ index_val = pd.Index(q_val)
168
174
  store_index_value = True
169
175
 
170
176
  # get dtype by tensor
@@ -49,7 +49,7 @@ def test_dataframe_quantile():
49
49
 
50
50
  # q = 0.3, axis = 0
51
51
  r = s.quantile(0.3)
52
- e = raw.quantile(0.3)
52
+ e = raw.quantile(0.3, numeric_only=True)
53
53
  assert isinstance(r, Series)
54
54
  assert r.shape == (2,)
55
55
  assert r.dtype == e.dtype
@@ -57,7 +57,7 @@ def test_dataframe_quantile():
57
57
 
58
58
  # q = 0.3, axis = 1
59
59
  r = s.quantile(0.3, axis=1)
60
- e = raw.quantile(0.3, axis=1)
60
+ e = raw.quantile(0.3, numeric_only=True, axis=1)
61
61
  assert isinstance(r, Series)
62
62
  assert r.shape == e.shape
63
63
  assert r.dtype == e.dtype
@@ -65,7 +65,7 @@ def test_dataframe_quantile():
65
65
 
66
66
  # q = [0.3, 0.7], axis = 0
67
67
  r = s.quantile([0.3, 0.7])
68
- e = raw.quantile([0.3, 0.7])
68
+ e = raw.quantile([0.3, 0.7], numeric_only=True)
69
69
  assert isinstance(r, DataFrame)
70
70
  assert r.shape == e.shape
71
71
  pd.testing.assert_series_equal(r.dtypes, e.dtypes)
@@ -74,7 +74,7 @@ def test_dataframe_quantile():
74
74
 
75
75
  # q = [0.3, 0.7], axis = 1
76
76
  r = s.quantile([0.3, 0.7], axis=1)
77
- e = raw.quantile([0.3, 0.7], axis=1)
77
+ e = raw.quantile([0.3, 0.7], numeric_only=True, axis=1)
78
78
  assert isinstance(r, DataFrame)
79
79
  assert r.shape == e.shape
80
80
  pd.testing.assert_series_equal(r.dtypes, e.dtypes)