maxframe 1.0.0rc3__cp38-cp38-win32.whl → 1.1.0__cp38-cp38-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (112) hide show
  1. maxframe/_utils.cp38-win32.pyd +0 -0
  2. maxframe/codegen.py +1 -0
  3. maxframe/config/config.py +16 -1
  4. maxframe/conftest.py +52 -14
  5. maxframe/core/entity/executable.py +1 -1
  6. maxframe/core/graph/core.cp38-win32.pyd +0 -0
  7. maxframe/core/operator/base.py +2 -0
  8. maxframe/dataframe/arithmetic/docstring.py +26 -2
  9. maxframe/dataframe/arithmetic/equal.py +4 -2
  10. maxframe/dataframe/arithmetic/greater.py +4 -2
  11. maxframe/dataframe/arithmetic/greater_equal.py +4 -2
  12. maxframe/dataframe/arithmetic/less.py +2 -2
  13. maxframe/dataframe/arithmetic/less_equal.py +4 -2
  14. maxframe/dataframe/arithmetic/not_equal.py +4 -2
  15. maxframe/dataframe/arithmetic/tests/test_arithmetic.py +17 -16
  16. maxframe/dataframe/core.py +26 -2
  17. maxframe/dataframe/datasource/read_odps_query.py +116 -28
  18. maxframe/dataframe/datasource/read_odps_table.py +3 -1
  19. maxframe/dataframe/datasource/tests/test_datasource.py +93 -12
  20. maxframe/dataframe/datastore/to_odps.py +7 -0
  21. maxframe/dataframe/extensions/__init__.py +8 -0
  22. maxframe/dataframe/extensions/apply_chunk.py +649 -0
  23. maxframe/dataframe/extensions/flatjson.py +131 -0
  24. maxframe/dataframe/extensions/flatmap.py +314 -0
  25. maxframe/dataframe/extensions/reshuffle.py +1 -1
  26. maxframe/dataframe/extensions/tests/test_apply_chunk.py +186 -0
  27. maxframe/dataframe/extensions/tests/test_extensions.py +108 -3
  28. maxframe/dataframe/groupby/__init__.py +1 -0
  29. maxframe/dataframe/groupby/aggregation.py +1 -0
  30. maxframe/dataframe/groupby/apply.py +9 -1
  31. maxframe/dataframe/groupby/core.py +1 -1
  32. maxframe/dataframe/groupby/fill.py +4 -1
  33. maxframe/dataframe/groupby/getitem.py +6 -0
  34. maxframe/dataframe/groupby/tests/test_groupby.py +1 -1
  35. maxframe/dataframe/groupby/transform.py +8 -2
  36. maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
  37. maxframe/dataframe/indexing/loc.py +6 -4
  38. maxframe/dataframe/indexing/rename.py +11 -0
  39. maxframe/dataframe/initializer.py +11 -1
  40. maxframe/dataframe/merge/__init__.py +9 -1
  41. maxframe/dataframe/merge/concat.py +41 -31
  42. maxframe/dataframe/merge/merge.py +1 -1
  43. maxframe/dataframe/merge/tests/test_merge.py +3 -1
  44. maxframe/dataframe/misc/apply.py +3 -0
  45. maxframe/dataframe/misc/drop_duplicates.py +23 -2
  46. maxframe/dataframe/misc/map.py +3 -1
  47. maxframe/dataframe/misc/tests/test_misc.py +24 -2
  48. maxframe/dataframe/misc/transform.py +22 -13
  49. maxframe/dataframe/reduction/__init__.py +3 -0
  50. maxframe/dataframe/reduction/aggregation.py +1 -0
  51. maxframe/dataframe/reduction/median.py +56 -0
  52. maxframe/dataframe/reduction/tests/test_reduction.py +17 -7
  53. maxframe/dataframe/statistics/quantile.py +8 -2
  54. maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
  55. maxframe/dataframe/tests/test_initializer.py +33 -2
  56. maxframe/dataframe/tests/test_utils.py +60 -0
  57. maxframe/dataframe/utils.py +110 -7
  58. maxframe/dataframe/window/expanding.py +5 -3
  59. maxframe/dataframe/window/tests/test_expanding.py +2 -2
  60. maxframe/io/objects/tests/test_object_io.py +39 -12
  61. maxframe/io/odpsio/arrow.py +30 -2
  62. maxframe/io/odpsio/schema.py +28 -8
  63. maxframe/io/odpsio/tableio.py +55 -133
  64. maxframe/io/odpsio/tests/test_schema.py +40 -4
  65. maxframe/io/odpsio/tests/test_tableio.py +5 -5
  66. maxframe/io/odpsio/tests/test_volumeio.py +35 -11
  67. maxframe/io/odpsio/volumeio.py +36 -6
  68. maxframe/learn/contrib/__init__.py +3 -1
  69. maxframe/learn/contrib/graph/__init__.py +15 -0
  70. maxframe/learn/contrib/graph/connected_components.py +215 -0
  71. maxframe/learn/contrib/graph/tests/__init__.py +13 -0
  72. maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
  73. maxframe/learn/contrib/llm/__init__.py +16 -0
  74. maxframe/learn/contrib/llm/core.py +54 -0
  75. maxframe/learn/contrib/llm/models/__init__.py +14 -0
  76. maxframe/learn/contrib/llm/models/dashscope.py +73 -0
  77. maxframe/learn/contrib/llm/multi_modal.py +42 -0
  78. maxframe/learn/contrib/llm/text.py +42 -0
  79. maxframe/learn/contrib/xgboost/classifier.py +3 -3
  80. maxframe/learn/contrib/xgboost/predict.py +8 -39
  81. maxframe/learn/contrib/xgboost/train.py +4 -3
  82. maxframe/lib/mmh3.cp38-win32.pyd +0 -0
  83. maxframe/lib/sparse/tests/test_sparse.py +15 -15
  84. maxframe/opcodes.py +10 -1
  85. maxframe/protocol.py +6 -1
  86. maxframe/serialization/core.cp38-win32.pyd +0 -0
  87. maxframe/serialization/core.pyx +13 -1
  88. maxframe/serialization/pandas.py +50 -20
  89. maxframe/serialization/serializables/core.py +24 -5
  90. maxframe/serialization/serializables/field_type.py +4 -1
  91. maxframe/serialization/serializables/tests/test_serializable.py +8 -1
  92. maxframe/serialization/tests/test_serial.py +2 -1
  93. maxframe/session.py +9 -2
  94. maxframe/tensor/__init__.py +19 -7
  95. maxframe/tensor/indexing/getitem.py +2 -0
  96. maxframe/tensor/merge/concatenate.py +23 -20
  97. maxframe/tensor/merge/vstack.py +5 -1
  98. maxframe/tensor/misc/transpose.py +1 -1
  99. maxframe/tests/utils.py +16 -0
  100. maxframe/udf.py +27 -0
  101. maxframe/utils.py +64 -14
  102. {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/METADATA +2 -2
  103. {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/RECORD +112 -96
  104. {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/WHEEL +1 -1
  105. maxframe_client/clients/framedriver.py +4 -1
  106. maxframe_client/fetcher.py +28 -10
  107. maxframe_client/session/consts.py +3 -0
  108. maxframe_client/session/odps.py +104 -20
  109. maxframe_client/session/task.py +42 -26
  110. maxframe_client/session/tests/test_task.py +0 -4
  111. maxframe_client/tests/test_session.py +44 -12
  112. {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/top_level.txt +0 -0
@@ -20,6 +20,7 @@ from .... import opcodes
20
20
  from ....core import OutputType
21
21
  from ....dataframe import DataFrame
22
22
  from ....tensor.core import TENSOR_TYPE
23
+ from ....udf import with_running_options
23
24
  from ... import eval as maxframe_eval
24
25
  from ... import get_dummies, to_numeric
25
26
  from ...arithmetic import DataFrameGreater, DataFrameLess
@@ -65,6 +66,17 @@ def test_transform():
65
66
  assert r.op._op_type_ == opcodes.TRANSFORM
66
67
  assert r.op.output_types[0] == OutputType.dataframe
67
68
 
69
+ def transform_df_with_param(row, param, k):
70
+ assert param == 5
71
+ assert k == "6"
72
+ return row
73
+
74
+ r = df.transform(transform_df_with_param, 1, 5, k="6")
75
+ assert all(v == np.dtype("int64") for v in r.dtypes) is True
76
+ assert r.shape == df.shape
77
+ assert r.op._op_type_ == opcodes.TRANSFORM
78
+ assert r.op.output_types[0] == OutputType.dataframe
79
+
68
80
  r = df.transform(lambda x: list(range(len(x))), axis=1)
69
81
  assert all(v == np.dtype("int64") for v in r.dtypes) is True
70
82
  assert r.shape == df.shape
@@ -349,7 +361,9 @@ def test_drop():
349
361
  def test_drop_duplicates():
350
362
  rs = np.random.RandomState(0)
351
363
  raw = pd.DataFrame(
352
- rs.randint(1000, size=(20, 7)), columns=["c" + str(i + 1) for i in range(7)]
364
+ rs.randint(1000, size=(20, 7)),
365
+ columns=["c" + str(i + 1) for i in range(7)],
366
+ index=pd.Index(range(20), name="idx"),
353
367
  )
354
368
  raw["c7"] = [f"s{j}" for j in range(20)]
355
369
 
@@ -361,6 +375,12 @@ def test_drop_duplicates():
361
375
  with pytest.raises(KeyError):
362
376
  df.drop_duplicates(subset="c8")
363
377
 
378
+ # check index
379
+ distinct_df = df.drop_duplicates()
380
+ assert distinct_df.index_value.name == df.index_value.name
381
+ assert isinstance(df.index_value.to_pandas(), pd.RangeIndex)
382
+ assert not isinstance(distinct_df.index_value.to_pandas(), pd.RangeIndex)
383
+
364
384
  s = df["c7"]
365
385
  with pytest.raises(ValueError):
366
386
  s.drop_duplicates(method="unknown")
@@ -436,6 +456,7 @@ def test_apply():
436
456
 
437
457
  keys = [1, 2]
438
458
 
459
+ @with_running_options(engine="spe")
439
460
  def f(x, keys):
440
461
  if x["a"] in keys:
441
462
  return [1, 0]
@@ -451,6 +472,7 @@ def test_apply():
451
472
  keys=keys,
452
473
  )
453
474
  assert apply_df.shape == (3, 2)
475
+ assert apply_df.op.expect_engine == "SPE"
454
476
 
455
477
 
456
478
  def test_pivot_table():
@@ -474,7 +496,7 @@ def test_pivot_table():
474
496
  with pytest.raises(ValueError):
475
497
  df.pivot_table(values=["D", "E"], aggfunc="sum")
476
498
 
477
- t = df.pivot_table(index="A")
499
+ t = df.pivot_table(index=["A", "B", "C"])
478
500
  assert isinstance(t.op, DataFrameGroupByAgg)
479
501
  t = df.pivot_table(index="A", values=["D", "E"], aggfunc="sum")
480
502
  assert isinstance(t.op, DataFrameGroupByAgg)
@@ -27,6 +27,7 @@ from ..operators import DataFrameOperator, DataFrameOperatorMixin
27
27
  from ..utils import (
28
28
  build_df,
29
29
  build_series,
30
+ copy_func_scheduling_hints,
30
31
  make_dtypes,
31
32
  pack_func_args,
32
33
  parse_index,
@@ -49,10 +50,12 @@ class TransformOperator(DataFrameOperator, DataFrameOperatorMixin):
49
50
 
50
51
  def __init__(self, output_types=None, memory_scale=None, **kw):
51
52
  super().__init__(_output_types=output_types, _memory_scale=memory_scale, **kw)
53
+ if hasattr(self, "func"):
54
+ copy_func_scheduling_hints(self.func, self)
52
55
 
53
56
  def _infer_df_func_returns(self, df, dtypes):
54
- packed_funcs = self.get_packed_funcs(df)
55
- test_df = self._build_stub_pandas_obj(df)
57
+ packed_funcs = self.func
58
+ test_df = _build_stub_pandas_obj(df, self.output_types[0])
56
59
  if self.output_types[0] == OutputType.dataframe:
57
60
  try:
58
61
  with np.errstate(all="ignore"), quiet_stdio():
@@ -147,16 +150,18 @@ class TransformOperator(DataFrameOperator, DataFrameOperatorMixin):
147
150
  index_value=new_index_value,
148
151
  )
149
152
 
150
- def get_packed_funcs(self, df=None) -> Any:
151
- stub_df = self._build_stub_pandas_obj(df or self.inputs[0])
152
- return pack_func_args(stub_df, self.func, *self.args, **self.kwds)
153
153
 
154
- def _build_stub_pandas_obj(self, df) -> Union[DataFrame, Series]:
155
- # TODO: Simulate a dataframe with the corresponding indexes if self.func is
156
- # a dict and axis=1
157
- if self.output_types[0] == OutputType.dataframe:
158
- return build_df(df, fill_value=1, size=1)
159
- return build_series(df, size=1, name=df.name)
154
+ def get_packed_funcs(df, output_type, func, *args, **kwds) -> Any:
155
+ stub_df = _build_stub_pandas_obj(df, output_type)
156
+ return pack_func_args(stub_df, func, *args, **kwds)
157
+
158
+
159
+ def _build_stub_pandas_obj(df, output_type) -> Union[DataFrame, Series]:
160
+ # TODO: Simulate a dataframe with the corresponding indexes if self.func is
161
+ # a dict and axis=1
162
+ if output_type == OutputType.dataframe:
163
+ return build_df(df, fill_value=1, size=1)
164
+ return build_series(df, size=1, name=df.name)
160
165
 
161
166
 
162
167
  def df_transform(df, func, axis=0, *args, dtypes=None, skip_infer=False, **kwargs):
@@ -229,13 +234,15 @@ def df_transform(df, func, axis=0, *args, dtypes=None, skip_infer=False, **kwarg
229
234
  1 2 3
230
235
  2 3 4
231
236
  """
237
+ call_agg = kwargs.pop("_call_agg", False)
238
+ func = get_packed_funcs(df, OutputType.dataframe, func, *args, **kwargs)
232
239
  op = TransformOperator(
233
240
  func=func,
234
241
  axis=axis,
235
242
  args=args,
236
243
  kwds=kwargs,
237
244
  output_types=[OutputType.dataframe],
238
- call_agg=kwargs.pop("_call_agg", False),
245
+ call_agg=call_agg,
239
246
  )
240
247
  return op(df, dtypes=dtypes, skip_infer=skip_infer)
241
248
 
@@ -319,6 +326,8 @@ def series_transform(
319
326
  1 2 3
320
327
  2 3 4
321
328
  """
329
+ call_agg = kwargs.pop("_call_agg", False)
330
+ func = get_packed_funcs(series, OutputType.series, func, *args, **kwargs)
322
331
  op = TransformOperator(
323
332
  func=func,
324
333
  axis=axis,
@@ -326,7 +335,7 @@ def series_transform(
326
335
  args=args,
327
336
  kwds=kwargs,
328
337
  output_types=[OutputType.series],
329
- call_agg=kwargs.pop("_call_agg", False),
338
+ call_agg=call_agg,
330
339
  )
331
340
  dtypes = (series.name, dtype) if dtype is not None else None
332
341
  return op(series, dtypes=dtypes, skip_infer=skip_infer)
@@ -25,6 +25,7 @@ from .custom_reduction import DataFrameCustomReduction
25
25
  from .kurtosis import DataFrameKurtosis
26
26
  from .max import DataFrameMax
27
27
  from .mean import DataFrameMean
28
+ from .median import DataFrameMedian
28
29
  from .min import DataFrameMin
29
30
  from .nunique import DataFrameNunique
30
31
  from .prod import DataFrameProd
@@ -50,6 +51,7 @@ def _install():
50
51
  from .kurtosis import kurt_dataframe, kurt_series
51
52
  from .max import max_dataframe, max_index, max_series
52
53
  from .mean import mean_dataframe, mean_series
54
+ from .median import median_dataframe, median_series
53
55
  from .min import min_dataframe, min_index, min_series
54
56
  from .nunique import nunique_dataframe, nunique_series
55
57
  from .prod import prod_dataframe, prod_series
@@ -68,6 +70,7 @@ def _install():
68
70
  ("min", min_series, min_dataframe),
69
71
  ("count", count_series, count_dataframe),
70
72
  ("mean", mean_series, mean_dataframe),
73
+ ("median", median_series, median_dataframe),
71
74
  ("var", var_series, var_dataframe),
72
75
  ("std", std_series, std_dataframe),
73
76
  ("all", all_series, all_dataframe),
@@ -71,6 +71,7 @@ _agg_functions = {
71
71
  "kurt": lambda x, skipna=True, bias=False: x.kurt(skipna=skipna, bias=bias),
72
72
  "kurtosis": lambda x, skipna=True, bias=False: x.kurtosis(skipna=skipna, bias=bias),
73
73
  "nunique": lambda x: x.nunique(),
74
+ "median": lambda x, skipna=True: x.median(skipna=skipna),
74
75
  }
75
76
 
76
77
 
@@ -0,0 +1,56 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from ... import opcodes
16
+ from ...core import OutputType
17
+ from .core import DataFrameReductionMixin, DataFrameReductionOperator
18
+
19
+
20
+ class DataFrameMedian(DataFrameReductionOperator, DataFrameReductionMixin):
21
+ _op_type_ = opcodes.MEDIAN
22
+ _func_name = "median"
23
+
24
+ @property
25
+ def is_atomic(self):
26
+ return True
27
+
28
+
29
+ def median_series(df, axis=None, skipna=True, level=None, method=None):
30
+ op = DataFrameMedian(
31
+ axis=axis,
32
+ skipna=skipna,
33
+ level=level,
34
+ output_types=[OutputType.scalar if level is not None else OutputType.scalar],
35
+ method=method,
36
+ )
37
+ return op(df)
38
+
39
+
40
+ def median_dataframe(
41
+ df,
42
+ axis=0,
43
+ skipna=True,
44
+ level=None,
45
+ numeric_only=None,
46
+ method=None,
47
+ ):
48
+ op = DataFrameMedian(
49
+ axis=axis,
50
+ skipna=skipna,
51
+ level=level,
52
+ numeric_only=numeric_only,
53
+ output_types=[OutputType.dataframe if level is not None else OutputType.series],
54
+ method=method,
55
+ )
56
+ return op(df)
@@ -23,6 +23,7 @@ import pytest
23
23
 
24
24
  from .... import dataframe as md
25
25
  from ....tensor import Tensor
26
+ from ....tests.utils import assert_mf_index_dtype
26
27
  from ...core import DataFrame, IndexValue, OutputType, Series
27
28
  from ...datasource.dataframe import from_pandas as from_pandas_df
28
29
  from ...datasource.series import from_pandas as from_pandas_series
@@ -38,6 +39,7 @@ from .. import (
38
39
  DataFrameKurtosis,
39
40
  DataFrameMax,
40
41
  DataFrameMean,
42
+ DataFrameMedian,
41
43
  DataFrameMin,
42
44
  DataFrameNunique,
43
45
  DataFrameProd,
@@ -71,6 +73,7 @@ reduction_functions = [
71
73
  ("sem", DataFrameSem, FunctionOptions()),
72
74
  ("all", DataFrameAll, FunctionOptions(has_numeric_only=False, has_bool_only=True)),
73
75
  ("any", DataFrameAny, FunctionOptions(has_numeric_only=False, has_bool_only=True)),
76
+ ("median", DataFrameMedian, FunctionOptions()),
74
77
  ]
75
78
 
76
79
 
@@ -111,10 +114,7 @@ def test_dataframe_reduction(func_name, op, func_opts: FunctionOptions):
111
114
  reduction_df = getattr(from_pandas_df(data, chunk_size=3), func_name)()
112
115
 
113
116
  assert isinstance(reduction_df, Series)
114
- assert isinstance(
115
- reduction_df.index_value._index_value,
116
- (IndexValue.RangeIndex, IndexValue.Int64Index),
117
- )
117
+ assert_mf_index_dtype(reduction_df.index_value._index_value, np.int64)
118
118
  assert reduction_df.shape == (10,)
119
119
 
120
120
  data = pd.DataFrame(np.random.rand(20, 20), index=[str(i) for i in range(20)])
@@ -210,6 +210,7 @@ def test_dataframe_aggregate():
210
210
  "skew",
211
211
  "kurt",
212
212
  "sem",
213
+ "median",
213
214
  ]
214
215
 
215
216
  df = from_pandas_df(data)
@@ -253,7 +254,7 @@ def test_dataframe_aggregate():
253
254
  assert result.op.output_types[0] == OutputType.dataframe
254
255
  assert result.op.func == agg_funcs
255
256
 
256
- dict_fun = {0: "sum", 2: ["var", "max"], 9: ["mean", "var", "std"]}
257
+ dict_fun = {0: "sum", 2: ["var", "max"], 9: ["mean", "var", "std", "median"]}
257
258
  all_cols = set(
258
259
  reduce(
259
260
  operator.add, [[v] if isinstance(v, str) else v for v in dict_fun.values()]
@@ -268,9 +269,9 @@ def test_dataframe_aggregate():
268
269
  assert result.op.func[2] == dict_fun[2]
269
270
 
270
271
  with pytest.raises(TypeError):
271
- df.agg(sum_0="sum", mean_0="mean")
272
+ df.agg(sum_0="sum", mean_0="mean", median_0="median")
272
273
  with pytest.raises(NotImplementedError):
273
- df.agg({0: ["sum", "min", "var"], 9: ["mean", "var", "std"]}, axis=1)
274
+ df.agg({0: ["sum", "min", "var"], 9: ["mean", "var", "std", "median"]}, axis=1)
274
275
 
275
276
 
276
277
  def test_series_aggregate():
@@ -287,6 +288,7 @@ def test_series_aggregate():
287
288
  "skew",
288
289
  "kurt",
289
290
  "sem",
291
+ "median",
290
292
  ]
291
293
 
292
294
  series = from_pandas_series(data)
@@ -303,6 +305,14 @@ def test_series_aggregate():
303
305
  assert result.shape == ()
304
306
  assert result.op.output_types[0] == OutputType.scalar
305
307
 
308
+ result = series.agg("median")
309
+ assert result.shape == ()
310
+ assert result.op.output_types[0] == OutputType.scalar
311
+
312
+ result = series.median(level=0)
313
+ assert result.shape == (np.nan,)
314
+ assert result.op.output_types[0] == OutputType.series
315
+
306
316
  result = series.agg(agg_funcs)
307
317
  assert result.shape == (len(agg_funcs),)
308
318
  assert list(result.index_value.to_pandas()) == agg_funcs
@@ -81,7 +81,10 @@ class DataFrameQuantile(DataFrameOperator, DataFrameOperatorMixin):
81
81
  store_index_value = False
82
82
  else:
83
83
  q_val = np.asanyarray(self.q)
84
- pd_index = pd.Index(q_val)
84
+ if q_val.ndim == 0:
85
+ pd_index = pd.Index(q_val.reshape(1))
86
+ else:
87
+ pd_index = pd.Index(q_val)
85
88
  name = self.q if q_val.size == 1 else None
86
89
  store_index_value = True
87
90
  tokenize_objects = (a, q_val, self.interpolation, type(self).__name__)
@@ -164,7 +167,10 @@ class DataFrameQuantile(DataFrameOperator, DataFrameOperatorMixin):
164
167
  store_index_value = False
165
168
  else:
166
169
  q_val = np.asanyarray(self.q)
167
- index_val = pd.Index(q_val)
170
+ if q_val.ndim == 0:
171
+ index_val = pd.Index(q_val.reshape(1))
172
+ else:
173
+ index_val = pd.Index(q_val)
168
174
  store_index_value = True
169
175
 
170
176
  # get dtype by tensor
@@ -49,7 +49,7 @@ def test_dataframe_quantile():
49
49
 
50
50
  # q = 0.3, axis = 0
51
51
  r = s.quantile(0.3)
52
- e = raw.quantile(0.3)
52
+ e = raw.quantile(0.3, numeric_only=True)
53
53
  assert isinstance(r, Series)
54
54
  assert r.shape == (2,)
55
55
  assert r.dtype == e.dtype
@@ -57,7 +57,7 @@ def test_dataframe_quantile():
57
57
 
58
58
  # q = 0.3, axis = 1
59
59
  r = s.quantile(0.3, axis=1)
60
- e = raw.quantile(0.3, axis=1)
60
+ e = raw.quantile(0.3, numeric_only=True, axis=1)
61
61
  assert isinstance(r, Series)
62
62
  assert r.shape == e.shape
63
63
  assert r.dtype == e.dtype
@@ -65,7 +65,7 @@ def test_dataframe_quantile():
65
65
 
66
66
  # q = [0.3, 0.7], axis = 0
67
67
  r = s.quantile([0.3, 0.7])
68
- e = raw.quantile([0.3, 0.7])
68
+ e = raw.quantile([0.3, 0.7], numeric_only=True)
69
69
  assert isinstance(r, DataFrame)
70
70
  assert r.shape == e.shape
71
71
  pd.testing.assert_series_equal(r.dtypes, e.dtypes)
@@ -74,7 +74,7 @@ def test_dataframe_quantile():
74
74
 
75
75
  # q = [0.3, 0.7], axis = 1
76
76
  r = s.quantile([0.3, 0.7], axis=1)
77
- e = raw.quantile([0.3, 0.7], axis=1)
77
+ e = raw.quantile([0.3, 0.7], numeric_only=True, axis=1)
78
78
  assert isinstance(r, DataFrame)
79
79
  assert r.shape == e.shape
80
80
  pd.testing.assert_series_equal(r.dtypes, e.dtypes)
@@ -13,12 +13,13 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import pandas as pd
16
+ import pytest
16
17
 
17
18
  from ..core import DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE
18
- from ..initializer import read_pandas
19
+ from ..initializer import DataFrame, Series, read_pandas
19
20
 
20
21
 
21
- def test_from_pandas():
22
+ def test_read_pandas():
22
23
  df_data = pd.DataFrame([["a", 1], ["b", 2]], columns=["a", "b"])
23
24
  assert isinstance(read_pandas(df_data), DATAFRAME_TYPE)
24
25
 
@@ -27,3 +28,33 @@ def test_from_pandas():
27
28
 
28
29
  idx_data = pd.Index(["a", "b"])
29
30
  assert isinstance(read_pandas(idx_data), INDEX_TYPE)
31
+
32
+
33
+ def test_init_dataframe_from_maxframe_series():
34
+ s = Series([1, 2, 3, 4], index=[1, 2, 3, 4])
35
+
36
+ df = DataFrame(s, index=s.index, columns=["col1"])
37
+
38
+ assert isinstance(df, DATAFRAME_TYPE)
39
+ assert df.dtypes.index == ["col1"]
40
+
41
+ with pytest.raises(ValueError):
42
+ DataFrame(s, index=s.index, columns=[])
43
+
44
+ with pytest.raises(ValueError):
45
+ DataFrame(s, index=s.index, columns="col1")
46
+
47
+ with pytest.raises(ValueError):
48
+ DataFrame(s, index=s.index, columns="col2")
49
+
50
+
51
+ def test_init_dataframe_from_maxframe_dataframe():
52
+ df1 = DataFrame({"A": [1, 2, 3, 4], "B": [1, 2, 3, 4]}, index=[1, 2, 3, 4])
53
+
54
+ df2 = DataFrame(df1, index=df1.index, columns=["col1", "col2"])
55
+
56
+ assert isinstance(df2, DATAFRAME_TYPE)
57
+ assert list(df2.dtypes.index) == ["col1", "col2"]
58
+
59
+ with pytest.raises(ValueError):
60
+ DataFrame(df1, index=df1.index, columns=["col1", "col2", "col3"])
@@ -0,0 +1,60 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import numpy as np
15
+ import pandas as pd
16
+ import pytest
17
+
18
+ from ...udf import MarkedFunction, with_python_requirements, with_resources
19
+ from ..utils import pack_func_args
20
+
21
+
22
+ @pytest.fixture
23
+ def df1():
24
+ return pd.DataFrame([[1, 2], [3, 4]], columns=["a", "b"])
25
+
26
+
27
+ def test_pack_function(df1):
28
+ # pack normal function
29
+ @with_resources("a.zip")
30
+ def keep(df):
31
+ return df
32
+
33
+ f = pack_func_args(df1, keep)
34
+ assert f(df1).equals(df1)
35
+ assert isinstance(f, MarkedFunction)
36
+ assert f.resources == ["a.zip"]
37
+
38
+ # pack with args
39
+ @with_python_requirements("numpy")
40
+ def add(a, b):
41
+ return a + b
42
+
43
+ f = pack_func_args(df1, add, 1)
44
+ assert f(df1).equals(df1 + 1)
45
+ assert isinstance(f, MarkedFunction)
46
+ assert f.pythonpacks[0].requirements == ("numpy",)
47
+
48
+ f = pack_func_args(df1, np.sum)
49
+ assert f(df1).equals(np.sum(df1))
50
+
51
+ @with_resources("a.txt")
52
+ @with_python_requirements("pandas")
53
+ def times_add(df, param, times):
54
+ return df * times + param
55
+
56
+ f = pack_func_args(df1, times_add, 5, 6)
57
+ assert f(df1).equals(df1 * 6 + 5)
58
+ assert isinstance(f, MarkedFunction)
59
+ assert f.resources == ["a.txt"]
60
+ assert f.pythonpacks[0].requirements == ("pandas",)
@@ -20,7 +20,7 @@ import operator
20
20
  import sys
21
21
  from contextlib import contextmanager
22
22
  from numbers import Integral
23
- from typing import Any, Callable, List
23
+ from typing import TYPE_CHECKING, Any, Callable, List
24
24
 
25
25
  import numpy as np
26
26
  import pandas as pd
@@ -30,6 +30,7 @@ from pandas.core.dtypes.inference import is_dict_like, is_list_like
30
30
 
31
31
  from ..core import Entity, ExecutableTuple
32
32
  from ..lib.mmh3 import hash as mmh_hash
33
+ from ..udf import MarkedFunction
33
34
  from ..utils import (
34
35
  ModulePlaceholder,
35
36
  is_full_slice,
@@ -44,6 +45,9 @@ try:
44
45
  except ImportError: # pragma: no cover
45
46
  pa = ModulePlaceholder("pyarrow")
46
47
 
48
+ if TYPE_CHECKING:
49
+ from .operators import DataFrameOperator
50
+
47
51
  cudf = lazy_import("cudf", rename="cudf")
48
52
  vineyard = lazy_import("vineyard")
49
53
  try:
@@ -263,12 +267,30 @@ def parse_index(index_value, *args, store_data=False, key=None):
263
267
  return IndexValue(_index_value=_serialize_index(index_value))
264
268
 
265
269
 
266
- def gen_unknown_index_value(index_value, *args):
270
+ def gen_unknown_index_value(index_value, *args, normalize_range_index=False):
271
+ """
272
+ Generate new index value with the same likes of given index_value and args, but without any value.
273
+
274
+ Parameters
275
+ ----------
276
+ index_value
277
+ Given index value.
278
+ args
279
+ Arguments for parse_index.
280
+ normalize_range_index
281
+ If normalize range index to normal index.
282
+
283
+ Returns
284
+ -------
285
+ New created range index value.
286
+ """
267
287
  pd_index = index_value.to_pandas()
268
- if isinstance(pd_index, pd.RangeIndex):
269
- return parse_index(pd.RangeIndex(-1), *args)
288
+ if not normalize_range_index and isinstance(pd_index, pd.RangeIndex):
289
+ return parse_index(pd.RangeIndex(-1, name=pd_index.name), *args)
270
290
  elif not isinstance(pd_index, pd.MultiIndex):
271
- return parse_index(pd.Index([], dtype=pd_index.dtype), *args)
291
+ return parse_index(
292
+ pd.Index([], dtype=pd_index.dtype, name=pd_index.name), *args
293
+ )
272
294
  else:
273
295
  i = pd.MultiIndex.from_arrays(
274
296
  [c[:0] for c in pd_index.levels], names=pd_index.names
@@ -1160,7 +1182,65 @@ def patch_sa_engine_execute():
1160
1182
  Engine.execute = execute
1161
1183
 
1162
1184
 
1163
- def pack_func_args(df, funcs, *args, **kwargs) -> Any:
1185
+ def bind_func_args_from_pos(func, args_bind_position, *bound_args, **bound_kwargs):
1186
+ """
1187
+ Create a new function with arguments bound from specified position.
1188
+
1189
+ Parameters
1190
+ ----------
1191
+ func : callable
1192
+ Target function to be wrapped.
1193
+ args_bind_position : int
1194
+ Position to start binding arguments (0-based).
1195
+ e.g., n=0 binds from first arg, n=1 binds from second arg.
1196
+ *bound_args : tuple
1197
+ Arguments to be bound from position n.
1198
+ **bound_kwargs : dict
1199
+ Keyword arguments to be bound.
1200
+
1201
+ Returns
1202
+ -------
1203
+ callable
1204
+ Wrapped function with bound arguments.
1205
+
1206
+ Examples
1207
+ --------
1208
+ >>> def func(x, y, z=0):
1209
+ ... return x * y + z
1210
+ >>> f = bind_func_args_from_pos(func, 0, 10) # bind from second position
1211
+ >>> f(5) # equals func(5, 10)
1212
+ 10
1213
+
1214
+ Raises
1215
+ ------
1216
+ TypeError
1217
+ If func is not callable or n is not an integer.
1218
+ ValueError
1219
+ If n is negative or exceeds the number of parameters.
1220
+ """
1221
+
1222
+ @functools.wraps(func)
1223
+ def wrapper(*runtime_args, **runtime_kwargs):
1224
+ try:
1225
+ # Combine arguments
1226
+ all_args = (
1227
+ runtime_args[:args_bind_position]
1228
+ + bound_args
1229
+ + runtime_args[args_bind_position:]
1230
+ )
1231
+ all_kwargs = {**bound_kwargs, **runtime_kwargs}
1232
+
1233
+ return func(*all_args, **all_kwargs)
1234
+ except Exception as e:
1235
+ # Enhance error message with context
1236
+ raise type(e)(
1237
+ f"Error calling {func.__name__} with bound arguments: {str(e)}"
1238
+ ) from e
1239
+
1240
+ return wrapper
1241
+
1242
+
1243
+ def pack_func_args(df, funcs, *args, args_bind_position=1, **kwargs) -> Any:
1164
1244
  """
1165
1245
  Pack the funcs with args and kwargs to avoid the ambiguity between other
1166
1246
  positional and keyword arguments. It will process the funcs by the following rule:
@@ -1189,6 +1269,9 @@ def pack_func_args(df, funcs, *args, **kwargs) -> Any:
1189
1269
  The DataFrame or Series object to test the function.
1190
1270
  funcs : function, str, list-like or dict-like
1191
1271
  Function to pack. It should have the same type with Dataframe.transform().
1272
+ args_bind_position: int
1273
+ Position to start binding arguments (0-based).
1274
+ e.g., n=0 binds from first arg, n=1 binds from second arg.
1192
1275
  *args :
1193
1276
  The positional arguments to func. If funcs contains many functions, each one
1194
1277
  should be able to accept *args.
@@ -1219,8 +1302,19 @@ def pack_func_args(df, funcs, *args, **kwargs) -> Any:
1219
1302
 
1220
1303
  f = get_callable_by_name(df, funcs) if isinstance(funcs, str) else funcs
1221
1304
 
1305
+ from ..udf import MarkedFunction
1306
+
1307
+ if isinstance(f, MarkedFunction):
1308
+ # for marked function, pack the inner function, and reset as mark function
1309
+ packed_func = f.copy()
1310
+ packed_func.func = bind_func_args_from_pos(
1311
+ f.func, args_bind_position, *args, **kwargs
1312
+ )
1313
+ else:
1314
+ packed_func = bind_func_args_from_pos(f, args_bind_position, *args, **kwargs)
1315
+
1222
1316
  # Callable
1223
- return functools.partial(f, *args, **kwargs)
1317
+ return packed_func
1224
1318
 
1225
1319
 
1226
1320
  def get_callable_by_name(df: Any, func_name: str) -> Callable:
@@ -1262,3 +1356,12 @@ def get_callable_by_name(df: Any, func_name: str) -> Callable:
1262
1356
  raise AttributeError(
1263
1357
  f"'{func_name}' is not a valid function for '{type(df).__name__}' object"
1264
1358
  )
1359
+
1360
+
1361
+ def copy_func_scheduling_hints(func, op: "DataFrameOperator") -> None:
1362
+ if not isinstance(func, MarkedFunction):
1363
+ return
1364
+ if func.expect_engine:
1365
+ op.expect_engine = func.expect_engine
1366
+ if func.expect_resources:
1367
+ op.expect_resources = func.expect_resources