maxframe 1.0.0rc3__cp310-cp310-win32.whl → 1.1.0__cp310-cp310-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (112) hide show
  1. maxframe/_utils.cp310-win32.pyd +0 -0
  2. maxframe/codegen.py +1 -0
  3. maxframe/config/config.py +16 -1
  4. maxframe/conftest.py +52 -14
  5. maxframe/core/entity/executable.py +1 -1
  6. maxframe/core/graph/core.cp310-win32.pyd +0 -0
  7. maxframe/core/operator/base.py +2 -0
  8. maxframe/dataframe/arithmetic/docstring.py +26 -2
  9. maxframe/dataframe/arithmetic/equal.py +4 -2
  10. maxframe/dataframe/arithmetic/greater.py +4 -2
  11. maxframe/dataframe/arithmetic/greater_equal.py +4 -2
  12. maxframe/dataframe/arithmetic/less.py +2 -2
  13. maxframe/dataframe/arithmetic/less_equal.py +4 -2
  14. maxframe/dataframe/arithmetic/not_equal.py +4 -2
  15. maxframe/dataframe/arithmetic/tests/test_arithmetic.py +17 -16
  16. maxframe/dataframe/core.py +26 -2
  17. maxframe/dataframe/datasource/read_odps_query.py +116 -28
  18. maxframe/dataframe/datasource/read_odps_table.py +3 -1
  19. maxframe/dataframe/datasource/tests/test_datasource.py +93 -12
  20. maxframe/dataframe/datastore/to_odps.py +7 -0
  21. maxframe/dataframe/extensions/__init__.py +8 -0
  22. maxframe/dataframe/extensions/apply_chunk.py +649 -0
  23. maxframe/dataframe/extensions/flatjson.py +131 -0
  24. maxframe/dataframe/extensions/flatmap.py +314 -0
  25. maxframe/dataframe/extensions/reshuffle.py +1 -1
  26. maxframe/dataframe/extensions/tests/test_apply_chunk.py +186 -0
  27. maxframe/dataframe/extensions/tests/test_extensions.py +108 -3
  28. maxframe/dataframe/groupby/__init__.py +1 -0
  29. maxframe/dataframe/groupby/aggregation.py +1 -0
  30. maxframe/dataframe/groupby/apply.py +9 -1
  31. maxframe/dataframe/groupby/core.py +1 -1
  32. maxframe/dataframe/groupby/fill.py +4 -1
  33. maxframe/dataframe/groupby/getitem.py +6 -0
  34. maxframe/dataframe/groupby/tests/test_groupby.py +1 -1
  35. maxframe/dataframe/groupby/transform.py +8 -2
  36. maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
  37. maxframe/dataframe/indexing/loc.py +6 -4
  38. maxframe/dataframe/indexing/rename.py +11 -0
  39. maxframe/dataframe/initializer.py +11 -1
  40. maxframe/dataframe/merge/__init__.py +9 -1
  41. maxframe/dataframe/merge/concat.py +41 -31
  42. maxframe/dataframe/merge/merge.py +1 -1
  43. maxframe/dataframe/merge/tests/test_merge.py +3 -1
  44. maxframe/dataframe/misc/apply.py +3 -0
  45. maxframe/dataframe/misc/drop_duplicates.py +23 -2
  46. maxframe/dataframe/misc/map.py +3 -1
  47. maxframe/dataframe/misc/tests/test_misc.py +24 -2
  48. maxframe/dataframe/misc/transform.py +22 -13
  49. maxframe/dataframe/reduction/__init__.py +3 -0
  50. maxframe/dataframe/reduction/aggregation.py +1 -0
  51. maxframe/dataframe/reduction/median.py +56 -0
  52. maxframe/dataframe/reduction/tests/test_reduction.py +17 -7
  53. maxframe/dataframe/statistics/quantile.py +8 -2
  54. maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
  55. maxframe/dataframe/tests/test_initializer.py +33 -2
  56. maxframe/dataframe/tests/test_utils.py +60 -0
  57. maxframe/dataframe/utils.py +110 -7
  58. maxframe/dataframe/window/expanding.py +5 -3
  59. maxframe/dataframe/window/tests/test_expanding.py +2 -2
  60. maxframe/io/objects/tests/test_object_io.py +39 -12
  61. maxframe/io/odpsio/arrow.py +30 -2
  62. maxframe/io/odpsio/schema.py +28 -8
  63. maxframe/io/odpsio/tableio.py +55 -133
  64. maxframe/io/odpsio/tests/test_schema.py +40 -4
  65. maxframe/io/odpsio/tests/test_tableio.py +5 -5
  66. maxframe/io/odpsio/tests/test_volumeio.py +35 -11
  67. maxframe/io/odpsio/volumeio.py +36 -6
  68. maxframe/learn/contrib/__init__.py +3 -1
  69. maxframe/learn/contrib/graph/__init__.py +15 -0
  70. maxframe/learn/contrib/graph/connected_components.py +215 -0
  71. maxframe/learn/contrib/graph/tests/__init__.py +13 -0
  72. maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
  73. maxframe/learn/contrib/llm/__init__.py +16 -0
  74. maxframe/learn/contrib/llm/core.py +54 -0
  75. maxframe/learn/contrib/llm/models/__init__.py +14 -0
  76. maxframe/learn/contrib/llm/models/dashscope.py +73 -0
  77. maxframe/learn/contrib/llm/multi_modal.py +42 -0
  78. maxframe/learn/contrib/llm/text.py +42 -0
  79. maxframe/learn/contrib/xgboost/classifier.py +3 -3
  80. maxframe/learn/contrib/xgboost/predict.py +8 -39
  81. maxframe/learn/contrib/xgboost/train.py +4 -3
  82. maxframe/lib/mmh3.cp310-win32.pyd +0 -0
  83. maxframe/lib/sparse/tests/test_sparse.py +15 -15
  84. maxframe/opcodes.py +10 -1
  85. maxframe/protocol.py +6 -1
  86. maxframe/serialization/core.cp310-win32.pyd +0 -0
  87. maxframe/serialization/core.pyx +13 -1
  88. maxframe/serialization/pandas.py +50 -20
  89. maxframe/serialization/serializables/core.py +24 -5
  90. maxframe/serialization/serializables/field_type.py +4 -1
  91. maxframe/serialization/serializables/tests/test_serializable.py +8 -1
  92. maxframe/serialization/tests/test_serial.py +2 -1
  93. maxframe/session.py +9 -2
  94. maxframe/tensor/__init__.py +19 -7
  95. maxframe/tensor/indexing/getitem.py +2 -0
  96. maxframe/tensor/merge/concatenate.py +23 -20
  97. maxframe/tensor/merge/vstack.py +5 -1
  98. maxframe/tensor/misc/transpose.py +1 -1
  99. maxframe/tests/utils.py +16 -0
  100. maxframe/udf.py +27 -0
  101. maxframe/utils.py +64 -14
  102. {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/METADATA +2 -2
  103. {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/RECORD +112 -96
  104. {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/WHEEL +1 -1
  105. maxframe_client/clients/framedriver.py +4 -1
  106. maxframe_client/fetcher.py +28 -10
  107. maxframe_client/session/consts.py +3 -0
  108. maxframe_client/session/odps.py +104 -20
  109. maxframe_client/session/task.py +42 -26
  110. maxframe_client/session/tests/test_task.py +0 -4
  111. maxframe_client/tests/test_session.py +44 -12
  112. {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/top_level.txt +0 -0
@@ -11,12 +11,14 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
-
14
+ import numpy as np
15
15
  import pandas as pd
16
16
  import pytest
17
17
 
18
18
  from .... import dataframe as md
19
- from ...core import IndexValue
19
+ from ....tests.utils import assert_mf_index_dtype
20
+ from ... import DataFrame
21
+ from ...core import DATAFRAME_TYPE, SERIES_TYPE, IndexValue
20
22
  from ..reshuffle import DataFrameReshuffle
21
23
 
22
24
 
@@ -31,8 +33,111 @@ def test_reshuffle():
31
33
 
32
34
  r = mdf.mf.reshuffle()
33
35
  assert isinstance(r.op, DataFrameReshuffle)
34
- assert isinstance(r.index_value.value, IndexValue.Int64Index)
36
+ assert_mf_index_dtype(r.index_value.value, np.int64)
35
37
 
36
38
  r = mdf.mf.reshuffle(ignore_index=True)
37
39
  assert isinstance(r.op, DataFrameReshuffle)
38
40
  assert isinstance(r.index_value.value, IndexValue.RangeIndex)
41
+
42
+
43
+ @pytest.fixture
44
+ def df1():
45
+ return DataFrame({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]})
46
+
47
+
48
+ @pytest.fixture
49
+ def df2():
50
+ return DataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3]], columns=["a", "b", "c"])
51
+
52
+
53
+ @pytest.fixture
54
+ def df3():
55
+ return DataFrame(
56
+ [[1, 2, 3], [1, 2, 3], [1, 2, 3]],
57
+ columns=["a", "b", "c"],
58
+ index=pd.MultiIndex.from_arrays([[1, 2, 3], [1, 2, 3]], names=["A", "B"]),
59
+ )
60
+
61
+
62
+ def test_flatmap(df1, df2, df3):
63
+ def f(x, keys):
64
+ if x["a"] in keys:
65
+ yield [1, 0]
66
+ yield [0, 1]
67
+
68
+ apply_df = df1[["a"]].mf.flatmap(
69
+ f,
70
+ dtypes={"a": "int64", "b": "int64"},
71
+ )
72
+ assert apply_df.shape == (np.nan, 2)
73
+ assert df1.index_value.key != apply_df.index_value.key
74
+ assert isinstance(df1.index_value.to_pandas(), pd.RangeIndex)
75
+ assert not isinstance(apply_df.index_value.to_pandas(), pd.RangeIndex)
76
+ apply_df = df2[["a"]].mf.flatmap(
77
+ f,
78
+ dtypes=pd.Series(["int64", "int64"]),
79
+ )
80
+ assert apply_df.shape == (np.nan, 2)
81
+ assert df2.index_value.key != apply_df.index_value.key
82
+ with pytest.raises(TypeError):
83
+ apply_s = df3["a"].mf.flatmap(
84
+ f,
85
+ )
86
+ apply_s = df3["a"].mf.flatmap(
87
+ f,
88
+ dtype="int64",
89
+ )
90
+ assert apply_s.shape == (np.nan,)
91
+ assert df3.index_value.key != apply_s.index_value.key
92
+ assert df3.key != apply_s.index_value.key
93
+ apply_s = df3["a"].mf.flatmap(
94
+ f,
95
+ output_type="dataframe",
96
+ dtypes=["int64", "int64"],
97
+ )
98
+ assert apply_s.shape == (np.nan, 2)
99
+ assert df3.index_value.key != apply_s.index_value.key
100
+ assert df3.key != apply_s.index_value.key
101
+
102
+
103
+ def test_flatjson():
104
+ s1 = md.Series(["{{'a': 1, 'b': false}}"], index=[1])
105
+ df1 = s1.mf.flatjson(
106
+ ["$.a", "$.b"], dtypes=pd.Series(["int32", "bool"], index=["a", "b"])
107
+ )
108
+ assert df1.shape == (1, 2)
109
+ assert df1.index_value.key == s1.index_value.key
110
+ assert isinstance(df1, DATAFRAME_TYPE)
111
+ assert list(df1.dtypes) == [np.dtype("int32"), np.dtype("bool")]
112
+ assert list(df1.dtypes.index) == ["a", "b"]
113
+
114
+ df2 = s1.mf.flatjson(["$.a"], dtypes=pd.Series(["int32"], index=["a"]))
115
+ assert df2.shape == (1, 1)
116
+ assert df2.index_value.key == s1.index_value.key
117
+ assert isinstance(df2, DATAFRAME_TYPE)
118
+ assert list(df2.dtypes) == [np.dtype("int32")]
119
+ assert list(df2.dtypes.index) == ["a"]
120
+
121
+ s2 = s1.mf.flatjson("$.a", dtype="int32", name="a")
122
+ assert s2.shape == (1,)
123
+ assert s2.index_value.key == s1.index_value.key
124
+ assert isinstance(s2, SERIES_TYPE)
125
+ assert s2.dtype == np.dtype("int32")
126
+ assert s2.name == "a"
127
+
128
+ with pytest.raises(ValueError):
129
+ s1.mf.flatjson([], dtypes=pd.Series(["int32", "bool"], index=["a", "b"]))
130
+ with pytest.raises(ValueError):
131
+ s1.mf.flatjson(["$.a"], dtypes=pd.Series(["int32", "bool"], index=["a", "b"]))
132
+ with pytest.raises(ValueError):
133
+ s1.mf.flatjson(["$.a"], dtypes=pd.Series(["int32", "bool"], index=["a", "b"]))
134
+ with pytest.raises(ValueError):
135
+ s1.mf.flatjson(["$.a", "$.b"], dtypes=pd.Series(["bool"], index=["b"]))
136
+ with pytest.raises(ValueError):
137
+ s1.mf.flatjson(
138
+ ["$.a"],
139
+ dtype="int32",
140
+ dtypes=pd.Series(["int32"], index=["a"]),
141
+ )
142
+ with pytest.raises(ValueError):
143
+ s1.mf.flatjson(["$.a"])
@@ -55,6 +55,7 @@ def _install():
55
55
  setattr(cls, "kurtosis", lambda groupby, **kw: agg(groupby, "kurtosis", **kw))
56
56
  setattr(cls, "sem", lambda groupby, **kw: agg(groupby, "sem", **kw))
57
57
  setattr(cls, "nunique", lambda groupby, **kw: agg(groupby, "nunique", **kw))
58
+ setattr(cls, "median", lambda groupby, **kw: agg(groupby, "median", **kw))
58
59
 
59
60
  setattr(cls, "apply", groupby_apply)
60
61
  setattr(cls, "transform", groupby_transform)
@@ -79,6 +79,7 @@ _agg_functions = {
79
79
  "kurt": lambda x, bias=False: x.kurt(bias=bias),
80
80
  "kurtosis": lambda x, bias=False: x.kurtosis(bias=bias),
81
81
  "nunique": lambda x: x.nunique(),
82
+ "median": lambda x: x.median(),
82
83
  }
83
84
  _series_col_name = "col_name"
84
85
 
@@ -28,7 +28,13 @@ from ...serialization.serializables import (
28
28
  )
29
29
  from ...utils import get_func_token, quiet_stdio, tokenize
30
30
  from ..operators import DataFrameOperator, DataFrameOperatorMixin
31
- from ..utils import make_dtype, make_dtypes, parse_index, validate_output_types
31
+ from ..utils import (
32
+ copy_func_scheduling_hints,
33
+ make_dtype,
34
+ make_dtypes,
35
+ parse_index,
36
+ validate_output_types,
37
+ )
32
38
 
33
39
 
34
40
  class GroupByApplyLogicKeyGeneratorMixin(OperatorLogicKeyGeneratorMixin):
@@ -56,6 +62,8 @@ class GroupByApply(
56
62
 
57
63
  def __init__(self, output_types=None, **kw):
58
64
  super().__init__(_output_types=output_types, **kw)
65
+ if hasattr(self, "func"):
66
+ copy_func_scheduling_hints(self.func, self)
59
67
 
60
68
  def _update_key(self):
61
69
  values = [v for v in self._values_ if v is not self.func] + [
@@ -28,7 +28,7 @@ from ..utils import build_df, build_series, parse_index
28
28
 
29
29
  cudf = lazy_import("cudf")
30
30
 
31
- _GROUP_KEYS_NO_DEFAULT = pd_release_version >= (1, 5, 0)
31
+ _GROUP_KEYS_NO_DEFAULT = pd_release_version[:2] == (1, 5)
32
32
  _default_group_keys = no_default if _GROUP_KEYS_NO_DEFAULT else True
33
33
 
34
34
 
@@ -35,12 +35,15 @@ class GroupByFillOperator(DataFrameOperator, DataFrameOperatorMixin):
35
35
  func_name = getattr(self, "_func_name")
36
36
 
37
37
  if func_name == "fillna":
38
+ kw = {}
39
+ if self.axis is not None:
40
+ kw["axis"] = self.axis
38
41
  result_df = mock_groupby.fillna(
39
42
  value=self.value,
40
43
  method=self.method,
41
- axis=self.axis,
42
44
  limit=self.limit,
43
45
  downcast=self.downcast,
46
+ **kw,
44
47
  )
45
48
  else:
46
49
  result_df = getattr(mock_groupby, func_name)(limit=self.limit)
@@ -88,5 +88,11 @@ def df_groupby_getitem(df_groupby, item):
88
88
  if df_groupby.selection:
89
89
  raise IndexError(f"Column(s) {df_groupby.selection!r} already selected")
90
90
 
91
+ if (
92
+ isinstance(item, tuple)
93
+ and item not in df_groupby.dtypes
94
+ and item not in df_groupby.index.names
95
+ ):
96
+ item = list(item)
91
97
  op = GroupByIndex(selection=item, output_types=output_types)
92
98
  return op(df_groupby)
@@ -230,7 +230,7 @@ def test_groupby_transform():
230
230
  assert r.op._op_type_ == opcodes.TRANSFORM
231
231
  assert r.op.output_types[0] == OutputType.dataframe
232
232
 
233
- r = mdf.groupby("b").transform(["cummax", "cumcount"], _call_agg=True)
233
+ r = mdf[list("abde")].groupby("b").transform(["cummax", "cumcount"], _call_agg=True)
234
234
  assert r.shape == (np.nan, 6)
235
235
  assert r.op._op_type_ == opcodes.TRANSFORM
236
236
  assert r.op.output_types[0] == OutputType.dataframe
@@ -12,6 +12,8 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ import logging
16
+
15
17
  import numpy as np
16
18
  import pandas as pd
17
19
 
@@ -20,7 +22,9 @@ from ...core import OutputType
20
22
  from ...serialization.serializables import AnyField, BoolField, DictField, TupleField
21
23
  from ...utils import quiet_stdio
22
24
  from ..operators import DataFrameOperator, DataFrameOperatorMixin
23
- from ..utils import parse_index
25
+ from ..utils import copy_func_scheduling_hints, parse_index
26
+
27
+ logger = logging.getLogger(__name__)
24
28
 
25
29
 
26
30
  class GroupByTransform(DataFrameOperator, DataFrameOperatorMixin):
@@ -35,6 +39,8 @@ class GroupByTransform(DataFrameOperator, DataFrameOperatorMixin):
35
39
 
36
40
  def __init__(self, output_types=None, **kw):
37
41
  super().__init__(_output_types=output_types, **kw)
42
+ if hasattr(self, "func"):
43
+ copy_func_scheduling_hints(self.func, self)
38
44
 
39
45
  def _infer_df_func_returns(self, in_groupby, dtypes, index):
40
46
  index_value, output_types, new_dtypes = None, None, None
@@ -65,7 +71,7 @@ class GroupByTransform(DataFrameOperator, DataFrameOperatorMixin):
65
71
  output_types = [OutputType.series]
66
72
  new_dtypes = new_dtypes or (infer_df.name, infer_df.dtype)
67
73
  except: # noqa: E722 # nosec
68
- pass
74
+ logger.info("Exception raised while inferring df_func", exc_info=True)
69
75
 
70
76
  self.output_types = output_types if not self.output_types else self.output_types
71
77
  dtypes = new_dtypes if dtypes is None else dtypes
@@ -51,7 +51,7 @@ def _get_prefix_suffix_docs(is_prefix: bool):
51
51
  Examples
52
52
  --------
53
53
  >>> import maxframe.dataframe as md
54
- >>> s = md.Series([1, 2, 3, 4])
54
+ >>> s = md.Series([1, 2, 3, 4])
55
55
  >>> s.execute()
56
56
  0 1
57
57
  1 2
@@ -25,13 +25,14 @@ from ...core import ENTITY_TYPE, OutputType
25
25
  from ...serialization.serializables import AnyField, KeyField, ListField
26
26
  from ...tensor.datasource import asarray
27
27
  from ...tensor.utils import calc_sliced_size, filter_inputs
28
- from ...utils import is_full_slice, lazy_import
28
+ from ...utils import is_full_slice, lazy_import, pd_release_version
29
29
  from ..core import DATAFRAME_TYPE, IndexValue
30
30
  from ..operators import DataFrameOperator, DataFrameOperatorMixin
31
31
  from ..utils import parse_index
32
32
  from .iloc import DataFrameIlocSetItem
33
33
 
34
34
  cudf = lazy_import("cudf")
35
+ with_slice_locs_kind = pd_release_version < (1, 4, 0)
35
36
 
36
37
 
37
38
  def process_loc_indexes(inp, indexes, fetch_index: bool = True):
@@ -210,9 +211,10 @@ class DataFrameLocGetItem(DataFrameOperator, DataFrameOperatorMixin):
210
211
  if axis == 1:
211
212
  param["dtypes"] = inp.dtypes
212
213
  elif input_index_value.has_value():
213
- start, end = pd_index.slice_locs(
214
- index.start, index.stop, index.step, kind="loc"
215
- )
214
+ kw = {}
215
+ if with_slice_locs_kind:
216
+ kw["kind"] = "loc"
217
+ start, end = pd_index.slice_locs(index.start, index.stop, index.step, **kw)
216
218
  slc = slice(start, end, index.step)
217
219
  size = calc_sliced_size(inp.shape[axis], slc)
218
220
  param["shape"] = size
@@ -248,6 +248,7 @@ def df_rename(
248
248
  )
249
249
 
250
250
 
251
+ # fixme https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/58
251
252
  def series_rename(
252
253
  series,
253
254
  index=None,
@@ -382,6 +383,7 @@ def index_rename(index, name, inplace=False):
382
383
  return ret
383
384
 
384
385
 
386
+ # fixme https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/59
385
387
  def index_set_names(index, names, level=None, inplace=False):
386
388
  """
387
389
  Set Index or MultiIndex name.
@@ -407,6 +409,15 @@ def index_set_names(index, names, level=None, inplace=False):
407
409
  See Also
408
410
  --------
409
411
  Index.rename : Able to set new names without level.
412
+
413
+ Examples
414
+ --------
415
+ >>> import maxframe.dataframe as md
416
+ >>> idx = md.Index([1, 2, 3, 4])
417
+ >>> idx.execute()
418
+ Int64Index([1, 2, 3, 4], dtype='int64')
419
+ >>> idx.set_names('quarter').execute()
420
+ Int64Index([1, 2, 3, 4], dtype='int64', name='quarter')
410
421
  """
411
422
  op = DataFrameRename(
412
423
  index_mapper=names, level=level, output_types=get_output_types(index)
@@ -15,6 +15,7 @@
15
15
  from typing import Union
16
16
 
17
17
  import pandas as pd
18
+ from pandas.api.types import is_list_like
18
19
  from pandas.core.dtypes.common import pandas_dtype
19
20
 
20
21
  from ..core import ENTITY_TYPE
@@ -61,6 +62,8 @@ class DataFrame(_Frame, metaclass=InitializerMeta):
61
62
  num_partitions=None,
62
63
  ):
63
64
  need_repart = False
65
+ if columns is not None and not is_list_like(columns):
66
+ raise ValueError("columns must be a list-like object")
64
67
  if isinstance(data, TENSOR_TYPE):
65
68
  if chunk_size is not None:
66
69
  data = data.rechunk(chunk_size)
@@ -69,7 +72,10 @@ class DataFrame(_Frame, metaclass=InitializerMeta):
69
72
  )
70
73
  need_repart = num_partitions is not None
71
74
  elif isinstance(data, SERIES_TYPE):
72
- df = data.to_frame()
75
+ if columns is not None and len(columns) != 1:
76
+ raise ValueError("columns' length must be 1 when data is Series")
77
+ col_name = columns[0] if columns else None
78
+ df = data.to_frame(name=col_name)
73
79
  need_repart = num_partitions is not None
74
80
  elif isinstance(data, DATAFRAME_TYPE):
75
81
  if not hasattr(data, "data"):
@@ -77,6 +83,10 @@ class DataFrame(_Frame, metaclass=InitializerMeta):
77
83
  df = _Frame(data)
78
84
  else:
79
85
  df = data
86
+ if columns is not None:
87
+ if len(df.columns) != len(columns):
88
+ raise ValueError("columns' length must be equal to the data's")
89
+ df.columns = columns
80
90
  need_repart = num_partitions is not None
81
91
  elif isinstance(data, dict) and self._can_process_by_1d_tileables(data):
82
92
  # data is a dict and some value is tensor
@@ -14,7 +14,15 @@
14
14
 
15
15
  from .append import DataFrameAppend, append
16
16
  from .concat import DataFrameConcat, concat
17
- from .merge import DataFrameMerge, DataFrameMergeAlign, join, merge
17
+ from .merge import (
18
+ DataFrameMerge,
19
+ DataFrameMergeAlign,
20
+ DistributedMapJoinHint,
21
+ MapJoinHint,
22
+ SkewJoinHint,
23
+ join,
24
+ merge,
25
+ )
18
26
 
19
27
 
20
28
  def _install():
@@ -11,6 +11,7 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+ from typing import List, Union
14
15
 
15
16
  import pandas as pd
16
17
 
@@ -24,6 +25,7 @@ from ...serialization.serializables import (
24
25
  StringField,
25
26
  )
26
27
  from ...utils import lazy_import
28
+ from ..core import DataFrame, Series
27
29
  from ..operators import SERIES_TYPE, DataFrameOperator, DataFrameOperatorMixin
28
30
  from ..utils import build_empty_df, build_empty_series, parse_index, validate_axis
29
31
 
@@ -55,41 +57,53 @@ class DataFrameConcat(DataFrameOperator, DataFrameOperatorMixin):
55
57
  return self.names
56
58
 
57
59
  @classmethod
58
- def _concat_index(cls, prev_index: pd.Index, cur_index: pd.Index):
59
- if isinstance(prev_index, pd.RangeIndex) and isinstance(
60
- cur_index, pd.RangeIndex
61
- ):
62
- # handle RangeIndex that append may generate huge amount of data
63
- # e.g. pd.RangeIndex(10_000) and pd.RangeIndex(10_000)
64
- # will generate a Int64Index full of data
65
- # for details see GH#1647
66
- prev_stop = prev_index.start + prev_index.size * prev_index.step
67
- cur_start = cur_index.start
68
- if prev_stop == cur_start and prev_index.step == cur_index.step:
69
- # continuous RangeIndex, still return RangeIndex
70
- return prev_index.append(cur_index)
71
- else:
72
- # otherwise, return an empty index
73
- return pd.Index([], dtype=prev_index.dtype)
74
- elif isinstance(prev_index, pd.RangeIndex):
75
- return pd.Index([], prev_index.dtype).append(cur_index)
76
- elif isinstance(cur_index, pd.RangeIndex):
77
- return prev_index.append(pd.Index([], cur_index.dtype))
78
- return prev_index.append(cur_index)
60
+ def _concat_index(cls, df_or_series_list: Union[List[DataFrame], List[Series]]):
61
+ concat_index = None
62
+ all_indexes_have_value = all(
63
+ input.index_value.has_value() for input in df_or_series_list
64
+ )
65
+
66
+ def _concat(prev_index: pd.Index, cur_index: pd.Index):
67
+ if prev_index is None:
68
+ return cur_index
69
+
70
+ if (
71
+ all_indexes_have_value
72
+ and isinstance(prev_index, pd.RangeIndex)
73
+ and isinstance(cur_index, pd.RangeIndex)
74
+ ):
75
+ # handle RangeIndex that append may generate huge amount of data
76
+ # e.g. pd.RangeIndex(10_000) and pd.RangeIndex(10_000)
77
+ # will generate a Int64Index full of data
78
+ # for details see GH#1647
79
+ prev_stop = prev_index.start + prev_index.size * prev_index.step
80
+ cur_start = cur_index.start
81
+ if prev_stop == cur_start and prev_index.step == cur_index.step:
82
+ # continuous RangeIndex, still return RangeIndex
83
+ return prev_index.append(cur_index)
84
+ else:
85
+ # otherwise, return an empty index
86
+ return pd.Index([], dtype=prev_index.dtype)
87
+ elif isinstance(prev_index, pd.RangeIndex):
88
+ return pd.Index([], prev_index.dtype).append(cur_index)
89
+ elif isinstance(cur_index, pd.RangeIndex):
90
+ return prev_index.append(pd.Index([], cur_index.dtype))
91
+ return prev_index.append(cur_index)
92
+
93
+ for input in df_or_series_list:
94
+ concat_index = _concat(concat_index, input.index_value.to_pandas())
95
+
96
+ return concat_index
79
97
 
80
98
  def _call_series(self, objs):
81
99
  if self.axis == 0:
82
100
  row_length = 0
83
- index = None
84
101
  for series in objs:
85
- if index is None:
86
- index = series.index_value.to_pandas()
87
- else:
88
- index = self._concat_index(index, series.index_value.to_pandas())
89
102
  row_length += series.shape[0]
90
103
  if self.ignore_index: # pragma: no cover
91
104
  index_value = parse_index(pd.RangeIndex(row_length))
92
105
  else:
106
+ index = self._concat_index(objs)
93
107
  index_value = parse_index(index, objs)
94
108
  obj_names = {obj.name for obj in objs}
95
109
  return self.new_series(
@@ -130,13 +144,8 @@ class DataFrameConcat(DataFrameOperator, DataFrameOperatorMixin):
130
144
  def _call_dataframes(self, objs):
131
145
  if self.axis == 0:
132
146
  row_length = 0
133
- index = None
134
147
  empty_dfs = []
135
148
  for df in objs:
136
- if index is None:
137
- index = df.index_value.to_pandas()
138
- else:
139
- index = self._concat_index(index, df.index_value.to_pandas())
140
149
  row_length += df.shape[0]
141
150
  if df.ndim == 2:
142
151
  empty_dfs.append(build_empty_df(df.dtypes))
@@ -153,6 +162,7 @@ class DataFrameConcat(DataFrameOperator, DataFrameOperatorMixin):
153
162
  if self.ignore_index: # pragma: no cover
154
163
  index_value = parse_index(pd.RangeIndex(row_length))
155
164
  else:
165
+ index = self._concat_index(objs)
156
166
  index_value = parse_index(index, objs)
157
167
 
158
168
  new_objs = []
@@ -353,7 +353,7 @@ def merge(
353
353
  df: Union[DataFrame, Series],
354
354
  right: Union[DataFrame, Series],
355
355
  how: str = "inner",
356
- on: str = None,
356
+ on: Union[str, List[str]] = None,
357
357
  left_on: str = None,
358
358
  right_on: str = None,
359
359
  left_index: bool = False,
@@ -16,6 +16,7 @@ import numpy as np
16
16
  import pandas as pd
17
17
  import pytest
18
18
 
19
+ from ....tests.utils import assert_mf_index_dtype
19
20
  from ...core import IndexValue
20
21
  from ...datasource.dataframe import from_pandas
21
22
  from .. import DataFrameMerge, concat
@@ -161,7 +162,7 @@ def test_append():
161
162
  adf = mdf1.append(mdf2)
162
163
 
163
164
  assert adf.shape == (20, 4)
164
- assert isinstance(adf.index_value.value, IndexValue.Int64Index)
165
+ assert_mf_index_dtype(adf.index_value.value, np.int64)
165
166
 
166
167
  mdf1 = from_pandas(df1, chunk_size=3)
167
168
  mdf2 = from_pandas(df2, chunk_size=3)
@@ -181,6 +182,7 @@ def test_concat():
181
182
  r = concat([mdf1, mdf2], axis="index")
182
183
 
183
184
  assert r.shape == (20, 4)
185
+ assert not isinstance(r.index_value.to_pandas(), pd.RangeIndex)
184
186
  pd.testing.assert_series_equal(r.dtypes, df1.dtypes)
185
187
 
186
188
  df3 = pd.DataFrame(
@@ -35,6 +35,7 @@ from ..operators import DataFrameOperator, DataFrameOperatorMixin
35
35
  from ..utils import (
36
36
  build_df,
37
37
  build_series,
38
+ copy_func_scheduling_hints,
38
39
  make_dtype,
39
40
  make_dtypes,
40
41
  pack_func_args,
@@ -79,6 +80,8 @@ class ApplyOperator(
79
80
  if output_type:
80
81
  kw["_output_types"] = [output_type]
81
82
  super().__init__(**kw)
83
+ if hasattr(self, "func"):
84
+ copy_func_scheduling_hints(self.func, self)
82
85
 
83
86
  def _update_key(self):
84
87
  values = [v for v in self._values_ if v is not self.func] + [
@@ -43,7 +43,11 @@ class DataFrameDropDuplicates(DuplicateOperand):
43
43
  params["index_value"] = parse_index(pd.RangeIndex(-1))
44
44
  else:
45
45
  params["index_value"] = gen_unknown_index_value(
46
- input_params["index_value"], op.keep, op.subset, type(op).__name__
46
+ input_params["index_value"],
47
+ op.keep,
48
+ op.subset,
49
+ type(op).__name__,
50
+ normalize_range_index=True,
47
51
  )
48
52
  params["shape"] = self._get_shape(input_params["shape"], op)
49
53
  return params
@@ -104,7 +108,6 @@ def df_drop_duplicates(
104
108
  def series_drop_duplicates(
105
109
  series, keep="first", inplace=False, ignore_index=False, method="auto"
106
110
  ):
107
- # FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/12
108
111
  """
109
112
  Return Series with duplicate values removed.
110
113
 
@@ -148,6 +151,24 @@ def series_drop_duplicates(
148
151
  5 hippo
149
152
  Name: animal, dtype: object
150
153
 
154
+ With the 'keep' parameter, the selection behaviour of duplicated values
155
+ can be changed. The value 'first' keeps the first occurrence for each
156
+ set of duplicated entries. The default value of keep is 'first'.
157
+ >>> s.drop_duplicates().execute()
158
+ 0 lame
159
+ 1 cow
160
+ 3 beetle
161
+ 5 hippo
162
+ Name: animal, dtype: object
163
+ The value 'last' for parameter 'keep' keeps the last occurrence for
164
+ each set of duplicated entries.
165
+ >>> s.drop_duplicates(keep='last').execute()
166
+ 1 cow
167
+ 3 beetle
168
+ 4 lame
169
+ 5 hippo
170
+ Name: animal, dtype: object
171
+
151
172
  The value ``False`` for parameter 'keep' discards all sets of
152
173
  duplicated entries. Setting the value of 'inplace' to ``True`` performs
153
174
  the operation inplace and returns ``None``.
@@ -24,7 +24,7 @@ from ...serialization.serializables import AnyField, KeyField, StringField
24
24
  from ...utils import quiet_stdio
25
25
  from ..core import SERIES_TYPE
26
26
  from ..operators import DataFrameOperator, DataFrameOperatorMixin
27
- from ..utils import build_series
27
+ from ..utils import build_series, copy_func_scheduling_hints
28
28
 
29
29
 
30
30
  class DataFrameMap(DataFrameOperator, DataFrameOperatorMixin):
@@ -38,6 +38,8 @@ class DataFrameMap(DataFrameOperator, DataFrameOperatorMixin):
38
38
  super().__init__(_output_types=output_types, _memory_scale=memory_scale, **kw)
39
39
  if not self.output_types:
40
40
  self.output_types = [OutputType.series]
41
+ if hasattr(self, "arg"):
42
+ copy_func_scheduling_hints(self.arg, self)
41
43
 
42
44
  def _set_inputs(self, inputs):
43
45
  super()._set_inputs(inputs)