maxframe 0.1.0b4__cp39-cp39-win32.whl → 1.0.0__cp39-cp39-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (214) hide show
  1. maxframe/__init__.py +1 -0
  2. maxframe/_utils.cp39-win32.pyd +0 -0
  3. maxframe/codegen.py +56 -5
  4. maxframe/config/config.py +78 -10
  5. maxframe/config/validators.py +42 -11
  6. maxframe/conftest.py +58 -14
  7. maxframe/core/__init__.py +2 -16
  8. maxframe/core/entity/__init__.py +1 -12
  9. maxframe/core/entity/executable.py +1 -1
  10. maxframe/core/entity/objects.py +46 -45
  11. maxframe/core/entity/output_types.py +0 -3
  12. maxframe/core/entity/tests/test_objects.py +43 -0
  13. maxframe/core/entity/tileables.py +5 -78
  14. maxframe/core/graph/__init__.py +2 -2
  15. maxframe/core/graph/builder/__init__.py +0 -1
  16. maxframe/core/graph/builder/base.py +5 -4
  17. maxframe/core/graph/builder/tileable.py +4 -4
  18. maxframe/core/graph/builder/utils.py +4 -8
  19. maxframe/core/graph/core.cp39-win32.pyd +0 -0
  20. maxframe/core/graph/core.pyx +4 -4
  21. maxframe/core/graph/entity.py +9 -33
  22. maxframe/core/operator/__init__.py +2 -9
  23. maxframe/core/operator/base.py +3 -5
  24. maxframe/core/operator/objects.py +0 -9
  25. maxframe/core/operator/utils.py +55 -0
  26. maxframe/dataframe/__init__.py +2 -1
  27. maxframe/dataframe/arithmetic/around.py +5 -17
  28. maxframe/dataframe/arithmetic/core.py +15 -7
  29. maxframe/dataframe/arithmetic/docstring.py +7 -33
  30. maxframe/dataframe/arithmetic/equal.py +4 -2
  31. maxframe/dataframe/arithmetic/greater.py +4 -2
  32. maxframe/dataframe/arithmetic/greater_equal.py +4 -2
  33. maxframe/dataframe/arithmetic/less.py +2 -2
  34. maxframe/dataframe/arithmetic/less_equal.py +4 -2
  35. maxframe/dataframe/arithmetic/not_equal.py +4 -2
  36. maxframe/dataframe/arithmetic/tests/test_arithmetic.py +39 -16
  37. maxframe/dataframe/core.py +58 -12
  38. maxframe/dataframe/datasource/date_range.py +2 -2
  39. maxframe/dataframe/datasource/read_odps_query.py +120 -24
  40. maxframe/dataframe/datasource/read_odps_table.py +9 -4
  41. maxframe/dataframe/datasource/tests/test_datasource.py +103 -8
  42. maxframe/dataframe/datastore/tests/test_to_odps.py +48 -0
  43. maxframe/dataframe/datastore/to_odps.py +28 -0
  44. maxframe/dataframe/extensions/__init__.py +5 -0
  45. maxframe/dataframe/extensions/flatjson.py +131 -0
  46. maxframe/dataframe/extensions/flatmap.py +317 -0
  47. maxframe/dataframe/extensions/reshuffle.py +1 -1
  48. maxframe/dataframe/extensions/tests/test_extensions.py +108 -3
  49. maxframe/dataframe/groupby/core.py +1 -1
  50. maxframe/dataframe/groupby/cum.py +0 -1
  51. maxframe/dataframe/groupby/fill.py +4 -1
  52. maxframe/dataframe/groupby/getitem.py +6 -0
  53. maxframe/dataframe/groupby/tests/test_groupby.py +5 -1
  54. maxframe/dataframe/groupby/transform.py +5 -1
  55. maxframe/dataframe/indexing/align.py +1 -1
  56. maxframe/dataframe/indexing/loc.py +6 -4
  57. maxframe/dataframe/indexing/rename.py +5 -28
  58. maxframe/dataframe/indexing/sample.py +0 -1
  59. maxframe/dataframe/indexing/set_index.py +68 -1
  60. maxframe/dataframe/initializer.py +11 -1
  61. maxframe/dataframe/merge/__init__.py +9 -1
  62. maxframe/dataframe/merge/concat.py +41 -31
  63. maxframe/dataframe/merge/merge.py +237 -3
  64. maxframe/dataframe/merge/tests/test_merge.py +126 -1
  65. maxframe/dataframe/misc/__init__.py +4 -0
  66. maxframe/dataframe/misc/apply.py +6 -11
  67. maxframe/dataframe/misc/case_when.py +141 -0
  68. maxframe/dataframe/misc/describe.py +2 -2
  69. maxframe/dataframe/misc/drop_duplicates.py +8 -8
  70. maxframe/dataframe/misc/eval.py +4 -0
  71. maxframe/dataframe/misc/memory_usage.py +2 -2
  72. maxframe/dataframe/misc/pct_change.py +1 -83
  73. maxframe/dataframe/misc/pivot_table.py +262 -0
  74. maxframe/dataframe/misc/tests/test_misc.py +93 -1
  75. maxframe/dataframe/misc/transform.py +1 -30
  76. maxframe/dataframe/misc/value_counts.py +4 -17
  77. maxframe/dataframe/missing/dropna.py +1 -1
  78. maxframe/dataframe/missing/fillna.py +5 -5
  79. maxframe/dataframe/operators.py +1 -17
  80. maxframe/dataframe/plotting/core.py +2 -2
  81. maxframe/dataframe/reduction/core.py +4 -3
  82. maxframe/dataframe/reduction/tests/test_reduction.py +2 -4
  83. maxframe/dataframe/sort/sort_values.py +1 -11
  84. maxframe/dataframe/statistics/corr.py +3 -3
  85. maxframe/dataframe/statistics/quantile.py +13 -19
  86. maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
  87. maxframe/dataframe/tests/test_initializer.py +33 -2
  88. maxframe/dataframe/utils.py +33 -11
  89. maxframe/dataframe/window/expanding.py +5 -3
  90. maxframe/dataframe/window/tests/test_expanding.py +2 -2
  91. maxframe/errors.py +13 -0
  92. maxframe/extension.py +12 -0
  93. maxframe/io/__init__.py +13 -0
  94. maxframe/io/objects/__init__.py +24 -0
  95. maxframe/io/objects/core.py +140 -0
  96. maxframe/io/objects/tensor.py +76 -0
  97. maxframe/io/objects/tests/__init__.py +13 -0
  98. maxframe/io/objects/tests/test_object_io.py +97 -0
  99. maxframe/{odpsio → io/odpsio}/__init__.py +3 -1
  100. maxframe/{odpsio → io/odpsio}/arrow.py +43 -12
  101. maxframe/{odpsio → io/odpsio}/schema.py +38 -16
  102. maxframe/io/odpsio/tableio.py +719 -0
  103. maxframe/io/odpsio/tests/__init__.py +13 -0
  104. maxframe/{odpsio → io/odpsio}/tests/test_schema.py +75 -33
  105. maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +50 -23
  106. maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
  107. maxframe/io/odpsio/volumeio.py +63 -0
  108. maxframe/learn/contrib/__init__.py +3 -1
  109. maxframe/learn/contrib/graph/__init__.py +15 -0
  110. maxframe/learn/contrib/graph/connected_components.py +215 -0
  111. maxframe/learn/contrib/graph/tests/__init__.py +13 -0
  112. maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
  113. maxframe/learn/contrib/llm/__init__.py +16 -0
  114. maxframe/learn/contrib/llm/core.py +54 -0
  115. maxframe/learn/contrib/llm/models/__init__.py +14 -0
  116. maxframe/learn/contrib/llm/models/dashscope.py +73 -0
  117. maxframe/learn/contrib/llm/multi_modal.py +42 -0
  118. maxframe/learn/contrib/llm/text.py +42 -0
  119. maxframe/learn/contrib/utils.py +52 -0
  120. maxframe/learn/contrib/xgboost/__init__.py +26 -0
  121. maxframe/learn/contrib/xgboost/classifier.py +110 -0
  122. maxframe/learn/contrib/xgboost/core.py +241 -0
  123. maxframe/learn/contrib/xgboost/dmatrix.py +147 -0
  124. maxframe/learn/contrib/xgboost/predict.py +121 -0
  125. maxframe/learn/contrib/xgboost/regressor.py +71 -0
  126. maxframe/learn/contrib/xgboost/tests/__init__.py +13 -0
  127. maxframe/learn/contrib/xgboost/tests/test_core.py +43 -0
  128. maxframe/learn/contrib/xgboost/train.py +132 -0
  129. maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
  130. maxframe/learn/utils/__init__.py +15 -0
  131. maxframe/learn/utils/core.py +29 -0
  132. maxframe/lib/mmh3.cp39-win32.pyd +0 -0
  133. maxframe/lib/mmh3.pyi +43 -0
  134. maxframe/lib/sparse/tests/test_sparse.py +15 -15
  135. maxframe/lib/wrapped_pickle.py +2 -1
  136. maxframe/opcodes.py +11 -0
  137. maxframe/protocol.py +154 -27
  138. maxframe/remote/core.py +4 -8
  139. maxframe/serialization/__init__.py +1 -0
  140. maxframe/serialization/core.cp39-win32.pyd +0 -0
  141. maxframe/serialization/core.pxd +3 -0
  142. maxframe/serialization/core.pyi +64 -0
  143. maxframe/serialization/core.pyx +67 -26
  144. maxframe/serialization/exception.py +1 -1
  145. maxframe/serialization/pandas.py +52 -17
  146. maxframe/serialization/serializables/core.py +180 -15
  147. maxframe/serialization/serializables/field_type.py +4 -1
  148. maxframe/serialization/serializables/tests/test_serializable.py +54 -5
  149. maxframe/serialization/tests/test_serial.py +2 -1
  150. maxframe/session.py +37 -2
  151. maxframe/tensor/__init__.py +81 -2
  152. maxframe/tensor/arithmetic/isclose.py +1 -0
  153. maxframe/tensor/arithmetic/tests/test_arithmetic.py +22 -18
  154. maxframe/tensor/core.py +5 -136
  155. maxframe/tensor/datasource/array.py +7 -2
  156. maxframe/tensor/datasource/full.py +1 -1
  157. maxframe/tensor/datasource/scalar.py +1 -1
  158. maxframe/tensor/datasource/tests/test_datasource.py +1 -1
  159. maxframe/tensor/indexing/flatnonzero.py +1 -1
  160. maxframe/tensor/indexing/getitem.py +2 -0
  161. maxframe/tensor/merge/__init__.py +2 -0
  162. maxframe/tensor/merge/concatenate.py +101 -0
  163. maxframe/tensor/merge/tests/test_merge.py +30 -1
  164. maxframe/tensor/merge/vstack.py +74 -0
  165. maxframe/tensor/{base → misc}/__init__.py +4 -0
  166. maxframe/tensor/misc/atleast_1d.py +72 -0
  167. maxframe/tensor/misc/atleast_2d.py +70 -0
  168. maxframe/tensor/misc/atleast_3d.py +85 -0
  169. maxframe/tensor/misc/tests/__init__.py +13 -0
  170. maxframe/tensor/{base → misc}/transpose.py +22 -18
  171. maxframe/tensor/misc/unique.py +205 -0
  172. maxframe/tensor/operators.py +1 -7
  173. maxframe/tensor/random/core.py +1 -1
  174. maxframe/tensor/reduction/count_nonzero.py +2 -1
  175. maxframe/tensor/reduction/mean.py +1 -0
  176. maxframe/tensor/reduction/nanmean.py +1 -0
  177. maxframe/tensor/reduction/nanvar.py +2 -0
  178. maxframe/tensor/reduction/tests/test_reduction.py +12 -1
  179. maxframe/tensor/reduction/var.py +2 -0
  180. maxframe/tensor/statistics/quantile.py +2 -2
  181. maxframe/tensor/utils.py +2 -22
  182. maxframe/tests/test_protocol.py +34 -0
  183. maxframe/tests/test_utils.py +0 -12
  184. maxframe/tests/utils.py +17 -2
  185. maxframe/typing_.py +4 -1
  186. maxframe/udf.py +62 -3
  187. maxframe/utils.py +112 -86
  188. {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/METADATA +25 -25
  189. {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/RECORD +208 -167
  190. {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/WHEEL +1 -1
  191. maxframe_client/__init__.py +0 -1
  192. maxframe_client/clients/framedriver.py +4 -1
  193. maxframe_client/fetcher.py +123 -54
  194. maxframe_client/session/consts.py +3 -0
  195. maxframe_client/session/graph.py +8 -2
  196. maxframe_client/session/odps.py +223 -40
  197. maxframe_client/session/task.py +108 -80
  198. maxframe_client/tests/test_fetcher.py +21 -3
  199. maxframe_client/tests/test_session.py +136 -8
  200. maxframe/core/entity/chunks.py +0 -68
  201. maxframe/core/entity/fuse.py +0 -73
  202. maxframe/core/graph/builder/chunk.py +0 -430
  203. maxframe/odpsio/tableio.py +0 -300
  204. maxframe/odpsio/volumeio.py +0 -95
  205. maxframe_client/clients/spe.py +0 -104
  206. /maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
  207. /maxframe/{tensor/base → dataframe/datastore}/tests/__init__.py +0 -0
  208. /maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
  209. /maxframe/tensor/{base → misc}/astype.py +0 -0
  210. /maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
  211. /maxframe/tensor/{base → misc}/ravel.py +0 -0
  212. /maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
  213. /maxframe/tensor/{base → misc}/where.py +0 -0
  214. {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/top_level.txt +0 -0
@@ -138,7 +138,7 @@ class DataFrameAlign(DataFrameOperator, DataFrameOperatorMixin):
138
138
  series_index = rhs.index_value.to_pandas()
139
139
  dtypes = lhs.dtypes.reindex(
140
140
  lhs.dtypes.index.join(series_index, how=self.join)
141
- ).fillna(np.dtype(np.float_))
141
+ ).fillna(np.dtype(float))
142
142
  l_shape[1] = r_size = len(dtypes)
143
143
  col_val = r_idx_val = parse_index(dtypes.index, store_data=True)
144
144
 
@@ -25,13 +25,14 @@ from ...core import ENTITY_TYPE, OutputType
25
25
  from ...serialization.serializables import AnyField, KeyField, ListField
26
26
  from ...tensor.datasource import asarray
27
27
  from ...tensor.utils import calc_sliced_size, filter_inputs
28
- from ...utils import is_full_slice, lazy_import
28
+ from ...utils import is_full_slice, lazy_import, pd_release_version
29
29
  from ..core import DATAFRAME_TYPE, IndexValue
30
30
  from ..operators import DataFrameOperator, DataFrameOperatorMixin
31
31
  from ..utils import parse_index
32
32
  from .iloc import DataFrameIlocSetItem
33
33
 
34
34
  cudf = lazy_import("cudf")
35
+ with_slice_locs_kind = pd_release_version < (1, 4, 0)
35
36
 
36
37
 
37
38
  def process_loc_indexes(inp, indexes, fetch_index: bool = True):
@@ -210,9 +211,10 @@ class DataFrameLocGetItem(DataFrameOperator, DataFrameOperatorMixin):
210
211
  if axis == 1:
211
212
  param["dtypes"] = inp.dtypes
212
213
  elif input_index_value.has_value():
213
- start, end = pd_index.slice_locs(
214
- index.start, index.stop, index.step, kind="loc"
215
- )
214
+ kw = {}
215
+ if with_slice_locs_kind:
216
+ kw["kind"] = "loc"
217
+ start, end = pd_index.slice_locs(index.start, index.stop, index.step, **kw)
216
218
  slc = slice(start, end, index.step)
217
219
  size = calc_sliced_size(inp.shape[axis], slc)
218
220
  param["shape"] = size
@@ -17,7 +17,7 @@ import warnings
17
17
  from ... import opcodes
18
18
  from ...core import get_output_types
19
19
  from ...serialization.serializables import AnyField, StringField
20
- from ..core import SERIES_TYPE
20
+ from ..core import INDEX_TYPE, SERIES_TYPE
21
21
  from ..operators import DataFrameOperator, DataFrameOperatorMixin
22
22
  from ..utils import build_df, build_series, parse_index, validate_axis
23
23
 
@@ -73,6 +73,8 @@ class DataFrameRename(DataFrameOperator, DataFrameOperatorMixin):
73
73
  params["index_value"] = parse_index(new_index)
74
74
  if df.ndim == 1:
75
75
  params["name"] = new_df.name
76
+ if isinstance(df, INDEX_TYPE):
77
+ params["names"] = new_df.names
76
78
  return self.new_tileable([df], **params)
77
79
 
78
80
 
@@ -246,6 +248,7 @@ def df_rename(
246
248
  )
247
249
 
248
250
 
251
+ # fixme https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/58
249
252
  def series_rename(
250
253
  series,
251
254
  index=None,
@@ -303,11 +306,6 @@ def series_rename(
303
306
  1 2
304
307
  2 3
305
308
  Name: my_name, dtype: int64
306
- >>> s.rename(lambda x: x ** 2).execute() # function, changes labels.execute()
307
- 0 1
308
- 1 2
309
- 4 3
310
- dtype: int64
311
309
  >>> s.rename({1: 3, 2: 5}).execute() # mapping, changes labels.execute()
312
310
  0 1
313
311
  3 2
@@ -385,6 +383,7 @@ def index_rename(index, name, inplace=False):
385
383
  return ret
386
384
 
387
385
 
386
+ # fixme https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/59
388
387
  def index_set_names(index, names, level=None, inplace=False):
389
388
  """
390
389
  Set Index or MultiIndex name.
@@ -419,28 +418,6 @@ def index_set_names(index, names, level=None, inplace=False):
419
418
  Int64Index([1, 2, 3, 4], dtype='int64')
420
419
  >>> idx.set_names('quarter').execute()
421
420
  Int64Index([1, 2, 3, 4], dtype='int64', name='quarter')
422
-
423
- >>> idx = md.MultiIndex.from_product([['python', 'cobra'],
424
- ... [2018, 2019]])
425
- >>> idx.execute()
426
- MultiIndex([('python', 2018),
427
- ('python', 2019),
428
- ( 'cobra', 2018),
429
- ( 'cobra', 2019)],
430
- )
431
- >>> idx.set_names(['kind', 'year'], inplace=True)
432
- >>> idx.execute()
433
- MultiIndex([('python', 2018),
434
- ('python', 2019),
435
- ( 'cobra', 2018),
436
- ( 'cobra', 2019)],
437
- names=['kind', 'year'])
438
- >>> idx.set_names('species', level=0).execute()
439
- MultiIndex([('python', 2018),
440
- ('python', 2019),
441
- ( 'cobra', 2018),
442
- ( 'cobra', 2019)],
443
- names=['species', 'year'])
444
421
  """
445
422
  op = DataFrameRename(
446
423
  index_mapper=names, level=level, output_types=get_output_types(index)
@@ -195,7 +195,6 @@ def sample(
195
195
  num_legs num_wings num_specimen_seen
196
196
  falcon 2 2 10
197
197
  fish 0 0 8
198
-
199
198
  """
200
199
  axis = validate_axis(axis or 0, df_or_series)
201
200
  if axis == 1:
@@ -31,7 +31,7 @@ class DataFrameSetIndex(DataFrameOperator, DataFrameOperatorMixin):
31
31
  super().__init__(_output_types=output_types, **kw)
32
32
 
33
33
  def __call__(self, df):
34
- new_df = build_empty_df(df.dtypes).set_index(
34
+ new_df = build_empty_df(df.dtypes, index=df.index_value.to_pandas()).set_index(
35
35
  keys=self.keys,
36
36
  drop=self.drop,
37
37
  append=self.append,
@@ -47,6 +47,73 @@ class DataFrameSetIndex(DataFrameOperator, DataFrameOperatorMixin):
47
47
 
48
48
 
49
49
  def set_index(df, keys, drop=True, append=False, inplace=False, verify_integrity=False):
50
+ # TODO add support for set index by series, index, mt.ndarray, etc.
51
+ """
52
+ Set the DataFrame index using existing columns.
53
+
54
+ Set the DataFrame index (row labels) using one or more existing
55
+ columns. The index can replace the existing index or expand on it.
56
+
57
+ Parameters
58
+ ----------
59
+ keys : label or array-like or list of labels
60
+ This parameter can be either a single column key, or a list containing column keys.
61
+ drop : bool, default True
62
+ Delete columns to be used as the new index.
63
+ append : bool, default False
64
+ Whether to append columns to existing index.
65
+ inplace : bool, default False
66
+ If True, modifies the DataFrame in place (do not create a new object).
67
+ verify_integrity : bool, default False
68
+ Check the new index for duplicates. Otherwise defer the check until
69
+ necessary. Setting to False will improve the performance of this
70
+ method.
71
+
72
+ Returns
73
+ -------
74
+ DataFrame or None
75
+ Changed row labels or None if ``inplace=True``.
76
+
77
+ See Also
78
+ --------
79
+ DataFrame.reset_index : Opposite of set_index.
80
+ DataFrame.reindex : Change to new indices or expand indices.
81
+ DataFrame.reindex_like : Change to same indices as other DataFrame.
82
+
83
+ Examples
84
+ --------
85
+ >>> import maxframe.dataframe as md
86
+
87
+ >>> df = md.DataFrame({'month': [1, 4, 7, 10],
88
+ ... 'year': [2012, 2014, 2013, 2014],
89
+ ... 'sale': [55, 40, 84, 31]})
90
+ >>> df
91
+ month year sale
92
+ 0 1 2012 55
93
+ 1 4 2014 40
94
+ 2 7 2013 84
95
+ 3 10 2014 31
96
+
97
+ Set the index to become the 'month' column:
98
+
99
+ >>> df.set_index('month')
100
+ year sale
101
+ month
102
+ 1 2012 55
103
+ 4 2014 40
104
+ 7 2013 84
105
+ 10 2014 31
106
+
107
+ Create a MultiIndex using columns 'year' and 'month':
108
+
109
+ >>> df.set_index(['year', 'month'])
110
+ sale
111
+ year month
112
+ 2012 1 55
113
+ 2014 4 40
114
+ 2013 7 84
115
+ 2014 10 31
116
+ """
50
117
  op = DataFrameSetIndex(
51
118
  keys=keys,
52
119
  drop=drop,
@@ -15,6 +15,7 @@
15
15
  from typing import Union
16
16
 
17
17
  import pandas as pd
18
+ from pandas.api.types import is_list_like
18
19
  from pandas.core.dtypes.common import pandas_dtype
19
20
 
20
21
  from ..core import ENTITY_TYPE
@@ -61,6 +62,8 @@ class DataFrame(_Frame, metaclass=InitializerMeta):
61
62
  num_partitions=None,
62
63
  ):
63
64
  need_repart = False
65
+ if columns is not None and not is_list_like(columns):
66
+ raise ValueError("columns must be a list-like object")
64
67
  if isinstance(data, TENSOR_TYPE):
65
68
  if chunk_size is not None:
66
69
  data = data.rechunk(chunk_size)
@@ -69,7 +72,10 @@ class DataFrame(_Frame, metaclass=InitializerMeta):
69
72
  )
70
73
  need_repart = num_partitions is not None
71
74
  elif isinstance(data, SERIES_TYPE):
72
- df = data.to_frame()
75
+ if columns is not None and len(columns) != 1:
76
+ raise ValueError("columns' length must be 1 when data is Series")
77
+ col_name = columns[0] if columns else None
78
+ df = data.to_frame(name=col_name)
73
79
  need_repart = num_partitions is not None
74
80
  elif isinstance(data, DATAFRAME_TYPE):
75
81
  if not hasattr(data, "data"):
@@ -77,6 +83,10 @@ class DataFrame(_Frame, metaclass=InitializerMeta):
77
83
  df = _Frame(data)
78
84
  else:
79
85
  df = data
86
+ if columns is not None:
87
+ if len(df.columns) != len(columns):
88
+ raise ValueError("columns' length must be equal to the data's")
89
+ df.columns = columns
80
90
  need_repart = num_partitions is not None
81
91
  elif isinstance(data, dict) and self._can_process_by_1d_tileables(data):
82
92
  # data is a dict and some value is tensor
@@ -14,7 +14,15 @@
14
14
 
15
15
  from .append import DataFrameAppend, append
16
16
  from .concat import DataFrameConcat, concat
17
- from .merge import DataFrameMerge, DataFrameMergeAlign, join, merge
17
+ from .merge import (
18
+ DataFrameMerge,
19
+ DataFrameMergeAlign,
20
+ DistributedMapJoinHint,
21
+ MapJoinHint,
22
+ SkewJoinHint,
23
+ join,
24
+ merge,
25
+ )
18
26
 
19
27
 
20
28
  def _install():
@@ -11,6 +11,7 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+ from typing import List, Union
14
15
 
15
16
  import pandas as pd
16
17
 
@@ -24,6 +25,7 @@ from ...serialization.serializables import (
24
25
  StringField,
25
26
  )
26
27
  from ...utils import lazy_import
28
+ from ..core import DataFrame, Series
27
29
  from ..operators import SERIES_TYPE, DataFrameOperator, DataFrameOperatorMixin
28
30
  from ..utils import build_empty_df, build_empty_series, parse_index, validate_axis
29
31
 
@@ -55,41 +57,53 @@ class DataFrameConcat(DataFrameOperator, DataFrameOperatorMixin):
55
57
  return self.names
56
58
 
57
59
  @classmethod
58
- def _concat_index(cls, prev_index: pd.Index, cur_index: pd.Index):
59
- if isinstance(prev_index, pd.RangeIndex) and isinstance(
60
- cur_index, pd.RangeIndex
61
- ):
62
- # handle RangeIndex that append may generate huge amount of data
63
- # e.g. pd.RangeIndex(10_000) and pd.RangeIndex(10_000)
64
- # will generate a Int64Index full of data
65
- # for details see GH#1647
66
- prev_stop = prev_index.start + prev_index.size * prev_index.step
67
- cur_start = cur_index.start
68
- if prev_stop == cur_start and prev_index.step == cur_index.step:
69
- # continuous RangeIndex, still return RangeIndex
70
- return prev_index.append(cur_index)
71
- else:
72
- # otherwise, return an empty index
73
- return pd.Index([], dtype=prev_index.dtype)
74
- elif isinstance(prev_index, pd.RangeIndex):
75
- return pd.Index([], prev_index.dtype).append(cur_index)
76
- elif isinstance(cur_index, pd.RangeIndex):
77
- return prev_index.append(pd.Index([], cur_index.dtype))
78
- return prev_index.append(cur_index)
60
+ def _concat_index(cls, df_or_series_list: Union[List[DataFrame], List[Series]]):
61
+ concat_index = None
62
+ all_indexes_have_value = all(
63
+ input.index_value.has_value() for input in df_or_series_list
64
+ )
65
+
66
+ def _concat(prev_index: pd.Index, cur_index: pd.Index):
67
+ if prev_index is None:
68
+ return cur_index
69
+
70
+ if (
71
+ all_indexes_have_value
72
+ and isinstance(prev_index, pd.RangeIndex)
73
+ and isinstance(cur_index, pd.RangeIndex)
74
+ ):
75
+ # handle RangeIndex that append may generate huge amount of data
76
+ # e.g. pd.RangeIndex(10_000) and pd.RangeIndex(10_000)
77
+ # will generate a Int64Index full of data
78
+ # for details see GH#1647
79
+ prev_stop = prev_index.start + prev_index.size * prev_index.step
80
+ cur_start = cur_index.start
81
+ if prev_stop == cur_start and prev_index.step == cur_index.step:
82
+ # continuous RangeIndex, still return RangeIndex
83
+ return prev_index.append(cur_index)
84
+ else:
85
+ # otherwise, return an empty index
86
+ return pd.Index([], dtype=prev_index.dtype)
87
+ elif isinstance(prev_index, pd.RangeIndex):
88
+ return pd.Index([], prev_index.dtype).append(cur_index)
89
+ elif isinstance(cur_index, pd.RangeIndex):
90
+ return prev_index.append(pd.Index([], cur_index.dtype))
91
+ return prev_index.append(cur_index)
92
+
93
+ for input in df_or_series_list:
94
+ concat_index = _concat(concat_index, input.index_value.to_pandas())
95
+
96
+ return concat_index
79
97
 
80
98
  def _call_series(self, objs):
81
99
  if self.axis == 0:
82
100
  row_length = 0
83
- index = None
84
101
  for series in objs:
85
- if index is None:
86
- index = series.index_value.to_pandas()
87
- else:
88
- index = self._concat_index(index, series.index_value.to_pandas())
89
102
  row_length += series.shape[0]
90
103
  if self.ignore_index: # pragma: no cover
91
104
  index_value = parse_index(pd.RangeIndex(row_length))
92
105
  else:
106
+ index = self._concat_index(objs)
93
107
  index_value = parse_index(index, objs)
94
108
  obj_names = {obj.name for obj in objs}
95
109
  return self.new_series(
@@ -130,13 +144,8 @@ class DataFrameConcat(DataFrameOperator, DataFrameOperatorMixin):
130
144
  def _call_dataframes(self, objs):
131
145
  if self.axis == 0:
132
146
  row_length = 0
133
- index = None
134
147
  empty_dfs = []
135
148
  for df in objs:
136
- if index is None:
137
- index = df.index_value.to_pandas()
138
- else:
139
- index = self._concat_index(index, df.index_value.to_pandas())
140
149
  row_length += df.shape[0]
141
150
  if df.ndim == 2:
142
151
  empty_dfs.append(build_empty_df(df.dtypes))
@@ -153,6 +162,7 @@ class DataFrameConcat(DataFrameOperator, DataFrameOperatorMixin):
153
162
  if self.ignore_index: # pragma: no cover
154
163
  index_value = parse_index(pd.RangeIndex(row_length))
155
164
  else:
165
+ index = self._concat_index(objs)
156
166
  index_value = parse_index(index, objs)
157
167
 
158
168
  new_objs = []