maxframe 0.1.0b5__cp39-cp39-macosx_10_9_universal2.whl → 1.0.0__cp39-cp39-macosx_10_9_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cpython-39-darwin.so +0 -0
- maxframe/codegen.py +10 -4
- maxframe/config/config.py +68 -10
- maxframe/config/validators.py +42 -11
- maxframe/conftest.py +58 -14
- maxframe/core/__init__.py +2 -16
- maxframe/core/entity/__init__.py +1 -12
- maxframe/core/entity/executable.py +1 -1
- maxframe/core/entity/objects.py +46 -45
- maxframe/core/entity/output_types.py +0 -3
- maxframe/core/entity/tests/test_objects.py +43 -0
- maxframe/core/entity/tileables.py +5 -78
- maxframe/core/graph/__init__.py +2 -2
- maxframe/core/graph/builder/__init__.py +0 -1
- maxframe/core/graph/builder/base.py +5 -4
- maxframe/core/graph/builder/tileable.py +4 -4
- maxframe/core/graph/builder/utils.py +4 -8
- maxframe/core/graph/core.cpython-39-darwin.so +0 -0
- maxframe/core/graph/core.pyx +4 -4
- maxframe/core/graph/entity.py +9 -33
- maxframe/core/operator/__init__.py +2 -9
- maxframe/core/operator/base.py +3 -5
- maxframe/core/operator/objects.py +0 -9
- maxframe/core/operator/utils.py +55 -0
- maxframe/dataframe/__init__.py +1 -1
- maxframe/dataframe/arithmetic/around.py +5 -17
- maxframe/dataframe/arithmetic/core.py +15 -7
- maxframe/dataframe/arithmetic/docstring.py +7 -33
- maxframe/dataframe/arithmetic/equal.py +4 -2
- maxframe/dataframe/arithmetic/greater.py +4 -2
- maxframe/dataframe/arithmetic/greater_equal.py +4 -2
- maxframe/dataframe/arithmetic/less.py +2 -2
- maxframe/dataframe/arithmetic/less_equal.py +4 -2
- maxframe/dataframe/arithmetic/not_equal.py +4 -2
- maxframe/dataframe/arithmetic/tests/test_arithmetic.py +39 -16
- maxframe/dataframe/core.py +31 -7
- maxframe/dataframe/datasource/date_range.py +2 -2
- maxframe/dataframe/datasource/read_odps_query.py +117 -23
- maxframe/dataframe/datasource/read_odps_table.py +6 -3
- maxframe/dataframe/datasource/tests/test_datasource.py +103 -8
- maxframe/dataframe/datastore/tests/test_to_odps.py +48 -0
- maxframe/dataframe/datastore/to_odps.py +28 -0
- maxframe/dataframe/extensions/__init__.py +5 -0
- maxframe/dataframe/extensions/flatjson.py +131 -0
- maxframe/dataframe/extensions/flatmap.py +317 -0
- maxframe/dataframe/extensions/reshuffle.py +1 -1
- maxframe/dataframe/extensions/tests/test_extensions.py +108 -3
- maxframe/dataframe/groupby/core.py +1 -1
- maxframe/dataframe/groupby/cum.py +0 -1
- maxframe/dataframe/groupby/fill.py +4 -1
- maxframe/dataframe/groupby/getitem.py +6 -0
- maxframe/dataframe/groupby/tests/test_groupby.py +5 -1
- maxframe/dataframe/groupby/transform.py +5 -1
- maxframe/dataframe/indexing/align.py +1 -1
- maxframe/dataframe/indexing/loc.py +6 -4
- maxframe/dataframe/indexing/rename.py +5 -28
- maxframe/dataframe/indexing/sample.py +0 -1
- maxframe/dataframe/indexing/set_index.py +68 -1
- maxframe/dataframe/initializer.py +11 -1
- maxframe/dataframe/merge/__init__.py +9 -1
- maxframe/dataframe/merge/concat.py +41 -31
- maxframe/dataframe/merge/merge.py +237 -3
- maxframe/dataframe/merge/tests/test_merge.py +126 -1
- maxframe/dataframe/misc/apply.py +5 -10
- maxframe/dataframe/misc/case_when.py +1 -1
- maxframe/dataframe/misc/describe.py +2 -2
- maxframe/dataframe/misc/drop_duplicates.py +8 -8
- maxframe/dataframe/misc/eval.py +4 -0
- maxframe/dataframe/misc/memory_usage.py +2 -2
- maxframe/dataframe/misc/pct_change.py +1 -83
- maxframe/dataframe/misc/tests/test_misc.py +33 -2
- maxframe/dataframe/misc/transform.py +1 -30
- maxframe/dataframe/misc/value_counts.py +4 -17
- maxframe/dataframe/missing/dropna.py +1 -1
- maxframe/dataframe/missing/fillna.py +5 -5
- maxframe/dataframe/operators.py +1 -17
- maxframe/dataframe/reduction/core.py +2 -2
- maxframe/dataframe/reduction/tests/test_reduction.py +2 -4
- maxframe/dataframe/sort/sort_values.py +1 -11
- maxframe/dataframe/statistics/corr.py +3 -3
- maxframe/dataframe/statistics/quantile.py +13 -19
- maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
- maxframe/dataframe/tests/test_initializer.py +33 -2
- maxframe/dataframe/utils.py +26 -11
- maxframe/dataframe/window/expanding.py +5 -3
- maxframe/dataframe/window/tests/test_expanding.py +2 -2
- maxframe/errors.py +13 -0
- maxframe/extension.py +12 -0
- maxframe/io/__init__.py +13 -0
- maxframe/io/objects/__init__.py +24 -0
- maxframe/io/objects/core.py +140 -0
- maxframe/io/objects/tensor.py +76 -0
- maxframe/io/objects/tests/__init__.py +13 -0
- maxframe/io/objects/tests/test_object_io.py +97 -0
- maxframe/{odpsio → io/odpsio}/__init__.py +3 -1
- maxframe/{odpsio → io/odpsio}/arrow.py +42 -10
- maxframe/{odpsio → io/odpsio}/schema.py +38 -16
- maxframe/io/odpsio/tableio.py +719 -0
- maxframe/io/odpsio/tests/__init__.py +13 -0
- maxframe/{odpsio → io/odpsio}/tests/test_schema.py +59 -22
- maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +50 -23
- maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
- maxframe/io/odpsio/volumeio.py +63 -0
- maxframe/learn/contrib/__init__.py +3 -1
- maxframe/learn/contrib/graph/__init__.py +15 -0
- maxframe/learn/contrib/graph/connected_components.py +215 -0
- maxframe/learn/contrib/graph/tests/__init__.py +13 -0
- maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
- maxframe/learn/contrib/llm/__init__.py +16 -0
- maxframe/learn/contrib/llm/core.py +54 -0
- maxframe/learn/contrib/llm/models/__init__.py +14 -0
- maxframe/learn/contrib/llm/models/dashscope.py +73 -0
- maxframe/learn/contrib/llm/multi_modal.py +42 -0
- maxframe/learn/contrib/llm/text.py +42 -0
- maxframe/learn/contrib/xgboost/classifier.py +26 -2
- maxframe/learn/contrib/xgboost/core.py +87 -2
- maxframe/learn/contrib/xgboost/dmatrix.py +3 -6
- maxframe/learn/contrib/xgboost/predict.py +29 -46
- maxframe/learn/contrib/xgboost/regressor.py +3 -10
- maxframe/learn/contrib/xgboost/train.py +29 -18
- maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
- maxframe/lib/mmh3.cpython-39-darwin.so +0 -0
- maxframe/lib/mmh3.pyi +43 -0
- maxframe/lib/sparse/tests/test_sparse.py +15 -15
- maxframe/lib/wrapped_pickle.py +2 -1
- maxframe/opcodes.py +8 -0
- maxframe/protocol.py +154 -27
- maxframe/remote/core.py +4 -8
- maxframe/serialization/__init__.py +1 -0
- maxframe/serialization/core.cpython-39-darwin.so +0 -0
- maxframe/serialization/core.pxd +3 -0
- maxframe/serialization/core.pyi +3 -0
- maxframe/serialization/core.pyx +67 -26
- maxframe/serialization/exception.py +1 -1
- maxframe/serialization/pandas.py +52 -17
- maxframe/serialization/serializables/core.py +180 -15
- maxframe/serialization/serializables/field_type.py +4 -1
- maxframe/serialization/serializables/tests/test_serializable.py +54 -5
- maxframe/serialization/tests/test_serial.py +2 -1
- maxframe/session.py +9 -2
- maxframe/tensor/__init__.py +81 -2
- maxframe/tensor/arithmetic/isclose.py +1 -0
- maxframe/tensor/arithmetic/tests/test_arithmetic.py +22 -18
- maxframe/tensor/core.py +5 -136
- maxframe/tensor/datasource/array.py +3 -0
- maxframe/tensor/datasource/full.py +1 -1
- maxframe/tensor/datasource/tests/test_datasource.py +1 -1
- maxframe/tensor/indexing/flatnonzero.py +1 -1
- maxframe/tensor/indexing/getitem.py +2 -0
- maxframe/tensor/merge/__init__.py +2 -0
- maxframe/tensor/merge/concatenate.py +101 -0
- maxframe/tensor/merge/tests/test_merge.py +30 -1
- maxframe/tensor/merge/vstack.py +74 -0
- maxframe/tensor/{base → misc}/__init__.py +2 -0
- maxframe/tensor/{base → misc}/atleast_1d.py +1 -3
- maxframe/tensor/misc/atleast_2d.py +70 -0
- maxframe/tensor/misc/atleast_3d.py +85 -0
- maxframe/tensor/misc/tests/__init__.py +13 -0
- maxframe/tensor/{base → misc}/transpose.py +22 -18
- maxframe/tensor/{base → misc}/unique.py +3 -3
- maxframe/tensor/operators.py +1 -7
- maxframe/tensor/random/core.py +1 -1
- maxframe/tensor/reduction/count_nonzero.py +2 -1
- maxframe/tensor/reduction/mean.py +1 -0
- maxframe/tensor/reduction/nanmean.py +1 -0
- maxframe/tensor/reduction/nanvar.py +2 -0
- maxframe/tensor/reduction/tests/test_reduction.py +12 -1
- maxframe/tensor/reduction/var.py +2 -0
- maxframe/tensor/statistics/quantile.py +2 -2
- maxframe/tensor/utils.py +2 -22
- maxframe/tests/test_protocol.py +34 -0
- maxframe/tests/test_utils.py +0 -12
- maxframe/tests/utils.py +17 -2
- maxframe/typing_.py +4 -1
- maxframe/udf.py +8 -9
- maxframe/utils.py +106 -86
- {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/METADATA +25 -25
- {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/RECORD +197 -173
- {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/WHEEL +1 -1
- maxframe_client/__init__.py +0 -1
- maxframe_client/clients/framedriver.py +4 -1
- maxframe_client/fetcher.py +81 -74
- maxframe_client/session/consts.py +3 -0
- maxframe_client/session/graph.py +8 -2
- maxframe_client/session/odps.py +194 -40
- maxframe_client/session/task.py +94 -39
- maxframe_client/tests/test_fetcher.py +21 -3
- maxframe_client/tests/test_session.py +109 -8
- maxframe/core/entity/chunks.py +0 -68
- maxframe/core/entity/fuse.py +0 -73
- maxframe/core/graph/builder/chunk.py +0 -430
- maxframe/odpsio/tableio.py +0 -322
- maxframe/odpsio/volumeio.py +0 -95
- maxframe_client/clients/spe.py +0 -104
- /maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
- /maxframe/{tensor/base → dataframe/datastore}/tests/__init__.py +0 -0
- /maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
- /maxframe/tensor/{base → misc}/astype.py +0 -0
- /maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
- /maxframe/tensor/{base → misc}/ravel.py +0 -0
- /maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
- /maxframe/tensor/{base → misc}/where.py +0 -0
- {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -138,7 +138,7 @@ class DataFrameAlign(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
138
138
|
series_index = rhs.index_value.to_pandas()
|
|
139
139
|
dtypes = lhs.dtypes.reindex(
|
|
140
140
|
lhs.dtypes.index.join(series_index, how=self.join)
|
|
141
|
-
).fillna(np.dtype(
|
|
141
|
+
).fillna(np.dtype(float))
|
|
142
142
|
l_shape[1] = r_size = len(dtypes)
|
|
143
143
|
col_val = r_idx_val = parse_index(dtypes.index, store_data=True)
|
|
144
144
|
|
|
@@ -25,13 +25,14 @@ from ...core import ENTITY_TYPE, OutputType
|
|
|
25
25
|
from ...serialization.serializables import AnyField, KeyField, ListField
|
|
26
26
|
from ...tensor.datasource import asarray
|
|
27
27
|
from ...tensor.utils import calc_sliced_size, filter_inputs
|
|
28
|
-
from ...utils import is_full_slice, lazy_import
|
|
28
|
+
from ...utils import is_full_slice, lazy_import, pd_release_version
|
|
29
29
|
from ..core import DATAFRAME_TYPE, IndexValue
|
|
30
30
|
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
31
31
|
from ..utils import parse_index
|
|
32
32
|
from .iloc import DataFrameIlocSetItem
|
|
33
33
|
|
|
34
34
|
cudf = lazy_import("cudf")
|
|
35
|
+
with_slice_locs_kind = pd_release_version < (1, 4, 0)
|
|
35
36
|
|
|
36
37
|
|
|
37
38
|
def process_loc_indexes(inp, indexes, fetch_index: bool = True):
|
|
@@ -210,9 +211,10 @@ class DataFrameLocGetItem(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
210
211
|
if axis == 1:
|
|
211
212
|
param["dtypes"] = inp.dtypes
|
|
212
213
|
elif input_index_value.has_value():
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
214
|
+
kw = {}
|
|
215
|
+
if with_slice_locs_kind:
|
|
216
|
+
kw["kind"] = "loc"
|
|
217
|
+
start, end = pd_index.slice_locs(index.start, index.stop, index.step, **kw)
|
|
216
218
|
slc = slice(start, end, index.step)
|
|
217
219
|
size = calc_sliced_size(inp.shape[axis], slc)
|
|
218
220
|
param["shape"] = size
|
|
@@ -17,7 +17,7 @@ import warnings
|
|
|
17
17
|
from ... import opcodes
|
|
18
18
|
from ...core import get_output_types
|
|
19
19
|
from ...serialization.serializables import AnyField, StringField
|
|
20
|
-
from ..core import SERIES_TYPE
|
|
20
|
+
from ..core import INDEX_TYPE, SERIES_TYPE
|
|
21
21
|
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
22
22
|
from ..utils import build_df, build_series, parse_index, validate_axis
|
|
23
23
|
|
|
@@ -73,6 +73,8 @@ class DataFrameRename(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
73
73
|
params["index_value"] = parse_index(new_index)
|
|
74
74
|
if df.ndim == 1:
|
|
75
75
|
params["name"] = new_df.name
|
|
76
|
+
if isinstance(df, INDEX_TYPE):
|
|
77
|
+
params["names"] = new_df.names
|
|
76
78
|
return self.new_tileable([df], **params)
|
|
77
79
|
|
|
78
80
|
|
|
@@ -246,6 +248,7 @@ def df_rename(
|
|
|
246
248
|
)
|
|
247
249
|
|
|
248
250
|
|
|
251
|
+
# fixme https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/58
|
|
249
252
|
def series_rename(
|
|
250
253
|
series,
|
|
251
254
|
index=None,
|
|
@@ -303,11 +306,6 @@ def series_rename(
|
|
|
303
306
|
1 2
|
|
304
307
|
2 3
|
|
305
308
|
Name: my_name, dtype: int64
|
|
306
|
-
>>> s.rename(lambda x: x ** 2).execute() # function, changes labels.execute()
|
|
307
|
-
0 1
|
|
308
|
-
1 2
|
|
309
|
-
4 3
|
|
310
|
-
dtype: int64
|
|
311
309
|
>>> s.rename({1: 3, 2: 5}).execute() # mapping, changes labels.execute()
|
|
312
310
|
0 1
|
|
313
311
|
3 2
|
|
@@ -385,6 +383,7 @@ def index_rename(index, name, inplace=False):
|
|
|
385
383
|
return ret
|
|
386
384
|
|
|
387
385
|
|
|
386
|
+
# fixme https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/59
|
|
388
387
|
def index_set_names(index, names, level=None, inplace=False):
|
|
389
388
|
"""
|
|
390
389
|
Set Index or MultiIndex name.
|
|
@@ -419,28 +418,6 @@ def index_set_names(index, names, level=None, inplace=False):
|
|
|
419
418
|
Int64Index([1, 2, 3, 4], dtype='int64')
|
|
420
419
|
>>> idx.set_names('quarter').execute()
|
|
421
420
|
Int64Index([1, 2, 3, 4], dtype='int64', name='quarter')
|
|
422
|
-
|
|
423
|
-
>>> idx = md.MultiIndex.from_product([['python', 'cobra'],
|
|
424
|
-
... [2018, 2019]])
|
|
425
|
-
>>> idx.execute()
|
|
426
|
-
MultiIndex([('python', 2018),
|
|
427
|
-
('python', 2019),
|
|
428
|
-
( 'cobra', 2018),
|
|
429
|
-
( 'cobra', 2019)],
|
|
430
|
-
)
|
|
431
|
-
>>> idx.set_names(['kind', 'year'], inplace=True)
|
|
432
|
-
>>> idx.execute()
|
|
433
|
-
MultiIndex([('python', 2018),
|
|
434
|
-
('python', 2019),
|
|
435
|
-
( 'cobra', 2018),
|
|
436
|
-
( 'cobra', 2019)],
|
|
437
|
-
names=['kind', 'year'])
|
|
438
|
-
>>> idx.set_names('species', level=0).execute()
|
|
439
|
-
MultiIndex([('python', 2018),
|
|
440
|
-
('python', 2019),
|
|
441
|
-
( 'cobra', 2018),
|
|
442
|
-
( 'cobra', 2019)],
|
|
443
|
-
names=['species', 'year'])
|
|
444
421
|
"""
|
|
445
422
|
op = DataFrameRename(
|
|
446
423
|
index_mapper=names, level=level, output_types=get_output_types(index)
|
|
@@ -31,7 +31,7 @@ class DataFrameSetIndex(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
31
31
|
super().__init__(_output_types=output_types, **kw)
|
|
32
32
|
|
|
33
33
|
def __call__(self, df):
|
|
34
|
-
new_df = build_empty_df(df.dtypes).set_index(
|
|
34
|
+
new_df = build_empty_df(df.dtypes, index=df.index_value.to_pandas()).set_index(
|
|
35
35
|
keys=self.keys,
|
|
36
36
|
drop=self.drop,
|
|
37
37
|
append=self.append,
|
|
@@ -47,6 +47,73 @@ class DataFrameSetIndex(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
47
47
|
|
|
48
48
|
|
|
49
49
|
def set_index(df, keys, drop=True, append=False, inplace=False, verify_integrity=False):
|
|
50
|
+
# TODO add support for set index by series, index, mt.ndarray, etc.
|
|
51
|
+
"""
|
|
52
|
+
Set the DataFrame index using existing columns.
|
|
53
|
+
|
|
54
|
+
Set the DataFrame index (row labels) using one or more existing
|
|
55
|
+
columns. The index can replace the existing index or expand on it.
|
|
56
|
+
|
|
57
|
+
Parameters
|
|
58
|
+
----------
|
|
59
|
+
keys : label or array-like or list of labels
|
|
60
|
+
This parameter can be either a single column key, or a list containing column keys.
|
|
61
|
+
drop : bool, default True
|
|
62
|
+
Delete columns to be used as the new index.
|
|
63
|
+
append : bool, default False
|
|
64
|
+
Whether to append columns to existing index.
|
|
65
|
+
inplace : bool, default False
|
|
66
|
+
If True, modifies the DataFrame in place (do not create a new object).
|
|
67
|
+
verify_integrity : bool, default False
|
|
68
|
+
Check the new index for duplicates. Otherwise defer the check until
|
|
69
|
+
necessary. Setting to False will improve the performance of this
|
|
70
|
+
method.
|
|
71
|
+
|
|
72
|
+
Returns
|
|
73
|
+
-------
|
|
74
|
+
DataFrame or None
|
|
75
|
+
Changed row labels or None if ``inplace=True``.
|
|
76
|
+
|
|
77
|
+
See Also
|
|
78
|
+
--------
|
|
79
|
+
DataFrame.reset_index : Opposite of set_index.
|
|
80
|
+
DataFrame.reindex : Change to new indices or expand indices.
|
|
81
|
+
DataFrame.reindex_like : Change to same indices as other DataFrame.
|
|
82
|
+
|
|
83
|
+
Examples
|
|
84
|
+
--------
|
|
85
|
+
>>> import maxframe.dataframe as md
|
|
86
|
+
|
|
87
|
+
>>> df = md.DataFrame({'month': [1, 4, 7, 10],
|
|
88
|
+
... 'year': [2012, 2014, 2013, 2014],
|
|
89
|
+
... 'sale': [55, 40, 84, 31]})
|
|
90
|
+
>>> df
|
|
91
|
+
month year sale
|
|
92
|
+
0 1 2012 55
|
|
93
|
+
1 4 2014 40
|
|
94
|
+
2 7 2013 84
|
|
95
|
+
3 10 2014 31
|
|
96
|
+
|
|
97
|
+
Set the index to become the 'month' column:
|
|
98
|
+
|
|
99
|
+
>>> df.set_index('month')
|
|
100
|
+
year sale
|
|
101
|
+
month
|
|
102
|
+
1 2012 55
|
|
103
|
+
4 2014 40
|
|
104
|
+
7 2013 84
|
|
105
|
+
10 2014 31
|
|
106
|
+
|
|
107
|
+
Create a MultiIndex using columns 'year' and 'month':
|
|
108
|
+
|
|
109
|
+
>>> df.set_index(['year', 'month'])
|
|
110
|
+
sale
|
|
111
|
+
year month
|
|
112
|
+
2012 1 55
|
|
113
|
+
2014 4 40
|
|
114
|
+
2013 7 84
|
|
115
|
+
2014 10 31
|
|
116
|
+
"""
|
|
50
117
|
op = DataFrameSetIndex(
|
|
51
118
|
keys=keys,
|
|
52
119
|
drop=drop,
|
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
from typing import Union
|
|
16
16
|
|
|
17
17
|
import pandas as pd
|
|
18
|
+
from pandas.api.types import is_list_like
|
|
18
19
|
from pandas.core.dtypes.common import pandas_dtype
|
|
19
20
|
|
|
20
21
|
from ..core import ENTITY_TYPE
|
|
@@ -61,6 +62,8 @@ class DataFrame(_Frame, metaclass=InitializerMeta):
|
|
|
61
62
|
num_partitions=None,
|
|
62
63
|
):
|
|
63
64
|
need_repart = False
|
|
65
|
+
if columns is not None and not is_list_like(columns):
|
|
66
|
+
raise ValueError("columns must be a list-like object")
|
|
64
67
|
if isinstance(data, TENSOR_TYPE):
|
|
65
68
|
if chunk_size is not None:
|
|
66
69
|
data = data.rechunk(chunk_size)
|
|
@@ -69,7 +72,10 @@ class DataFrame(_Frame, metaclass=InitializerMeta):
|
|
|
69
72
|
)
|
|
70
73
|
need_repart = num_partitions is not None
|
|
71
74
|
elif isinstance(data, SERIES_TYPE):
|
|
72
|
-
|
|
75
|
+
if columns is not None and len(columns) != 1:
|
|
76
|
+
raise ValueError("columns' length must be 1 when data is Series")
|
|
77
|
+
col_name = columns[0] if columns else None
|
|
78
|
+
df = data.to_frame(name=col_name)
|
|
73
79
|
need_repart = num_partitions is not None
|
|
74
80
|
elif isinstance(data, DATAFRAME_TYPE):
|
|
75
81
|
if not hasattr(data, "data"):
|
|
@@ -77,6 +83,10 @@ class DataFrame(_Frame, metaclass=InitializerMeta):
|
|
|
77
83
|
df = _Frame(data)
|
|
78
84
|
else:
|
|
79
85
|
df = data
|
|
86
|
+
if columns is not None:
|
|
87
|
+
if len(df.columns) != len(columns):
|
|
88
|
+
raise ValueError("columns' length must be equal to the data's")
|
|
89
|
+
df.columns = columns
|
|
80
90
|
need_repart = num_partitions is not None
|
|
81
91
|
elif isinstance(data, dict) and self._can_process_by_1d_tileables(data):
|
|
82
92
|
# data is a dict and some value is tensor
|
|
@@ -14,7 +14,15 @@
|
|
|
14
14
|
|
|
15
15
|
from .append import DataFrameAppend, append
|
|
16
16
|
from .concat import DataFrameConcat, concat
|
|
17
|
-
from .merge import
|
|
17
|
+
from .merge import (
|
|
18
|
+
DataFrameMerge,
|
|
19
|
+
DataFrameMergeAlign,
|
|
20
|
+
DistributedMapJoinHint,
|
|
21
|
+
MapJoinHint,
|
|
22
|
+
SkewJoinHint,
|
|
23
|
+
join,
|
|
24
|
+
merge,
|
|
25
|
+
)
|
|
18
26
|
|
|
19
27
|
|
|
20
28
|
def _install():
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
from typing import List, Union
|
|
14
15
|
|
|
15
16
|
import pandas as pd
|
|
16
17
|
|
|
@@ -24,6 +25,7 @@ from ...serialization.serializables import (
|
|
|
24
25
|
StringField,
|
|
25
26
|
)
|
|
26
27
|
from ...utils import lazy_import
|
|
28
|
+
from ..core import DataFrame, Series
|
|
27
29
|
from ..operators import SERIES_TYPE, DataFrameOperator, DataFrameOperatorMixin
|
|
28
30
|
from ..utils import build_empty_df, build_empty_series, parse_index, validate_axis
|
|
29
31
|
|
|
@@ -55,41 +57,53 @@ class DataFrameConcat(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
55
57
|
return self.names
|
|
56
58
|
|
|
57
59
|
@classmethod
|
|
58
|
-
def _concat_index(cls,
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
if
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
60
|
+
def _concat_index(cls, df_or_series_list: Union[List[DataFrame], List[Series]]):
|
|
61
|
+
concat_index = None
|
|
62
|
+
all_indexes_have_value = all(
|
|
63
|
+
input.index_value.has_value() for input in df_or_series_list
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
def _concat(prev_index: pd.Index, cur_index: pd.Index):
|
|
67
|
+
if prev_index is None:
|
|
68
|
+
return cur_index
|
|
69
|
+
|
|
70
|
+
if (
|
|
71
|
+
all_indexes_have_value
|
|
72
|
+
and isinstance(prev_index, pd.RangeIndex)
|
|
73
|
+
and isinstance(cur_index, pd.RangeIndex)
|
|
74
|
+
):
|
|
75
|
+
# handle RangeIndex that append may generate huge amount of data
|
|
76
|
+
# e.g. pd.RangeIndex(10_000) and pd.RangeIndex(10_000)
|
|
77
|
+
# will generate a Int64Index full of data
|
|
78
|
+
# for details see GH#1647
|
|
79
|
+
prev_stop = prev_index.start + prev_index.size * prev_index.step
|
|
80
|
+
cur_start = cur_index.start
|
|
81
|
+
if prev_stop == cur_start and prev_index.step == cur_index.step:
|
|
82
|
+
# continuous RangeIndex, still return RangeIndex
|
|
83
|
+
return prev_index.append(cur_index)
|
|
84
|
+
else:
|
|
85
|
+
# otherwise, return an empty index
|
|
86
|
+
return pd.Index([], dtype=prev_index.dtype)
|
|
87
|
+
elif isinstance(prev_index, pd.RangeIndex):
|
|
88
|
+
return pd.Index([], prev_index.dtype).append(cur_index)
|
|
89
|
+
elif isinstance(cur_index, pd.RangeIndex):
|
|
90
|
+
return prev_index.append(pd.Index([], cur_index.dtype))
|
|
91
|
+
return prev_index.append(cur_index)
|
|
92
|
+
|
|
93
|
+
for input in df_or_series_list:
|
|
94
|
+
concat_index = _concat(concat_index, input.index_value.to_pandas())
|
|
95
|
+
|
|
96
|
+
return concat_index
|
|
79
97
|
|
|
80
98
|
def _call_series(self, objs):
|
|
81
99
|
if self.axis == 0:
|
|
82
100
|
row_length = 0
|
|
83
|
-
index = None
|
|
84
101
|
for series in objs:
|
|
85
|
-
if index is None:
|
|
86
|
-
index = series.index_value.to_pandas()
|
|
87
|
-
else:
|
|
88
|
-
index = self._concat_index(index, series.index_value.to_pandas())
|
|
89
102
|
row_length += series.shape[0]
|
|
90
103
|
if self.ignore_index: # pragma: no cover
|
|
91
104
|
index_value = parse_index(pd.RangeIndex(row_length))
|
|
92
105
|
else:
|
|
106
|
+
index = self._concat_index(objs)
|
|
93
107
|
index_value = parse_index(index, objs)
|
|
94
108
|
obj_names = {obj.name for obj in objs}
|
|
95
109
|
return self.new_series(
|
|
@@ -130,13 +144,8 @@ class DataFrameConcat(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
130
144
|
def _call_dataframes(self, objs):
|
|
131
145
|
if self.axis == 0:
|
|
132
146
|
row_length = 0
|
|
133
|
-
index = None
|
|
134
147
|
empty_dfs = []
|
|
135
148
|
for df in objs:
|
|
136
|
-
if index is None:
|
|
137
|
-
index = df.index_value.to_pandas()
|
|
138
|
-
else:
|
|
139
|
-
index = self._concat_index(index, df.index_value.to_pandas())
|
|
140
149
|
row_length += df.shape[0]
|
|
141
150
|
if df.ndim == 2:
|
|
142
151
|
empty_dfs.append(build_empty_df(df.dtypes))
|
|
@@ -153,6 +162,7 @@ class DataFrameConcat(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
153
162
|
if self.ignore_index: # pragma: no cover
|
|
154
163
|
index_value = parse_index(pd.RangeIndex(row_length))
|
|
155
164
|
else:
|
|
165
|
+
index = self._concat_index(objs)
|
|
156
166
|
index_value = parse_index(index, objs)
|
|
157
167
|
|
|
158
168
|
new_objs = []
|