maxframe 1.0.0rc2__cp311-cp311-win_amd64.whl → 1.0.0rc4__cp311-cp311-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cp311-win_amd64.pyd +0 -0
- maxframe/codegen.py +4 -2
- maxframe/config/config.py +28 -9
- maxframe/config/validators.py +42 -12
- maxframe/conftest.py +56 -14
- maxframe/core/__init__.py +2 -13
- maxframe/core/entity/__init__.py +0 -4
- maxframe/core/entity/executable.py +1 -1
- maxframe/core/entity/objects.py +45 -2
- maxframe/core/entity/output_types.py +0 -3
- maxframe/core/entity/tests/test_objects.py +43 -0
- maxframe/core/entity/tileables.py +5 -78
- maxframe/core/graph/__init__.py +2 -2
- maxframe/core/graph/builder/__init__.py +0 -1
- maxframe/core/graph/builder/base.py +5 -4
- maxframe/core/graph/builder/tileable.py +4 -4
- maxframe/core/graph/builder/utils.py +4 -8
- maxframe/core/graph/core.cp311-win_amd64.pyd +0 -0
- maxframe/core/graph/entity.py +9 -33
- maxframe/core/operator/__init__.py +2 -9
- maxframe/core/operator/base.py +3 -5
- maxframe/core/operator/objects.py +0 -9
- maxframe/core/operator/utils.py +55 -0
- maxframe/dataframe/arithmetic/docstring.py +26 -2
- maxframe/dataframe/arithmetic/equal.py +4 -2
- maxframe/dataframe/arithmetic/greater.py +4 -2
- maxframe/dataframe/arithmetic/greater_equal.py +4 -2
- maxframe/dataframe/arithmetic/less.py +2 -2
- maxframe/dataframe/arithmetic/less_equal.py +4 -2
- maxframe/dataframe/arithmetic/not_equal.py +4 -2
- maxframe/dataframe/core.py +2 -0
- maxframe/dataframe/datasource/read_odps_query.py +67 -8
- maxframe/dataframe/datasource/read_odps_table.py +4 -2
- maxframe/dataframe/datasource/tests/test_datasource.py +35 -6
- maxframe/dataframe/datastore/to_odps.py +8 -1
- maxframe/dataframe/extensions/__init__.py +3 -0
- maxframe/dataframe/extensions/flatmap.py +326 -0
- maxframe/dataframe/extensions/tests/test_extensions.py +62 -1
- maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
- maxframe/dataframe/indexing/rename.py +11 -0
- maxframe/dataframe/initializer.py +11 -1
- maxframe/dataframe/misc/drop_duplicates.py +18 -1
- maxframe/dataframe/operators.py +1 -17
- maxframe/dataframe/reduction/core.py +2 -2
- maxframe/dataframe/tests/test_initializer.py +33 -2
- maxframe/io/objects/__init__.py +24 -0
- maxframe/io/objects/core.py +140 -0
- maxframe/io/objects/tensor.py +76 -0
- maxframe/io/objects/tests/__init__.py +13 -0
- maxframe/io/objects/tests/test_object_io.py +97 -0
- maxframe/{odpsio → io/odpsio}/__init__.py +2 -0
- maxframe/{odpsio → io/odpsio}/arrow.py +4 -4
- maxframe/{odpsio → io/odpsio}/schema.py +10 -8
- maxframe/{odpsio → io/odpsio}/tableio.py +50 -38
- maxframe/io/odpsio/tests/__init__.py +13 -0
- maxframe/{odpsio → io/odpsio}/tests/test_schema.py +3 -7
- maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +3 -3
- maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
- maxframe/io/odpsio/volumeio.py +63 -0
- maxframe/learn/contrib/__init__.py +2 -1
- maxframe/learn/contrib/graph/__init__.py +15 -0
- maxframe/learn/contrib/graph/connected_components.py +215 -0
- maxframe/learn/contrib/graph/tests/__init__.py +13 -0
- maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
- maxframe/learn/contrib/xgboost/classifier.py +26 -2
- maxframe/learn/contrib/xgboost/core.py +87 -2
- maxframe/learn/contrib/xgboost/dmatrix.py +1 -4
- maxframe/learn/contrib/xgboost/predict.py +27 -44
- maxframe/learn/contrib/xgboost/regressor.py +3 -10
- maxframe/learn/contrib/xgboost/train.py +27 -16
- maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
- maxframe/lib/mmh3.cp311-win_amd64.pyd +0 -0
- maxframe/opcodes.py +3 -0
- maxframe/protocol.py +7 -16
- maxframe/remote/core.py +4 -8
- maxframe/serialization/__init__.py +1 -0
- maxframe/serialization/core.cp311-win_amd64.pyd +0 -0
- maxframe/session.py +9 -2
- maxframe/tensor/__init__.py +10 -2
- maxframe/tensor/arithmetic/isclose.py +1 -0
- maxframe/tensor/arithmetic/tests/test_arithmetic.py +21 -17
- maxframe/tensor/core.py +5 -136
- maxframe/tensor/datasource/array.py +3 -0
- maxframe/tensor/datasource/full.py +1 -1
- maxframe/tensor/datasource/tests/test_datasource.py +1 -1
- maxframe/tensor/indexing/flatnonzero.py +1 -1
- maxframe/tensor/indexing/getitem.py +2 -0
- maxframe/tensor/merge/__init__.py +2 -0
- maxframe/tensor/merge/concatenate.py +101 -0
- maxframe/tensor/merge/tests/test_merge.py +30 -1
- maxframe/tensor/merge/vstack.py +74 -0
- maxframe/tensor/{base → misc}/__init__.py +2 -0
- maxframe/tensor/{base → misc}/atleast_1d.py +0 -2
- maxframe/tensor/misc/atleast_2d.py +70 -0
- maxframe/tensor/misc/atleast_3d.py +85 -0
- maxframe/tensor/misc/tests/__init__.py +13 -0
- maxframe/tensor/{base → misc}/transpose.py +22 -18
- maxframe/tensor/operators.py +1 -7
- maxframe/tensor/random/core.py +1 -1
- maxframe/tensor/reduction/count_nonzero.py +1 -0
- maxframe/tensor/reduction/mean.py +1 -0
- maxframe/tensor/reduction/nanmean.py +1 -0
- maxframe/tensor/reduction/nanvar.py +2 -0
- maxframe/tensor/reduction/tests/test_reduction.py +12 -1
- maxframe/tensor/reduction/var.py +2 -0
- maxframe/tensor/utils.py +2 -22
- maxframe/typing_.py +4 -1
- maxframe/udf.py +8 -9
- maxframe/utils.py +49 -73
- maxframe-1.0.0rc4.dist-info/METADATA +104 -0
- {maxframe-1.0.0rc2.dist-info → maxframe-1.0.0rc4.dist-info}/RECORD +129 -114
- {maxframe-1.0.0rc2.dist-info → maxframe-1.0.0rc4.dist-info}/WHEEL +1 -1
- maxframe_client/fetcher.py +33 -50
- maxframe_client/session/consts.py +3 -0
- maxframe_client/session/graph.py +8 -2
- maxframe_client/session/odps.py +134 -27
- maxframe_client/session/task.py +58 -20
- maxframe_client/tests/test_fetcher.py +1 -1
- maxframe_client/tests/test_session.py +27 -3
- maxframe/core/entity/chunks.py +0 -68
- maxframe/core/entity/fuse.py +0 -73
- maxframe/core/graph/builder/chunk.py +0 -430
- maxframe/odpsio/volumeio.py +0 -95
- maxframe-1.0.0rc2.dist-info/METADATA +0 -177
- /maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
- /maxframe/{tensor/base/tests → io}/__init__.py +0 -0
- /maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
- /maxframe/tensor/{base → misc}/astype.py +0 -0
- /maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
- /maxframe/tensor/{base → misc}/ravel.py +0 -0
- /maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
- /maxframe/tensor/{base → misc}/unique.py +0 -0
- /maxframe/tensor/{base → misc}/where.py +0 -0
- {maxframe-1.0.0rc2.dist-info → maxframe-1.0.0rc4.dist-info}/top_level.txt +0 -0
|
@@ -19,6 +19,7 @@ import numpy as np
|
|
|
19
19
|
import pandas as pd
|
|
20
20
|
import pytest
|
|
21
21
|
from odps import ODPS
|
|
22
|
+
from odps import types as odps_types
|
|
22
23
|
|
|
23
24
|
from .... import tensor as mt
|
|
24
25
|
from ....core import OutputType
|
|
@@ -35,7 +36,7 @@ from ..from_tensor import (
|
|
|
35
36
|
)
|
|
36
37
|
from ..index import from_pandas as from_pandas_index
|
|
37
38
|
from ..index import from_tileable
|
|
38
|
-
from ..read_odps_query import ColumnSchema, _resolve_task_sector
|
|
39
|
+
from ..read_odps_query import ColumnSchema, _parse_simple_explain, _resolve_task_sector
|
|
39
40
|
from ..series import from_pandas as from_pandas_series
|
|
40
41
|
|
|
41
42
|
ray = lazy_import("ray")
|
|
@@ -329,10 +330,6 @@ def test_from_odps_query():
|
|
|
329
330
|
read_odps_query(f"CREATE TABLE dummy_table AS SELECT * FROM {table1_name}")
|
|
330
331
|
assert "instant query" in err_info.value.args[0]
|
|
331
332
|
|
|
332
|
-
with pytest.raises(ValueError) as err_info:
|
|
333
|
-
read_odps_query(f"SELECT col1, col2 + col3 FROM {table1_name}")
|
|
334
|
-
assert "names" in err_info.value.args[0]
|
|
335
|
-
|
|
336
333
|
query1 = f"SELECT * FROM {table1_name} WHERE col1 > 10"
|
|
337
334
|
df = read_odps_query(query1)
|
|
338
335
|
assert df.op.query == query1
|
|
@@ -401,7 +398,9 @@ def test_date_range():
|
|
|
401
398
|
|
|
402
399
|
|
|
403
400
|
def test_resolve_task_sector():
|
|
404
|
-
input_path = os.path.join(
|
|
401
|
+
input_path = os.path.join(
|
|
402
|
+
os.path.dirname(__file__), "test-data", "task-input-full.txt"
|
|
403
|
+
)
|
|
405
404
|
with open(input_path, "r") as f:
|
|
406
405
|
sector = f.read()
|
|
407
406
|
actual_sector = _resolve_task_sector("job0", sector)
|
|
@@ -413,3 +412,33 @@ def test_resolve_task_sector():
|
|
|
413
412
|
assert actual_sector.schema[0] == ColumnSchema("unnamed: 0", "bigint", "")
|
|
414
413
|
assert actual_sector.schema[1] == ColumnSchema("id", "bigint", "id_alias")
|
|
415
414
|
assert actual_sector.schema[2] == ColumnSchema("listing_url", "string", "")
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
def test_resolve_task_odps2():
|
|
418
|
+
input_path = os.path.join(
|
|
419
|
+
os.path.dirname(__file__), "test-data", "task-input-odps2.txt"
|
|
420
|
+
)
|
|
421
|
+
with open(input_path, "r") as f:
|
|
422
|
+
sector = f.read()
|
|
423
|
+
actual_sector = _resolve_task_sector("job0", sector)
|
|
424
|
+
|
|
425
|
+
assert actual_sector.job_name == "job0"
|
|
426
|
+
assert actual_sector.task_name == "M1"
|
|
427
|
+
assert actual_sector.output_target == "Screen"
|
|
428
|
+
assert len(actual_sector.schema) == 2
|
|
429
|
+
assert actual_sector.schema[0] == ColumnSchema("key", "varchar(2048)", "")
|
|
430
|
+
assert actual_sector.schema[1] == ColumnSchema("data", "binary", "")
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
def test_resolve_simple_explain():
|
|
434
|
+
input_path = os.path.join(
|
|
435
|
+
os.path.dirname(__file__), "test-data", "task-input-simple.txt"
|
|
436
|
+
)
|
|
437
|
+
with open(input_path, "r") as f:
|
|
438
|
+
sector = f.read()
|
|
439
|
+
|
|
440
|
+
schema = _parse_simple_explain(sector)
|
|
441
|
+
assert schema.columns[0].name == "memberid"
|
|
442
|
+
assert schema.columns[0].type == odps_types.string
|
|
443
|
+
assert schema.columns[1].name == "createdate"
|
|
444
|
+
assert schema.columns[1].type == odps_types.bigint
|
|
@@ -17,13 +17,14 @@
|
|
|
17
17
|
import logging
|
|
18
18
|
from typing import List, Optional, Union
|
|
19
19
|
|
|
20
|
+
from odps import ODPS
|
|
20
21
|
from odps.models import Table as ODPSTable
|
|
21
22
|
from odps.types import PartitionSpec
|
|
22
23
|
|
|
23
24
|
from ... import opcodes
|
|
24
25
|
from ...config import options
|
|
25
26
|
from ...core import OutputType
|
|
26
|
-
from ...odpsio import build_dataframe_table_meta
|
|
27
|
+
from ...io.odpsio import build_dataframe_table_meta
|
|
27
28
|
from ...serialization.serializables import (
|
|
28
29
|
BoolField,
|
|
29
30
|
FieldTypes,
|
|
@@ -136,8 +137,14 @@ def to_odps_table(
|
|
|
136
137
|
--------
|
|
137
138
|
|
|
138
139
|
"""
|
|
140
|
+
odps_entry = ODPS.from_global() or ODPS.from_environments()
|
|
139
141
|
if isinstance(table, ODPSTable):
|
|
140
142
|
table = table.full_table_name
|
|
143
|
+
elif options.session.enable_schema and "." not in table:
|
|
144
|
+
default_schema = (
|
|
145
|
+
options.session.default_schema or odps_entry.schema or "default"
|
|
146
|
+
)
|
|
147
|
+
table = default_schema + "." + table
|
|
141
148
|
|
|
142
149
|
if isinstance(index_label, str):
|
|
143
150
|
index_label = [index_label]
|
|
@@ -18,6 +18,7 @@ from .accessor import (
|
|
|
18
18
|
IndexMaxFrameAccessor,
|
|
19
19
|
SeriesMaxFrameAccessor,
|
|
20
20
|
)
|
|
21
|
+
from .flatmap import df_flatmap, series_flatmap
|
|
21
22
|
from .reshuffle import DataFrameReshuffle, df_reshuffle
|
|
22
23
|
|
|
23
24
|
|
|
@@ -25,6 +26,8 @@ def _install():
|
|
|
25
26
|
from ..core import DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE
|
|
26
27
|
|
|
27
28
|
DataFrameMaxFrameAccessor._register("reshuffle", df_reshuffle)
|
|
29
|
+
DataFrameMaxFrameAccessor._register("flatmap", df_flatmap)
|
|
30
|
+
SeriesMaxFrameAccessor._register("flatmap", series_flatmap)
|
|
28
31
|
|
|
29
32
|
if DataFrameMaxFrameAccessor._api_count:
|
|
30
33
|
for t in DATAFRAME_TYPE:
|
|
@@ -0,0 +1,326 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from typing import Callable
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
import pandas as pd
|
|
19
|
+
|
|
20
|
+
from maxframe import opcodes
|
|
21
|
+
from maxframe.core import OutputType
|
|
22
|
+
from maxframe.dataframe.core import DataFrame, IndexValue
|
|
23
|
+
from maxframe.dataframe.operators import DataFrameOperator, DataFrameOperatorMixin
|
|
24
|
+
from maxframe.dataframe.utils import make_dtypes, parse_index
|
|
25
|
+
from maxframe.serialization.serializables import (
|
|
26
|
+
BoolField,
|
|
27
|
+
DictField,
|
|
28
|
+
FunctionField,
|
|
29
|
+
TupleField,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class DataFrameFlatMapOperator(DataFrameOperator, DataFrameOperatorMixin):
|
|
34
|
+
_op_type_ = opcodes.FLATMAP
|
|
35
|
+
|
|
36
|
+
func = FunctionField("func")
|
|
37
|
+
raw = BoolField("raw", default=False)
|
|
38
|
+
args = TupleField("args", default=())
|
|
39
|
+
kwargs = DictField("kwargs", default={})
|
|
40
|
+
|
|
41
|
+
def __init__(self, output_types=None, **kw):
|
|
42
|
+
super().__init__(_output_types=output_types, **kw)
|
|
43
|
+
|
|
44
|
+
@staticmethod
|
|
45
|
+
def _gen_flattening_index_value(index_value, *args) -> IndexValue:
|
|
46
|
+
pd_index = index_value.to_pandas()
|
|
47
|
+
if not isinstance(pd_index, pd.MultiIndex):
|
|
48
|
+
# for func return multi rows, will copy indexes
|
|
49
|
+
return parse_index(pd.Index([], dtype=pd_index.dtype), *args)
|
|
50
|
+
# multi index will keep the same level and types
|
|
51
|
+
return parse_index(
|
|
52
|
+
pd.MultiIndex.from_arrays([c[:0] for c in pd_index.levels]), *args
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
def _call_dataframe(self, df: DataFrame, dtypes: pd.Series):
|
|
56
|
+
dtypes = make_dtypes(dtypes)
|
|
57
|
+
index_value = self._gen_flattening_index_value(
|
|
58
|
+
df.index_value,
|
|
59
|
+
(df.key, df.index_value.key, self.func),
|
|
60
|
+
)
|
|
61
|
+
return self.new_dataframe(
|
|
62
|
+
[df],
|
|
63
|
+
shape=(np.nan, len(dtypes)),
|
|
64
|
+
index_value=index_value,
|
|
65
|
+
columns_value=parse_index(dtypes.index, store_data=True),
|
|
66
|
+
dtypes=dtypes,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
def _call_series_or_index(self, series, dtypes=None):
|
|
70
|
+
index_value = self._gen_flattening_index_value(
|
|
71
|
+
series.index_value,
|
|
72
|
+
(series.key, series.index_value.key, self.func),
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
if self.output_types[0] == OutputType.series:
|
|
76
|
+
name, dtype = dtypes
|
|
77
|
+
return self.new_series(
|
|
78
|
+
[series],
|
|
79
|
+
dtype=dtype,
|
|
80
|
+
shape=(np.nan,),
|
|
81
|
+
index_value=index_value,
|
|
82
|
+
name=name,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
dtypes = make_dtypes(dtypes)
|
|
86
|
+
columns_value = parse_index(dtypes.index, store_data=True)
|
|
87
|
+
return self.new_dataframe(
|
|
88
|
+
[series],
|
|
89
|
+
shape=(np.nan, len(dtypes)),
|
|
90
|
+
index_value=index_value,
|
|
91
|
+
columns_value=columns_value,
|
|
92
|
+
dtypes=dtypes,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
def __call__(
|
|
96
|
+
self,
|
|
97
|
+
df_or_series,
|
|
98
|
+
dtypes=None,
|
|
99
|
+
output_type=None,
|
|
100
|
+
):
|
|
101
|
+
if df_or_series.op.output_types[0] == OutputType.dataframe:
|
|
102
|
+
return self._call_dataframe(df_or_series, dtypes=dtypes)
|
|
103
|
+
else:
|
|
104
|
+
return self._call_series_or_index(df_or_series, dtypes=dtypes)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def df_flatmap(dataframe, func: Callable, dtypes=None, raw=False, args=(), **kwargs):
|
|
108
|
+
"""
|
|
109
|
+
Apply the given function to each row and then flatten results. Use this method if your transformation returns
|
|
110
|
+
multiple rows for each input row.
|
|
111
|
+
|
|
112
|
+
This function applies a transformation to each row of the DataFrame, where the transformation can return zero
|
|
113
|
+
or multiple values, effectively flattening Python generators, list-like collections, and DataFrames.
|
|
114
|
+
|
|
115
|
+
Parameters
|
|
116
|
+
----------
|
|
117
|
+
dataframe : DataFrame
|
|
118
|
+
The DataFrame to which the function will be applied.
|
|
119
|
+
|
|
120
|
+
func : Callable
|
|
121
|
+
Function to apply to each row of the DataFrame. It should accept a Series (or an array if `raw=True`)
|
|
122
|
+
representing a row and return a list or iterable of values.
|
|
123
|
+
|
|
124
|
+
dtypes : Series, dict or list
|
|
125
|
+
Specify dtypes of returned DataFrame.
|
|
126
|
+
|
|
127
|
+
raw : bool, default False
|
|
128
|
+
Determines if the row is passed as a Series or as a numpy array:
|
|
129
|
+
|
|
130
|
+
* ``False`` : passes each row as a Series to the function.
|
|
131
|
+
* ``True`` : the passed function will receive numpy array objects instead.
|
|
132
|
+
|
|
133
|
+
args : tuple
|
|
134
|
+
Positional arguments to pass to `func`.
|
|
135
|
+
|
|
136
|
+
**kwargs
|
|
137
|
+
Additional keyword arguments to pass as keywords arguments to `func`.
|
|
138
|
+
|
|
139
|
+
Returns
|
|
140
|
+
-------
|
|
141
|
+
DataFrame
|
|
142
|
+
Return DataFrame with specified `dtypes`.
|
|
143
|
+
|
|
144
|
+
Notes
|
|
145
|
+
-----
|
|
146
|
+
The `func` must return an iterable of values for each input row. The index of the resulting DataFrame will be
|
|
147
|
+
repeated based on the number of output rows generated by `func`.
|
|
148
|
+
|
|
149
|
+
Examples
|
|
150
|
+
--------
|
|
151
|
+
>>> import numpy as np
|
|
152
|
+
>>> import maxframe.dataframe as md
|
|
153
|
+
>>> df = md.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
|
|
154
|
+
>>> df.execute()
|
|
155
|
+
A B
|
|
156
|
+
0 1 4
|
|
157
|
+
1 2 5
|
|
158
|
+
2 3 6
|
|
159
|
+
|
|
160
|
+
Define a function that takes a number and returns a list of two numbers:
|
|
161
|
+
|
|
162
|
+
>>> def generate_values_array(row):
|
|
163
|
+
... return [row['A'] * 2, row['B'] * 3]
|
|
164
|
+
|
|
165
|
+
Define a function that takes a row and return two rows and two columns:
|
|
166
|
+
|
|
167
|
+
>>> def generate_values_in_generator(row):
|
|
168
|
+
... yield [row[0] * 2, row[1] * 4]
|
|
169
|
+
... yield [row[0] * 3, row[1] * 5]
|
|
170
|
+
|
|
171
|
+
Which equals to the following function return a dataframe:
|
|
172
|
+
|
|
173
|
+
>>> def generate_values_in_dataframe(row):
|
|
174
|
+
... return pd.DataFrame([[row[0] * 2, row[1] * 4], [row[0] * 3, row[1] * 5]])
|
|
175
|
+
|
|
176
|
+
Specify `dtypes` with a function which returns a DataFrame:
|
|
177
|
+
|
|
178
|
+
>>> df.mf.flatmap(generate_values_array, dtypes=pd.Series({'A': 'int'})).execute()
|
|
179
|
+
A
|
|
180
|
+
0 2
|
|
181
|
+
0 12
|
|
182
|
+
1 4
|
|
183
|
+
1 15
|
|
184
|
+
2 6
|
|
185
|
+
2 18
|
|
186
|
+
|
|
187
|
+
Specify raw=True to pass input row as array:
|
|
188
|
+
|
|
189
|
+
>>> df.mf.flatmap(generate_values_in_generator, dtypes={"A": "int", "B": "int"}, raw=True).execute()
|
|
190
|
+
A B
|
|
191
|
+
0 2 16
|
|
192
|
+
0 3 20
|
|
193
|
+
1 4 20
|
|
194
|
+
1 6 25
|
|
195
|
+
2 6 24
|
|
196
|
+
2 9 30
|
|
197
|
+
"""
|
|
198
|
+
if dtypes is None or len(dtypes) == 0:
|
|
199
|
+
raise TypeError(
|
|
200
|
+
"Cannot determine {dtypes} by calculating with enumerate data, "
|
|
201
|
+
"please specify it as arguments"
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
if not isinstance(func, Callable):
|
|
205
|
+
raise TypeError("function must be a callable object")
|
|
206
|
+
|
|
207
|
+
output_types = [OutputType.dataframe]
|
|
208
|
+
op = DataFrameFlatMapOperator(
|
|
209
|
+
func=func, raw=raw, output_types=output_types, args=args, kwargs=kwargs
|
|
210
|
+
)
|
|
211
|
+
return op(
|
|
212
|
+
dataframe,
|
|
213
|
+
dtypes=dtypes,
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def series_flatmap(
|
|
218
|
+
series, func: Callable, dtypes=None, dtype=None, name=None, args=(), **kwargs
|
|
219
|
+
):
|
|
220
|
+
"""
|
|
221
|
+
Apply the given function to each row and then flatten results. Use this method if your transformation returns
|
|
222
|
+
multiple rows for each input row.
|
|
223
|
+
|
|
224
|
+
This function applies a transformation to each element of the Series, where the transformation can return zero
|
|
225
|
+
or multiple values, effectively flattening Python generator, list-liked collections and DataFrame.
|
|
226
|
+
|
|
227
|
+
Parameters
|
|
228
|
+
----------
|
|
229
|
+
series : Series
|
|
230
|
+
The series to which the function will be applied.
|
|
231
|
+
|
|
232
|
+
func : Callable
|
|
233
|
+
Function to apply to each element of the Series. It should accept a scalar value
|
|
234
|
+
(or an array if `raw=True`) and return a list or iterable of values.
|
|
235
|
+
|
|
236
|
+
dtypes : Series, default None
|
|
237
|
+
Specify dtypes of returned DataFrame. Can't work with dtype.
|
|
238
|
+
|
|
239
|
+
dtype : numpy.dtype, default None
|
|
240
|
+
Specify dtype of returned Series. Can't work with dtypes.
|
|
241
|
+
|
|
242
|
+
name : str, default None
|
|
243
|
+
Specify name of the returned Series.
|
|
244
|
+
|
|
245
|
+
args : tuple
|
|
246
|
+
Positional arguments to pass to `func`.
|
|
247
|
+
|
|
248
|
+
**kwargs
|
|
249
|
+
Additional keyword arguments to pass as keywords arguments to `func`.
|
|
250
|
+
|
|
251
|
+
Returns
|
|
252
|
+
-------
|
|
253
|
+
DataFrame or Series
|
|
254
|
+
Result of DataFrame when dtypes specified, else Series.
|
|
255
|
+
|
|
256
|
+
Notes
|
|
257
|
+
-----
|
|
258
|
+
The `func` must return an iterable of values for each input element. If `dtypes` is specified,
|
|
259
|
+
`flatmap` will return a DataFrame, if `dtype` and `name` is specified, a Series will be returned. The index of
|
|
260
|
+
the resulting DataFrame/Series will be repeated based on the number of output rows generated by `func`.
|
|
261
|
+
|
|
262
|
+
Examples
|
|
263
|
+
--------
|
|
264
|
+
>>> import numpy as np
|
|
265
|
+
>>> import maxframe.dataframe as md
|
|
266
|
+
>>> df = md.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
|
|
267
|
+
>>> df.execute()
|
|
268
|
+
A B
|
|
269
|
+
0 1 4
|
|
270
|
+
1 2 5
|
|
271
|
+
2 3 6
|
|
272
|
+
|
|
273
|
+
Define a function that takes a number and returns a list of two numbers:
|
|
274
|
+
|
|
275
|
+
>>> def generate_values_array(x):
|
|
276
|
+
... return [x * 2, x * 3]
|
|
277
|
+
|
|
278
|
+
>>> def generate_values_in_generator(x):
|
|
279
|
+
... yield pd.Series([x * 2, x * 4])
|
|
280
|
+
... yield pd.Series([x * 3, x * 5])
|
|
281
|
+
|
|
282
|
+
Specify `dtype` with a function which returns list to return more than one elements as a Series:
|
|
283
|
+
|
|
284
|
+
>>> df['A'].mf.flatmap(generate_values_array, dtype="int", name="C").execute()
|
|
285
|
+
0 2
|
|
286
|
+
0 3
|
|
287
|
+
1 4
|
|
288
|
+
1 6
|
|
289
|
+
2 6
|
|
290
|
+
2 9
|
|
291
|
+
Name: C, dtype: int64
|
|
292
|
+
|
|
293
|
+
Specify `dtypes` to return multi columns as a DataFrame:
|
|
294
|
+
|
|
295
|
+
>>> df['A'].mf.flatmap(generate_values_in_generator, dtypes={"A": "int", "B": "int"}).execute()
|
|
296
|
+
A B
|
|
297
|
+
0 2 4
|
|
298
|
+
0 3 5
|
|
299
|
+
1 4 8
|
|
300
|
+
1 6 10
|
|
301
|
+
2 6 12
|
|
302
|
+
2 9 15
|
|
303
|
+
"""
|
|
304
|
+
|
|
305
|
+
if dtypes and dtype:
|
|
306
|
+
raise ValueError("Both dtypes and dtype cannot be specified at the same time.")
|
|
307
|
+
|
|
308
|
+
dtypes = (name, dtype) if dtype is not None else dtypes
|
|
309
|
+
if dtypes is None:
|
|
310
|
+
raise TypeError(
|
|
311
|
+
"Cannot determine {dtypes} or {dtype} by calculating with enumerate data, "
|
|
312
|
+
"please specify it as arguments"
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
if not isinstance(func, Callable):
|
|
316
|
+
raise TypeError("function must be a callable object")
|
|
317
|
+
|
|
318
|
+
output_type = OutputType.series if dtype is not None else OutputType.dataframe
|
|
319
|
+
|
|
320
|
+
op = DataFrameFlatMapOperator(
|
|
321
|
+
func=func, raw=False, output_types=[output_type], args=args, kwargs=kwargs
|
|
322
|
+
)
|
|
323
|
+
return op(
|
|
324
|
+
series,
|
|
325
|
+
dtypes=dtypes,
|
|
326
|
+
)
|
|
@@ -11,11 +11,12 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
|
|
14
|
+
import numpy as np
|
|
15
15
|
import pandas as pd
|
|
16
16
|
import pytest
|
|
17
17
|
|
|
18
18
|
from .... import dataframe as md
|
|
19
|
+
from ... import DataFrame
|
|
19
20
|
from ...core import IndexValue
|
|
20
21
|
from ..reshuffle import DataFrameReshuffle
|
|
21
22
|
|
|
@@ -36,3 +37,63 @@ def test_reshuffle():
|
|
|
36
37
|
r = mdf.mf.reshuffle(ignore_index=True)
|
|
37
38
|
assert isinstance(r.op, DataFrameReshuffle)
|
|
38
39
|
assert isinstance(r.index_value.value, IndexValue.RangeIndex)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@pytest.fixture
|
|
43
|
+
def df1():
|
|
44
|
+
return DataFrame({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]})
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@pytest.fixture
|
|
48
|
+
def df2():
|
|
49
|
+
return DataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3]], columns=["a", "b", "c"])
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@pytest.fixture
|
|
53
|
+
def df3():
|
|
54
|
+
return DataFrame(
|
|
55
|
+
[[1, 2, 3], [1, 2, 3], [1, 2, 3]],
|
|
56
|
+
columns=["a", "b", "c"],
|
|
57
|
+
index=pd.MultiIndex.from_arrays([[1, 2, 3], [1, 2, 3]], names=["A", "B"]),
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def test_flatmap(df1, df2, df3):
|
|
62
|
+
def f(x, keys):
|
|
63
|
+
if x["a"] in keys:
|
|
64
|
+
yield [1, 0]
|
|
65
|
+
yield [0, 1]
|
|
66
|
+
|
|
67
|
+
apply_df = df1[["a"]].mf.flatmap(
|
|
68
|
+
f,
|
|
69
|
+
dtypes={"a": "int64", "b": "int64"},
|
|
70
|
+
)
|
|
71
|
+
assert apply_df.shape == (np.nan, 2)
|
|
72
|
+
assert df1.index_value.key != apply_df.index_value.key
|
|
73
|
+
assert isinstance(df1.index_value.to_pandas(), pd.RangeIndex)
|
|
74
|
+
assert not isinstance(apply_df.index_value.to_pandas(), pd.RangeIndex)
|
|
75
|
+
apply_df = df2[["a"]].mf.flatmap(
|
|
76
|
+
f,
|
|
77
|
+
dtypes=pd.Series(["int64", "int64"]),
|
|
78
|
+
)
|
|
79
|
+
assert apply_df.shape == (np.nan, 2)
|
|
80
|
+
assert df2.index_value.key != apply_df.index_value.key
|
|
81
|
+
with pytest.raises(TypeError):
|
|
82
|
+
apply_s = df3["a"].mf.flatmap(
|
|
83
|
+
f,
|
|
84
|
+
)
|
|
85
|
+
apply_s = df3["a"].mf.flatmap(
|
|
86
|
+
f,
|
|
87
|
+
dtype="int64",
|
|
88
|
+
)
|
|
89
|
+
assert apply_s.shape == (np.nan,)
|
|
90
|
+
assert df3.index_value.key != apply_s.index_value.key
|
|
91
|
+
assert df3.key != apply_s.index_value.key
|
|
92
|
+
apply_s = df3["a"].mf.flatmap(
|
|
93
|
+
f,
|
|
94
|
+
output_type="dataframe",
|
|
95
|
+
dtypes=["int64", "int64"],
|
|
96
|
+
)
|
|
97
|
+
assert apply_s.shape == (np.nan, 2)
|
|
98
|
+
assert df3.index_value.key != apply_s.index_value.key
|
|
99
|
+
assert df3.key != apply_s.index_value.key
|
|
@@ -248,6 +248,7 @@ def df_rename(
|
|
|
248
248
|
)
|
|
249
249
|
|
|
250
250
|
|
|
251
|
+
# fixme https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/58
|
|
251
252
|
def series_rename(
|
|
252
253
|
series,
|
|
253
254
|
index=None,
|
|
@@ -382,6 +383,7 @@ def index_rename(index, name, inplace=False):
|
|
|
382
383
|
return ret
|
|
383
384
|
|
|
384
385
|
|
|
386
|
+
# fixme https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/59
|
|
385
387
|
def index_set_names(index, names, level=None, inplace=False):
|
|
386
388
|
"""
|
|
387
389
|
Set Index or MultiIndex name.
|
|
@@ -407,6 +409,15 @@ def index_set_names(index, names, level=None, inplace=False):
|
|
|
407
409
|
See Also
|
|
408
410
|
--------
|
|
409
411
|
Index.rename : Able to set new names without level.
|
|
412
|
+
|
|
413
|
+
Examples
|
|
414
|
+
--------
|
|
415
|
+
>>> import maxframe.dataframe as md
|
|
416
|
+
>>> idx = md.Index([1, 2, 3, 4])
|
|
417
|
+
>>> idx.execute()
|
|
418
|
+
Int64Index([1, 2, 3, 4], dtype='int64')
|
|
419
|
+
>>> idx.set_names('quarter').execute()
|
|
420
|
+
Int64Index([1, 2, 3, 4], dtype='int64', name='quarter')
|
|
410
421
|
"""
|
|
411
422
|
op = DataFrameRename(
|
|
412
423
|
index_mapper=names, level=level, output_types=get_output_types(index)
|
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
from typing import Union
|
|
16
16
|
|
|
17
17
|
import pandas as pd
|
|
18
|
+
from pandas.api.types import is_list_like
|
|
18
19
|
from pandas.core.dtypes.common import pandas_dtype
|
|
19
20
|
|
|
20
21
|
from ..core import ENTITY_TYPE
|
|
@@ -61,6 +62,8 @@ class DataFrame(_Frame, metaclass=InitializerMeta):
|
|
|
61
62
|
num_partitions=None,
|
|
62
63
|
):
|
|
63
64
|
need_repart = False
|
|
65
|
+
if columns is not None and not is_list_like(columns):
|
|
66
|
+
raise ValueError("columns must be a list-like object")
|
|
64
67
|
if isinstance(data, TENSOR_TYPE):
|
|
65
68
|
if chunk_size is not None:
|
|
66
69
|
data = data.rechunk(chunk_size)
|
|
@@ -69,7 +72,10 @@ class DataFrame(_Frame, metaclass=InitializerMeta):
|
|
|
69
72
|
)
|
|
70
73
|
need_repart = num_partitions is not None
|
|
71
74
|
elif isinstance(data, SERIES_TYPE):
|
|
72
|
-
|
|
75
|
+
if columns is not None and len(columns) != 1:
|
|
76
|
+
raise ValueError("columns' length must be 1 when data is Series")
|
|
77
|
+
col_name = columns[0] if columns else None
|
|
78
|
+
df = data.to_frame(name=col_name)
|
|
73
79
|
need_repart = num_partitions is not None
|
|
74
80
|
elif isinstance(data, DATAFRAME_TYPE):
|
|
75
81
|
if not hasattr(data, "data"):
|
|
@@ -77,6 +83,10 @@ class DataFrame(_Frame, metaclass=InitializerMeta):
|
|
|
77
83
|
df = _Frame(data)
|
|
78
84
|
else:
|
|
79
85
|
df = data
|
|
86
|
+
if columns is not None:
|
|
87
|
+
if len(df.columns) != len(columns):
|
|
88
|
+
raise ValueError("columns' length must be equal to the data's")
|
|
89
|
+
df.columns = columns
|
|
80
90
|
need_repart = num_partitions is not None
|
|
81
91
|
elif isinstance(data, dict) and self._can_process_by_1d_tileables(data):
|
|
82
92
|
# data is a dict and some value is tensor
|
|
@@ -104,7 +104,6 @@ def df_drop_duplicates(
|
|
|
104
104
|
def series_drop_duplicates(
|
|
105
105
|
series, keep="first", inplace=False, ignore_index=False, method="auto"
|
|
106
106
|
):
|
|
107
|
-
# FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/12
|
|
108
107
|
"""
|
|
109
108
|
Return Series with duplicate values removed.
|
|
110
109
|
|
|
@@ -148,6 +147,24 @@ def series_drop_duplicates(
|
|
|
148
147
|
5 hippo
|
|
149
148
|
Name: animal, dtype: object
|
|
150
149
|
|
|
150
|
+
With the 'keep' parameter, the selection behaviour of duplicated values
|
|
151
|
+
can be changed. The value 'first' keeps the first occurrence for each
|
|
152
|
+
set of duplicated entries. The default value of keep is 'first'.
|
|
153
|
+
>>> s.drop_duplicates().execute()
|
|
154
|
+
0 lame
|
|
155
|
+
1 cow
|
|
156
|
+
3 beetle
|
|
157
|
+
5 hippo
|
|
158
|
+
Name: animal, dtype: object
|
|
159
|
+
The value 'last' for parameter 'keep' keeps the last occurrence for
|
|
160
|
+
each set of duplicated entries.
|
|
161
|
+
>>> s.drop_duplicates(keep='last').execute()
|
|
162
|
+
1 cow
|
|
163
|
+
3 beetle
|
|
164
|
+
4 lame
|
|
165
|
+
5 hippo
|
|
166
|
+
Name: animal, dtype: object
|
|
167
|
+
|
|
151
168
|
The value ``False`` for parameter 'keep' discards all sets of
|
|
152
169
|
duplicated entries. Setting the value of 'inplace' to ``True`` performs
|
|
153
170
|
the operation inplace and returns ``None``.
|
maxframe/dataframe/operators.py
CHANGED
|
@@ -16,13 +16,7 @@ import numpy as np
|
|
|
16
16
|
import pandas as pd
|
|
17
17
|
|
|
18
18
|
from ..core import ENTITY_TYPE, OutputType
|
|
19
|
-
from ..core.operator import
|
|
20
|
-
Fuse,
|
|
21
|
-
FuseChunkMixin,
|
|
22
|
-
Operator,
|
|
23
|
-
ShuffleProxy,
|
|
24
|
-
TileableOperatorMixin,
|
|
25
|
-
)
|
|
19
|
+
from ..core.operator import Operator, ShuffleProxy, TileableOperatorMixin
|
|
26
20
|
from ..tensor.core import TENSOR_TYPE
|
|
27
21
|
from ..tensor.datasource import tensor as astensor
|
|
28
22
|
from .core import DATAFRAME_TYPE, SERIES_TYPE
|
|
@@ -261,13 +255,3 @@ DataFrameOperator = Operator
|
|
|
261
255
|
class DataFrameShuffleProxy(ShuffleProxy, DataFrameOperatorMixin):
|
|
262
256
|
def __init__(self, sparse=None, output_types=None, **kwargs):
|
|
263
257
|
super().__init__(sparse=sparse, _output_types=output_types, **kwargs)
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
class DataFrameFuseChunkMixin(FuseChunkMixin, DataFrameOperatorMixin):
|
|
267
|
-
__slots__ = ()
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
class DataFrameFuseChunk(Fuse, DataFrameFuseChunkMixin):
|
|
271
|
-
@property
|
|
272
|
-
def output_types(self):
|
|
273
|
-
return self.outputs[-1].chunk.op.output_types
|
|
@@ -552,7 +552,7 @@ class ReductionCompiler:
|
|
|
552
552
|
@enter_mode(build=True)
|
|
553
553
|
def _compile_function(self, func, func_name=None, ndim=1) -> ReductionSteps:
|
|
554
554
|
from ...tensor.arithmetic.core import TensorBinOp, TensorUnaryOp
|
|
555
|
-
from ...tensor.
|
|
555
|
+
from ...tensor.misc import TensorWhere
|
|
556
556
|
from ..arithmetic.core import DataFrameBinOp, DataFrameUnaryOp
|
|
557
557
|
from ..datasource.dataframe import DataFrameDataSource
|
|
558
558
|
from ..datasource.series import SeriesDataSource
|
|
@@ -679,8 +679,8 @@ class ReductionCompiler:
|
|
|
679
679
|
]
|
|
680
680
|
"""
|
|
681
681
|
from ...tensor.arithmetic.core import TensorBinOp, TensorUnaryOp
|
|
682
|
-
from ...tensor.base import TensorWhere
|
|
683
682
|
from ...tensor.datasource import Scalar
|
|
683
|
+
from ...tensor.misc import TensorWhere
|
|
684
684
|
from ..arithmetic.core import DataFrameBinOp, DataFrameUnaryOp
|
|
685
685
|
from ..datasource.dataframe import DataFrameDataSource
|
|
686
686
|
from ..datasource.series import SeriesDataSource
|