maxframe 1.0.0rc3__cp39-cp39-win_amd64.whl → 1.1.0__cp39-cp39-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cp39-win_amd64.pyd +0 -0
- maxframe/codegen.py +1 -0
- maxframe/config/config.py +16 -1
- maxframe/conftest.py +52 -14
- maxframe/core/entity/executable.py +1 -1
- maxframe/core/graph/core.cp39-win_amd64.pyd +0 -0
- maxframe/core/operator/base.py +2 -0
- maxframe/dataframe/arithmetic/docstring.py +26 -2
- maxframe/dataframe/arithmetic/equal.py +4 -2
- maxframe/dataframe/arithmetic/greater.py +4 -2
- maxframe/dataframe/arithmetic/greater_equal.py +4 -2
- maxframe/dataframe/arithmetic/less.py +2 -2
- maxframe/dataframe/arithmetic/less_equal.py +4 -2
- maxframe/dataframe/arithmetic/not_equal.py +4 -2
- maxframe/dataframe/arithmetic/tests/test_arithmetic.py +17 -16
- maxframe/dataframe/core.py +26 -2
- maxframe/dataframe/datasource/read_odps_query.py +116 -28
- maxframe/dataframe/datasource/read_odps_table.py +3 -1
- maxframe/dataframe/datasource/tests/test_datasource.py +93 -12
- maxframe/dataframe/datastore/to_odps.py +7 -0
- maxframe/dataframe/extensions/__init__.py +8 -0
- maxframe/dataframe/extensions/apply_chunk.py +649 -0
- maxframe/dataframe/extensions/flatjson.py +131 -0
- maxframe/dataframe/extensions/flatmap.py +314 -0
- maxframe/dataframe/extensions/reshuffle.py +1 -1
- maxframe/dataframe/extensions/tests/test_apply_chunk.py +186 -0
- maxframe/dataframe/extensions/tests/test_extensions.py +108 -3
- maxframe/dataframe/groupby/__init__.py +1 -0
- maxframe/dataframe/groupby/aggregation.py +1 -0
- maxframe/dataframe/groupby/apply.py +9 -1
- maxframe/dataframe/groupby/core.py +1 -1
- maxframe/dataframe/groupby/fill.py +4 -1
- maxframe/dataframe/groupby/getitem.py +6 -0
- maxframe/dataframe/groupby/tests/test_groupby.py +1 -1
- maxframe/dataframe/groupby/transform.py +8 -2
- maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
- maxframe/dataframe/indexing/loc.py +6 -4
- maxframe/dataframe/indexing/rename.py +11 -0
- maxframe/dataframe/initializer.py +11 -1
- maxframe/dataframe/merge/__init__.py +9 -1
- maxframe/dataframe/merge/concat.py +41 -31
- maxframe/dataframe/merge/merge.py +1 -1
- maxframe/dataframe/merge/tests/test_merge.py +3 -1
- maxframe/dataframe/misc/apply.py +3 -0
- maxframe/dataframe/misc/drop_duplicates.py +23 -2
- maxframe/dataframe/misc/map.py +3 -1
- maxframe/dataframe/misc/tests/test_misc.py +24 -2
- maxframe/dataframe/misc/transform.py +22 -13
- maxframe/dataframe/reduction/__init__.py +3 -0
- maxframe/dataframe/reduction/aggregation.py +1 -0
- maxframe/dataframe/reduction/median.py +56 -0
- maxframe/dataframe/reduction/tests/test_reduction.py +17 -7
- maxframe/dataframe/statistics/quantile.py +8 -2
- maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
- maxframe/dataframe/tests/test_initializer.py +33 -2
- maxframe/dataframe/tests/test_utils.py +60 -0
- maxframe/dataframe/utils.py +110 -7
- maxframe/dataframe/window/expanding.py +5 -3
- maxframe/dataframe/window/tests/test_expanding.py +2 -2
- maxframe/io/objects/tests/test_object_io.py +39 -12
- maxframe/io/odpsio/arrow.py +30 -2
- maxframe/io/odpsio/schema.py +28 -8
- maxframe/io/odpsio/tableio.py +55 -133
- maxframe/io/odpsio/tests/test_schema.py +40 -4
- maxframe/io/odpsio/tests/test_tableio.py +5 -5
- maxframe/io/odpsio/tests/test_volumeio.py +35 -11
- maxframe/io/odpsio/volumeio.py +36 -6
- maxframe/learn/contrib/__init__.py +3 -1
- maxframe/learn/contrib/graph/__init__.py +15 -0
- maxframe/learn/contrib/graph/connected_components.py +215 -0
- maxframe/learn/contrib/graph/tests/__init__.py +13 -0
- maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
- maxframe/learn/contrib/llm/__init__.py +16 -0
- maxframe/learn/contrib/llm/core.py +54 -0
- maxframe/learn/contrib/llm/models/__init__.py +14 -0
- maxframe/learn/contrib/llm/models/dashscope.py +73 -0
- maxframe/learn/contrib/llm/multi_modal.py +42 -0
- maxframe/learn/contrib/llm/text.py +42 -0
- maxframe/learn/contrib/xgboost/classifier.py +3 -3
- maxframe/learn/contrib/xgboost/predict.py +8 -39
- maxframe/learn/contrib/xgboost/train.py +4 -3
- maxframe/lib/mmh3.cp39-win_amd64.pyd +0 -0
- maxframe/lib/sparse/tests/test_sparse.py +15 -15
- maxframe/opcodes.py +10 -1
- maxframe/protocol.py +6 -1
- maxframe/serialization/core.cp39-win_amd64.pyd +0 -0
- maxframe/serialization/core.pyx +13 -1
- maxframe/serialization/pandas.py +50 -20
- maxframe/serialization/serializables/core.py +24 -5
- maxframe/serialization/serializables/field_type.py +4 -1
- maxframe/serialization/serializables/tests/test_serializable.py +8 -1
- maxframe/serialization/tests/test_serial.py +2 -1
- maxframe/session.py +9 -2
- maxframe/tensor/__init__.py +19 -7
- maxframe/tensor/indexing/getitem.py +2 -0
- maxframe/tensor/merge/concatenate.py +23 -20
- maxframe/tensor/merge/vstack.py +5 -1
- maxframe/tensor/misc/transpose.py +1 -1
- maxframe/tests/utils.py +16 -0
- maxframe/udf.py +27 -0
- maxframe/utils.py +64 -14
- {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/METADATA +2 -2
- {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/RECORD +112 -96
- {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/WHEEL +1 -1
- maxframe_client/clients/framedriver.py +4 -1
- maxframe_client/fetcher.py +28 -10
- maxframe_client/session/consts.py +3 -0
- maxframe_client/session/odps.py +104 -20
- maxframe_client/session/task.py +42 -26
- maxframe_client/session/tests/test_task.py +0 -4
- maxframe_client/tests/test_session.py +44 -12
- {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/top_level.txt +0 -0
|
@@ -11,12 +11,14 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
|
|
14
|
+
import numpy as np
|
|
15
15
|
import pandas as pd
|
|
16
16
|
import pytest
|
|
17
17
|
|
|
18
18
|
from .... import dataframe as md
|
|
19
|
-
from
|
|
19
|
+
from ....tests.utils import assert_mf_index_dtype
|
|
20
|
+
from ... import DataFrame
|
|
21
|
+
from ...core import DATAFRAME_TYPE, SERIES_TYPE, IndexValue
|
|
20
22
|
from ..reshuffle import DataFrameReshuffle
|
|
21
23
|
|
|
22
24
|
|
|
@@ -31,8 +33,111 @@ def test_reshuffle():
|
|
|
31
33
|
|
|
32
34
|
r = mdf.mf.reshuffle()
|
|
33
35
|
assert isinstance(r.op, DataFrameReshuffle)
|
|
34
|
-
|
|
36
|
+
assert_mf_index_dtype(r.index_value.value, np.int64)
|
|
35
37
|
|
|
36
38
|
r = mdf.mf.reshuffle(ignore_index=True)
|
|
37
39
|
assert isinstance(r.op, DataFrameReshuffle)
|
|
38
40
|
assert isinstance(r.index_value.value, IndexValue.RangeIndex)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@pytest.fixture
|
|
44
|
+
def df1():
|
|
45
|
+
return DataFrame({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]})
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@pytest.fixture
|
|
49
|
+
def df2():
|
|
50
|
+
return DataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3]], columns=["a", "b", "c"])
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@pytest.fixture
|
|
54
|
+
def df3():
|
|
55
|
+
return DataFrame(
|
|
56
|
+
[[1, 2, 3], [1, 2, 3], [1, 2, 3]],
|
|
57
|
+
columns=["a", "b", "c"],
|
|
58
|
+
index=pd.MultiIndex.from_arrays([[1, 2, 3], [1, 2, 3]], names=["A", "B"]),
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def test_flatmap(df1, df2, df3):
|
|
63
|
+
def f(x, keys):
|
|
64
|
+
if x["a"] in keys:
|
|
65
|
+
yield [1, 0]
|
|
66
|
+
yield [0, 1]
|
|
67
|
+
|
|
68
|
+
apply_df = df1[["a"]].mf.flatmap(
|
|
69
|
+
f,
|
|
70
|
+
dtypes={"a": "int64", "b": "int64"},
|
|
71
|
+
)
|
|
72
|
+
assert apply_df.shape == (np.nan, 2)
|
|
73
|
+
assert df1.index_value.key != apply_df.index_value.key
|
|
74
|
+
assert isinstance(df1.index_value.to_pandas(), pd.RangeIndex)
|
|
75
|
+
assert not isinstance(apply_df.index_value.to_pandas(), pd.RangeIndex)
|
|
76
|
+
apply_df = df2[["a"]].mf.flatmap(
|
|
77
|
+
f,
|
|
78
|
+
dtypes=pd.Series(["int64", "int64"]),
|
|
79
|
+
)
|
|
80
|
+
assert apply_df.shape == (np.nan, 2)
|
|
81
|
+
assert df2.index_value.key != apply_df.index_value.key
|
|
82
|
+
with pytest.raises(TypeError):
|
|
83
|
+
apply_s = df3["a"].mf.flatmap(
|
|
84
|
+
f,
|
|
85
|
+
)
|
|
86
|
+
apply_s = df3["a"].mf.flatmap(
|
|
87
|
+
f,
|
|
88
|
+
dtype="int64",
|
|
89
|
+
)
|
|
90
|
+
assert apply_s.shape == (np.nan,)
|
|
91
|
+
assert df3.index_value.key != apply_s.index_value.key
|
|
92
|
+
assert df3.key != apply_s.index_value.key
|
|
93
|
+
apply_s = df3["a"].mf.flatmap(
|
|
94
|
+
f,
|
|
95
|
+
output_type="dataframe",
|
|
96
|
+
dtypes=["int64", "int64"],
|
|
97
|
+
)
|
|
98
|
+
assert apply_s.shape == (np.nan, 2)
|
|
99
|
+
assert df3.index_value.key != apply_s.index_value.key
|
|
100
|
+
assert df3.key != apply_s.index_value.key
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def test_flatjson():
|
|
104
|
+
s1 = md.Series(["{{'a': 1, 'b': false}}"], index=[1])
|
|
105
|
+
df1 = s1.mf.flatjson(
|
|
106
|
+
["$.a", "$.b"], dtypes=pd.Series(["int32", "bool"], index=["a", "b"])
|
|
107
|
+
)
|
|
108
|
+
assert df1.shape == (1, 2)
|
|
109
|
+
assert df1.index_value.key == s1.index_value.key
|
|
110
|
+
assert isinstance(df1, DATAFRAME_TYPE)
|
|
111
|
+
assert list(df1.dtypes) == [np.dtype("int32"), np.dtype("bool")]
|
|
112
|
+
assert list(df1.dtypes.index) == ["a", "b"]
|
|
113
|
+
|
|
114
|
+
df2 = s1.mf.flatjson(["$.a"], dtypes=pd.Series(["int32"], index=["a"]))
|
|
115
|
+
assert df2.shape == (1, 1)
|
|
116
|
+
assert df2.index_value.key == s1.index_value.key
|
|
117
|
+
assert isinstance(df2, DATAFRAME_TYPE)
|
|
118
|
+
assert list(df2.dtypes) == [np.dtype("int32")]
|
|
119
|
+
assert list(df2.dtypes.index) == ["a"]
|
|
120
|
+
|
|
121
|
+
s2 = s1.mf.flatjson("$.a", dtype="int32", name="a")
|
|
122
|
+
assert s2.shape == (1,)
|
|
123
|
+
assert s2.index_value.key == s1.index_value.key
|
|
124
|
+
assert isinstance(s2, SERIES_TYPE)
|
|
125
|
+
assert s2.dtype == np.dtype("int32")
|
|
126
|
+
assert s2.name == "a"
|
|
127
|
+
|
|
128
|
+
with pytest.raises(ValueError):
|
|
129
|
+
s1.mf.flatjson([], dtypes=pd.Series(["int32", "bool"], index=["a", "b"]))
|
|
130
|
+
with pytest.raises(ValueError):
|
|
131
|
+
s1.mf.flatjson(["$.a"], dtypes=pd.Series(["int32", "bool"], index=["a", "b"]))
|
|
132
|
+
with pytest.raises(ValueError):
|
|
133
|
+
s1.mf.flatjson(["$.a"], dtypes=pd.Series(["int32", "bool"], index=["a", "b"]))
|
|
134
|
+
with pytest.raises(ValueError):
|
|
135
|
+
s1.mf.flatjson(["$.a", "$.b"], dtypes=pd.Series(["bool"], index=["b"]))
|
|
136
|
+
with pytest.raises(ValueError):
|
|
137
|
+
s1.mf.flatjson(
|
|
138
|
+
["$.a"],
|
|
139
|
+
dtype="int32",
|
|
140
|
+
dtypes=pd.Series(["int32"], index=["a"]),
|
|
141
|
+
)
|
|
142
|
+
with pytest.raises(ValueError):
|
|
143
|
+
s1.mf.flatjson(["$.a"])
|
|
@@ -55,6 +55,7 @@ def _install():
|
|
|
55
55
|
setattr(cls, "kurtosis", lambda groupby, **kw: agg(groupby, "kurtosis", **kw))
|
|
56
56
|
setattr(cls, "sem", lambda groupby, **kw: agg(groupby, "sem", **kw))
|
|
57
57
|
setattr(cls, "nunique", lambda groupby, **kw: agg(groupby, "nunique", **kw))
|
|
58
|
+
setattr(cls, "median", lambda groupby, **kw: agg(groupby, "median", **kw))
|
|
58
59
|
|
|
59
60
|
setattr(cls, "apply", groupby_apply)
|
|
60
61
|
setattr(cls, "transform", groupby_transform)
|
|
@@ -28,7 +28,13 @@ from ...serialization.serializables import (
|
|
|
28
28
|
)
|
|
29
29
|
from ...utils import get_func_token, quiet_stdio, tokenize
|
|
30
30
|
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
31
|
-
from ..utils import
|
|
31
|
+
from ..utils import (
|
|
32
|
+
copy_func_scheduling_hints,
|
|
33
|
+
make_dtype,
|
|
34
|
+
make_dtypes,
|
|
35
|
+
parse_index,
|
|
36
|
+
validate_output_types,
|
|
37
|
+
)
|
|
32
38
|
|
|
33
39
|
|
|
34
40
|
class GroupByApplyLogicKeyGeneratorMixin(OperatorLogicKeyGeneratorMixin):
|
|
@@ -56,6 +62,8 @@ class GroupByApply(
|
|
|
56
62
|
|
|
57
63
|
def __init__(self, output_types=None, **kw):
|
|
58
64
|
super().__init__(_output_types=output_types, **kw)
|
|
65
|
+
if hasattr(self, "func"):
|
|
66
|
+
copy_func_scheduling_hints(self.func, self)
|
|
59
67
|
|
|
60
68
|
def _update_key(self):
|
|
61
69
|
values = [v for v in self._values_ if v is not self.func] + [
|
|
@@ -28,7 +28,7 @@ from ..utils import build_df, build_series, parse_index
|
|
|
28
28
|
|
|
29
29
|
cudf = lazy_import("cudf")
|
|
30
30
|
|
|
31
|
-
_GROUP_KEYS_NO_DEFAULT = pd_release_version
|
|
31
|
+
_GROUP_KEYS_NO_DEFAULT = pd_release_version[:2] == (1, 5)
|
|
32
32
|
_default_group_keys = no_default if _GROUP_KEYS_NO_DEFAULT else True
|
|
33
33
|
|
|
34
34
|
|
|
@@ -35,12 +35,15 @@ class GroupByFillOperator(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
35
35
|
func_name = getattr(self, "_func_name")
|
|
36
36
|
|
|
37
37
|
if func_name == "fillna":
|
|
38
|
+
kw = {}
|
|
39
|
+
if self.axis is not None:
|
|
40
|
+
kw["axis"] = self.axis
|
|
38
41
|
result_df = mock_groupby.fillna(
|
|
39
42
|
value=self.value,
|
|
40
43
|
method=self.method,
|
|
41
|
-
axis=self.axis,
|
|
42
44
|
limit=self.limit,
|
|
43
45
|
downcast=self.downcast,
|
|
46
|
+
**kw,
|
|
44
47
|
)
|
|
45
48
|
else:
|
|
46
49
|
result_df = getattr(mock_groupby, func_name)(limit=self.limit)
|
|
@@ -88,5 +88,11 @@ def df_groupby_getitem(df_groupby, item):
|
|
|
88
88
|
if df_groupby.selection:
|
|
89
89
|
raise IndexError(f"Column(s) {df_groupby.selection!r} already selected")
|
|
90
90
|
|
|
91
|
+
if (
|
|
92
|
+
isinstance(item, tuple)
|
|
93
|
+
and item not in df_groupby.dtypes
|
|
94
|
+
and item not in df_groupby.index.names
|
|
95
|
+
):
|
|
96
|
+
item = list(item)
|
|
91
97
|
op = GroupByIndex(selection=item, output_types=output_types)
|
|
92
98
|
return op(df_groupby)
|
|
@@ -230,7 +230,7 @@ def test_groupby_transform():
|
|
|
230
230
|
assert r.op._op_type_ == opcodes.TRANSFORM
|
|
231
231
|
assert r.op.output_types[0] == OutputType.dataframe
|
|
232
232
|
|
|
233
|
-
r = mdf.groupby("b").transform(["cummax", "cumcount"], _call_agg=True)
|
|
233
|
+
r = mdf[list("abde")].groupby("b").transform(["cummax", "cumcount"], _call_agg=True)
|
|
234
234
|
assert r.shape == (np.nan, 6)
|
|
235
235
|
assert r.op._op_type_ == opcodes.TRANSFORM
|
|
236
236
|
assert r.op.output_types[0] == OutputType.dataframe
|
|
@@ -12,6 +12,8 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
+
import logging
|
|
16
|
+
|
|
15
17
|
import numpy as np
|
|
16
18
|
import pandas as pd
|
|
17
19
|
|
|
@@ -20,7 +22,9 @@ from ...core import OutputType
|
|
|
20
22
|
from ...serialization.serializables import AnyField, BoolField, DictField, TupleField
|
|
21
23
|
from ...utils import quiet_stdio
|
|
22
24
|
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
23
|
-
from ..utils import parse_index
|
|
25
|
+
from ..utils import copy_func_scheduling_hints, parse_index
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
24
28
|
|
|
25
29
|
|
|
26
30
|
class GroupByTransform(DataFrameOperator, DataFrameOperatorMixin):
|
|
@@ -35,6 +39,8 @@ class GroupByTransform(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
35
39
|
|
|
36
40
|
def __init__(self, output_types=None, **kw):
|
|
37
41
|
super().__init__(_output_types=output_types, **kw)
|
|
42
|
+
if hasattr(self, "func"):
|
|
43
|
+
copy_func_scheduling_hints(self.func, self)
|
|
38
44
|
|
|
39
45
|
def _infer_df_func_returns(self, in_groupby, dtypes, index):
|
|
40
46
|
index_value, output_types, new_dtypes = None, None, None
|
|
@@ -65,7 +71,7 @@ class GroupByTransform(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
65
71
|
output_types = [OutputType.series]
|
|
66
72
|
new_dtypes = new_dtypes or (infer_df.name, infer_df.dtype)
|
|
67
73
|
except: # noqa: E722 # nosec
|
|
68
|
-
|
|
74
|
+
logger.info("Exception raised while inferring df_func", exc_info=True)
|
|
69
75
|
|
|
70
76
|
self.output_types = output_types if not self.output_types else self.output_types
|
|
71
77
|
dtypes = new_dtypes if dtypes is None else dtypes
|
|
@@ -25,13 +25,14 @@ from ...core import ENTITY_TYPE, OutputType
|
|
|
25
25
|
from ...serialization.serializables import AnyField, KeyField, ListField
|
|
26
26
|
from ...tensor.datasource import asarray
|
|
27
27
|
from ...tensor.utils import calc_sliced_size, filter_inputs
|
|
28
|
-
from ...utils import is_full_slice, lazy_import
|
|
28
|
+
from ...utils import is_full_slice, lazy_import, pd_release_version
|
|
29
29
|
from ..core import DATAFRAME_TYPE, IndexValue
|
|
30
30
|
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
31
31
|
from ..utils import parse_index
|
|
32
32
|
from .iloc import DataFrameIlocSetItem
|
|
33
33
|
|
|
34
34
|
cudf = lazy_import("cudf")
|
|
35
|
+
with_slice_locs_kind = pd_release_version < (1, 4, 0)
|
|
35
36
|
|
|
36
37
|
|
|
37
38
|
def process_loc_indexes(inp, indexes, fetch_index: bool = True):
|
|
@@ -210,9 +211,10 @@ class DataFrameLocGetItem(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
210
211
|
if axis == 1:
|
|
211
212
|
param["dtypes"] = inp.dtypes
|
|
212
213
|
elif input_index_value.has_value():
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
214
|
+
kw = {}
|
|
215
|
+
if with_slice_locs_kind:
|
|
216
|
+
kw["kind"] = "loc"
|
|
217
|
+
start, end = pd_index.slice_locs(index.start, index.stop, index.step, **kw)
|
|
216
218
|
slc = slice(start, end, index.step)
|
|
217
219
|
size = calc_sliced_size(inp.shape[axis], slc)
|
|
218
220
|
param["shape"] = size
|
|
@@ -248,6 +248,7 @@ def df_rename(
|
|
|
248
248
|
)
|
|
249
249
|
|
|
250
250
|
|
|
251
|
+
# fixme https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/58
|
|
251
252
|
def series_rename(
|
|
252
253
|
series,
|
|
253
254
|
index=None,
|
|
@@ -382,6 +383,7 @@ def index_rename(index, name, inplace=False):
|
|
|
382
383
|
return ret
|
|
383
384
|
|
|
384
385
|
|
|
386
|
+
# fixme https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/59
|
|
385
387
|
def index_set_names(index, names, level=None, inplace=False):
|
|
386
388
|
"""
|
|
387
389
|
Set Index or MultiIndex name.
|
|
@@ -407,6 +409,15 @@ def index_set_names(index, names, level=None, inplace=False):
|
|
|
407
409
|
See Also
|
|
408
410
|
--------
|
|
409
411
|
Index.rename : Able to set new names without level.
|
|
412
|
+
|
|
413
|
+
Examples
|
|
414
|
+
--------
|
|
415
|
+
>>> import maxframe.dataframe as md
|
|
416
|
+
>>> idx = md.Index([1, 2, 3, 4])
|
|
417
|
+
>>> idx.execute()
|
|
418
|
+
Int64Index([1, 2, 3, 4], dtype='int64')
|
|
419
|
+
>>> idx.set_names('quarter').execute()
|
|
420
|
+
Int64Index([1, 2, 3, 4], dtype='int64', name='quarter')
|
|
410
421
|
"""
|
|
411
422
|
op = DataFrameRename(
|
|
412
423
|
index_mapper=names, level=level, output_types=get_output_types(index)
|
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
from typing import Union
|
|
16
16
|
|
|
17
17
|
import pandas as pd
|
|
18
|
+
from pandas.api.types import is_list_like
|
|
18
19
|
from pandas.core.dtypes.common import pandas_dtype
|
|
19
20
|
|
|
20
21
|
from ..core import ENTITY_TYPE
|
|
@@ -61,6 +62,8 @@ class DataFrame(_Frame, metaclass=InitializerMeta):
|
|
|
61
62
|
num_partitions=None,
|
|
62
63
|
):
|
|
63
64
|
need_repart = False
|
|
65
|
+
if columns is not None and not is_list_like(columns):
|
|
66
|
+
raise ValueError("columns must be a list-like object")
|
|
64
67
|
if isinstance(data, TENSOR_TYPE):
|
|
65
68
|
if chunk_size is not None:
|
|
66
69
|
data = data.rechunk(chunk_size)
|
|
@@ -69,7 +72,10 @@ class DataFrame(_Frame, metaclass=InitializerMeta):
|
|
|
69
72
|
)
|
|
70
73
|
need_repart = num_partitions is not None
|
|
71
74
|
elif isinstance(data, SERIES_TYPE):
|
|
72
|
-
|
|
75
|
+
if columns is not None and len(columns) != 1:
|
|
76
|
+
raise ValueError("columns' length must be 1 when data is Series")
|
|
77
|
+
col_name = columns[0] if columns else None
|
|
78
|
+
df = data.to_frame(name=col_name)
|
|
73
79
|
need_repart = num_partitions is not None
|
|
74
80
|
elif isinstance(data, DATAFRAME_TYPE):
|
|
75
81
|
if not hasattr(data, "data"):
|
|
@@ -77,6 +83,10 @@ class DataFrame(_Frame, metaclass=InitializerMeta):
|
|
|
77
83
|
df = _Frame(data)
|
|
78
84
|
else:
|
|
79
85
|
df = data
|
|
86
|
+
if columns is not None:
|
|
87
|
+
if len(df.columns) != len(columns):
|
|
88
|
+
raise ValueError("columns' length must be equal to the data's")
|
|
89
|
+
df.columns = columns
|
|
80
90
|
need_repart = num_partitions is not None
|
|
81
91
|
elif isinstance(data, dict) and self._can_process_by_1d_tileables(data):
|
|
82
92
|
# data is a dict and some value is tensor
|
|
@@ -14,7 +14,15 @@
|
|
|
14
14
|
|
|
15
15
|
from .append import DataFrameAppend, append
|
|
16
16
|
from .concat import DataFrameConcat, concat
|
|
17
|
-
from .merge import
|
|
17
|
+
from .merge import (
|
|
18
|
+
DataFrameMerge,
|
|
19
|
+
DataFrameMergeAlign,
|
|
20
|
+
DistributedMapJoinHint,
|
|
21
|
+
MapJoinHint,
|
|
22
|
+
SkewJoinHint,
|
|
23
|
+
join,
|
|
24
|
+
merge,
|
|
25
|
+
)
|
|
18
26
|
|
|
19
27
|
|
|
20
28
|
def _install():
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
from typing import List, Union
|
|
14
15
|
|
|
15
16
|
import pandas as pd
|
|
16
17
|
|
|
@@ -24,6 +25,7 @@ from ...serialization.serializables import (
|
|
|
24
25
|
StringField,
|
|
25
26
|
)
|
|
26
27
|
from ...utils import lazy_import
|
|
28
|
+
from ..core import DataFrame, Series
|
|
27
29
|
from ..operators import SERIES_TYPE, DataFrameOperator, DataFrameOperatorMixin
|
|
28
30
|
from ..utils import build_empty_df, build_empty_series, parse_index, validate_axis
|
|
29
31
|
|
|
@@ -55,41 +57,53 @@ class DataFrameConcat(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
55
57
|
return self.names
|
|
56
58
|
|
|
57
59
|
@classmethod
|
|
58
|
-
def _concat_index(cls,
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
if
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
60
|
+
def _concat_index(cls, df_or_series_list: Union[List[DataFrame], List[Series]]):
|
|
61
|
+
concat_index = None
|
|
62
|
+
all_indexes_have_value = all(
|
|
63
|
+
input.index_value.has_value() for input in df_or_series_list
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
def _concat(prev_index: pd.Index, cur_index: pd.Index):
|
|
67
|
+
if prev_index is None:
|
|
68
|
+
return cur_index
|
|
69
|
+
|
|
70
|
+
if (
|
|
71
|
+
all_indexes_have_value
|
|
72
|
+
and isinstance(prev_index, pd.RangeIndex)
|
|
73
|
+
and isinstance(cur_index, pd.RangeIndex)
|
|
74
|
+
):
|
|
75
|
+
# handle RangeIndex that append may generate huge amount of data
|
|
76
|
+
# e.g. pd.RangeIndex(10_000) and pd.RangeIndex(10_000)
|
|
77
|
+
# will generate a Int64Index full of data
|
|
78
|
+
# for details see GH#1647
|
|
79
|
+
prev_stop = prev_index.start + prev_index.size * prev_index.step
|
|
80
|
+
cur_start = cur_index.start
|
|
81
|
+
if prev_stop == cur_start and prev_index.step == cur_index.step:
|
|
82
|
+
# continuous RangeIndex, still return RangeIndex
|
|
83
|
+
return prev_index.append(cur_index)
|
|
84
|
+
else:
|
|
85
|
+
# otherwise, return an empty index
|
|
86
|
+
return pd.Index([], dtype=prev_index.dtype)
|
|
87
|
+
elif isinstance(prev_index, pd.RangeIndex):
|
|
88
|
+
return pd.Index([], prev_index.dtype).append(cur_index)
|
|
89
|
+
elif isinstance(cur_index, pd.RangeIndex):
|
|
90
|
+
return prev_index.append(pd.Index([], cur_index.dtype))
|
|
91
|
+
return prev_index.append(cur_index)
|
|
92
|
+
|
|
93
|
+
for input in df_or_series_list:
|
|
94
|
+
concat_index = _concat(concat_index, input.index_value.to_pandas())
|
|
95
|
+
|
|
96
|
+
return concat_index
|
|
79
97
|
|
|
80
98
|
def _call_series(self, objs):
|
|
81
99
|
if self.axis == 0:
|
|
82
100
|
row_length = 0
|
|
83
|
-
index = None
|
|
84
101
|
for series in objs:
|
|
85
|
-
if index is None:
|
|
86
|
-
index = series.index_value.to_pandas()
|
|
87
|
-
else:
|
|
88
|
-
index = self._concat_index(index, series.index_value.to_pandas())
|
|
89
102
|
row_length += series.shape[0]
|
|
90
103
|
if self.ignore_index: # pragma: no cover
|
|
91
104
|
index_value = parse_index(pd.RangeIndex(row_length))
|
|
92
105
|
else:
|
|
106
|
+
index = self._concat_index(objs)
|
|
93
107
|
index_value = parse_index(index, objs)
|
|
94
108
|
obj_names = {obj.name for obj in objs}
|
|
95
109
|
return self.new_series(
|
|
@@ -130,13 +144,8 @@ class DataFrameConcat(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
130
144
|
def _call_dataframes(self, objs):
|
|
131
145
|
if self.axis == 0:
|
|
132
146
|
row_length = 0
|
|
133
|
-
index = None
|
|
134
147
|
empty_dfs = []
|
|
135
148
|
for df in objs:
|
|
136
|
-
if index is None:
|
|
137
|
-
index = df.index_value.to_pandas()
|
|
138
|
-
else:
|
|
139
|
-
index = self._concat_index(index, df.index_value.to_pandas())
|
|
140
149
|
row_length += df.shape[0]
|
|
141
150
|
if df.ndim == 2:
|
|
142
151
|
empty_dfs.append(build_empty_df(df.dtypes))
|
|
@@ -153,6 +162,7 @@ class DataFrameConcat(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
153
162
|
if self.ignore_index: # pragma: no cover
|
|
154
163
|
index_value = parse_index(pd.RangeIndex(row_length))
|
|
155
164
|
else:
|
|
165
|
+
index = self._concat_index(objs)
|
|
156
166
|
index_value = parse_index(index, objs)
|
|
157
167
|
|
|
158
168
|
new_objs = []
|
|
@@ -16,6 +16,7 @@ import numpy as np
|
|
|
16
16
|
import pandas as pd
|
|
17
17
|
import pytest
|
|
18
18
|
|
|
19
|
+
from ....tests.utils import assert_mf_index_dtype
|
|
19
20
|
from ...core import IndexValue
|
|
20
21
|
from ...datasource.dataframe import from_pandas
|
|
21
22
|
from .. import DataFrameMerge, concat
|
|
@@ -161,7 +162,7 @@ def test_append():
|
|
|
161
162
|
adf = mdf1.append(mdf2)
|
|
162
163
|
|
|
163
164
|
assert adf.shape == (20, 4)
|
|
164
|
-
|
|
165
|
+
assert_mf_index_dtype(adf.index_value.value, np.int64)
|
|
165
166
|
|
|
166
167
|
mdf1 = from_pandas(df1, chunk_size=3)
|
|
167
168
|
mdf2 = from_pandas(df2, chunk_size=3)
|
|
@@ -181,6 +182,7 @@ def test_concat():
|
|
|
181
182
|
r = concat([mdf1, mdf2], axis="index")
|
|
182
183
|
|
|
183
184
|
assert r.shape == (20, 4)
|
|
185
|
+
assert not isinstance(r.index_value.to_pandas(), pd.RangeIndex)
|
|
184
186
|
pd.testing.assert_series_equal(r.dtypes, df1.dtypes)
|
|
185
187
|
|
|
186
188
|
df3 = pd.DataFrame(
|
maxframe/dataframe/misc/apply.py
CHANGED
|
@@ -35,6 +35,7 @@ from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
|
35
35
|
from ..utils import (
|
|
36
36
|
build_df,
|
|
37
37
|
build_series,
|
|
38
|
+
copy_func_scheduling_hints,
|
|
38
39
|
make_dtype,
|
|
39
40
|
make_dtypes,
|
|
40
41
|
pack_func_args,
|
|
@@ -79,6 +80,8 @@ class ApplyOperator(
|
|
|
79
80
|
if output_type:
|
|
80
81
|
kw["_output_types"] = [output_type]
|
|
81
82
|
super().__init__(**kw)
|
|
83
|
+
if hasattr(self, "func"):
|
|
84
|
+
copy_func_scheduling_hints(self.func, self)
|
|
82
85
|
|
|
83
86
|
def _update_key(self):
|
|
84
87
|
values = [v for v in self._values_ if v is not self.func] + [
|
|
@@ -43,7 +43,11 @@ class DataFrameDropDuplicates(DuplicateOperand):
|
|
|
43
43
|
params["index_value"] = parse_index(pd.RangeIndex(-1))
|
|
44
44
|
else:
|
|
45
45
|
params["index_value"] = gen_unknown_index_value(
|
|
46
|
-
input_params["index_value"],
|
|
46
|
+
input_params["index_value"],
|
|
47
|
+
op.keep,
|
|
48
|
+
op.subset,
|
|
49
|
+
type(op).__name__,
|
|
50
|
+
normalize_range_index=True,
|
|
47
51
|
)
|
|
48
52
|
params["shape"] = self._get_shape(input_params["shape"], op)
|
|
49
53
|
return params
|
|
@@ -104,7 +108,6 @@ def df_drop_duplicates(
|
|
|
104
108
|
def series_drop_duplicates(
|
|
105
109
|
series, keep="first", inplace=False, ignore_index=False, method="auto"
|
|
106
110
|
):
|
|
107
|
-
# FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/12
|
|
108
111
|
"""
|
|
109
112
|
Return Series with duplicate values removed.
|
|
110
113
|
|
|
@@ -148,6 +151,24 @@ def series_drop_duplicates(
|
|
|
148
151
|
5 hippo
|
|
149
152
|
Name: animal, dtype: object
|
|
150
153
|
|
|
154
|
+
With the 'keep' parameter, the selection behaviour of duplicated values
|
|
155
|
+
can be changed. The value 'first' keeps the first occurrence for each
|
|
156
|
+
set of duplicated entries. The default value of keep is 'first'.
|
|
157
|
+
>>> s.drop_duplicates().execute()
|
|
158
|
+
0 lame
|
|
159
|
+
1 cow
|
|
160
|
+
3 beetle
|
|
161
|
+
5 hippo
|
|
162
|
+
Name: animal, dtype: object
|
|
163
|
+
The value 'last' for parameter 'keep' keeps the last occurrence for
|
|
164
|
+
each set of duplicated entries.
|
|
165
|
+
>>> s.drop_duplicates(keep='last').execute()
|
|
166
|
+
1 cow
|
|
167
|
+
3 beetle
|
|
168
|
+
4 lame
|
|
169
|
+
5 hippo
|
|
170
|
+
Name: animal, dtype: object
|
|
171
|
+
|
|
151
172
|
The value ``False`` for parameter 'keep' discards all sets of
|
|
152
173
|
duplicated entries. Setting the value of 'inplace' to ``True`` performs
|
|
153
174
|
the operation inplace and returns ``None``.
|
maxframe/dataframe/misc/map.py
CHANGED
|
@@ -24,7 +24,7 @@ from ...serialization.serializables import AnyField, KeyField, StringField
|
|
|
24
24
|
from ...utils import quiet_stdio
|
|
25
25
|
from ..core import SERIES_TYPE
|
|
26
26
|
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
27
|
-
from ..utils import build_series
|
|
27
|
+
from ..utils import build_series, copy_func_scheduling_hints
|
|
28
28
|
|
|
29
29
|
|
|
30
30
|
class DataFrameMap(DataFrameOperator, DataFrameOperatorMixin):
|
|
@@ -38,6 +38,8 @@ class DataFrameMap(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
38
38
|
super().__init__(_output_types=output_types, _memory_scale=memory_scale, **kw)
|
|
39
39
|
if not self.output_types:
|
|
40
40
|
self.output_types = [OutputType.series]
|
|
41
|
+
if hasattr(self, "arg"):
|
|
42
|
+
copy_func_scheduling_hints(self.arg, self)
|
|
41
43
|
|
|
42
44
|
def _set_inputs(self, inputs):
|
|
43
45
|
super()._set_inputs(inputs)
|