maxframe 1.2.1__cp37-cp37m-win32.whl → 1.3.1__cp37-cp37m-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cp37-win32.pyd +0 -0
- maxframe/codegen.py +70 -21
- maxframe/config/config.py +6 -0
- maxframe/core/accessor.py +1 -0
- maxframe/core/graph/core.cp37-win32.pyd +0 -0
- maxframe/dataframe/accessors/__init__.py +1 -1
- maxframe/dataframe/accessors/dict_/accessor.py +1 -0
- maxframe/dataframe/accessors/dict_/length.py +1 -0
- maxframe/dataframe/accessors/dict_/setitem.py +1 -0
- maxframe/dataframe/accessors/dict_/tests/test_dict_accessor.py +5 -7
- maxframe/dataframe/accessors/list_/__init__.py +37 -0
- maxframe/dataframe/accessors/list_/accessor.py +39 -0
- maxframe/dataframe/accessors/list_/getitem.py +135 -0
- maxframe/dataframe/accessors/list_/length.py +73 -0
- maxframe/dataframe/accessors/list_/tests/__init__.py +13 -0
- maxframe/dataframe/accessors/list_/tests/test_list_accessor.py +79 -0
- maxframe/dataframe/accessors/plotting/__init__.py +2 -0
- maxframe/dataframe/accessors/string_/__init__.py +1 -0
- maxframe/dataframe/datastore/to_odps.py +6 -0
- maxframe/dataframe/extensions/accessor.py +1 -0
- maxframe/dataframe/extensions/apply_chunk.py +34 -21
- maxframe/dataframe/extensions/flatmap.py +8 -1
- maxframe/dataframe/extensions/tests/test_apply_chunk.py +2 -1
- maxframe/dataframe/extensions/tests/test_extensions.py +1 -0
- maxframe/dataframe/groupby/aggregation.py +53 -1
- maxframe/dataframe/merge/concat.py +7 -4
- maxframe/dataframe/merge/merge.py +1 -0
- maxframe/dataframe/merge/tests/test_merge.py +97 -47
- maxframe/dataframe/missing/tests/test_missing.py +1 -0
- maxframe/dataframe/reduction/aggregation.py +63 -0
- maxframe/dataframe/reduction/core.py +17 -5
- maxframe/dataframe/tests/test_utils.py +7 -0
- maxframe/dataframe/ufunc/ufunc.py +1 -0
- maxframe/dataframe/utils.py +3 -0
- maxframe/io/odpsio/schema.py +1 -0
- maxframe/learn/contrib/__init__.py +2 -4
- maxframe/learn/contrib/llm/__init__.py +1 -0
- maxframe/learn/contrib/llm/core.py +31 -10
- maxframe/learn/contrib/llm/models/__init__.py +1 -0
- maxframe/learn/contrib/llm/models/dashscope.py +38 -3
- maxframe/learn/contrib/llm/models/managed.py +54 -0
- maxframe/learn/contrib/llm/multi_modal.py +93 -0
- maxframe/learn/contrib/llm/text.py +268 -8
- maxframe/learn/contrib/models.py +77 -0
- maxframe/learn/contrib/utils.py +1 -0
- maxframe/learn/contrib/xgboost/__init__.py +8 -1
- maxframe/learn/contrib/xgboost/classifier.py +15 -4
- maxframe/learn/contrib/xgboost/core.py +108 -1
- maxframe/learn/contrib/xgboost/dmatrix.py +1 -1
- maxframe/learn/contrib/xgboost/predict.py +6 -3
- maxframe/learn/contrib/xgboost/regressor.py +15 -1
- maxframe/learn/contrib/xgboost/train.py +5 -4
- maxframe/lib/dtypes_extension/__init__.py +2 -1
- maxframe/lib/dtypes_extension/dtypes.py +21 -0
- maxframe/lib/dtypes_extension/tests/test_dtypes.py +13 -3
- maxframe/lib/mmh3.cp37-win32.pyd +0 -0
- maxframe/opcodes.py +19 -0
- maxframe/serialization/__init__.py +1 -0
- maxframe/serialization/core.cp37-win32.pyd +0 -0
- maxframe/serialization/core.pyx +12 -1
- maxframe/serialization/numpy.py +12 -4
- maxframe/serialization/serializables/tests/test_serializable.py +13 -2
- maxframe/serialization/tests/test_serial.py +2 -0
- maxframe/tensor/merge/concatenate.py +1 -0
- maxframe/tensor/misc/unique.py +11 -10
- maxframe/tensor/reshape/reshape.py +4 -1
- maxframe/utils.py +4 -0
- {maxframe-1.2.1.dist-info → maxframe-1.3.1.dist-info}/METADATA +2 -1
- {maxframe-1.2.1.dist-info → maxframe-1.3.1.dist-info}/RECORD +73 -65
- maxframe_client/session/odps.py +3 -0
- maxframe_client/session/tests/test_task.py +1 -0
- {maxframe-1.2.1.dist-info → maxframe-1.3.1.dist-info}/WHEEL +0 -0
- {maxframe-1.2.1.dist-info → maxframe-1.3.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# Copyright 1999-2025 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import numpy as np
|
|
16
|
+
import pandas as pd
|
|
17
|
+
import pyarrow as pa
|
|
18
|
+
import pytest
|
|
19
|
+
|
|
20
|
+
from ..... import dataframe as md
|
|
21
|
+
from .....lib.dtypes_extension import list_
|
|
22
|
+
from .....utils import ARROW_DTYPE_NOT_SUPPORTED
|
|
23
|
+
from ..getitem import SeriesListGetItemOperator
|
|
24
|
+
from ..length import SeriesListLengthOperator
|
|
25
|
+
|
|
26
|
+
pytestmark = pytest.mark.skipif(
|
|
27
|
+
ARROW_DTYPE_NOT_SUPPORTED, reason="Arrow Dtype is not supported"
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@pytest.fixture
|
|
32
|
+
def df():
|
|
33
|
+
return md.DataFrame(
|
|
34
|
+
{
|
|
35
|
+
"A": pd.Series([[5, 3, 2]], dtype=list_(pa.int32())),
|
|
36
|
+
"B": pd.Series([["ab", "cd"]], dtype=list_(pa.string())),
|
|
37
|
+
"C": pd.Series([1], dtype=np.dtype("int64")),
|
|
38
|
+
},
|
|
39
|
+
index=[1],
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def test_invalid_dtype(df):
|
|
44
|
+
with pytest.raises(AttributeError):
|
|
45
|
+
df["C"].list.len()
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def test_getitem(df):
|
|
49
|
+
s1 = df["A"].list[1]
|
|
50
|
+
assert isinstance(s1, md.Series)
|
|
51
|
+
assert s1.dtype == pd.ArrowDtype(pa.int32())
|
|
52
|
+
assert s1.shape == (1,)
|
|
53
|
+
assert s1.index_value == df.index_value
|
|
54
|
+
op = s1.op
|
|
55
|
+
assert isinstance(op, SeriesListGetItemOperator)
|
|
56
|
+
assert op.query_index == 1
|
|
57
|
+
assert op.ignore_index_error is False
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def test_getitem_ignore_index_err(df):
|
|
61
|
+
s1 = df["B"].list.get(1)
|
|
62
|
+
assert isinstance(s1, md.Series)
|
|
63
|
+
assert s1.dtype == pd.ArrowDtype(pa.string())
|
|
64
|
+
assert s1.shape == (1,)
|
|
65
|
+
assert s1.index_value == df.index_value
|
|
66
|
+
op = s1.op
|
|
67
|
+
assert isinstance(op, SeriesListGetItemOperator)
|
|
68
|
+
assert op.query_index == 1
|
|
69
|
+
assert op.ignore_index_error is True
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def test_length(df):
|
|
73
|
+
s1 = df["A"].list.len()
|
|
74
|
+
assert isinstance(s1, md.Series)
|
|
75
|
+
assert s1.dtype == pd.ArrowDtype(pa.int64())
|
|
76
|
+
assert s1.shape == (1,)
|
|
77
|
+
assert s1.index_value == df.index_value
|
|
78
|
+
op = s1.op
|
|
79
|
+
assert isinstance(op, SeriesListLengthOperator)
|
|
@@ -27,6 +27,7 @@ from ...core import OutputType
|
|
|
27
27
|
from ...io.odpsio import build_dataframe_table_meta
|
|
28
28
|
from ...serialization.serializables import (
|
|
29
29
|
BoolField,
|
|
30
|
+
DictField,
|
|
30
31
|
FieldTypes,
|
|
31
32
|
Int64Field,
|
|
32
33
|
ListField,
|
|
@@ -55,6 +56,7 @@ class DataFrameToODPSTable(DataFrameDataStore):
|
|
|
55
56
|
index = BoolField("index", default=True)
|
|
56
57
|
index_label = ListField("index_label", FieldTypes.string, default=None)
|
|
57
58
|
lifecycle = Int64Field("lifecycle", default=None)
|
|
59
|
+
table_properties = DictField("table_properties", default=None)
|
|
58
60
|
|
|
59
61
|
def __init__(self, **kw):
|
|
60
62
|
super().__init__(_output_types=[OutputType.dataframe], **kw)
|
|
@@ -84,6 +86,7 @@ def to_odps_table(
|
|
|
84
86
|
index: bool = True,
|
|
85
87
|
index_label: Union[None, str, List[str]] = None,
|
|
86
88
|
lifecycle: Optional[int] = None,
|
|
89
|
+
table_properties: Optional[dict] = None,
|
|
87
90
|
):
|
|
88
91
|
"""
|
|
89
92
|
Write DataFrame object into a MaxCompute (ODPS) table.
|
|
@@ -122,6 +125,8 @@ def to_odps_table(
|
|
|
122
125
|
names will be used.
|
|
123
126
|
lifecycle: Optional[int]
|
|
124
127
|
Specify lifecycle of the output table.
|
|
128
|
+
table_properties: Optional[dict]
|
|
129
|
+
Specify properties of the output table.
|
|
125
130
|
|
|
126
131
|
Returns
|
|
127
132
|
-------
|
|
@@ -186,5 +191,6 @@ def to_odps_table(
|
|
|
186
191
|
index=index,
|
|
187
192
|
index_label=index_label,
|
|
188
193
|
lifecycle=lifecycle or options.session.table_lifecycle,
|
|
194
|
+
table_properties=table_properties,
|
|
189
195
|
)
|
|
190
196
|
return op(df)
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
|
|
14
15
|
from typing import TYPE_CHECKING
|
|
15
16
|
|
|
16
17
|
from ...core import BaseMaxFrameAccessor
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
|
|
14
15
|
import functools
|
|
15
16
|
from typing import Any, Callable, Dict, List, Tuple, Union
|
|
16
17
|
|
|
@@ -19,7 +20,12 @@ import pandas as pd
|
|
|
19
20
|
|
|
20
21
|
from ... import opcodes
|
|
21
22
|
from ...core import OutputType
|
|
22
|
-
from ...serialization.serializables import
|
|
23
|
+
from ...serialization.serializables import (
|
|
24
|
+
DictField,
|
|
25
|
+
FunctionField,
|
|
26
|
+
Int32Field,
|
|
27
|
+
TupleField,
|
|
28
|
+
)
|
|
23
29
|
from ...utils import quiet_stdio
|
|
24
30
|
from ..core import DATAFRAME_TYPE, DataFrame, IndexValue, Series
|
|
25
31
|
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
@@ -38,7 +44,9 @@ class DataFrameApplyChunkOperator(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
38
44
|
_op_type_ = opcodes.APPLY_CHUNK
|
|
39
45
|
|
|
40
46
|
func = FunctionField("func")
|
|
41
|
-
batch_rows = Int32Field("batch_rows")
|
|
47
|
+
batch_rows = Int32Field("batch_rows", default=None)
|
|
48
|
+
args = TupleField("args", default=None)
|
|
49
|
+
kwargs = DictField("kwargs", default=None)
|
|
42
50
|
|
|
43
51
|
def __init__(self, output_type=None, **kw):
|
|
44
52
|
if output_type:
|
|
@@ -104,12 +112,11 @@ class DataFrameApplyChunkOperator(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
104
112
|
dtypes: Union[Tuple[str, Any], Dict[str, Any]] = None,
|
|
105
113
|
output_type=None,
|
|
106
114
|
index=None,
|
|
107
|
-
args=(),
|
|
108
|
-
**kwargs,
|
|
109
115
|
):
|
|
116
|
+
args = self.args or ()
|
|
117
|
+
kwargs = self.kwargs or {}
|
|
110
118
|
# if not dtypes and not skip_infer:
|
|
111
|
-
|
|
112
|
-
self.func = get_packed_func(df_or_series, origin_func, *args, **kwargs)
|
|
119
|
+
packed_func = get_packed_func(df_or_series, self.func, *args, **kwargs)
|
|
113
120
|
|
|
114
121
|
# if skip_infer, directly build a frame
|
|
115
122
|
if self.output_types and self.output_types[0] == OutputType.df_or_series:
|
|
@@ -118,8 +125,8 @@ class DataFrameApplyChunkOperator(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
118
125
|
# infer return index and dtypes
|
|
119
126
|
dtypes, index_value, elementwise = self._infer_batch_func_returns(
|
|
120
127
|
df_or_series,
|
|
121
|
-
origin_func=
|
|
122
|
-
packed_func=
|
|
128
|
+
origin_func=self.func,
|
|
129
|
+
packed_func=packed_func,
|
|
123
130
|
given_output_type=output_type,
|
|
124
131
|
given_dtypes=dtypes,
|
|
125
132
|
given_index=index,
|
|
@@ -166,6 +173,8 @@ class DataFrameApplyChunkOperator(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
166
173
|
given_dtypes: Union[Tuple[str, Any], pd.Series, List[Any], Dict[str, Any]],
|
|
167
174
|
given_index: Union[pd.Index, IndexValue],
|
|
168
175
|
given_elementwise: bool = False,
|
|
176
|
+
*args,
|
|
177
|
+
**kwargs,
|
|
169
178
|
):
|
|
170
179
|
inferred_output_type = inferred_dtypes = inferred_index_value = None
|
|
171
180
|
inferred_is_elementwise = False
|
|
@@ -190,7 +199,7 @@ class DataFrameApplyChunkOperator(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
190
199
|
try:
|
|
191
200
|
# execute
|
|
192
201
|
with np.errstate(all="ignore"), quiet_stdio():
|
|
193
|
-
infer_result = packed_func(empty_data)
|
|
202
|
+
infer_result = packed_func(empty_data, *args, **kwargs)
|
|
194
203
|
|
|
195
204
|
# if executed successfully, get index and dtypes from returned object
|
|
196
205
|
if inferred_index_value is None:
|
|
@@ -258,7 +267,7 @@ def get_packed_func(df, func, *args, **kwargs) -> Any:
|
|
|
258
267
|
def df_apply_chunk(
|
|
259
268
|
dataframe,
|
|
260
269
|
func: Union[str, Callable],
|
|
261
|
-
batch_rows,
|
|
270
|
+
batch_rows=None,
|
|
262
271
|
dtypes=None,
|
|
263
272
|
dtype=None,
|
|
264
273
|
name=None,
|
|
@@ -462,11 +471,11 @@ def df_apply_chunk(
|
|
|
462
471
|
if not isinstance(func, Callable):
|
|
463
472
|
raise TypeError("function must be a callable object")
|
|
464
473
|
|
|
465
|
-
if not
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
474
|
+
if batch_rows is not None:
|
|
475
|
+
if not isinstance(batch_rows, int):
|
|
476
|
+
raise TypeError("batch_rows must be an integer")
|
|
477
|
+
elif batch_rows <= 0:
|
|
478
|
+
raise ValueError("batch_rows must be greater than 0")
|
|
470
479
|
|
|
471
480
|
dtypes = (name, dtype) if dtype is not None else dtypes
|
|
472
481
|
|
|
@@ -481,15 +490,17 @@ def df_apply_chunk(
|
|
|
481
490
|
|
|
482
491
|
# bind args and kwargs
|
|
483
492
|
op = DataFrameApplyChunkOperator(
|
|
484
|
-
func=func,
|
|
493
|
+
func=func,
|
|
494
|
+
batch_rows=batch_rows,
|
|
495
|
+
output_type=output_type,
|
|
496
|
+
args=args,
|
|
497
|
+
kwargs=kwargs,
|
|
485
498
|
)
|
|
486
499
|
|
|
487
500
|
return op(
|
|
488
501
|
dataframe,
|
|
489
502
|
dtypes=dtypes,
|
|
490
503
|
index=index,
|
|
491
|
-
args=args,
|
|
492
|
-
**kwargs,
|
|
493
504
|
)
|
|
494
505
|
|
|
495
506
|
|
|
@@ -720,7 +731,11 @@ def series_apply_chunk(
|
|
|
720
731
|
output_type = OutputType.df_or_series
|
|
721
732
|
|
|
722
733
|
op = DataFrameApplyChunkOperator(
|
|
723
|
-
func=func,
|
|
734
|
+
func=func,
|
|
735
|
+
batch_rows=batch_rows,
|
|
736
|
+
output_type=output_type,
|
|
737
|
+
args=args,
|
|
738
|
+
kwargs=kwargs,
|
|
724
739
|
)
|
|
725
740
|
|
|
726
741
|
dtypes = (name, dtype) if dtype is not None else dtypes
|
|
@@ -729,6 +744,4 @@ def series_apply_chunk(
|
|
|
729
744
|
dtypes=dtypes,
|
|
730
745
|
output_type=output_type,
|
|
731
746
|
index=index,
|
|
732
|
-
args=args,
|
|
733
|
-
**kwargs,
|
|
734
747
|
)
|
|
@@ -27,7 +27,12 @@ from ...serialization.serializables import (
|
|
|
27
27
|
)
|
|
28
28
|
from ..core import DataFrame
|
|
29
29
|
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
30
|
-
from ..utils import
|
|
30
|
+
from ..utils import (
|
|
31
|
+
copy_func_scheduling_hints,
|
|
32
|
+
gen_unknown_index_value,
|
|
33
|
+
make_dtypes,
|
|
34
|
+
parse_index,
|
|
35
|
+
)
|
|
31
36
|
|
|
32
37
|
|
|
33
38
|
class DataFrameFlatMapOperator(DataFrameOperator, DataFrameOperatorMixin):
|
|
@@ -40,6 +45,8 @@ class DataFrameFlatMapOperator(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
40
45
|
|
|
41
46
|
def __init__(self, output_types=None, **kw):
|
|
42
47
|
super().__init__(_output_types=output_types, **kw)
|
|
48
|
+
if hasattr(self, "func"):
|
|
49
|
+
copy_func_scheduling_hints(self.func, self)
|
|
43
50
|
|
|
44
51
|
def _call_dataframe(self, df: DataFrame, dtypes: pd.Series):
|
|
45
52
|
dtypes = make_dtypes(dtypes)
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
|
|
14
15
|
import numpy as np
|
|
15
16
|
import pandas as pd
|
|
16
17
|
import pytest
|
|
@@ -102,7 +103,7 @@ def test_apply_chunk_infer_dtypes_and_index(df1, df2, df3):
|
|
|
102
103
|
assert result.index_value is df1.index_value
|
|
103
104
|
assert result.dtypes.equals(df1.dtypes)
|
|
104
105
|
assert isinstance(result.op.func, MarkedFunction)
|
|
105
|
-
assert result.op.func is
|
|
106
|
+
assert result.op.func is process
|
|
106
107
|
assert result.op.func.resources is process.resources
|
|
107
108
|
assert result.op.func.pythonpacks is process.pythonpacks
|
|
108
109
|
|
|
@@ -303,11 +303,63 @@ def agg(groupby, func=None, method="auto", *args, **kwargs):
|
|
|
303
303
|
if aggregated result is very large, 'auto' will use 'shuffle' method
|
|
304
304
|
in distributed mode and use 'tree' in local mode.
|
|
305
305
|
|
|
306
|
-
|
|
307
306
|
Returns
|
|
308
307
|
-------
|
|
309
308
|
Series or DataFrame
|
|
310
309
|
Aggregated result.
|
|
310
|
+
|
|
311
|
+
Examples
|
|
312
|
+
--------
|
|
313
|
+
>>> import maxframe.dataframe as md
|
|
314
|
+
>>> df = md.DataFrame(
|
|
315
|
+
... {
|
|
316
|
+
... "A": [1, 1, 2, 2],
|
|
317
|
+
... "B": [1, 2, 3, 4],
|
|
318
|
+
... "C": [0.362838, 0.227877, 1.267767, -0.562860],
|
|
319
|
+
... }
|
|
320
|
+
... ).execute()
|
|
321
|
+
A B C
|
|
322
|
+
0 1 1 0.362838
|
|
323
|
+
1 1 2 0.227877
|
|
324
|
+
2 2 3 1.267767
|
|
325
|
+
3 2 4 -0.562860
|
|
326
|
+
|
|
327
|
+
The aggregation is for each column.
|
|
328
|
+
|
|
329
|
+
>>> df.groupby('A').agg('min').execute()
|
|
330
|
+
B C
|
|
331
|
+
A
|
|
332
|
+
1 1 0.227877
|
|
333
|
+
2 3 -0.562860
|
|
334
|
+
|
|
335
|
+
Multiple aggregations.
|
|
336
|
+
|
|
337
|
+
>>> df.groupby('A').agg(['min', 'max']).execute()
|
|
338
|
+
B C
|
|
339
|
+
min max min max
|
|
340
|
+
A
|
|
341
|
+
1 1 2 0.227877 0.362838
|
|
342
|
+
2 3 4 -0.562860 1.267767
|
|
343
|
+
|
|
344
|
+
Different aggregations per column
|
|
345
|
+
|
|
346
|
+
>>> df.groupby('A').agg({'B': ['min', 'max'], 'C': 'sum'}).execute()
|
|
347
|
+
B C
|
|
348
|
+
min max sum
|
|
349
|
+
A
|
|
350
|
+
1 1 2 0.590715
|
|
351
|
+
2 3 4 0.704907
|
|
352
|
+
|
|
353
|
+
To control the output names with different aggregations per column, pandas supports “named aggregation”
|
|
354
|
+
|
|
355
|
+
>>> from maxframe.dataframe.groupby import NamedAgg
|
|
356
|
+
>>> df.groupby("A").agg(
|
|
357
|
+
... b_min=NamedAgg(column="B", aggfunc="min"),
|
|
358
|
+
... c_sum=NamedAgg(column="C", aggfunc="sum")).execute()
|
|
359
|
+
b_min c_sum
|
|
360
|
+
A
|
|
361
|
+
1 1 0.590715
|
|
362
|
+
2 3 0.704907
|
|
311
363
|
"""
|
|
312
364
|
|
|
313
365
|
# When perform a computation on the grouped data, we won't shuffle
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
|
|
14
15
|
from typing import List, Union
|
|
15
16
|
|
|
16
17
|
import pandas as pd
|
|
@@ -100,8 +101,9 @@ class DataFrameConcat(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
100
101
|
row_length = 0
|
|
101
102
|
for series in objs:
|
|
102
103
|
row_length += series.shape[0]
|
|
103
|
-
if self.ignore_index:
|
|
104
|
-
|
|
104
|
+
if self.ignore_index:
|
|
105
|
+
idx_length = 0 if pd.isna(row_length) else row_length
|
|
106
|
+
index_value = parse_index(pd.RangeIndex(idx_length))
|
|
105
107
|
else:
|
|
106
108
|
index = self._concat_index(objs)
|
|
107
109
|
index_value = parse_index(index, objs)
|
|
@@ -159,8 +161,9 @@ class DataFrameConcat(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
159
161
|
if self.join == "inner":
|
|
160
162
|
objs = [o[list(emtpy_result.columns)] for o in objs]
|
|
161
163
|
|
|
162
|
-
if self.ignore_index:
|
|
163
|
-
|
|
164
|
+
if self.ignore_index:
|
|
165
|
+
idx_length = 0 if pd.isna(row_length) else row_length
|
|
166
|
+
index_value = parse_index(pd.RangeIndex(idx_length))
|
|
164
167
|
else:
|
|
165
168
|
index = self._concat_index(objs)
|
|
166
169
|
index_value = parse_index(index, objs)
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
|
|
14
15
|
import logging
|
|
15
16
|
from abc import abstractmethod
|
|
16
17
|
from collections import namedtuple
|