maxframe 1.0.0rc4__cp310-cp310-win32.whl → 1.1.1__cp310-cp310-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cp310-win32.pyd +0 -0
- maxframe/config/__init__.py +1 -1
- maxframe/config/config.py +26 -0
- maxframe/config/tests/test_config.py +20 -1
- maxframe/conftest.py +17 -4
- maxframe/core/graph/core.cp310-win32.pyd +0 -0
- maxframe/core/operator/base.py +2 -0
- maxframe/dataframe/arithmetic/tests/test_arithmetic.py +17 -16
- maxframe/dataframe/core.py +24 -2
- maxframe/dataframe/datasource/read_odps_query.py +65 -35
- maxframe/dataframe/datasource/read_odps_table.py +4 -2
- maxframe/dataframe/datasource/tests/test_datasource.py +59 -7
- maxframe/dataframe/extensions/__init__.py +5 -0
- maxframe/dataframe/extensions/apply_chunk.py +649 -0
- maxframe/dataframe/extensions/flatjson.py +131 -0
- maxframe/dataframe/extensions/flatmap.py +28 -40
- maxframe/dataframe/extensions/reshuffle.py +1 -1
- maxframe/dataframe/extensions/tests/test_apply_chunk.py +186 -0
- maxframe/dataframe/extensions/tests/test_extensions.py +46 -2
- maxframe/dataframe/groupby/__init__.py +1 -0
- maxframe/dataframe/groupby/aggregation.py +1 -0
- maxframe/dataframe/groupby/apply.py +9 -1
- maxframe/dataframe/groupby/core.py +1 -1
- maxframe/dataframe/groupby/fill.py +4 -1
- maxframe/dataframe/groupby/getitem.py +6 -0
- maxframe/dataframe/groupby/tests/test_groupby.py +1 -1
- maxframe/dataframe/groupby/transform.py +8 -2
- maxframe/dataframe/indexing/loc.py +6 -4
- maxframe/dataframe/merge/__init__.py +9 -1
- maxframe/dataframe/merge/concat.py +41 -31
- maxframe/dataframe/merge/merge.py +1 -1
- maxframe/dataframe/merge/tests/test_merge.py +3 -1
- maxframe/dataframe/misc/apply.py +3 -0
- maxframe/dataframe/misc/drop_duplicates.py +5 -1
- maxframe/dataframe/misc/map.py +3 -1
- maxframe/dataframe/misc/tests/test_misc.py +24 -2
- maxframe/dataframe/misc/transform.py +22 -13
- maxframe/dataframe/reduction/__init__.py +3 -0
- maxframe/dataframe/reduction/aggregation.py +1 -0
- maxframe/dataframe/reduction/median.py +56 -0
- maxframe/dataframe/reduction/tests/test_reduction.py +17 -7
- maxframe/dataframe/statistics/quantile.py +8 -2
- maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
- maxframe/dataframe/tests/test_utils.py +60 -0
- maxframe/dataframe/utils.py +110 -7
- maxframe/dataframe/window/expanding.py +5 -3
- maxframe/dataframe/window/tests/test_expanding.py +2 -2
- maxframe/io/objects/tests/test_object_io.py +39 -12
- maxframe/io/odpsio/__init__.py +1 -1
- maxframe/io/odpsio/arrow.py +51 -2
- maxframe/io/odpsio/schema.py +23 -5
- maxframe/io/odpsio/tableio.py +80 -124
- maxframe/io/odpsio/tests/test_schema.py +40 -0
- maxframe/io/odpsio/tests/test_tableio.py +5 -5
- maxframe/io/odpsio/tests/test_volumeio.py +35 -11
- maxframe/io/odpsio/volumeio.py +27 -3
- maxframe/learn/contrib/__init__.py +3 -2
- maxframe/learn/contrib/llm/__init__.py +16 -0
- maxframe/learn/contrib/llm/core.py +54 -0
- maxframe/learn/contrib/llm/models/__init__.py +14 -0
- maxframe/learn/contrib/llm/models/dashscope.py +73 -0
- maxframe/learn/contrib/llm/multi_modal.py +42 -0
- maxframe/learn/contrib/llm/text.py +42 -0
- maxframe/lib/mmh3.cp310-win32.pyd +0 -0
- maxframe/lib/sparse/tests/test_sparse.py +15 -15
- maxframe/opcodes.py +7 -1
- maxframe/serialization/core.cp310-win32.pyd +0 -0
- maxframe/serialization/core.pyx +13 -1
- maxframe/serialization/pandas.py +50 -20
- maxframe/serialization/serializables/core.py +70 -15
- maxframe/serialization/serializables/field_type.py +4 -1
- maxframe/serialization/serializables/tests/test_serializable.py +12 -2
- maxframe/serialization/tests/test_serial.py +2 -1
- maxframe/tensor/__init__.py +19 -7
- maxframe/tensor/merge/vstack.py +1 -1
- maxframe/tests/utils.py +16 -0
- maxframe/udf.py +27 -0
- maxframe/utils.py +42 -8
- {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/METADATA +4 -4
- {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/RECORD +88 -77
- {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/WHEEL +1 -1
- maxframe_client/clients/framedriver.py +4 -1
- maxframe_client/fetcher.py +23 -8
- maxframe_client/session/odps.py +40 -11
- maxframe_client/session/task.py +6 -25
- maxframe_client/session/tests/test_task.py +35 -6
- maxframe_client/tests/test_session.py +30 -10
- {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/top_level.txt +0 -0
|
@@ -25,13 +25,14 @@ from ...core import ENTITY_TYPE, OutputType
|
|
|
25
25
|
from ...serialization.serializables import AnyField, KeyField, ListField
|
|
26
26
|
from ...tensor.datasource import asarray
|
|
27
27
|
from ...tensor.utils import calc_sliced_size, filter_inputs
|
|
28
|
-
from ...utils import is_full_slice, lazy_import
|
|
28
|
+
from ...utils import is_full_slice, lazy_import, pd_release_version
|
|
29
29
|
from ..core import DATAFRAME_TYPE, IndexValue
|
|
30
30
|
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
31
31
|
from ..utils import parse_index
|
|
32
32
|
from .iloc import DataFrameIlocSetItem
|
|
33
33
|
|
|
34
34
|
cudf = lazy_import("cudf")
|
|
35
|
+
with_slice_locs_kind = pd_release_version < (1, 4, 0)
|
|
35
36
|
|
|
36
37
|
|
|
37
38
|
def process_loc_indexes(inp, indexes, fetch_index: bool = True):
|
|
@@ -210,9 +211,10 @@ class DataFrameLocGetItem(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
210
211
|
if axis == 1:
|
|
211
212
|
param["dtypes"] = inp.dtypes
|
|
212
213
|
elif input_index_value.has_value():
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
214
|
+
kw = {}
|
|
215
|
+
if with_slice_locs_kind:
|
|
216
|
+
kw["kind"] = "loc"
|
|
217
|
+
start, end = pd_index.slice_locs(index.start, index.stop, index.step, **kw)
|
|
216
218
|
slc = slice(start, end, index.step)
|
|
217
219
|
size = calc_sliced_size(inp.shape[axis], slc)
|
|
218
220
|
param["shape"] = size
|
|
@@ -14,7 +14,15 @@
|
|
|
14
14
|
|
|
15
15
|
from .append import DataFrameAppend, append
|
|
16
16
|
from .concat import DataFrameConcat, concat
|
|
17
|
-
from .merge import
|
|
17
|
+
from .merge import (
|
|
18
|
+
DataFrameMerge,
|
|
19
|
+
DataFrameMergeAlign,
|
|
20
|
+
DistributedMapJoinHint,
|
|
21
|
+
MapJoinHint,
|
|
22
|
+
SkewJoinHint,
|
|
23
|
+
join,
|
|
24
|
+
merge,
|
|
25
|
+
)
|
|
18
26
|
|
|
19
27
|
|
|
20
28
|
def _install():
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
from typing import List, Union
|
|
14
15
|
|
|
15
16
|
import pandas as pd
|
|
16
17
|
|
|
@@ -24,6 +25,7 @@ from ...serialization.serializables import (
|
|
|
24
25
|
StringField,
|
|
25
26
|
)
|
|
26
27
|
from ...utils import lazy_import
|
|
28
|
+
from ..core import DataFrame, Series
|
|
27
29
|
from ..operators import SERIES_TYPE, DataFrameOperator, DataFrameOperatorMixin
|
|
28
30
|
from ..utils import build_empty_df, build_empty_series, parse_index, validate_axis
|
|
29
31
|
|
|
@@ -55,41 +57,53 @@ class DataFrameConcat(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
55
57
|
return self.names
|
|
56
58
|
|
|
57
59
|
@classmethod
|
|
58
|
-
def _concat_index(cls,
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
if
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
60
|
+
def _concat_index(cls, df_or_series_list: Union[List[DataFrame], List[Series]]):
|
|
61
|
+
concat_index = None
|
|
62
|
+
all_indexes_have_value = all(
|
|
63
|
+
input.index_value.has_value() for input in df_or_series_list
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
def _concat(prev_index: pd.Index, cur_index: pd.Index):
|
|
67
|
+
if prev_index is None:
|
|
68
|
+
return cur_index
|
|
69
|
+
|
|
70
|
+
if (
|
|
71
|
+
all_indexes_have_value
|
|
72
|
+
and isinstance(prev_index, pd.RangeIndex)
|
|
73
|
+
and isinstance(cur_index, pd.RangeIndex)
|
|
74
|
+
):
|
|
75
|
+
# handle RangeIndex that append may generate huge amount of data
|
|
76
|
+
# e.g. pd.RangeIndex(10_000) and pd.RangeIndex(10_000)
|
|
77
|
+
# will generate a Int64Index full of data
|
|
78
|
+
# for details see GH#1647
|
|
79
|
+
prev_stop = prev_index.start + prev_index.size * prev_index.step
|
|
80
|
+
cur_start = cur_index.start
|
|
81
|
+
if prev_stop == cur_start and prev_index.step == cur_index.step:
|
|
82
|
+
# continuous RangeIndex, still return RangeIndex
|
|
83
|
+
return prev_index.append(cur_index)
|
|
84
|
+
else:
|
|
85
|
+
# otherwise, return an empty index
|
|
86
|
+
return pd.Index([], dtype=prev_index.dtype)
|
|
87
|
+
elif isinstance(prev_index, pd.RangeIndex):
|
|
88
|
+
return pd.Index([], prev_index.dtype).append(cur_index)
|
|
89
|
+
elif isinstance(cur_index, pd.RangeIndex):
|
|
90
|
+
return prev_index.append(pd.Index([], cur_index.dtype))
|
|
91
|
+
return prev_index.append(cur_index)
|
|
92
|
+
|
|
93
|
+
for input in df_or_series_list:
|
|
94
|
+
concat_index = _concat(concat_index, input.index_value.to_pandas())
|
|
95
|
+
|
|
96
|
+
return concat_index
|
|
79
97
|
|
|
80
98
|
def _call_series(self, objs):
|
|
81
99
|
if self.axis == 0:
|
|
82
100
|
row_length = 0
|
|
83
|
-
index = None
|
|
84
101
|
for series in objs:
|
|
85
|
-
if index is None:
|
|
86
|
-
index = series.index_value.to_pandas()
|
|
87
|
-
else:
|
|
88
|
-
index = self._concat_index(index, series.index_value.to_pandas())
|
|
89
102
|
row_length += series.shape[0]
|
|
90
103
|
if self.ignore_index: # pragma: no cover
|
|
91
104
|
index_value = parse_index(pd.RangeIndex(row_length))
|
|
92
105
|
else:
|
|
106
|
+
index = self._concat_index(objs)
|
|
93
107
|
index_value = parse_index(index, objs)
|
|
94
108
|
obj_names = {obj.name for obj in objs}
|
|
95
109
|
return self.new_series(
|
|
@@ -130,13 +144,8 @@ class DataFrameConcat(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
130
144
|
def _call_dataframes(self, objs):
|
|
131
145
|
if self.axis == 0:
|
|
132
146
|
row_length = 0
|
|
133
|
-
index = None
|
|
134
147
|
empty_dfs = []
|
|
135
148
|
for df in objs:
|
|
136
|
-
if index is None:
|
|
137
|
-
index = df.index_value.to_pandas()
|
|
138
|
-
else:
|
|
139
|
-
index = self._concat_index(index, df.index_value.to_pandas())
|
|
140
149
|
row_length += df.shape[0]
|
|
141
150
|
if df.ndim == 2:
|
|
142
151
|
empty_dfs.append(build_empty_df(df.dtypes))
|
|
@@ -153,6 +162,7 @@ class DataFrameConcat(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
153
162
|
if self.ignore_index: # pragma: no cover
|
|
154
163
|
index_value = parse_index(pd.RangeIndex(row_length))
|
|
155
164
|
else:
|
|
165
|
+
index = self._concat_index(objs)
|
|
156
166
|
index_value = parse_index(index, objs)
|
|
157
167
|
|
|
158
168
|
new_objs = []
|
|
@@ -16,6 +16,7 @@ import numpy as np
|
|
|
16
16
|
import pandas as pd
|
|
17
17
|
import pytest
|
|
18
18
|
|
|
19
|
+
from ....tests.utils import assert_mf_index_dtype
|
|
19
20
|
from ...core import IndexValue
|
|
20
21
|
from ...datasource.dataframe import from_pandas
|
|
21
22
|
from .. import DataFrameMerge, concat
|
|
@@ -161,7 +162,7 @@ def test_append():
|
|
|
161
162
|
adf = mdf1.append(mdf2)
|
|
162
163
|
|
|
163
164
|
assert adf.shape == (20, 4)
|
|
164
|
-
|
|
165
|
+
assert_mf_index_dtype(adf.index_value.value, np.int64)
|
|
165
166
|
|
|
166
167
|
mdf1 = from_pandas(df1, chunk_size=3)
|
|
167
168
|
mdf2 = from_pandas(df2, chunk_size=3)
|
|
@@ -181,6 +182,7 @@ def test_concat():
|
|
|
181
182
|
r = concat([mdf1, mdf2], axis="index")
|
|
182
183
|
|
|
183
184
|
assert r.shape == (20, 4)
|
|
185
|
+
assert not isinstance(r.index_value.to_pandas(), pd.RangeIndex)
|
|
184
186
|
pd.testing.assert_series_equal(r.dtypes, df1.dtypes)
|
|
185
187
|
|
|
186
188
|
df3 = pd.DataFrame(
|
maxframe/dataframe/misc/apply.py
CHANGED
|
@@ -35,6 +35,7 @@ from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
|
35
35
|
from ..utils import (
|
|
36
36
|
build_df,
|
|
37
37
|
build_series,
|
|
38
|
+
copy_func_scheduling_hints,
|
|
38
39
|
make_dtype,
|
|
39
40
|
make_dtypes,
|
|
40
41
|
pack_func_args,
|
|
@@ -79,6 +80,8 @@ class ApplyOperator(
|
|
|
79
80
|
if output_type:
|
|
80
81
|
kw["_output_types"] = [output_type]
|
|
81
82
|
super().__init__(**kw)
|
|
83
|
+
if hasattr(self, "func"):
|
|
84
|
+
copy_func_scheduling_hints(self.func, self)
|
|
82
85
|
|
|
83
86
|
def _update_key(self):
|
|
84
87
|
values = [v for v in self._values_ if v is not self.func] + [
|
|
@@ -43,7 +43,11 @@ class DataFrameDropDuplicates(DuplicateOperand):
|
|
|
43
43
|
params["index_value"] = parse_index(pd.RangeIndex(-1))
|
|
44
44
|
else:
|
|
45
45
|
params["index_value"] = gen_unknown_index_value(
|
|
46
|
-
input_params["index_value"],
|
|
46
|
+
input_params["index_value"],
|
|
47
|
+
op.keep,
|
|
48
|
+
op.subset,
|
|
49
|
+
type(op).__name__,
|
|
50
|
+
normalize_range_index=True,
|
|
47
51
|
)
|
|
48
52
|
params["shape"] = self._get_shape(input_params["shape"], op)
|
|
49
53
|
return params
|
maxframe/dataframe/misc/map.py
CHANGED
|
@@ -24,7 +24,7 @@ from ...serialization.serializables import AnyField, KeyField, StringField
|
|
|
24
24
|
from ...utils import quiet_stdio
|
|
25
25
|
from ..core import SERIES_TYPE
|
|
26
26
|
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
27
|
-
from ..utils import build_series
|
|
27
|
+
from ..utils import build_series, copy_func_scheduling_hints
|
|
28
28
|
|
|
29
29
|
|
|
30
30
|
class DataFrameMap(DataFrameOperator, DataFrameOperatorMixin):
|
|
@@ -38,6 +38,8 @@ class DataFrameMap(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
38
38
|
super().__init__(_output_types=output_types, _memory_scale=memory_scale, **kw)
|
|
39
39
|
if not self.output_types:
|
|
40
40
|
self.output_types = [OutputType.series]
|
|
41
|
+
if hasattr(self, "arg"):
|
|
42
|
+
copy_func_scheduling_hints(self.arg, self)
|
|
41
43
|
|
|
42
44
|
def _set_inputs(self, inputs):
|
|
43
45
|
super()._set_inputs(inputs)
|
|
@@ -20,6 +20,7 @@ from .... import opcodes
|
|
|
20
20
|
from ....core import OutputType
|
|
21
21
|
from ....dataframe import DataFrame
|
|
22
22
|
from ....tensor.core import TENSOR_TYPE
|
|
23
|
+
from ....udf import with_running_options
|
|
23
24
|
from ... import eval as maxframe_eval
|
|
24
25
|
from ... import get_dummies, to_numeric
|
|
25
26
|
from ...arithmetic import DataFrameGreater, DataFrameLess
|
|
@@ -65,6 +66,17 @@ def test_transform():
|
|
|
65
66
|
assert r.op._op_type_ == opcodes.TRANSFORM
|
|
66
67
|
assert r.op.output_types[0] == OutputType.dataframe
|
|
67
68
|
|
|
69
|
+
def transform_df_with_param(row, param, k):
|
|
70
|
+
assert param == 5
|
|
71
|
+
assert k == "6"
|
|
72
|
+
return row
|
|
73
|
+
|
|
74
|
+
r = df.transform(transform_df_with_param, 1, 5, k="6")
|
|
75
|
+
assert all(v == np.dtype("int64") for v in r.dtypes) is True
|
|
76
|
+
assert r.shape == df.shape
|
|
77
|
+
assert r.op._op_type_ == opcodes.TRANSFORM
|
|
78
|
+
assert r.op.output_types[0] == OutputType.dataframe
|
|
79
|
+
|
|
68
80
|
r = df.transform(lambda x: list(range(len(x))), axis=1)
|
|
69
81
|
assert all(v == np.dtype("int64") for v in r.dtypes) is True
|
|
70
82
|
assert r.shape == df.shape
|
|
@@ -349,7 +361,9 @@ def test_drop():
|
|
|
349
361
|
def test_drop_duplicates():
|
|
350
362
|
rs = np.random.RandomState(0)
|
|
351
363
|
raw = pd.DataFrame(
|
|
352
|
-
rs.randint(1000, size=(20, 7)),
|
|
364
|
+
rs.randint(1000, size=(20, 7)),
|
|
365
|
+
columns=["c" + str(i + 1) for i in range(7)],
|
|
366
|
+
index=pd.Index(range(20), name="idx"),
|
|
353
367
|
)
|
|
354
368
|
raw["c7"] = [f"s{j}" for j in range(20)]
|
|
355
369
|
|
|
@@ -361,6 +375,12 @@ def test_drop_duplicates():
|
|
|
361
375
|
with pytest.raises(KeyError):
|
|
362
376
|
df.drop_duplicates(subset="c8")
|
|
363
377
|
|
|
378
|
+
# check index
|
|
379
|
+
distinct_df = df.drop_duplicates()
|
|
380
|
+
assert distinct_df.index_value.name == df.index_value.name
|
|
381
|
+
assert isinstance(df.index_value.to_pandas(), pd.RangeIndex)
|
|
382
|
+
assert not isinstance(distinct_df.index_value.to_pandas(), pd.RangeIndex)
|
|
383
|
+
|
|
364
384
|
s = df["c7"]
|
|
365
385
|
with pytest.raises(ValueError):
|
|
366
386
|
s.drop_duplicates(method="unknown")
|
|
@@ -436,6 +456,7 @@ def test_apply():
|
|
|
436
456
|
|
|
437
457
|
keys = [1, 2]
|
|
438
458
|
|
|
459
|
+
@with_running_options(engine="spe")
|
|
439
460
|
def f(x, keys):
|
|
440
461
|
if x["a"] in keys:
|
|
441
462
|
return [1, 0]
|
|
@@ -451,6 +472,7 @@ def test_apply():
|
|
|
451
472
|
keys=keys,
|
|
452
473
|
)
|
|
453
474
|
assert apply_df.shape == (3, 2)
|
|
475
|
+
assert apply_df.op.expect_engine == "SPE"
|
|
454
476
|
|
|
455
477
|
|
|
456
478
|
def test_pivot_table():
|
|
@@ -474,7 +496,7 @@ def test_pivot_table():
|
|
|
474
496
|
with pytest.raises(ValueError):
|
|
475
497
|
df.pivot_table(values=["D", "E"], aggfunc="sum")
|
|
476
498
|
|
|
477
|
-
t = df.pivot_table(index="A")
|
|
499
|
+
t = df.pivot_table(index=["A", "B", "C"])
|
|
478
500
|
assert isinstance(t.op, DataFrameGroupByAgg)
|
|
479
501
|
t = df.pivot_table(index="A", values=["D", "E"], aggfunc="sum")
|
|
480
502
|
assert isinstance(t.op, DataFrameGroupByAgg)
|
|
@@ -27,6 +27,7 @@ from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
|
27
27
|
from ..utils import (
|
|
28
28
|
build_df,
|
|
29
29
|
build_series,
|
|
30
|
+
copy_func_scheduling_hints,
|
|
30
31
|
make_dtypes,
|
|
31
32
|
pack_func_args,
|
|
32
33
|
parse_index,
|
|
@@ -49,10 +50,12 @@ class TransformOperator(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
49
50
|
|
|
50
51
|
def __init__(self, output_types=None, memory_scale=None, **kw):
|
|
51
52
|
super().__init__(_output_types=output_types, _memory_scale=memory_scale, **kw)
|
|
53
|
+
if hasattr(self, "func"):
|
|
54
|
+
copy_func_scheduling_hints(self.func, self)
|
|
52
55
|
|
|
53
56
|
def _infer_df_func_returns(self, df, dtypes):
|
|
54
|
-
packed_funcs = self.
|
|
55
|
-
test_df =
|
|
57
|
+
packed_funcs = self.func
|
|
58
|
+
test_df = _build_stub_pandas_obj(df, self.output_types[0])
|
|
56
59
|
if self.output_types[0] == OutputType.dataframe:
|
|
57
60
|
try:
|
|
58
61
|
with np.errstate(all="ignore"), quiet_stdio():
|
|
@@ -147,16 +150,18 @@ class TransformOperator(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
147
150
|
index_value=new_index_value,
|
|
148
151
|
)
|
|
149
152
|
|
|
150
|
-
def get_packed_funcs(self, df=None) -> Any:
|
|
151
|
-
stub_df = self._build_stub_pandas_obj(df or self.inputs[0])
|
|
152
|
-
return pack_func_args(stub_df, self.func, *self.args, **self.kwds)
|
|
153
153
|
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
154
|
+
def get_packed_funcs(df, output_type, func, *args, **kwds) -> Any:
|
|
155
|
+
stub_df = _build_stub_pandas_obj(df, output_type)
|
|
156
|
+
return pack_func_args(stub_df, func, *args, **kwds)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def _build_stub_pandas_obj(df, output_type) -> Union[DataFrame, Series]:
|
|
160
|
+
# TODO: Simulate a dataframe with the corresponding indexes if self.func is
|
|
161
|
+
# a dict and axis=1
|
|
162
|
+
if output_type == OutputType.dataframe:
|
|
163
|
+
return build_df(df, fill_value=1, size=1)
|
|
164
|
+
return build_series(df, size=1, name=df.name)
|
|
160
165
|
|
|
161
166
|
|
|
162
167
|
def df_transform(df, func, axis=0, *args, dtypes=None, skip_infer=False, **kwargs):
|
|
@@ -229,13 +234,15 @@ def df_transform(df, func, axis=0, *args, dtypes=None, skip_infer=False, **kwarg
|
|
|
229
234
|
1 2 3
|
|
230
235
|
2 3 4
|
|
231
236
|
"""
|
|
237
|
+
call_agg = kwargs.pop("_call_agg", False)
|
|
238
|
+
func = get_packed_funcs(df, OutputType.dataframe, func, *args, **kwargs)
|
|
232
239
|
op = TransformOperator(
|
|
233
240
|
func=func,
|
|
234
241
|
axis=axis,
|
|
235
242
|
args=args,
|
|
236
243
|
kwds=kwargs,
|
|
237
244
|
output_types=[OutputType.dataframe],
|
|
238
|
-
call_agg=
|
|
245
|
+
call_agg=call_agg,
|
|
239
246
|
)
|
|
240
247
|
return op(df, dtypes=dtypes, skip_infer=skip_infer)
|
|
241
248
|
|
|
@@ -319,6 +326,8 @@ def series_transform(
|
|
|
319
326
|
1 2 3
|
|
320
327
|
2 3 4
|
|
321
328
|
"""
|
|
329
|
+
call_agg = kwargs.pop("_call_agg", False)
|
|
330
|
+
func = get_packed_funcs(series, OutputType.series, func, *args, **kwargs)
|
|
322
331
|
op = TransformOperator(
|
|
323
332
|
func=func,
|
|
324
333
|
axis=axis,
|
|
@@ -326,7 +335,7 @@ def series_transform(
|
|
|
326
335
|
args=args,
|
|
327
336
|
kwds=kwargs,
|
|
328
337
|
output_types=[OutputType.series],
|
|
329
|
-
call_agg=
|
|
338
|
+
call_agg=call_agg,
|
|
330
339
|
)
|
|
331
340
|
dtypes = (series.name, dtype) if dtype is not None else None
|
|
332
341
|
return op(series, dtypes=dtypes, skip_infer=skip_infer)
|
|
@@ -25,6 +25,7 @@ from .custom_reduction import DataFrameCustomReduction
|
|
|
25
25
|
from .kurtosis import DataFrameKurtosis
|
|
26
26
|
from .max import DataFrameMax
|
|
27
27
|
from .mean import DataFrameMean
|
|
28
|
+
from .median import DataFrameMedian
|
|
28
29
|
from .min import DataFrameMin
|
|
29
30
|
from .nunique import DataFrameNunique
|
|
30
31
|
from .prod import DataFrameProd
|
|
@@ -50,6 +51,7 @@ def _install():
|
|
|
50
51
|
from .kurtosis import kurt_dataframe, kurt_series
|
|
51
52
|
from .max import max_dataframe, max_index, max_series
|
|
52
53
|
from .mean import mean_dataframe, mean_series
|
|
54
|
+
from .median import median_dataframe, median_series
|
|
53
55
|
from .min import min_dataframe, min_index, min_series
|
|
54
56
|
from .nunique import nunique_dataframe, nunique_series
|
|
55
57
|
from .prod import prod_dataframe, prod_series
|
|
@@ -68,6 +70,7 @@ def _install():
|
|
|
68
70
|
("min", min_series, min_dataframe),
|
|
69
71
|
("count", count_series, count_dataframe),
|
|
70
72
|
("mean", mean_series, mean_dataframe),
|
|
73
|
+
("median", median_series, median_dataframe),
|
|
71
74
|
("var", var_series, var_dataframe),
|
|
72
75
|
("std", std_series, std_dataframe),
|
|
73
76
|
("all", all_series, all_dataframe),
|
|
@@ -71,6 +71,7 @@ _agg_functions = {
|
|
|
71
71
|
"kurt": lambda x, skipna=True, bias=False: x.kurt(skipna=skipna, bias=bias),
|
|
72
72
|
"kurtosis": lambda x, skipna=True, bias=False: x.kurtosis(skipna=skipna, bias=bias),
|
|
73
73
|
"nunique": lambda x: x.nunique(),
|
|
74
|
+
"median": lambda x, skipna=True: x.median(skipna=skipna),
|
|
74
75
|
}
|
|
75
76
|
|
|
76
77
|
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from ... import opcodes
|
|
16
|
+
from ...core import OutputType
|
|
17
|
+
from .core import DataFrameReductionMixin, DataFrameReductionOperator
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class DataFrameMedian(DataFrameReductionOperator, DataFrameReductionMixin):
|
|
21
|
+
_op_type_ = opcodes.MEDIAN
|
|
22
|
+
_func_name = "median"
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
def is_atomic(self):
|
|
26
|
+
return True
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def median_series(df, axis=None, skipna=True, level=None, method=None):
|
|
30
|
+
op = DataFrameMedian(
|
|
31
|
+
axis=axis,
|
|
32
|
+
skipna=skipna,
|
|
33
|
+
level=level,
|
|
34
|
+
output_types=[OutputType.scalar if level is not None else OutputType.scalar],
|
|
35
|
+
method=method,
|
|
36
|
+
)
|
|
37
|
+
return op(df)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def median_dataframe(
|
|
41
|
+
df,
|
|
42
|
+
axis=0,
|
|
43
|
+
skipna=True,
|
|
44
|
+
level=None,
|
|
45
|
+
numeric_only=None,
|
|
46
|
+
method=None,
|
|
47
|
+
):
|
|
48
|
+
op = DataFrameMedian(
|
|
49
|
+
axis=axis,
|
|
50
|
+
skipna=skipna,
|
|
51
|
+
level=level,
|
|
52
|
+
numeric_only=numeric_only,
|
|
53
|
+
output_types=[OutputType.dataframe if level is not None else OutputType.series],
|
|
54
|
+
method=method,
|
|
55
|
+
)
|
|
56
|
+
return op(df)
|
|
@@ -23,6 +23,7 @@ import pytest
|
|
|
23
23
|
|
|
24
24
|
from .... import dataframe as md
|
|
25
25
|
from ....tensor import Tensor
|
|
26
|
+
from ....tests.utils import assert_mf_index_dtype
|
|
26
27
|
from ...core import DataFrame, IndexValue, OutputType, Series
|
|
27
28
|
from ...datasource.dataframe import from_pandas as from_pandas_df
|
|
28
29
|
from ...datasource.series import from_pandas as from_pandas_series
|
|
@@ -38,6 +39,7 @@ from .. import (
|
|
|
38
39
|
DataFrameKurtosis,
|
|
39
40
|
DataFrameMax,
|
|
40
41
|
DataFrameMean,
|
|
42
|
+
DataFrameMedian,
|
|
41
43
|
DataFrameMin,
|
|
42
44
|
DataFrameNunique,
|
|
43
45
|
DataFrameProd,
|
|
@@ -71,6 +73,7 @@ reduction_functions = [
|
|
|
71
73
|
("sem", DataFrameSem, FunctionOptions()),
|
|
72
74
|
("all", DataFrameAll, FunctionOptions(has_numeric_only=False, has_bool_only=True)),
|
|
73
75
|
("any", DataFrameAny, FunctionOptions(has_numeric_only=False, has_bool_only=True)),
|
|
76
|
+
("median", DataFrameMedian, FunctionOptions()),
|
|
74
77
|
]
|
|
75
78
|
|
|
76
79
|
|
|
@@ -111,10 +114,7 @@ def test_dataframe_reduction(func_name, op, func_opts: FunctionOptions):
|
|
|
111
114
|
reduction_df = getattr(from_pandas_df(data, chunk_size=3), func_name)()
|
|
112
115
|
|
|
113
116
|
assert isinstance(reduction_df, Series)
|
|
114
|
-
|
|
115
|
-
reduction_df.index_value._index_value,
|
|
116
|
-
(IndexValue.RangeIndex, IndexValue.Int64Index),
|
|
117
|
-
)
|
|
117
|
+
assert_mf_index_dtype(reduction_df.index_value._index_value, np.int64)
|
|
118
118
|
assert reduction_df.shape == (10,)
|
|
119
119
|
|
|
120
120
|
data = pd.DataFrame(np.random.rand(20, 20), index=[str(i) for i in range(20)])
|
|
@@ -210,6 +210,7 @@ def test_dataframe_aggregate():
|
|
|
210
210
|
"skew",
|
|
211
211
|
"kurt",
|
|
212
212
|
"sem",
|
|
213
|
+
"median",
|
|
213
214
|
]
|
|
214
215
|
|
|
215
216
|
df = from_pandas_df(data)
|
|
@@ -253,7 +254,7 @@ def test_dataframe_aggregate():
|
|
|
253
254
|
assert result.op.output_types[0] == OutputType.dataframe
|
|
254
255
|
assert result.op.func == agg_funcs
|
|
255
256
|
|
|
256
|
-
dict_fun = {0: "sum", 2: ["var", "max"], 9: ["mean", "var", "std"]}
|
|
257
|
+
dict_fun = {0: "sum", 2: ["var", "max"], 9: ["mean", "var", "std", "median"]}
|
|
257
258
|
all_cols = set(
|
|
258
259
|
reduce(
|
|
259
260
|
operator.add, [[v] if isinstance(v, str) else v for v in dict_fun.values()]
|
|
@@ -268,9 +269,9 @@ def test_dataframe_aggregate():
|
|
|
268
269
|
assert result.op.func[2] == dict_fun[2]
|
|
269
270
|
|
|
270
271
|
with pytest.raises(TypeError):
|
|
271
|
-
df.agg(sum_0="sum", mean_0="mean")
|
|
272
|
+
df.agg(sum_0="sum", mean_0="mean", median_0="median")
|
|
272
273
|
with pytest.raises(NotImplementedError):
|
|
273
|
-
df.agg({0: ["sum", "min", "var"], 9: ["mean", "var", "std"]}, axis=1)
|
|
274
|
+
df.agg({0: ["sum", "min", "var"], 9: ["mean", "var", "std", "median"]}, axis=1)
|
|
274
275
|
|
|
275
276
|
|
|
276
277
|
def test_series_aggregate():
|
|
@@ -287,6 +288,7 @@ def test_series_aggregate():
|
|
|
287
288
|
"skew",
|
|
288
289
|
"kurt",
|
|
289
290
|
"sem",
|
|
291
|
+
"median",
|
|
290
292
|
]
|
|
291
293
|
|
|
292
294
|
series = from_pandas_series(data)
|
|
@@ -303,6 +305,14 @@ def test_series_aggregate():
|
|
|
303
305
|
assert result.shape == ()
|
|
304
306
|
assert result.op.output_types[0] == OutputType.scalar
|
|
305
307
|
|
|
308
|
+
result = series.agg("median")
|
|
309
|
+
assert result.shape == ()
|
|
310
|
+
assert result.op.output_types[0] == OutputType.scalar
|
|
311
|
+
|
|
312
|
+
result = series.median(level=0)
|
|
313
|
+
assert result.shape == (np.nan,)
|
|
314
|
+
assert result.op.output_types[0] == OutputType.series
|
|
315
|
+
|
|
306
316
|
result = series.agg(agg_funcs)
|
|
307
317
|
assert result.shape == (len(agg_funcs),)
|
|
308
318
|
assert list(result.index_value.to_pandas()) == agg_funcs
|
|
@@ -81,7 +81,10 @@ class DataFrameQuantile(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
81
81
|
store_index_value = False
|
|
82
82
|
else:
|
|
83
83
|
q_val = np.asanyarray(self.q)
|
|
84
|
-
|
|
84
|
+
if q_val.ndim == 0:
|
|
85
|
+
pd_index = pd.Index(q_val.reshape(1))
|
|
86
|
+
else:
|
|
87
|
+
pd_index = pd.Index(q_val)
|
|
85
88
|
name = self.q if q_val.size == 1 else None
|
|
86
89
|
store_index_value = True
|
|
87
90
|
tokenize_objects = (a, q_val, self.interpolation, type(self).__name__)
|
|
@@ -164,7 +167,10 @@ class DataFrameQuantile(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
164
167
|
store_index_value = False
|
|
165
168
|
else:
|
|
166
169
|
q_val = np.asanyarray(self.q)
|
|
167
|
-
|
|
170
|
+
if q_val.ndim == 0:
|
|
171
|
+
index_val = pd.Index(q_val.reshape(1))
|
|
172
|
+
else:
|
|
173
|
+
index_val = pd.Index(q_val)
|
|
168
174
|
store_index_value = True
|
|
169
175
|
|
|
170
176
|
# get dtype by tensor
|
|
@@ -49,7 +49,7 @@ def test_dataframe_quantile():
|
|
|
49
49
|
|
|
50
50
|
# q = 0.3, axis = 0
|
|
51
51
|
r = s.quantile(0.3)
|
|
52
|
-
e = raw.quantile(0.3)
|
|
52
|
+
e = raw.quantile(0.3, numeric_only=True)
|
|
53
53
|
assert isinstance(r, Series)
|
|
54
54
|
assert r.shape == (2,)
|
|
55
55
|
assert r.dtype == e.dtype
|
|
@@ -57,7 +57,7 @@ def test_dataframe_quantile():
|
|
|
57
57
|
|
|
58
58
|
# q = 0.3, axis = 1
|
|
59
59
|
r = s.quantile(0.3, axis=1)
|
|
60
|
-
e = raw.quantile(0.3, axis=1)
|
|
60
|
+
e = raw.quantile(0.3, numeric_only=True, axis=1)
|
|
61
61
|
assert isinstance(r, Series)
|
|
62
62
|
assert r.shape == e.shape
|
|
63
63
|
assert r.dtype == e.dtype
|
|
@@ -65,7 +65,7 @@ def test_dataframe_quantile():
|
|
|
65
65
|
|
|
66
66
|
# q = [0.3, 0.7], axis = 0
|
|
67
67
|
r = s.quantile([0.3, 0.7])
|
|
68
|
-
e = raw.quantile([0.3, 0.7])
|
|
68
|
+
e = raw.quantile([0.3, 0.7], numeric_only=True)
|
|
69
69
|
assert isinstance(r, DataFrame)
|
|
70
70
|
assert r.shape == e.shape
|
|
71
71
|
pd.testing.assert_series_equal(r.dtypes, e.dtypes)
|
|
@@ -74,7 +74,7 @@ def test_dataframe_quantile():
|
|
|
74
74
|
|
|
75
75
|
# q = [0.3, 0.7], axis = 1
|
|
76
76
|
r = s.quantile([0.3, 0.7], axis=1)
|
|
77
|
-
e = raw.quantile([0.3, 0.7], axis=1)
|
|
77
|
+
e = raw.quantile([0.3, 0.7], numeric_only=True, axis=1)
|
|
78
78
|
assert isinstance(r, DataFrame)
|
|
79
79
|
assert r.shape == e.shape
|
|
80
80
|
pd.testing.assert_series_equal(r.dtypes, e.dtypes)
|