maxframe 1.0.0rc1__cp39-cp39-win32.whl → 1.0.0rc3__cp39-cp39-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cp39-win32.pyd +0 -0
- maxframe/codegen.py +3 -6
- maxframe/config/config.py +49 -10
- maxframe/config/validators.py +42 -11
- maxframe/conftest.py +15 -2
- maxframe/core/__init__.py +2 -13
- maxframe/core/entity/__init__.py +0 -4
- maxframe/core/entity/objects.py +46 -3
- maxframe/core/entity/output_types.py +0 -3
- maxframe/core/entity/tests/test_objects.py +43 -0
- maxframe/core/entity/tileables.py +5 -78
- maxframe/core/graph/__init__.py +2 -2
- maxframe/core/graph/builder/__init__.py +0 -1
- maxframe/core/graph/builder/base.py +5 -4
- maxframe/core/graph/builder/tileable.py +4 -4
- maxframe/core/graph/builder/utils.py +4 -8
- maxframe/core/graph/core.cp39-win32.pyd +0 -0
- maxframe/core/graph/entity.py +9 -33
- maxframe/core/operator/__init__.py +2 -9
- maxframe/core/operator/base.py +3 -5
- maxframe/core/operator/objects.py +0 -9
- maxframe/core/operator/utils.py +55 -0
- maxframe/dataframe/__init__.py +1 -1
- maxframe/dataframe/arithmetic/around.py +5 -17
- maxframe/dataframe/arithmetic/core.py +15 -7
- maxframe/dataframe/arithmetic/docstring.py +5 -55
- maxframe/dataframe/arithmetic/tests/test_arithmetic.py +22 -0
- maxframe/dataframe/core.py +5 -5
- maxframe/dataframe/datasource/date_range.py +2 -2
- maxframe/dataframe/datasource/read_odps_query.py +7 -1
- maxframe/dataframe/datasource/read_odps_table.py +3 -2
- maxframe/dataframe/datasource/tests/test_datasource.py +14 -0
- maxframe/dataframe/datastore/to_odps.py +1 -1
- maxframe/dataframe/groupby/cum.py +0 -1
- maxframe/dataframe/groupby/tests/test_groupby.py +4 -0
- maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
- maxframe/dataframe/indexing/rename.py +3 -37
- maxframe/dataframe/indexing/sample.py +0 -1
- maxframe/dataframe/indexing/set_index.py +68 -1
- maxframe/dataframe/merge/merge.py +236 -2
- maxframe/dataframe/merge/tests/test_merge.py +123 -0
- maxframe/dataframe/misc/apply.py +3 -10
- maxframe/dataframe/misc/case_when.py +1 -1
- maxframe/dataframe/misc/describe.py +2 -2
- maxframe/dataframe/misc/drop_duplicates.py +4 -25
- maxframe/dataframe/misc/eval.py +4 -0
- maxframe/dataframe/misc/pct_change.py +1 -83
- maxframe/dataframe/misc/transform.py +1 -30
- maxframe/dataframe/misc/value_counts.py +4 -17
- maxframe/dataframe/missing/dropna.py +1 -1
- maxframe/dataframe/missing/fillna.py +5 -5
- maxframe/dataframe/operators.py +1 -17
- maxframe/dataframe/reduction/core.py +2 -2
- maxframe/dataframe/sort/sort_values.py +1 -11
- maxframe/dataframe/statistics/quantile.py +5 -17
- maxframe/dataframe/utils.py +4 -7
- maxframe/io/objects/__init__.py +24 -0
- maxframe/io/objects/core.py +140 -0
- maxframe/io/objects/tensor.py +76 -0
- maxframe/io/objects/tests/__init__.py +13 -0
- maxframe/io/objects/tests/test_object_io.py +97 -0
- maxframe/{odpsio → io/odpsio}/__init__.py +3 -1
- maxframe/{odpsio → io/odpsio}/arrow.py +12 -8
- maxframe/{odpsio → io/odpsio}/schema.py +15 -12
- maxframe/io/odpsio/tableio.py +702 -0
- maxframe/io/odpsio/tests/__init__.py +13 -0
- maxframe/{odpsio → io/odpsio}/tests/test_schema.py +19 -18
- maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +50 -23
- maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
- maxframe/io/odpsio/volumeio.py +57 -0
- maxframe/learn/contrib/xgboost/classifier.py +26 -2
- maxframe/learn/contrib/xgboost/core.py +87 -2
- maxframe/learn/contrib/xgboost/dmatrix.py +3 -6
- maxframe/learn/contrib/xgboost/predict.py +21 -7
- maxframe/learn/contrib/xgboost/regressor.py +3 -10
- maxframe/learn/contrib/xgboost/train.py +27 -17
- maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
- maxframe/lib/mmh3.cp39-win32.pyd +0 -0
- maxframe/protocol.py +41 -17
- maxframe/remote/core.py +4 -8
- maxframe/serialization/__init__.py +1 -0
- maxframe/serialization/core.cp39-win32.pyd +0 -0
- maxframe/serialization/serializables/core.py +48 -9
- maxframe/tensor/__init__.py +69 -2
- maxframe/tensor/arithmetic/isclose.py +1 -0
- maxframe/tensor/arithmetic/tests/test_arithmetic.py +21 -17
- maxframe/tensor/core.py +5 -136
- maxframe/tensor/datasource/array.py +3 -0
- maxframe/tensor/datasource/full.py +1 -1
- maxframe/tensor/datasource/tests/test_datasource.py +1 -1
- maxframe/tensor/indexing/flatnonzero.py +1 -1
- maxframe/tensor/merge/__init__.py +2 -0
- maxframe/tensor/merge/concatenate.py +98 -0
- maxframe/tensor/merge/tests/test_merge.py +30 -1
- maxframe/tensor/merge/vstack.py +70 -0
- maxframe/tensor/{base → misc}/__init__.py +2 -0
- maxframe/tensor/{base → misc}/atleast_1d.py +0 -2
- maxframe/tensor/misc/atleast_2d.py +70 -0
- maxframe/tensor/misc/atleast_3d.py +85 -0
- maxframe/tensor/misc/tests/__init__.py +13 -0
- maxframe/tensor/{base → misc}/transpose.py +22 -18
- maxframe/tensor/{base → misc}/unique.py +2 -2
- maxframe/tensor/operators.py +1 -7
- maxframe/tensor/random/core.py +1 -1
- maxframe/tensor/reduction/count_nonzero.py +1 -0
- maxframe/tensor/reduction/mean.py +1 -0
- maxframe/tensor/reduction/nanmean.py +1 -0
- maxframe/tensor/reduction/nanvar.py +2 -0
- maxframe/tensor/reduction/tests/test_reduction.py +12 -1
- maxframe/tensor/reduction/var.py +2 -0
- maxframe/tensor/statistics/quantile.py +2 -2
- maxframe/tensor/utils.py +2 -22
- maxframe/tests/utils.py +11 -2
- maxframe/typing_.py +4 -1
- maxframe/udf.py +8 -9
- maxframe/utils.py +32 -70
- {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc3.dist-info}/METADATA +25 -25
- {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc3.dist-info}/RECORD +133 -123
- {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc3.dist-info}/WHEEL +1 -1
- maxframe_client/fetcher.py +60 -68
- maxframe_client/session/graph.py +8 -2
- maxframe_client/session/odps.py +58 -22
- maxframe_client/tests/test_fetcher.py +21 -3
- maxframe_client/tests/test_session.py +27 -4
- maxframe/core/entity/chunks.py +0 -68
- maxframe/core/entity/fuse.py +0 -73
- maxframe/core/graph/builder/chunk.py +0 -430
- maxframe/odpsio/tableio.py +0 -322
- maxframe/odpsio/volumeio.py +0 -95
- /maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
- /maxframe/{tensor/base/tests → io}/__init__.py +0 -0
- /maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
- /maxframe/tensor/{base → misc}/astype.py +0 -0
- /maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
- /maxframe/tensor/{base → misc}/ravel.py +0 -0
- /maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
- /maxframe/tensor/{base → misc}/where.py +0 -0
- {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc3.dist-info}/top_level.txt +0 -0
maxframe/dataframe/misc/apply.py
CHANGED
|
@@ -319,6 +319,7 @@ def df_apply(
|
|
|
319
319
|
skip_infer=False,
|
|
320
320
|
**kwds,
|
|
321
321
|
):
|
|
322
|
+
# FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/50
|
|
322
323
|
"""
|
|
323
324
|
Apply a function along an axis of the DataFrame.
|
|
324
325
|
|
|
@@ -444,20 +445,12 @@ def df_apply(
|
|
|
444
445
|
B 27
|
|
445
446
|
dtype: int64
|
|
446
447
|
|
|
447
|
-
>>> df.apply(np.sum, axis=1).execute()
|
|
448
|
+
>>> df.apply(lambda row: int(np.sum(row)), axis=1).execute()
|
|
448
449
|
0 13
|
|
449
450
|
1 13
|
|
450
451
|
2 13
|
|
451
452
|
dtype: int64
|
|
452
453
|
|
|
453
|
-
Returning a list-like will result in a Series
|
|
454
|
-
|
|
455
|
-
>>> df.apply(lambda x: [1, 2], axis=1).execute()
|
|
456
|
-
0 [1, 2]
|
|
457
|
-
1 [1, 2]
|
|
458
|
-
2 [1, 2]
|
|
459
|
-
dtype: object
|
|
460
|
-
|
|
461
454
|
Passing ``result_type='expand'`` will expand list-like results
|
|
462
455
|
to columns of a Dataframe
|
|
463
456
|
|
|
@@ -471,7 +464,7 @@ def df_apply(
|
|
|
471
464
|
``result_type='expand'``. The resulting column names
|
|
472
465
|
will be the Series index.
|
|
473
466
|
|
|
474
|
-
>>> df.apply(lambda x:
|
|
467
|
+
>>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1).execute()
|
|
475
468
|
foo bar
|
|
476
469
|
0 1 2
|
|
477
470
|
1 1 2
|
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
import numpy as np
|
|
16
16
|
import pandas as pd
|
|
17
17
|
|
|
18
|
-
from ... import opcodes
|
|
18
|
+
from ... import opcodes
|
|
19
19
|
from ...serialization.serializables import AnyField, FieldTypes, KeyField, ListField
|
|
20
20
|
from ..core import SERIES_TYPE
|
|
21
21
|
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
@@ -23,7 +23,7 @@ from ..utils import build_empty_df, parse_index
|
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
class DataFrameDescribe(DataFrameOperator, DataFrameOperatorMixin):
|
|
26
|
-
_op_type_ =
|
|
26
|
+
_op_type_ = opcodes.DESCRIBE
|
|
27
27
|
|
|
28
28
|
input = KeyField("input", default=None)
|
|
29
29
|
percentiles = ListField("percentiles", FieldTypes.float64, default=None)
|
|
@@ -37,16 +37,15 @@ class DataFrameDropDuplicates(DuplicateOperand):
|
|
|
37
37
|
shape += (3,)
|
|
38
38
|
return shape
|
|
39
39
|
|
|
40
|
-
|
|
41
|
-
def _gen_tileable_params(cls, op: "DataFrameDropDuplicates", input_params):
|
|
40
|
+
def _gen_tileable_params(self, op: "DataFrameDropDuplicates", input_params):
|
|
42
41
|
params = input_params.copy()
|
|
43
|
-
if op.ignore_index:
|
|
42
|
+
if op.ignore_index and self._output_types[0] != OutputType.index:
|
|
44
43
|
params["index_value"] = parse_index(pd.RangeIndex(-1))
|
|
45
44
|
else:
|
|
46
45
|
params["index_value"] = gen_unknown_index_value(
|
|
47
46
|
input_params["index_value"], op.keep, op.subset, type(op).__name__
|
|
48
47
|
)
|
|
49
|
-
params["shape"] =
|
|
48
|
+
params["shape"] = self._get_shape(input_params["shape"], op)
|
|
50
49
|
return params
|
|
51
50
|
|
|
52
51
|
def __call__(self, inp, inplace=False):
|
|
@@ -105,6 +104,7 @@ def df_drop_duplicates(
|
|
|
105
104
|
def series_drop_duplicates(
|
|
106
105
|
series, keep="first", inplace=False, ignore_index=False, method="auto"
|
|
107
106
|
):
|
|
107
|
+
# FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/12
|
|
108
108
|
"""
|
|
109
109
|
Return Series with duplicate values removed.
|
|
110
110
|
|
|
@@ -148,27 +148,6 @@ def series_drop_duplicates(
|
|
|
148
148
|
5 hippo
|
|
149
149
|
Name: animal, dtype: object
|
|
150
150
|
|
|
151
|
-
With the 'keep' parameter, the selection behaviour of duplicated values
|
|
152
|
-
can be changed. The value 'first' keeps the first occurrence for each
|
|
153
|
-
set of duplicated entries. The default value of keep is 'first'.
|
|
154
|
-
|
|
155
|
-
>>> s.drop_duplicates().execute()
|
|
156
|
-
0 lame
|
|
157
|
-
1 cow
|
|
158
|
-
3 beetle
|
|
159
|
-
5 hippo
|
|
160
|
-
Name: animal, dtype: object
|
|
161
|
-
|
|
162
|
-
The value 'last' for parameter 'keep' keeps the last occurrence for
|
|
163
|
-
each set of duplicated entries.
|
|
164
|
-
|
|
165
|
-
>>> s.drop_duplicates(keep='last').execute()
|
|
166
|
-
1 cow
|
|
167
|
-
3 beetle
|
|
168
|
-
4 lame
|
|
169
|
-
5 hippo
|
|
170
|
-
Name: animal, dtype: object
|
|
171
|
-
|
|
172
151
|
The value ``False`` for parameter 'keep' discards all sets of
|
|
173
152
|
duplicated entries. Setting the value of 'inplace' to ``True`` performs
|
|
174
153
|
the operation inplace and returns ``None``.
|
maxframe/dataframe/misc/eval.py
CHANGED
|
@@ -120,6 +120,10 @@ class CollectionVisitor(ast.NodeVisitor):
|
|
|
120
120
|
if obj_name in self.env:
|
|
121
121
|
self.referenced_vars.add(obj_name)
|
|
122
122
|
return self.env[obj_name]
|
|
123
|
+
try:
|
|
124
|
+
return self.target[obj_name]
|
|
125
|
+
except KeyError:
|
|
126
|
+
pass
|
|
123
127
|
raise KeyError(f"name {obj_name} is not defined")
|
|
124
128
|
|
|
125
129
|
def visit(self, node):
|
|
@@ -18,6 +18,7 @@ from ..utils import validate_axis
|
|
|
18
18
|
def pct_change(
|
|
19
19
|
df_or_series, periods=1, fill_method="pad", limit=None, freq=None, **kwargs
|
|
20
20
|
):
|
|
21
|
+
# FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/32
|
|
21
22
|
"""
|
|
22
23
|
Percentage change between the current and a prior element.
|
|
23
24
|
|
|
@@ -50,89 +51,6 @@ def pct_change(
|
|
|
50
51
|
DataFrame.diff : Compute the difference of two elements in a DataFrame.
|
|
51
52
|
Series.shift : Shift the index by some number of periods.
|
|
52
53
|
DataFrame.shift : Shift the index by some number of periods.
|
|
53
|
-
|
|
54
|
-
Examples
|
|
55
|
-
--------
|
|
56
|
-
**Series**
|
|
57
|
-
|
|
58
|
-
>>> import maxframe.dataframe as md
|
|
59
|
-
|
|
60
|
-
>>> s = md.Series([90, 91, 85])
|
|
61
|
-
>>> s.execute()
|
|
62
|
-
0 90
|
|
63
|
-
1 91
|
|
64
|
-
2 85
|
|
65
|
-
dtype: int64
|
|
66
|
-
|
|
67
|
-
>>> s.pct_change().execute()
|
|
68
|
-
0 NaN
|
|
69
|
-
1 0.011111
|
|
70
|
-
2 -0.065934
|
|
71
|
-
dtype: float64
|
|
72
|
-
|
|
73
|
-
>>> s.pct_change(periods=2).execute()
|
|
74
|
-
0 NaN
|
|
75
|
-
1 NaN
|
|
76
|
-
2 -0.055556
|
|
77
|
-
dtype: float64
|
|
78
|
-
|
|
79
|
-
See the percentage change in a Series where filling NAs with last
|
|
80
|
-
valid observation forward to next valid.
|
|
81
|
-
|
|
82
|
-
>>> s = md.Series([90, 91, None, 85])
|
|
83
|
-
>>> s.execute()
|
|
84
|
-
0 90.0
|
|
85
|
-
1 91.0
|
|
86
|
-
2 NaN
|
|
87
|
-
3 85.0
|
|
88
|
-
dtype: float64
|
|
89
|
-
|
|
90
|
-
>>> s.pct_change(fill_method='ffill').execute()
|
|
91
|
-
0 NaN
|
|
92
|
-
1 0.011111
|
|
93
|
-
2 0.000000
|
|
94
|
-
3 -0.065934
|
|
95
|
-
dtype: float64
|
|
96
|
-
|
|
97
|
-
**DataFrame**
|
|
98
|
-
|
|
99
|
-
Percentage change in French franc, Deutsche Mark, and Italian lira from
|
|
100
|
-
1980-01-01 to 1980-03-01.
|
|
101
|
-
|
|
102
|
-
>>> df = md.DataFrame({
|
|
103
|
-
... 'FR': [4.0405, 4.0963, 4.3149],
|
|
104
|
-
... 'GR': [1.7246, 1.7482, 1.8519],
|
|
105
|
-
... 'IT': [804.74, 810.01, 860.13]},
|
|
106
|
-
... index=['1980-01-01', '1980-02-01', '1980-03-01'])
|
|
107
|
-
>>> df.execute()
|
|
108
|
-
FR GR IT
|
|
109
|
-
1980-01-01 4.0405 1.7246 804.74
|
|
110
|
-
1980-02-01 4.0963 1.7482 810.01
|
|
111
|
-
1980-03-01 4.3149 1.8519 860.13
|
|
112
|
-
|
|
113
|
-
>>> df.pct_change().execute()
|
|
114
|
-
FR GR IT
|
|
115
|
-
1980-01-01 NaN NaN NaN
|
|
116
|
-
1980-02-01 0.013810 0.013684 0.006549
|
|
117
|
-
1980-03-01 0.053365 0.059318 0.061876
|
|
118
|
-
|
|
119
|
-
Percentage of change in GOOG and APPL stock volume. Shows computing
|
|
120
|
-
the percentage change between columns.
|
|
121
|
-
|
|
122
|
-
>>> df = md.DataFrame({
|
|
123
|
-
... '2016': [1769950, 30586265],
|
|
124
|
-
... '2015': [1500923, 40912316],
|
|
125
|
-
... '2014': [1371819, 41403351]},
|
|
126
|
-
... index=['GOOG', 'APPL'])
|
|
127
|
-
>>> df.execute()
|
|
128
|
-
2016 2015 2014
|
|
129
|
-
GOOG 1769950 1500923 1371819
|
|
130
|
-
APPL 30586265 40912316 41403351
|
|
131
|
-
|
|
132
|
-
>>> df.pct_change(axis='columns').execute()
|
|
133
|
-
2016 2015 2014
|
|
134
|
-
GOOG NaN -0.151997 -0.086016
|
|
135
|
-
APPL NaN 0.337604 0.012002
|
|
136
54
|
"""
|
|
137
55
|
|
|
138
56
|
axis = validate_axis(kwargs.pop("axis", 0))
|
|
@@ -228,21 +228,6 @@ def df_transform(df, func, axis=0, *args, dtypes=None, skip_infer=False, **kwarg
|
|
|
228
228
|
0 1 2
|
|
229
229
|
1 2 3
|
|
230
230
|
2 3 4
|
|
231
|
-
|
|
232
|
-
Even though the resulting DataFrame must have the same length as the
|
|
233
|
-
input DataFrame, it is possible to provide several input functions:
|
|
234
|
-
|
|
235
|
-
>>> s = md.Series(range(3))
|
|
236
|
-
>>> s.execute()
|
|
237
|
-
0 0
|
|
238
|
-
1 1
|
|
239
|
-
2 2
|
|
240
|
-
dtype: int64
|
|
241
|
-
>>> s.transform([mt.sqrt, mt.exp]).execute()
|
|
242
|
-
sqrt exp
|
|
243
|
-
0 0.000000 1.000000
|
|
244
|
-
1 1.000000 2.718282
|
|
245
|
-
2 1.414214 7.389056
|
|
246
231
|
"""
|
|
247
232
|
op = TransformOperator(
|
|
248
233
|
func=func,
|
|
@@ -265,6 +250,7 @@ def series_transform(
|
|
|
265
250
|
dtype=None,
|
|
266
251
|
**kwargs
|
|
267
252
|
):
|
|
253
|
+
# FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/10
|
|
268
254
|
"""
|
|
269
255
|
Call ``func`` on self producing a Series with transformed values.
|
|
270
256
|
|
|
@@ -332,21 +318,6 @@ def series_transform(
|
|
|
332
318
|
0 1 2
|
|
333
319
|
1 2 3
|
|
334
320
|
2 3 4
|
|
335
|
-
|
|
336
|
-
Even though the resulting Series must have the same length as the
|
|
337
|
-
input Series, it is possible to provide several input functions:
|
|
338
|
-
|
|
339
|
-
>>> s = md.Series(range(3))
|
|
340
|
-
>>> s.execute()
|
|
341
|
-
0 0
|
|
342
|
-
1 1
|
|
343
|
-
2 2
|
|
344
|
-
dtype: int64
|
|
345
|
-
>>> s.transform([mt.sqrt, mt.exp]).execute()
|
|
346
|
-
sqrt exp
|
|
347
|
-
0 0.000000 1.000000
|
|
348
|
-
1 1.000000 2.718282
|
|
349
|
-
2 1.414214 7.389056
|
|
350
321
|
"""
|
|
351
322
|
op = TransformOperator(
|
|
352
323
|
func=func,
|
|
@@ -85,6 +85,7 @@ def value_counts(
|
|
|
85
85
|
dropna=True,
|
|
86
86
|
method="auto",
|
|
87
87
|
):
|
|
88
|
+
# FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/33
|
|
88
89
|
"""
|
|
89
90
|
Return a Series containing counts of unique values.
|
|
90
91
|
|
|
@@ -125,9 +126,8 @@ def value_counts(
|
|
|
125
126
|
Examples
|
|
126
127
|
--------
|
|
127
128
|
>>> import maxframe.dataframe as md
|
|
128
|
-
>>> import
|
|
129
|
-
|
|
130
|
-
>>> s = md.Series([3, 1, 2, 3, 4, mt.nan])
|
|
129
|
+
>>> import numpy as np
|
|
130
|
+
>>> s = md.Series([3, 1, 2, 3, 4, np.nan])
|
|
131
131
|
>>> s.value_counts().execute()
|
|
132
132
|
3.0 2
|
|
133
133
|
4.0 1
|
|
@@ -138,7 +138,7 @@ def value_counts(
|
|
|
138
138
|
With `normalize` set to `True`, returns the relative frequency by
|
|
139
139
|
dividing all values by the sum of values.
|
|
140
140
|
|
|
141
|
-
>>> s = md.Series([3, 1, 2, 3, 4,
|
|
141
|
+
>>> s = md.Series([3, 1, 2, 3, 4, np.nan])
|
|
142
142
|
>>> s.value_counts(normalize=True).execute()
|
|
143
143
|
3.0 0.4
|
|
144
144
|
4.0 0.2
|
|
@@ -146,19 +146,6 @@ def value_counts(
|
|
|
146
146
|
1.0 0.2
|
|
147
147
|
dtype: float64
|
|
148
148
|
|
|
149
|
-
**bins**
|
|
150
|
-
|
|
151
|
-
Bins can be useful for going from a continuous variable to a
|
|
152
|
-
categorical variable; instead of counting unique
|
|
153
|
-
apparitions of values, divide the index in the specified
|
|
154
|
-
number of half-open bins.
|
|
155
|
-
|
|
156
|
-
>>> s.value_counts(bins=3).execute()
|
|
157
|
-
(2.0, 3.0] 2
|
|
158
|
-
(0.996, 2.0] 2
|
|
159
|
-
(3.0, 4.0] 1
|
|
160
|
-
dtype: int64
|
|
161
|
-
|
|
162
149
|
**dropna**
|
|
163
150
|
|
|
164
151
|
With `dropna` set to `False` we can also see NaN index values.
|
|
@@ -234,7 +234,7 @@ def series_dropna(series, axis=0, inplace=False, how=None):
|
|
|
234
234
|
Empty strings are not considered NA values. ``None`` is considered an
|
|
235
235
|
NA value.
|
|
236
236
|
|
|
237
|
-
>>> ser = md.Series([np.NaN, 2, md.NaT, '', None, 'I stay'])
|
|
237
|
+
>>> ser = md.Series([np.NaN, '2', md.NaT, '', None, 'I stay'])
|
|
238
238
|
>>> ser.execute()
|
|
239
239
|
0 NaN
|
|
240
240
|
1 2
|
|
@@ -132,11 +132,11 @@ def fillna(
|
|
|
132
132
|
--------
|
|
133
133
|
>>> import maxframe.tensor as mt
|
|
134
134
|
>>> import maxframe.dataframe as md
|
|
135
|
-
>>> df = md.DataFrame([[
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
135
|
+
>>> df = md.DataFrame([[np.nan, 2, np.nan, 0],
|
|
136
|
+
[3, 4, np.nan, 1],
|
|
137
|
+
[np.nan, np.nan, np.nan, 5],
|
|
138
|
+
[np.nan, 3, np.nan, 4]],
|
|
139
|
+
columns=list('ABCD'))
|
|
140
140
|
>>> df.execute()
|
|
141
141
|
A B C D
|
|
142
142
|
0 NaN 2.0 NaN 0
|
maxframe/dataframe/operators.py
CHANGED
|
@@ -16,13 +16,7 @@ import numpy as np
|
|
|
16
16
|
import pandas as pd
|
|
17
17
|
|
|
18
18
|
from ..core import ENTITY_TYPE, OutputType
|
|
19
|
-
from ..core.operator import
|
|
20
|
-
Fuse,
|
|
21
|
-
FuseChunkMixin,
|
|
22
|
-
Operator,
|
|
23
|
-
ShuffleProxy,
|
|
24
|
-
TileableOperatorMixin,
|
|
25
|
-
)
|
|
19
|
+
from ..core.operator import Operator, ShuffleProxy, TileableOperatorMixin
|
|
26
20
|
from ..tensor.core import TENSOR_TYPE
|
|
27
21
|
from ..tensor.datasource import tensor as astensor
|
|
28
22
|
from .core import DATAFRAME_TYPE, SERIES_TYPE
|
|
@@ -261,13 +255,3 @@ DataFrameOperator = Operator
|
|
|
261
255
|
class DataFrameShuffleProxy(ShuffleProxy, DataFrameOperatorMixin):
|
|
262
256
|
def __init__(self, sparse=None, output_types=None, **kwargs):
|
|
263
257
|
super().__init__(sparse=sparse, _output_types=output_types, **kwargs)
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
class DataFrameFuseChunkMixin(FuseChunkMixin, DataFrameOperatorMixin):
|
|
267
|
-
__slots__ = ()
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
class DataFrameFuseChunk(Fuse, DataFrameFuseChunkMixin):
|
|
271
|
-
@property
|
|
272
|
-
def output_types(self):
|
|
273
|
-
return self.outputs[-1].chunk.op.output_types
|
|
@@ -552,7 +552,7 @@ class ReductionCompiler:
|
|
|
552
552
|
@enter_mode(build=True)
|
|
553
553
|
def _compile_function(self, func, func_name=None, ndim=1) -> ReductionSteps:
|
|
554
554
|
from ...tensor.arithmetic.core import TensorBinOp, TensorUnaryOp
|
|
555
|
-
from ...tensor.
|
|
555
|
+
from ...tensor.misc import TensorWhere
|
|
556
556
|
from ..arithmetic.core import DataFrameBinOp, DataFrameUnaryOp
|
|
557
557
|
from ..datasource.dataframe import DataFrameDataSource
|
|
558
558
|
from ..datasource.series import SeriesDataSource
|
|
@@ -679,8 +679,8 @@ class ReductionCompiler:
|
|
|
679
679
|
]
|
|
680
680
|
"""
|
|
681
681
|
from ...tensor.arithmetic.core import TensorBinOp, TensorUnaryOp
|
|
682
|
-
from ...tensor.base import TensorWhere
|
|
683
682
|
from ...tensor.datasource import Scalar
|
|
683
|
+
from ...tensor.misc import TensorWhere
|
|
684
684
|
from ..arithmetic.core import DataFrameBinOp, DataFrameUnaryOp
|
|
685
685
|
from ..datasource.dataframe import DataFrameDataSource
|
|
686
686
|
from ..datasource.series import SeriesDataSource
|
|
@@ -67,6 +67,7 @@ def dataframe_sort_values(
|
|
|
67
67
|
parallel_kind="PSRS",
|
|
68
68
|
psrs_kinds=None,
|
|
69
69
|
):
|
|
70
|
+
# FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/15
|
|
70
71
|
"""
|
|
71
72
|
Sort by the values along either axis.
|
|
72
73
|
|
|
@@ -152,17 +153,6 @@ def dataframe_sort_values(
|
|
|
152
153
|
0 A 2 0
|
|
153
154
|
1 A 1 1
|
|
154
155
|
3 NaN 8 4
|
|
155
|
-
|
|
156
|
-
Putting NAs first
|
|
157
|
-
|
|
158
|
-
>>> df.sort_values(by='col1', ascending=False, na_position='first').execute()
|
|
159
|
-
col1 col2 col3
|
|
160
|
-
3 NaN 8 4
|
|
161
|
-
4 D 7 2
|
|
162
|
-
5 C 4 3
|
|
163
|
-
2 B 9 9
|
|
164
|
-
0 A 2 0
|
|
165
|
-
1 A 1 1
|
|
166
156
|
"""
|
|
167
157
|
|
|
168
158
|
if na_position not in ["last", "first"]: # pragma: no cover
|
|
@@ -14,8 +14,9 @@
|
|
|
14
14
|
|
|
15
15
|
import numpy as np
|
|
16
16
|
import pandas as pd
|
|
17
|
+
from pandas.core.dtypes.cast import find_common_type
|
|
17
18
|
|
|
18
|
-
from ... import opcodes
|
|
19
|
+
from ... import opcodes
|
|
19
20
|
from ...core import ENTITY_TYPE
|
|
20
21
|
from ...serialization.serializables import (
|
|
21
22
|
AnyField,
|
|
@@ -32,11 +33,11 @@ from ...tensor.datasource import tensor as astensor
|
|
|
32
33
|
from ...tensor.statistics.quantile import quantile as tensor_quantile
|
|
33
34
|
from ..core import DATAFRAME_TYPE
|
|
34
35
|
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
35
|
-
from ..utils import build_empty_df,
|
|
36
|
+
from ..utils import build_empty_df, parse_index, validate_axis
|
|
36
37
|
|
|
37
38
|
|
|
38
39
|
class DataFrameQuantile(DataFrameOperator, DataFrameOperatorMixin):
|
|
39
|
-
_op_type_ =
|
|
40
|
+
_op_type_ = opcodes.QUANTILE
|
|
40
41
|
|
|
41
42
|
input = KeyField("input", default=None)
|
|
42
43
|
q = AnyField("q", default=None)
|
|
@@ -259,6 +260,7 @@ def quantile_series(series, q=0.5, interpolation="linear"):
|
|
|
259
260
|
|
|
260
261
|
|
|
261
262
|
def quantile_dataframe(df, q=0.5, axis=0, numeric_only=True, interpolation="linear"):
|
|
263
|
+
# FIXME: Timedelta not support. Data invalid: ODPS-0010000:InvalidArgument:duration[ns] is not equal to string
|
|
262
264
|
"""
|
|
263
265
|
Return values at the given quantile over requested axis.
|
|
264
266
|
|
|
@@ -309,20 +311,6 @@ def quantile_dataframe(df, q=0.5, axis=0, numeric_only=True, interpolation="line
|
|
|
309
311
|
a b
|
|
310
312
|
0.1 1.3 3.7
|
|
311
313
|
0.5 2.5 55.0
|
|
312
|
-
|
|
313
|
-
Specifying `numeric_only=False` will also compute the quantile of
|
|
314
|
-
datetime and timedelta data.
|
|
315
|
-
|
|
316
|
-
>>> df = md.DataFrame({'A': [1, 2],
|
|
317
|
-
... 'B': [md.Timestamp('2010'),
|
|
318
|
-
... md.Timestamp('2011')],
|
|
319
|
-
... 'C': [md.Timedelta('1 days'),
|
|
320
|
-
... md.Timedelta('2 days')]})
|
|
321
|
-
>>> df.quantile(0.5, numeric_only=False).execute()
|
|
322
|
-
A 1.5
|
|
323
|
-
B 2010-07-02 12:00:00
|
|
324
|
-
C 1 days 12:00:00
|
|
325
|
-
Name: 0.5, dtype: object
|
|
326
314
|
"""
|
|
327
315
|
if isinstance(q, ENTITY_TYPE):
|
|
328
316
|
q = astensor(q)
|
maxframe/dataframe/utils.py
CHANGED
|
@@ -26,7 +26,6 @@ import numpy as np
|
|
|
26
26
|
import pandas as pd
|
|
27
27
|
from pandas.api.extensions import ExtensionDtype
|
|
28
28
|
from pandas.api.types import is_string_dtype
|
|
29
|
-
from pandas.core.dtypes.cast import find_common_type
|
|
30
29
|
from pandas.core.dtypes.inference import is_dict_like, is_list_like
|
|
31
30
|
|
|
32
31
|
from ..core import Entity, ExecutableTuple
|
|
@@ -477,11 +476,11 @@ def build_df(df_obj, fill_value=1, size=1, ensure_string=False):
|
|
|
477
476
|
else:
|
|
478
477
|
fill_values = fill_value
|
|
479
478
|
|
|
480
|
-
from .core import SERIES_TYPE
|
|
479
|
+
from .core import INDEX_TYPE, SERIES_TYPE
|
|
481
480
|
|
|
482
481
|
dtypes = (
|
|
483
482
|
pd.Series([df_obj.dtype], index=[df_obj.name])
|
|
484
|
-
if isinstance(df_obj, SERIES_TYPE)
|
|
483
|
+
if isinstance(df_obj, (INDEX_TYPE, SERIES_TYPE))
|
|
485
484
|
else df_obj.dtypes
|
|
486
485
|
)
|
|
487
486
|
for size, fill_value in zip(sizes, fill_values):
|
|
@@ -593,7 +592,7 @@ def build_series(
|
|
|
593
592
|
return ret_series
|
|
594
593
|
|
|
595
594
|
|
|
596
|
-
def infer_index_value(left_index_value, right_index_value):
|
|
595
|
+
def infer_index_value(left_index_value, right_index_value, level=None):
|
|
597
596
|
from .core import IndexValue
|
|
598
597
|
|
|
599
598
|
if isinstance(left_index_value.value, IndexValue.RangeIndex) and isinstance(
|
|
@@ -616,9 +615,7 @@ def infer_index_value(left_index_value, right_index_value):
|
|
|
616
615
|
|
|
617
616
|
left_index = left_index_value.to_pandas()
|
|
618
617
|
right_index = right_index_value.to_pandas()
|
|
619
|
-
out_index =
|
|
620
|
-
[], dtype=find_common_type([left_index.dtype, right_index.dtype])
|
|
621
|
-
)
|
|
618
|
+
out_index = left_index.join(right_index, level=level)[:0]
|
|
622
619
|
return parse_index(out_index, left_index_value, right_index_value)
|
|
623
620
|
|
|
624
621
|
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from .core import (
|
|
16
|
+
AbstractObjectIOHandler,
|
|
17
|
+
get_object_io_handler,
|
|
18
|
+
register_object_io_handler,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
# isort: off
|
|
22
|
+
from . import tensor
|
|
23
|
+
|
|
24
|
+
del tensor
|