maxframe 0.1.0b5__cp37-cp37m-win32.whl → 1.0.0rc2__cp37-cp37m-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cp37-win32.pyd +0 -0
- maxframe/codegen.py +6 -2
- maxframe/config/config.py +38 -2
- maxframe/config/validators.py +1 -0
- maxframe/conftest.py +2 -0
- maxframe/core/__init__.py +0 -3
- maxframe/core/entity/__init__.py +1 -8
- maxframe/core/entity/objects.py +3 -45
- maxframe/core/graph/core.cp37-win32.pyd +0 -0
- maxframe/core/graph/core.pyx +4 -4
- maxframe/dataframe/__init__.py +1 -1
- maxframe/dataframe/arithmetic/around.py +5 -17
- maxframe/dataframe/arithmetic/core.py +15 -7
- maxframe/dataframe/arithmetic/docstring.py +5 -55
- maxframe/dataframe/arithmetic/tests/test_arithmetic.py +22 -0
- maxframe/dataframe/core.py +5 -5
- maxframe/dataframe/datasource/date_range.py +2 -2
- maxframe/dataframe/datasource/read_odps_query.py +6 -0
- maxframe/dataframe/datasource/read_odps_table.py +2 -1
- maxframe/dataframe/datasource/tests/test_datasource.py +14 -0
- maxframe/dataframe/datastore/tests/__init__.py +13 -0
- maxframe/dataframe/datastore/tests/test_to_odps.py +48 -0
- maxframe/dataframe/datastore/to_odps.py +21 -0
- maxframe/dataframe/groupby/cum.py +0 -1
- maxframe/dataframe/groupby/tests/test_groupby.py +4 -0
- maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
- maxframe/dataframe/indexing/align.py +1 -1
- maxframe/dataframe/indexing/rename.py +3 -37
- maxframe/dataframe/indexing/sample.py +0 -1
- maxframe/dataframe/indexing/set_index.py +68 -1
- maxframe/dataframe/merge/merge.py +236 -2
- maxframe/dataframe/merge/tests/test_merge.py +123 -0
- maxframe/dataframe/misc/apply.py +5 -10
- maxframe/dataframe/misc/case_when.py +1 -1
- maxframe/dataframe/misc/describe.py +2 -2
- maxframe/dataframe/misc/drop_duplicates.py +4 -25
- maxframe/dataframe/misc/eval.py +4 -0
- maxframe/dataframe/misc/memory_usage.py +2 -2
- maxframe/dataframe/misc/pct_change.py +1 -83
- maxframe/dataframe/misc/tests/test_misc.py +23 -0
- maxframe/dataframe/misc/transform.py +1 -30
- maxframe/dataframe/misc/value_counts.py +4 -17
- maxframe/dataframe/missing/dropna.py +1 -1
- maxframe/dataframe/missing/fillna.py +5 -5
- maxframe/dataframe/sort/sort_values.py +1 -11
- maxframe/dataframe/statistics/corr.py +3 -3
- maxframe/dataframe/statistics/quantile.py +5 -17
- maxframe/dataframe/utils.py +4 -7
- maxframe/errors.py +13 -0
- maxframe/extension.py +12 -0
- maxframe/learn/contrib/xgboost/dmatrix.py +2 -2
- maxframe/learn/contrib/xgboost/predict.py +2 -2
- maxframe/learn/contrib/xgboost/train.py +2 -2
- maxframe/lib/mmh3.cp37-win32.pyd +0 -0
- maxframe/lib/mmh3.pyi +43 -0
- maxframe/lib/wrapped_pickle.py +2 -1
- maxframe/odpsio/__init__.py +1 -1
- maxframe/odpsio/arrow.py +8 -4
- maxframe/odpsio/schema.py +10 -7
- maxframe/odpsio/tableio.py +388 -14
- maxframe/odpsio/tests/test_schema.py +16 -15
- maxframe/odpsio/tests/test_tableio.py +48 -21
- maxframe/protocol.py +148 -12
- maxframe/serialization/core.cp37-win32.pyd +0 -0
- maxframe/serialization/core.pxd +3 -0
- maxframe/serialization/core.pyi +3 -0
- maxframe/serialization/core.pyx +54 -25
- maxframe/serialization/exception.py +1 -1
- maxframe/serialization/pandas.py +7 -2
- maxframe/serialization/serializables/core.py +158 -12
- maxframe/serialization/serializables/tests/test_serializable.py +46 -4
- maxframe/tensor/__init__.py +59 -0
- maxframe/tensor/arithmetic/tests/test_arithmetic.py +1 -1
- maxframe/tensor/base/atleast_1d.py +1 -1
- maxframe/tensor/base/unique.py +3 -3
- maxframe/tensor/reduction/count_nonzero.py +1 -1
- maxframe/tensor/statistics/quantile.py +2 -2
- maxframe/tests/test_protocol.py +34 -0
- maxframe/tests/test_utils.py +0 -12
- maxframe/tests/utils.py +11 -2
- maxframe/utils.py +24 -13
- {maxframe-0.1.0b5.dist-info → maxframe-1.0.0rc2.dist-info}/METADATA +75 -2
- {maxframe-0.1.0b5.dist-info → maxframe-1.0.0rc2.dist-info}/RECORD +91 -89
- maxframe_client/__init__.py +0 -1
- maxframe_client/fetcher.py +38 -27
- maxframe_client/session/odps.py +50 -10
- maxframe_client/session/task.py +41 -20
- maxframe_client/tests/test_fetcher.py +21 -3
- maxframe_client/tests/test_session.py +49 -2
- maxframe_client/clients/spe.py +0 -104
- {maxframe-0.1.0b5.dist-info → maxframe-1.0.0rc2.dist-info}/WHEEL +0 -0
- {maxframe-0.1.0b5.dist-info → maxframe-1.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -19,6 +19,7 @@ import pytest
|
|
|
19
19
|
from ...core import IndexValue
|
|
20
20
|
from ...datasource.dataframe import from_pandas
|
|
21
21
|
from .. import DataFrameMerge, concat
|
|
22
|
+
from ..merge import DistributedMapJoinHint, MapJoinHint, SkewJoinHint
|
|
22
23
|
|
|
23
24
|
|
|
24
25
|
def test_merge():
|
|
@@ -30,14 +31,39 @@ def test_merge():
|
|
|
30
31
|
mdf1 = from_pandas(df1, chunk_size=2)
|
|
31
32
|
mdf2 = from_pandas(df2, chunk_size=3)
|
|
32
33
|
|
|
34
|
+
mapjoin = MapJoinHint()
|
|
35
|
+
dist_mapjoin1 = DistributedMapJoinHint(shard_count=5)
|
|
36
|
+
skew_join1 = SkewJoinHint()
|
|
37
|
+
skew_join2 = SkewJoinHint(columns=[0])
|
|
38
|
+
skew_join3 = SkewJoinHint(columns=[{"a": 4}, {"a": 6}])
|
|
39
|
+
skew_join4 = SkewJoinHint(columns=[{"a": 4, "b": "test"}, {"a": 5, "b": "hello"}])
|
|
40
|
+
|
|
33
41
|
parameters = [
|
|
34
42
|
{},
|
|
35
43
|
{"how": "left", "right_on": "x", "left_index": True},
|
|
44
|
+
{
|
|
45
|
+
"how": "left",
|
|
46
|
+
"right_on": "x",
|
|
47
|
+
"left_index": True,
|
|
48
|
+
"left_hint": mapjoin,
|
|
49
|
+
"right_hint": mapjoin,
|
|
50
|
+
},
|
|
36
51
|
{"how": "right", "left_on": "a", "right_index": True},
|
|
52
|
+
{
|
|
53
|
+
"how": "right",
|
|
54
|
+
"left_on": "a",
|
|
55
|
+
"right_index": True,
|
|
56
|
+
"left_hint": mapjoin,
|
|
57
|
+
"right_hint": dist_mapjoin1,
|
|
58
|
+
},
|
|
37
59
|
{"how": "left", "left_on": "a", "right_on": "x"},
|
|
60
|
+
{"how": "left", "left_on": "a", "right_on": "x", "left_hint": skew_join1},
|
|
38
61
|
{"how": "right", "left_on": "a", "right_index": True},
|
|
62
|
+
{"how": "right", "left_on": "a", "right_index": True, "right_hint": skew_join2},
|
|
39
63
|
{"how": "right", "on": "a"},
|
|
64
|
+
{"how": "right", "on": "a", "right_hint": skew_join3},
|
|
40
65
|
{"how": "inner", "on": ["a", "b"]},
|
|
66
|
+
{"how": "inner", "on": ["a", "b"], "left_hint": skew_join4},
|
|
41
67
|
]
|
|
42
68
|
|
|
43
69
|
for kw in parameters:
|
|
@@ -213,3 +239,100 @@ def test_concat():
|
|
|
213
239
|
mdf2 = from_pandas(df2, chunk_size=3)
|
|
214
240
|
r = concat([mdf1, mdf2], join="inner")
|
|
215
241
|
assert r.shape == (20, 3)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def test_invalid_join_hint():
|
|
245
|
+
df1 = pd.DataFrame(
|
|
246
|
+
np.arange(20).reshape((4, 5)) + 1, columns=["a", "b", "c", "d", "e"]
|
|
247
|
+
)
|
|
248
|
+
df2 = pd.DataFrame(np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"])
|
|
249
|
+
|
|
250
|
+
mdf1 = from_pandas(df1, chunk_size=2)
|
|
251
|
+
mdf2 = from_pandas(df2, chunk_size=3)
|
|
252
|
+
|
|
253
|
+
# type error
|
|
254
|
+
parameters = [
|
|
255
|
+
{"how": "left", "right_on": "x", "left_index": True, "left_hint": [1]},
|
|
256
|
+
{
|
|
257
|
+
"how": "left",
|
|
258
|
+
"right_on": "x",
|
|
259
|
+
"left_index": True,
|
|
260
|
+
"left_hint": {"key": "value"},
|
|
261
|
+
},
|
|
262
|
+
{
|
|
263
|
+
"how": "right",
|
|
264
|
+
"left_on": "a",
|
|
265
|
+
"right_index": True,
|
|
266
|
+
"right_hint": SkewJoinHint(columns=2),
|
|
267
|
+
},
|
|
268
|
+
{
|
|
269
|
+
"how": "left",
|
|
270
|
+
"left_on": "a",
|
|
271
|
+
"right_on": "x",
|
|
272
|
+
"left_hint": SkewJoinHint(columns="a"),
|
|
273
|
+
},
|
|
274
|
+
{
|
|
275
|
+
"how": "right",
|
|
276
|
+
"left_on": "a",
|
|
277
|
+
"right_index": True,
|
|
278
|
+
"right_hint": SkewJoinHint(columns=["0", []]),
|
|
279
|
+
},
|
|
280
|
+
]
|
|
281
|
+
|
|
282
|
+
for kw in parameters:
|
|
283
|
+
print(kw)
|
|
284
|
+
with pytest.raises(TypeError):
|
|
285
|
+
mdf1.merge(mdf2, **kw)
|
|
286
|
+
|
|
287
|
+
# value error
|
|
288
|
+
parameters = [
|
|
289
|
+
# mapjoin can't working with skew join
|
|
290
|
+
{
|
|
291
|
+
"how": "left",
|
|
292
|
+
"right_on": "x",
|
|
293
|
+
"left_index": True,
|
|
294
|
+
"left_hint": MapJoinHint(),
|
|
295
|
+
"right_hint": SkewJoinHint(),
|
|
296
|
+
},
|
|
297
|
+
# right join can't apply to skew join left frame
|
|
298
|
+
{
|
|
299
|
+
"how": "right",
|
|
300
|
+
"left_on": "a",
|
|
301
|
+
"right_index": True,
|
|
302
|
+
"left_hint": SkewJoinHint(),
|
|
303
|
+
},
|
|
304
|
+
# invalid columns
|
|
305
|
+
{
|
|
306
|
+
"how": "left",
|
|
307
|
+
"left_on": "a",
|
|
308
|
+
"right_on": "x",
|
|
309
|
+
"left_hint": SkewJoinHint(columns=["b"]),
|
|
310
|
+
},
|
|
311
|
+
# invalid index level
|
|
312
|
+
{
|
|
313
|
+
"how": "right",
|
|
314
|
+
"left_on": "a",
|
|
315
|
+
"right_index": True,
|
|
316
|
+
"right_hint": SkewJoinHint(columns=[5]),
|
|
317
|
+
},
|
|
318
|
+
# unmatched skew join columns
|
|
319
|
+
{
|
|
320
|
+
"how": "right",
|
|
321
|
+
"left_on": "a",
|
|
322
|
+
"right_index": True,
|
|
323
|
+
"right_hint": SkewJoinHint(columns=[{0: "value1"}, {1: "value2"}]),
|
|
324
|
+
},
|
|
325
|
+
# invalid dist_mapjoin shard_count
|
|
326
|
+
{"how": "right", "on": "a", "right_hint": DistributedMapJoinHint()},
|
|
327
|
+
# all can't work with outer join
|
|
328
|
+
{"how": "outer", "on": ["a", "b"], "left_hint": MapJoinHint()},
|
|
329
|
+
{
|
|
330
|
+
"how": "outer",
|
|
331
|
+
"on": ["a", "b"],
|
|
332
|
+
"left_hint": DistributedMapJoinHint(shard_count=5),
|
|
333
|
+
},
|
|
334
|
+
{"how": "outer", "on": ["a", "b"], "left_hint": SkewJoinHint()},
|
|
335
|
+
]
|
|
336
|
+
for kw in parameters:
|
|
337
|
+
with pytest.raises(ValueError):
|
|
338
|
+
mdf1.merge(mdf2, **kw)
|
maxframe/dataframe/misc/apply.py
CHANGED
|
@@ -170,6 +170,8 @@ class ApplyOperator(
|
|
|
170
170
|
elif self.output_types[0] == OutputType.dataframe:
|
|
171
171
|
shape = [np.nan, np.nan]
|
|
172
172
|
shape[1 - self.axis] = df.shape[1 - self.axis]
|
|
173
|
+
if self.axis == 1:
|
|
174
|
+
shape[1] = len(dtypes)
|
|
173
175
|
shape = tuple(shape)
|
|
174
176
|
else:
|
|
175
177
|
shape = (df.shape[1 - self.axis],)
|
|
@@ -317,6 +319,7 @@ def df_apply(
|
|
|
317
319
|
skip_infer=False,
|
|
318
320
|
**kwds,
|
|
319
321
|
):
|
|
322
|
+
# FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/50
|
|
320
323
|
"""
|
|
321
324
|
Apply a function along an axis of the DataFrame.
|
|
322
325
|
|
|
@@ -442,20 +445,12 @@ def df_apply(
|
|
|
442
445
|
B 27
|
|
443
446
|
dtype: int64
|
|
444
447
|
|
|
445
|
-
>>> df.apply(np.sum, axis=1).execute()
|
|
448
|
+
>>> df.apply(lambda row: int(np.sum(row)), axis=1).execute()
|
|
446
449
|
0 13
|
|
447
450
|
1 13
|
|
448
451
|
2 13
|
|
449
452
|
dtype: int64
|
|
450
453
|
|
|
451
|
-
Returning a list-like will result in a Series
|
|
452
|
-
|
|
453
|
-
>>> df.apply(lambda x: [1, 2], axis=1).execute()
|
|
454
|
-
0 [1, 2]
|
|
455
|
-
1 [1, 2]
|
|
456
|
-
2 [1, 2]
|
|
457
|
-
dtype: object
|
|
458
|
-
|
|
459
454
|
Passing ``result_type='expand'`` will expand list-like results
|
|
460
455
|
to columns of a Dataframe
|
|
461
456
|
|
|
@@ -469,7 +464,7 @@ def df_apply(
|
|
|
469
464
|
``result_type='expand'``. The resulting column names
|
|
470
465
|
will be the Series index.
|
|
471
466
|
|
|
472
|
-
>>> df.apply(lambda x:
|
|
467
|
+
>>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1).execute()
|
|
473
468
|
foo bar
|
|
474
469
|
0 1 2
|
|
475
470
|
1 1 2
|
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
import numpy as np
|
|
16
16
|
import pandas as pd
|
|
17
17
|
|
|
18
|
-
from ... import opcodes
|
|
18
|
+
from ... import opcodes
|
|
19
19
|
from ...serialization.serializables import AnyField, FieldTypes, KeyField, ListField
|
|
20
20
|
from ..core import SERIES_TYPE
|
|
21
21
|
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
@@ -23,7 +23,7 @@ from ..utils import build_empty_df, parse_index
|
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
class DataFrameDescribe(DataFrameOperator, DataFrameOperatorMixin):
|
|
26
|
-
_op_type_ =
|
|
26
|
+
_op_type_ = opcodes.DESCRIBE
|
|
27
27
|
|
|
28
28
|
input = KeyField("input", default=None)
|
|
29
29
|
percentiles = ListField("percentiles", FieldTypes.float64, default=None)
|
|
@@ -37,16 +37,15 @@ class DataFrameDropDuplicates(DuplicateOperand):
|
|
|
37
37
|
shape += (3,)
|
|
38
38
|
return shape
|
|
39
39
|
|
|
40
|
-
|
|
41
|
-
def _gen_tileable_params(cls, op: "DataFrameDropDuplicates", input_params):
|
|
40
|
+
def _gen_tileable_params(self, op: "DataFrameDropDuplicates", input_params):
|
|
42
41
|
params = input_params.copy()
|
|
43
|
-
if op.ignore_index:
|
|
42
|
+
if op.ignore_index and self._output_types[0] != OutputType.index:
|
|
44
43
|
params["index_value"] = parse_index(pd.RangeIndex(-1))
|
|
45
44
|
else:
|
|
46
45
|
params["index_value"] = gen_unknown_index_value(
|
|
47
46
|
input_params["index_value"], op.keep, op.subset, type(op).__name__
|
|
48
47
|
)
|
|
49
|
-
params["shape"] =
|
|
48
|
+
params["shape"] = self._get_shape(input_params["shape"], op)
|
|
50
49
|
return params
|
|
51
50
|
|
|
52
51
|
def __call__(self, inp, inplace=False):
|
|
@@ -105,6 +104,7 @@ def df_drop_duplicates(
|
|
|
105
104
|
def series_drop_duplicates(
|
|
106
105
|
series, keep="first", inplace=False, ignore_index=False, method="auto"
|
|
107
106
|
):
|
|
107
|
+
# FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/12
|
|
108
108
|
"""
|
|
109
109
|
Return Series with duplicate values removed.
|
|
110
110
|
|
|
@@ -148,27 +148,6 @@ def series_drop_duplicates(
|
|
|
148
148
|
5 hippo
|
|
149
149
|
Name: animal, dtype: object
|
|
150
150
|
|
|
151
|
-
With the 'keep' parameter, the selection behaviour of duplicated values
|
|
152
|
-
can be changed. The value 'first' keeps the first occurrence for each
|
|
153
|
-
set of duplicated entries. The default value of keep is 'first'.
|
|
154
|
-
|
|
155
|
-
>>> s.drop_duplicates().execute()
|
|
156
|
-
0 lame
|
|
157
|
-
1 cow
|
|
158
|
-
3 beetle
|
|
159
|
-
5 hippo
|
|
160
|
-
Name: animal, dtype: object
|
|
161
|
-
|
|
162
|
-
The value 'last' for parameter 'keep' keeps the last occurrence for
|
|
163
|
-
each set of duplicated entries.
|
|
164
|
-
|
|
165
|
-
>>> s.drop_duplicates(keep='last').execute()
|
|
166
|
-
1 cow
|
|
167
|
-
3 beetle
|
|
168
|
-
4 lame
|
|
169
|
-
5 hippo
|
|
170
|
-
Name: animal, dtype: object
|
|
171
|
-
|
|
172
151
|
The value ``False`` for parameter 'keep' discards all sets of
|
|
173
152
|
duplicated entries. Setting the value of 'inplace' to ``True`` performs
|
|
174
153
|
the operation inplace and returns ``None``.
|
maxframe/dataframe/misc/eval.py
CHANGED
|
@@ -120,6 +120,10 @@ class CollectionVisitor(ast.NodeVisitor):
|
|
|
120
120
|
if obj_name in self.env:
|
|
121
121
|
self.referenced_vars.add(obj_name)
|
|
122
122
|
return self.env[obj_name]
|
|
123
|
+
try:
|
|
124
|
+
return self.target[obj_name]
|
|
125
|
+
except KeyError:
|
|
126
|
+
pass
|
|
123
127
|
raise KeyError(f"name {obj_name} is not defined")
|
|
124
128
|
|
|
125
129
|
def visit(self, node):
|
|
@@ -58,7 +58,7 @@ class DataFrameMemoryUsage(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
58
58
|
"""
|
|
59
59
|
if df_or_series.ndim == 1:
|
|
60
60
|
# the input data is a series, a Scalar will be returned
|
|
61
|
-
return self.new_scalar([df_or_series], dtype=np.dtype(
|
|
61
|
+
return self.new_scalar([df_or_series], dtype=np.dtype(int))
|
|
62
62
|
else:
|
|
63
63
|
# the input data is a DataFrame, a Scalar will be returned
|
|
64
64
|
# calculate shape of returning series given ``op.index``
|
|
@@ -71,7 +71,7 @@ class DataFrameMemoryUsage(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
71
71
|
[df_or_series],
|
|
72
72
|
index_value=self._adapt_index(df_or_series.columns_value),
|
|
73
73
|
shape=new_shape,
|
|
74
|
-
dtype=np.dtype(
|
|
74
|
+
dtype=np.dtype(int),
|
|
75
75
|
)
|
|
76
76
|
|
|
77
77
|
|
|
@@ -18,6 +18,7 @@ from ..utils import validate_axis
|
|
|
18
18
|
def pct_change(
|
|
19
19
|
df_or_series, periods=1, fill_method="pad", limit=None, freq=None, **kwargs
|
|
20
20
|
):
|
|
21
|
+
# FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/32
|
|
21
22
|
"""
|
|
22
23
|
Percentage change between the current and a prior element.
|
|
23
24
|
|
|
@@ -50,89 +51,6 @@ def pct_change(
|
|
|
50
51
|
DataFrame.diff : Compute the difference of two elements in a DataFrame.
|
|
51
52
|
Series.shift : Shift the index by some number of periods.
|
|
52
53
|
DataFrame.shift : Shift the index by some number of periods.
|
|
53
|
-
|
|
54
|
-
Examples
|
|
55
|
-
--------
|
|
56
|
-
**Series**
|
|
57
|
-
|
|
58
|
-
>>> import maxframe.dataframe as md
|
|
59
|
-
|
|
60
|
-
>>> s = md.Series([90, 91, 85])
|
|
61
|
-
>>> s.execute()
|
|
62
|
-
0 90
|
|
63
|
-
1 91
|
|
64
|
-
2 85
|
|
65
|
-
dtype: int64
|
|
66
|
-
|
|
67
|
-
>>> s.pct_change().execute()
|
|
68
|
-
0 NaN
|
|
69
|
-
1 0.011111
|
|
70
|
-
2 -0.065934
|
|
71
|
-
dtype: float64
|
|
72
|
-
|
|
73
|
-
>>> s.pct_change(periods=2).execute()
|
|
74
|
-
0 NaN
|
|
75
|
-
1 NaN
|
|
76
|
-
2 -0.055556
|
|
77
|
-
dtype: float64
|
|
78
|
-
|
|
79
|
-
See the percentage change in a Series where filling NAs with last
|
|
80
|
-
valid observation forward to next valid.
|
|
81
|
-
|
|
82
|
-
>>> s = md.Series([90, 91, None, 85])
|
|
83
|
-
>>> s.execute()
|
|
84
|
-
0 90.0
|
|
85
|
-
1 91.0
|
|
86
|
-
2 NaN
|
|
87
|
-
3 85.0
|
|
88
|
-
dtype: float64
|
|
89
|
-
|
|
90
|
-
>>> s.pct_change(fill_method='ffill').execute()
|
|
91
|
-
0 NaN
|
|
92
|
-
1 0.011111
|
|
93
|
-
2 0.000000
|
|
94
|
-
3 -0.065934
|
|
95
|
-
dtype: float64
|
|
96
|
-
|
|
97
|
-
**DataFrame**
|
|
98
|
-
|
|
99
|
-
Percentage change in French franc, Deutsche Mark, and Italian lira from
|
|
100
|
-
1980-01-01 to 1980-03-01.
|
|
101
|
-
|
|
102
|
-
>>> df = md.DataFrame({
|
|
103
|
-
... 'FR': [4.0405, 4.0963, 4.3149],
|
|
104
|
-
... 'GR': [1.7246, 1.7482, 1.8519],
|
|
105
|
-
... 'IT': [804.74, 810.01, 860.13]},
|
|
106
|
-
... index=['1980-01-01', '1980-02-01', '1980-03-01'])
|
|
107
|
-
>>> df.execute()
|
|
108
|
-
FR GR IT
|
|
109
|
-
1980-01-01 4.0405 1.7246 804.74
|
|
110
|
-
1980-02-01 4.0963 1.7482 810.01
|
|
111
|
-
1980-03-01 4.3149 1.8519 860.13
|
|
112
|
-
|
|
113
|
-
>>> df.pct_change().execute()
|
|
114
|
-
FR GR IT
|
|
115
|
-
1980-01-01 NaN NaN NaN
|
|
116
|
-
1980-02-01 0.013810 0.013684 0.006549
|
|
117
|
-
1980-03-01 0.053365 0.059318 0.061876
|
|
118
|
-
|
|
119
|
-
Percentage of change in GOOG and APPL stock volume. Shows computing
|
|
120
|
-
the percentage change between columns.
|
|
121
|
-
|
|
122
|
-
>>> df = md.DataFrame({
|
|
123
|
-
... '2016': [1769950, 30586265],
|
|
124
|
-
... '2015': [1500923, 40912316],
|
|
125
|
-
... '2014': [1371819, 41403351]},
|
|
126
|
-
... index=['GOOG', 'APPL'])
|
|
127
|
-
>>> df.execute()
|
|
128
|
-
2016 2015 2014
|
|
129
|
-
GOOG 1769950 1500923 1371819
|
|
130
|
-
APPL 30586265 40912316 41403351
|
|
131
|
-
|
|
132
|
-
>>> df.pct_change(axis='columns').execute()
|
|
133
|
-
2016 2015 2014
|
|
134
|
-
GOOG NaN -0.151997 -0.086016
|
|
135
|
-
APPL NaN 0.337604 0.012002
|
|
136
54
|
"""
|
|
137
55
|
|
|
138
56
|
axis = validate_axis(kwargs.pop("axis", 0))
|
|
@@ -18,6 +18,7 @@ import pytest
|
|
|
18
18
|
|
|
19
19
|
from .... import opcodes
|
|
20
20
|
from ....core import OutputType
|
|
21
|
+
from ....dataframe import DataFrame
|
|
21
22
|
from ....tensor.core import TENSOR_TYPE
|
|
22
23
|
from ... import eval as maxframe_eval
|
|
23
24
|
from ... import get_dummies, to_numeric
|
|
@@ -430,6 +431,28 @@ def test_case_when():
|
|
|
430
431
|
assert isinstance(col.inputs[2].op, DataFrameGreater)
|
|
431
432
|
|
|
432
433
|
|
|
434
|
+
def test_apply():
|
|
435
|
+
df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]})
|
|
436
|
+
|
|
437
|
+
keys = [1, 2]
|
|
438
|
+
|
|
439
|
+
def f(x, keys):
|
|
440
|
+
if x["a"] in keys:
|
|
441
|
+
return [1, 0]
|
|
442
|
+
else:
|
|
443
|
+
return [0, 1]
|
|
444
|
+
|
|
445
|
+
apply_df = df[["a"]].apply(
|
|
446
|
+
f,
|
|
447
|
+
output_type="dataframe",
|
|
448
|
+
dtypes=pd.Series(["int64", "int64"]),
|
|
449
|
+
axis=1,
|
|
450
|
+
result_type="expand",
|
|
451
|
+
keys=keys,
|
|
452
|
+
)
|
|
453
|
+
assert apply_df.shape == (3, 2)
|
|
454
|
+
|
|
455
|
+
|
|
433
456
|
def test_pivot_table():
|
|
434
457
|
from ...groupby.aggregation import DataFrameGroupByAgg
|
|
435
458
|
from ...misc.pivot_table import DataFramePivotTable
|
|
@@ -228,21 +228,6 @@ def df_transform(df, func, axis=0, *args, dtypes=None, skip_infer=False, **kwarg
|
|
|
228
228
|
0 1 2
|
|
229
229
|
1 2 3
|
|
230
230
|
2 3 4
|
|
231
|
-
|
|
232
|
-
Even though the resulting DataFrame must have the same length as the
|
|
233
|
-
input DataFrame, it is possible to provide several input functions:
|
|
234
|
-
|
|
235
|
-
>>> s = md.Series(range(3))
|
|
236
|
-
>>> s.execute()
|
|
237
|
-
0 0
|
|
238
|
-
1 1
|
|
239
|
-
2 2
|
|
240
|
-
dtype: int64
|
|
241
|
-
>>> s.transform([mt.sqrt, mt.exp]).execute()
|
|
242
|
-
sqrt exp
|
|
243
|
-
0 0.000000 1.000000
|
|
244
|
-
1 1.000000 2.718282
|
|
245
|
-
2 1.414214 7.389056
|
|
246
231
|
"""
|
|
247
232
|
op = TransformOperator(
|
|
248
233
|
func=func,
|
|
@@ -265,6 +250,7 @@ def series_transform(
|
|
|
265
250
|
dtype=None,
|
|
266
251
|
**kwargs
|
|
267
252
|
):
|
|
253
|
+
# FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/10
|
|
268
254
|
"""
|
|
269
255
|
Call ``func`` on self producing a Series with transformed values.
|
|
270
256
|
|
|
@@ -332,21 +318,6 @@ def series_transform(
|
|
|
332
318
|
0 1 2
|
|
333
319
|
1 2 3
|
|
334
320
|
2 3 4
|
|
335
|
-
|
|
336
|
-
Even though the resulting Series must have the same length as the
|
|
337
|
-
input Series, it is possible to provide several input functions:
|
|
338
|
-
|
|
339
|
-
>>> s = md.Series(range(3))
|
|
340
|
-
>>> s.execute()
|
|
341
|
-
0 0
|
|
342
|
-
1 1
|
|
343
|
-
2 2
|
|
344
|
-
dtype: int64
|
|
345
|
-
>>> s.transform([mt.sqrt, mt.exp]).execute()
|
|
346
|
-
sqrt exp
|
|
347
|
-
0 0.000000 1.000000
|
|
348
|
-
1 1.000000 2.718282
|
|
349
|
-
2 1.414214 7.389056
|
|
350
321
|
"""
|
|
351
322
|
op = TransformOperator(
|
|
352
323
|
func=func,
|
|
@@ -85,6 +85,7 @@ def value_counts(
|
|
|
85
85
|
dropna=True,
|
|
86
86
|
method="auto",
|
|
87
87
|
):
|
|
88
|
+
# FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/33
|
|
88
89
|
"""
|
|
89
90
|
Return a Series containing counts of unique values.
|
|
90
91
|
|
|
@@ -125,9 +126,8 @@ def value_counts(
|
|
|
125
126
|
Examples
|
|
126
127
|
--------
|
|
127
128
|
>>> import maxframe.dataframe as md
|
|
128
|
-
>>> import
|
|
129
|
-
|
|
130
|
-
>>> s = md.Series([3, 1, 2, 3, 4, mt.nan])
|
|
129
|
+
>>> import numpy as np
|
|
130
|
+
>>> s = md.Series([3, 1, 2, 3, 4, np.nan])
|
|
131
131
|
>>> s.value_counts().execute()
|
|
132
132
|
3.0 2
|
|
133
133
|
4.0 1
|
|
@@ -138,7 +138,7 @@ def value_counts(
|
|
|
138
138
|
With `normalize` set to `True`, returns the relative frequency by
|
|
139
139
|
dividing all values by the sum of values.
|
|
140
140
|
|
|
141
|
-
>>> s = md.Series([3, 1, 2, 3, 4,
|
|
141
|
+
>>> s = md.Series([3, 1, 2, 3, 4, np.nan])
|
|
142
142
|
>>> s.value_counts(normalize=True).execute()
|
|
143
143
|
3.0 0.4
|
|
144
144
|
4.0 0.2
|
|
@@ -146,19 +146,6 @@ def value_counts(
|
|
|
146
146
|
1.0 0.2
|
|
147
147
|
dtype: float64
|
|
148
148
|
|
|
149
|
-
**bins**
|
|
150
|
-
|
|
151
|
-
Bins can be useful for going from a continuous variable to a
|
|
152
|
-
categorical variable; instead of counting unique
|
|
153
|
-
apparitions of values, divide the index in the specified
|
|
154
|
-
number of half-open bins.
|
|
155
|
-
|
|
156
|
-
>>> s.value_counts(bins=3).execute()
|
|
157
|
-
(2.0, 3.0] 2
|
|
158
|
-
(0.996, 2.0] 2
|
|
159
|
-
(3.0, 4.0] 1
|
|
160
|
-
dtype: int64
|
|
161
|
-
|
|
162
149
|
**dropna**
|
|
163
150
|
|
|
164
151
|
With `dropna` set to `False` we can also see NaN index values.
|
|
@@ -234,7 +234,7 @@ def series_dropna(series, axis=0, inplace=False, how=None):
|
|
|
234
234
|
Empty strings are not considered NA values. ``None`` is considered an
|
|
235
235
|
NA value.
|
|
236
236
|
|
|
237
|
-
>>> ser = md.Series([np.NaN, 2, md.NaT, '', None, 'I stay'])
|
|
237
|
+
>>> ser = md.Series([np.NaN, '2', md.NaT, '', None, 'I stay'])
|
|
238
238
|
>>> ser.execute()
|
|
239
239
|
0 NaN
|
|
240
240
|
1 2
|
|
@@ -132,11 +132,11 @@ def fillna(
|
|
|
132
132
|
--------
|
|
133
133
|
>>> import maxframe.tensor as mt
|
|
134
134
|
>>> import maxframe.dataframe as md
|
|
135
|
-
>>> df = md.DataFrame([[
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
135
|
+
>>> df = md.DataFrame([[np.nan, 2, np.nan, 0],
|
|
136
|
+
[3, 4, np.nan, 1],
|
|
137
|
+
[np.nan, np.nan, np.nan, 5],
|
|
138
|
+
[np.nan, 3, np.nan, 4]],
|
|
139
|
+
columns=list('ABCD'))
|
|
140
140
|
>>> df.execute()
|
|
141
141
|
A B C D
|
|
142
142
|
0 NaN 2.0 NaN 0
|
|
@@ -67,6 +67,7 @@ def dataframe_sort_values(
|
|
|
67
67
|
parallel_kind="PSRS",
|
|
68
68
|
psrs_kinds=None,
|
|
69
69
|
):
|
|
70
|
+
# FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/15
|
|
70
71
|
"""
|
|
71
72
|
Sort by the values along either axis.
|
|
72
73
|
|
|
@@ -152,17 +153,6 @@ def dataframe_sort_values(
|
|
|
152
153
|
0 A 2 0
|
|
153
154
|
1 A 1 1
|
|
154
155
|
3 NaN 8 4
|
|
155
|
-
|
|
156
|
-
Putting NAs first
|
|
157
|
-
|
|
158
|
-
>>> df.sort_values(by='col1', ascending=False, na_position='first').execute()
|
|
159
|
-
col1 col2 col3
|
|
160
|
-
3 NaN 8 4
|
|
161
|
-
4 D 7 2
|
|
162
|
-
5 C 4 3
|
|
163
|
-
2 B 9 9
|
|
164
|
-
0 A 2 0
|
|
165
|
-
1 A 1 1
|
|
166
156
|
"""
|
|
167
157
|
|
|
168
158
|
if na_position not in ["last", "first"]: # pragma: no cover
|
|
@@ -43,7 +43,7 @@ class DataFrameCorr(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
43
43
|
def __call__(self, df_or_series):
|
|
44
44
|
if isinstance(df_or_series, SERIES_TYPE):
|
|
45
45
|
inputs = filter_inputs([df_or_series, self.other])
|
|
46
|
-
return self.new_scalar(inputs, dtype=np.dtype(
|
|
46
|
+
return self.new_scalar(inputs, dtype=np.dtype(float))
|
|
47
47
|
else:
|
|
48
48
|
|
|
49
49
|
def _filter_numeric(obj):
|
|
@@ -60,7 +60,7 @@ class DataFrameCorr(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
60
60
|
inputs = filter_inputs([df_or_series, self.other])
|
|
61
61
|
if self.axis is None:
|
|
62
62
|
dtypes = pd.Series(
|
|
63
|
-
[np.dtype(
|
|
63
|
+
[np.dtype(float)] * len(df_or_series.dtypes),
|
|
64
64
|
index=df_or_series.dtypes.index,
|
|
65
65
|
)
|
|
66
66
|
return self.new_dataframe(
|
|
@@ -85,7 +85,7 @@ class DataFrameCorr(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
85
85
|
return self.new_series(
|
|
86
86
|
inputs,
|
|
87
87
|
shape=shape,
|
|
88
|
-
dtype=np.dtype(
|
|
88
|
+
dtype=np.dtype(float),
|
|
89
89
|
index_value=new_index_value,
|
|
90
90
|
)
|
|
91
91
|
|
|
@@ -14,8 +14,9 @@
|
|
|
14
14
|
|
|
15
15
|
import numpy as np
|
|
16
16
|
import pandas as pd
|
|
17
|
+
from pandas.core.dtypes.cast import find_common_type
|
|
17
18
|
|
|
18
|
-
from ... import opcodes
|
|
19
|
+
from ... import opcodes
|
|
19
20
|
from ...core import ENTITY_TYPE
|
|
20
21
|
from ...serialization.serializables import (
|
|
21
22
|
AnyField,
|
|
@@ -32,11 +33,11 @@ from ...tensor.datasource import tensor as astensor
|
|
|
32
33
|
from ...tensor.statistics.quantile import quantile as tensor_quantile
|
|
33
34
|
from ..core import DATAFRAME_TYPE
|
|
34
35
|
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
35
|
-
from ..utils import build_empty_df,
|
|
36
|
+
from ..utils import build_empty_df, parse_index, validate_axis
|
|
36
37
|
|
|
37
38
|
|
|
38
39
|
class DataFrameQuantile(DataFrameOperator, DataFrameOperatorMixin):
|
|
39
|
-
_op_type_ =
|
|
40
|
+
_op_type_ = opcodes.QUANTILE
|
|
40
41
|
|
|
41
42
|
input = KeyField("input", default=None)
|
|
42
43
|
q = AnyField("q", default=None)
|
|
@@ -259,6 +260,7 @@ def quantile_series(series, q=0.5, interpolation="linear"):
|
|
|
259
260
|
|
|
260
261
|
|
|
261
262
|
def quantile_dataframe(df, q=0.5, axis=0, numeric_only=True, interpolation="linear"):
|
|
263
|
+
# FIXME: Timedelta not support. Data invalid: ODPS-0010000:InvalidArgument:duration[ns] is not equal to string
|
|
262
264
|
"""
|
|
263
265
|
Return values at the given quantile over requested axis.
|
|
264
266
|
|
|
@@ -309,20 +311,6 @@ def quantile_dataframe(df, q=0.5, axis=0, numeric_only=True, interpolation="line
|
|
|
309
311
|
a b
|
|
310
312
|
0.1 1.3 3.7
|
|
311
313
|
0.5 2.5 55.0
|
|
312
|
-
|
|
313
|
-
Specifying `numeric_only=False` will also compute the quantile of
|
|
314
|
-
datetime and timedelta data.
|
|
315
|
-
|
|
316
|
-
>>> df = md.DataFrame({'A': [1, 2],
|
|
317
|
-
... 'B': [md.Timestamp('2010'),
|
|
318
|
-
... md.Timestamp('2011')],
|
|
319
|
-
... 'C': [md.Timedelta('1 days'),
|
|
320
|
-
... md.Timedelta('2 days')]})
|
|
321
|
-
>>> df.quantile(0.5, numeric_only=False).execute()
|
|
322
|
-
A 1.5
|
|
323
|
-
B 2010-07-02 12:00:00
|
|
324
|
-
C 1 days 12:00:00
|
|
325
|
-
Name: 0.5, dtype: object
|
|
326
314
|
"""
|
|
327
315
|
if isinstance(q, ENTITY_TYPE):
|
|
328
316
|
q = astensor(q)
|