maxframe 0.1.0b5__cp37-cp37m-win32.whl → 1.0.0rc2__cp37-cp37m-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (92) hide show
  1. maxframe/_utils.cp37-win32.pyd +0 -0
  2. maxframe/codegen.py +6 -2
  3. maxframe/config/config.py +38 -2
  4. maxframe/config/validators.py +1 -0
  5. maxframe/conftest.py +2 -0
  6. maxframe/core/__init__.py +0 -3
  7. maxframe/core/entity/__init__.py +1 -8
  8. maxframe/core/entity/objects.py +3 -45
  9. maxframe/core/graph/core.cp37-win32.pyd +0 -0
  10. maxframe/core/graph/core.pyx +4 -4
  11. maxframe/dataframe/__init__.py +1 -1
  12. maxframe/dataframe/arithmetic/around.py +5 -17
  13. maxframe/dataframe/arithmetic/core.py +15 -7
  14. maxframe/dataframe/arithmetic/docstring.py +5 -55
  15. maxframe/dataframe/arithmetic/tests/test_arithmetic.py +22 -0
  16. maxframe/dataframe/core.py +5 -5
  17. maxframe/dataframe/datasource/date_range.py +2 -2
  18. maxframe/dataframe/datasource/read_odps_query.py +6 -0
  19. maxframe/dataframe/datasource/read_odps_table.py +2 -1
  20. maxframe/dataframe/datasource/tests/test_datasource.py +14 -0
  21. maxframe/dataframe/datastore/tests/__init__.py +13 -0
  22. maxframe/dataframe/datastore/tests/test_to_odps.py +48 -0
  23. maxframe/dataframe/datastore/to_odps.py +21 -0
  24. maxframe/dataframe/groupby/cum.py +0 -1
  25. maxframe/dataframe/groupby/tests/test_groupby.py +4 -0
  26. maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
  27. maxframe/dataframe/indexing/align.py +1 -1
  28. maxframe/dataframe/indexing/rename.py +3 -37
  29. maxframe/dataframe/indexing/sample.py +0 -1
  30. maxframe/dataframe/indexing/set_index.py +68 -1
  31. maxframe/dataframe/merge/merge.py +236 -2
  32. maxframe/dataframe/merge/tests/test_merge.py +123 -0
  33. maxframe/dataframe/misc/apply.py +5 -10
  34. maxframe/dataframe/misc/case_when.py +1 -1
  35. maxframe/dataframe/misc/describe.py +2 -2
  36. maxframe/dataframe/misc/drop_duplicates.py +4 -25
  37. maxframe/dataframe/misc/eval.py +4 -0
  38. maxframe/dataframe/misc/memory_usage.py +2 -2
  39. maxframe/dataframe/misc/pct_change.py +1 -83
  40. maxframe/dataframe/misc/tests/test_misc.py +23 -0
  41. maxframe/dataframe/misc/transform.py +1 -30
  42. maxframe/dataframe/misc/value_counts.py +4 -17
  43. maxframe/dataframe/missing/dropna.py +1 -1
  44. maxframe/dataframe/missing/fillna.py +5 -5
  45. maxframe/dataframe/sort/sort_values.py +1 -11
  46. maxframe/dataframe/statistics/corr.py +3 -3
  47. maxframe/dataframe/statistics/quantile.py +5 -17
  48. maxframe/dataframe/utils.py +4 -7
  49. maxframe/errors.py +13 -0
  50. maxframe/extension.py +12 -0
  51. maxframe/learn/contrib/xgboost/dmatrix.py +2 -2
  52. maxframe/learn/contrib/xgboost/predict.py +2 -2
  53. maxframe/learn/contrib/xgboost/train.py +2 -2
  54. maxframe/lib/mmh3.cp37-win32.pyd +0 -0
  55. maxframe/lib/mmh3.pyi +43 -0
  56. maxframe/lib/wrapped_pickle.py +2 -1
  57. maxframe/odpsio/__init__.py +1 -1
  58. maxframe/odpsio/arrow.py +8 -4
  59. maxframe/odpsio/schema.py +10 -7
  60. maxframe/odpsio/tableio.py +388 -14
  61. maxframe/odpsio/tests/test_schema.py +16 -15
  62. maxframe/odpsio/tests/test_tableio.py +48 -21
  63. maxframe/protocol.py +148 -12
  64. maxframe/serialization/core.cp37-win32.pyd +0 -0
  65. maxframe/serialization/core.pxd +3 -0
  66. maxframe/serialization/core.pyi +3 -0
  67. maxframe/serialization/core.pyx +54 -25
  68. maxframe/serialization/exception.py +1 -1
  69. maxframe/serialization/pandas.py +7 -2
  70. maxframe/serialization/serializables/core.py +158 -12
  71. maxframe/serialization/serializables/tests/test_serializable.py +46 -4
  72. maxframe/tensor/__init__.py +59 -0
  73. maxframe/tensor/arithmetic/tests/test_arithmetic.py +1 -1
  74. maxframe/tensor/base/atleast_1d.py +1 -1
  75. maxframe/tensor/base/unique.py +3 -3
  76. maxframe/tensor/reduction/count_nonzero.py +1 -1
  77. maxframe/tensor/statistics/quantile.py +2 -2
  78. maxframe/tests/test_protocol.py +34 -0
  79. maxframe/tests/test_utils.py +0 -12
  80. maxframe/tests/utils.py +11 -2
  81. maxframe/utils.py +24 -13
  82. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0rc2.dist-info}/METADATA +75 -2
  83. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0rc2.dist-info}/RECORD +91 -89
  84. maxframe_client/__init__.py +0 -1
  85. maxframe_client/fetcher.py +38 -27
  86. maxframe_client/session/odps.py +50 -10
  87. maxframe_client/session/task.py +41 -20
  88. maxframe_client/tests/test_fetcher.py +21 -3
  89. maxframe_client/tests/test_session.py +49 -2
  90. maxframe_client/clients/spe.py +0 -104
  91. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0rc2.dist-info}/WHEEL +0 -0
  92. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0rc2.dist-info}/top_level.txt +0 -0
@@ -19,6 +19,7 @@ import pytest
19
19
  from ...core import IndexValue
20
20
  from ...datasource.dataframe import from_pandas
21
21
  from .. import DataFrameMerge, concat
22
+ from ..merge import DistributedMapJoinHint, MapJoinHint, SkewJoinHint
22
23
 
23
24
 
24
25
  def test_merge():
@@ -30,14 +31,39 @@ def test_merge():
30
31
  mdf1 = from_pandas(df1, chunk_size=2)
31
32
  mdf2 = from_pandas(df2, chunk_size=3)
32
33
 
34
+ mapjoin = MapJoinHint()
35
+ dist_mapjoin1 = DistributedMapJoinHint(shard_count=5)
36
+ skew_join1 = SkewJoinHint()
37
+ skew_join2 = SkewJoinHint(columns=[0])
38
+ skew_join3 = SkewJoinHint(columns=[{"a": 4}, {"a": 6}])
39
+ skew_join4 = SkewJoinHint(columns=[{"a": 4, "b": "test"}, {"a": 5, "b": "hello"}])
40
+
33
41
  parameters = [
34
42
  {},
35
43
  {"how": "left", "right_on": "x", "left_index": True},
44
+ {
45
+ "how": "left",
46
+ "right_on": "x",
47
+ "left_index": True,
48
+ "left_hint": mapjoin,
49
+ "right_hint": mapjoin,
50
+ },
36
51
  {"how": "right", "left_on": "a", "right_index": True},
52
+ {
53
+ "how": "right",
54
+ "left_on": "a",
55
+ "right_index": True,
56
+ "left_hint": mapjoin,
57
+ "right_hint": dist_mapjoin1,
58
+ },
37
59
  {"how": "left", "left_on": "a", "right_on": "x"},
60
+ {"how": "left", "left_on": "a", "right_on": "x", "left_hint": skew_join1},
38
61
  {"how": "right", "left_on": "a", "right_index": True},
62
+ {"how": "right", "left_on": "a", "right_index": True, "right_hint": skew_join2},
39
63
  {"how": "right", "on": "a"},
64
+ {"how": "right", "on": "a", "right_hint": skew_join3},
40
65
  {"how": "inner", "on": ["a", "b"]},
66
+ {"how": "inner", "on": ["a", "b"], "left_hint": skew_join4},
41
67
  ]
42
68
 
43
69
  for kw in parameters:
@@ -213,3 +239,100 @@ def test_concat():
213
239
  mdf2 = from_pandas(df2, chunk_size=3)
214
240
  r = concat([mdf1, mdf2], join="inner")
215
241
  assert r.shape == (20, 3)
242
+
243
+
244
+ def test_invalid_join_hint():
245
+ df1 = pd.DataFrame(
246
+ np.arange(20).reshape((4, 5)) + 1, columns=["a", "b", "c", "d", "e"]
247
+ )
248
+ df2 = pd.DataFrame(np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"])
249
+
250
+ mdf1 = from_pandas(df1, chunk_size=2)
251
+ mdf2 = from_pandas(df2, chunk_size=3)
252
+
253
+ # type error
254
+ parameters = [
255
+ {"how": "left", "right_on": "x", "left_index": True, "left_hint": [1]},
256
+ {
257
+ "how": "left",
258
+ "right_on": "x",
259
+ "left_index": True,
260
+ "left_hint": {"key": "value"},
261
+ },
262
+ {
263
+ "how": "right",
264
+ "left_on": "a",
265
+ "right_index": True,
266
+ "right_hint": SkewJoinHint(columns=2),
267
+ },
268
+ {
269
+ "how": "left",
270
+ "left_on": "a",
271
+ "right_on": "x",
272
+ "left_hint": SkewJoinHint(columns="a"),
273
+ },
274
+ {
275
+ "how": "right",
276
+ "left_on": "a",
277
+ "right_index": True,
278
+ "right_hint": SkewJoinHint(columns=["0", []]),
279
+ },
280
+ ]
281
+
282
+ for kw in parameters:
283
+ print(kw)
284
+ with pytest.raises(TypeError):
285
+ mdf1.merge(mdf2, **kw)
286
+
287
+ # value error
288
+ parameters = [
289
+ # mapjoin can't working with skew join
290
+ {
291
+ "how": "left",
292
+ "right_on": "x",
293
+ "left_index": True,
294
+ "left_hint": MapJoinHint(),
295
+ "right_hint": SkewJoinHint(),
296
+ },
297
+ # right join can't apply to skew join left frame
298
+ {
299
+ "how": "right",
300
+ "left_on": "a",
301
+ "right_index": True,
302
+ "left_hint": SkewJoinHint(),
303
+ },
304
+ # invalid columns
305
+ {
306
+ "how": "left",
307
+ "left_on": "a",
308
+ "right_on": "x",
309
+ "left_hint": SkewJoinHint(columns=["b"]),
310
+ },
311
+ # invalid index level
312
+ {
313
+ "how": "right",
314
+ "left_on": "a",
315
+ "right_index": True,
316
+ "right_hint": SkewJoinHint(columns=[5]),
317
+ },
318
+ # unmatched skew join columns
319
+ {
320
+ "how": "right",
321
+ "left_on": "a",
322
+ "right_index": True,
323
+ "right_hint": SkewJoinHint(columns=[{0: "value1"}, {1: "value2"}]),
324
+ },
325
+ # invalid dist_mapjoin shard_count
326
+ {"how": "right", "on": "a", "right_hint": DistributedMapJoinHint()},
327
+ # all can't work with outer join
328
+ {"how": "outer", "on": ["a", "b"], "left_hint": MapJoinHint()},
329
+ {
330
+ "how": "outer",
331
+ "on": ["a", "b"],
332
+ "left_hint": DistributedMapJoinHint(shard_count=5),
333
+ },
334
+ {"how": "outer", "on": ["a", "b"], "left_hint": SkewJoinHint()},
335
+ ]
336
+ for kw in parameters:
337
+ with pytest.raises(ValueError):
338
+ mdf1.merge(mdf2, **kw)
@@ -170,6 +170,8 @@ class ApplyOperator(
170
170
  elif self.output_types[0] == OutputType.dataframe:
171
171
  shape = [np.nan, np.nan]
172
172
  shape[1 - self.axis] = df.shape[1 - self.axis]
173
+ if self.axis == 1:
174
+ shape[1] = len(dtypes)
173
175
  shape = tuple(shape)
174
176
  else:
175
177
  shape = (df.shape[1 - self.axis],)
@@ -317,6 +319,7 @@ def df_apply(
317
319
  skip_infer=False,
318
320
  **kwds,
319
321
  ):
322
+ # FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/50
320
323
  """
321
324
  Apply a function along an axis of the DataFrame.
322
325
 
@@ -442,20 +445,12 @@ def df_apply(
442
445
  B 27
443
446
  dtype: int64
444
447
 
445
- >>> df.apply(np.sum, axis=1).execute()
448
+ >>> df.apply(lambda row: int(np.sum(row)), axis=1).execute()
446
449
  0 13
447
450
  1 13
448
451
  2 13
449
452
  dtype: int64
450
453
 
451
- Returning a list-like will result in a Series
452
-
453
- >>> df.apply(lambda x: [1, 2], axis=1).execute()
454
- 0 [1, 2]
455
- 1 [1, 2]
456
- 2 [1, 2]
457
- dtype: object
458
-
459
454
  Passing ``result_type='expand'`` will expand list-like results
460
455
  to columns of a Dataframe
461
456
 
@@ -469,7 +464,7 @@ def df_apply(
469
464
  ``result_type='expand'``. The resulting column names
470
465
  will be the Series index.
471
466
 
472
- >>> df.apply(lambda x: md.Series([1, 2], index=['foo', 'bar']), axis=1).execute()
467
+ >>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1).execute()
473
468
  foo bar
474
469
  0 1 2
475
470
  1 1 2
@@ -99,7 +99,7 @@ def case_when(series, caselist):
99
99
  >>> b = md.Series([0, 3, 4, 5])
100
100
 
101
101
  >>> c.case_when(caselist=[(a.gt(0), a), # condition, replacement
102
- ... (b.gt(0), b)])
102
+ ... (b.gt(0), b)]).execute()
103
103
  0 6
104
104
  1 3
105
105
  2 1
@@ -15,7 +15,7 @@
15
15
  import numpy as np
16
16
  import pandas as pd
17
17
 
18
- from ... import opcodes as OperandDef
18
+ from ... import opcodes
19
19
  from ...serialization.serializables import AnyField, FieldTypes, KeyField, ListField
20
20
  from ..core import SERIES_TYPE
21
21
  from ..operators import DataFrameOperator, DataFrameOperatorMixin
@@ -23,7 +23,7 @@ from ..utils import build_empty_df, parse_index
23
23
 
24
24
 
25
25
  class DataFrameDescribe(DataFrameOperator, DataFrameOperatorMixin):
26
- _op_type_ = OperandDef.DESCRIBE
26
+ _op_type_ = opcodes.DESCRIBE
27
27
 
28
28
  input = KeyField("input", default=None)
29
29
  percentiles = ListField("percentiles", FieldTypes.float64, default=None)
@@ -37,16 +37,15 @@ class DataFrameDropDuplicates(DuplicateOperand):
37
37
  shape += (3,)
38
38
  return shape
39
39
 
40
- @classmethod
41
- def _gen_tileable_params(cls, op: "DataFrameDropDuplicates", input_params):
40
+ def _gen_tileable_params(self, op: "DataFrameDropDuplicates", input_params):
42
41
  params = input_params.copy()
43
- if op.ignore_index:
42
+ if op.ignore_index and self._output_types[0] != OutputType.index:
44
43
  params["index_value"] = parse_index(pd.RangeIndex(-1))
45
44
  else:
46
45
  params["index_value"] = gen_unknown_index_value(
47
46
  input_params["index_value"], op.keep, op.subset, type(op).__name__
48
47
  )
49
- params["shape"] = cls._get_shape(input_params["shape"], op)
48
+ params["shape"] = self._get_shape(input_params["shape"], op)
50
49
  return params
51
50
 
52
51
  def __call__(self, inp, inplace=False):
@@ -105,6 +104,7 @@ def df_drop_duplicates(
105
104
  def series_drop_duplicates(
106
105
  series, keep="first", inplace=False, ignore_index=False, method="auto"
107
106
  ):
107
+ # FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/12
108
108
  """
109
109
  Return Series with duplicate values removed.
110
110
 
@@ -148,27 +148,6 @@ def series_drop_duplicates(
148
148
  5 hippo
149
149
  Name: animal, dtype: object
150
150
 
151
- With the 'keep' parameter, the selection behaviour of duplicated values
152
- can be changed. The value 'first' keeps the first occurrence for each
153
- set of duplicated entries. The default value of keep is 'first'.
154
-
155
- >>> s.drop_duplicates().execute()
156
- 0 lame
157
- 1 cow
158
- 3 beetle
159
- 5 hippo
160
- Name: animal, dtype: object
161
-
162
- The value 'last' for parameter 'keep' keeps the last occurrence for
163
- each set of duplicated entries.
164
-
165
- >>> s.drop_duplicates(keep='last').execute()
166
- 1 cow
167
- 3 beetle
168
- 4 lame
169
- 5 hippo
170
- Name: animal, dtype: object
171
-
172
151
  The value ``False`` for parameter 'keep' discards all sets of
173
152
  duplicated entries. Setting the value of 'inplace' to ``True`` performs
174
153
  the operation inplace and returns ``None``.
@@ -120,6 +120,10 @@ class CollectionVisitor(ast.NodeVisitor):
120
120
  if obj_name in self.env:
121
121
  self.referenced_vars.add(obj_name)
122
122
  return self.env[obj_name]
123
+ try:
124
+ return self.target[obj_name]
125
+ except KeyError:
126
+ pass
123
127
  raise KeyError(f"name {obj_name} is not defined")
124
128
 
125
129
  def visit(self, node):
@@ -58,7 +58,7 @@ class DataFrameMemoryUsage(DataFrameOperator, DataFrameOperatorMixin):
58
58
  """
59
59
  if df_or_series.ndim == 1:
60
60
  # the input data is a series, a Scalar will be returned
61
- return self.new_scalar([df_or_series], dtype=np.dtype(np.int_))
61
+ return self.new_scalar([df_or_series], dtype=np.dtype(int))
62
62
  else:
63
63
  # the input data is a DataFrame, a Scalar will be returned
64
64
  # calculate shape of returning series given ``op.index``
@@ -71,7 +71,7 @@ class DataFrameMemoryUsage(DataFrameOperator, DataFrameOperatorMixin):
71
71
  [df_or_series],
72
72
  index_value=self._adapt_index(df_or_series.columns_value),
73
73
  shape=new_shape,
74
- dtype=np.dtype(np.int_),
74
+ dtype=np.dtype(int),
75
75
  )
76
76
 
77
77
 
@@ -18,6 +18,7 @@ from ..utils import validate_axis
18
18
  def pct_change(
19
19
  df_or_series, periods=1, fill_method="pad", limit=None, freq=None, **kwargs
20
20
  ):
21
+ # FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/32
21
22
  """
22
23
  Percentage change between the current and a prior element.
23
24
 
@@ -50,89 +51,6 @@ def pct_change(
50
51
  DataFrame.diff : Compute the difference of two elements in a DataFrame.
51
52
  Series.shift : Shift the index by some number of periods.
52
53
  DataFrame.shift : Shift the index by some number of periods.
53
-
54
- Examples
55
- --------
56
- **Series**
57
-
58
- >>> import maxframe.dataframe as md
59
-
60
- >>> s = md.Series([90, 91, 85])
61
- >>> s.execute()
62
- 0 90
63
- 1 91
64
- 2 85
65
- dtype: int64
66
-
67
- >>> s.pct_change().execute()
68
- 0 NaN
69
- 1 0.011111
70
- 2 -0.065934
71
- dtype: float64
72
-
73
- >>> s.pct_change(periods=2).execute()
74
- 0 NaN
75
- 1 NaN
76
- 2 -0.055556
77
- dtype: float64
78
-
79
- See the percentage change in a Series where filling NAs with last
80
- valid observation forward to next valid.
81
-
82
- >>> s = md.Series([90, 91, None, 85])
83
- >>> s.execute()
84
- 0 90.0
85
- 1 91.0
86
- 2 NaN
87
- 3 85.0
88
- dtype: float64
89
-
90
- >>> s.pct_change(fill_method='ffill').execute()
91
- 0 NaN
92
- 1 0.011111
93
- 2 0.000000
94
- 3 -0.065934
95
- dtype: float64
96
-
97
- **DataFrame**
98
-
99
- Percentage change in French franc, Deutsche Mark, and Italian lira from
100
- 1980-01-01 to 1980-03-01.
101
-
102
- >>> df = md.DataFrame({
103
- ... 'FR': [4.0405, 4.0963, 4.3149],
104
- ... 'GR': [1.7246, 1.7482, 1.8519],
105
- ... 'IT': [804.74, 810.01, 860.13]},
106
- ... index=['1980-01-01', '1980-02-01', '1980-03-01'])
107
- >>> df.execute()
108
- FR GR IT
109
- 1980-01-01 4.0405 1.7246 804.74
110
- 1980-02-01 4.0963 1.7482 810.01
111
- 1980-03-01 4.3149 1.8519 860.13
112
-
113
- >>> df.pct_change().execute()
114
- FR GR IT
115
- 1980-01-01 NaN NaN NaN
116
- 1980-02-01 0.013810 0.013684 0.006549
117
- 1980-03-01 0.053365 0.059318 0.061876
118
-
119
- Percentage of change in GOOG and APPL stock volume. Shows computing
120
- the percentage change between columns.
121
-
122
- >>> df = md.DataFrame({
123
- ... '2016': [1769950, 30586265],
124
- ... '2015': [1500923, 40912316],
125
- ... '2014': [1371819, 41403351]},
126
- ... index=['GOOG', 'APPL'])
127
- >>> df.execute()
128
- 2016 2015 2014
129
- GOOG 1769950 1500923 1371819
130
- APPL 30586265 40912316 41403351
131
-
132
- >>> df.pct_change(axis='columns').execute()
133
- 2016 2015 2014
134
- GOOG NaN -0.151997 -0.086016
135
- APPL NaN 0.337604 0.012002
136
54
  """
137
55
 
138
56
  axis = validate_axis(kwargs.pop("axis", 0))
@@ -18,6 +18,7 @@ import pytest
18
18
 
19
19
  from .... import opcodes
20
20
  from ....core import OutputType
21
+ from ....dataframe import DataFrame
21
22
  from ....tensor.core import TENSOR_TYPE
22
23
  from ... import eval as maxframe_eval
23
24
  from ... import get_dummies, to_numeric
@@ -430,6 +431,28 @@ def test_case_when():
430
431
  assert isinstance(col.inputs[2].op, DataFrameGreater)
431
432
 
432
433
 
434
+ def test_apply():
435
+ df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]})
436
+
437
+ keys = [1, 2]
438
+
439
+ def f(x, keys):
440
+ if x["a"] in keys:
441
+ return [1, 0]
442
+ else:
443
+ return [0, 1]
444
+
445
+ apply_df = df[["a"]].apply(
446
+ f,
447
+ output_type="dataframe",
448
+ dtypes=pd.Series(["int64", "int64"]),
449
+ axis=1,
450
+ result_type="expand",
451
+ keys=keys,
452
+ )
453
+ assert apply_df.shape == (3, 2)
454
+
455
+
433
456
  def test_pivot_table():
434
457
  from ...groupby.aggregation import DataFrameGroupByAgg
435
458
  from ...misc.pivot_table import DataFramePivotTable
@@ -228,21 +228,6 @@ def df_transform(df, func, axis=0, *args, dtypes=None, skip_infer=False, **kwarg
228
228
  0 1 2
229
229
  1 2 3
230
230
  2 3 4
231
-
232
- Even though the resulting DataFrame must have the same length as the
233
- input DataFrame, it is possible to provide several input functions:
234
-
235
- >>> s = md.Series(range(3))
236
- >>> s.execute()
237
- 0 0
238
- 1 1
239
- 2 2
240
- dtype: int64
241
- >>> s.transform([mt.sqrt, mt.exp]).execute()
242
- sqrt exp
243
- 0 0.000000 1.000000
244
- 1 1.000000 2.718282
245
- 2 1.414214 7.389056
246
231
  """
247
232
  op = TransformOperator(
248
233
  func=func,
@@ -265,6 +250,7 @@ def series_transform(
265
250
  dtype=None,
266
251
  **kwargs
267
252
  ):
253
+ # FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/10
268
254
  """
269
255
  Call ``func`` on self producing a Series with transformed values.
270
256
 
@@ -332,21 +318,6 @@ def series_transform(
332
318
  0 1 2
333
319
  1 2 3
334
320
  2 3 4
335
-
336
- Even though the resulting Series must have the same length as the
337
- input Series, it is possible to provide several input functions:
338
-
339
- >>> s = md.Series(range(3))
340
- >>> s.execute()
341
- 0 0
342
- 1 1
343
- 2 2
344
- dtype: int64
345
- >>> s.transform([mt.sqrt, mt.exp]).execute()
346
- sqrt exp
347
- 0 0.000000 1.000000
348
- 1 1.000000 2.718282
349
- 2 1.414214 7.389056
350
321
  """
351
322
  op = TransformOperator(
352
323
  func=func,
@@ -85,6 +85,7 @@ def value_counts(
85
85
  dropna=True,
86
86
  method="auto",
87
87
  ):
88
+ # FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/33
88
89
  """
89
90
  Return a Series containing counts of unique values.
90
91
 
@@ -125,9 +126,8 @@ def value_counts(
125
126
  Examples
126
127
  --------
127
128
  >>> import maxframe.dataframe as md
128
- >>> import maxframe.tensor as mt
129
-
130
- >>> s = md.Series([3, 1, 2, 3, 4, mt.nan])
129
+ >>> import numpy as np
130
+ >>> s = md.Series([3, 1, 2, 3, 4, np.nan])
131
131
  >>> s.value_counts().execute()
132
132
  3.0 2
133
133
  4.0 1
@@ -138,7 +138,7 @@ def value_counts(
138
138
  With `normalize` set to `True`, returns the relative frequency by
139
139
  dividing all values by the sum of values.
140
140
 
141
- >>> s = md.Series([3, 1, 2, 3, 4, mt.nan])
141
+ >>> s = md.Series([3, 1, 2, 3, 4, np.nan])
142
142
  >>> s.value_counts(normalize=True).execute()
143
143
  3.0 0.4
144
144
  4.0 0.2
@@ -146,19 +146,6 @@ def value_counts(
146
146
  1.0 0.2
147
147
  dtype: float64
148
148
 
149
- **bins**
150
-
151
- Bins can be useful for going from a continuous variable to a
152
- categorical variable; instead of counting unique
153
- apparitions of values, divide the index in the specified
154
- number of half-open bins.
155
-
156
- >>> s.value_counts(bins=3).execute()
157
- (2.0, 3.0] 2
158
- (0.996, 2.0] 2
159
- (3.0, 4.0] 1
160
- dtype: int64
161
-
162
149
  **dropna**
163
150
 
164
151
  With `dropna` set to `False` we can also see NaN index values.
@@ -234,7 +234,7 @@ def series_dropna(series, axis=0, inplace=False, how=None):
234
234
  Empty strings are not considered NA values. ``None`` is considered an
235
235
  NA value.
236
236
 
237
- >>> ser = md.Series([np.NaN, 2, md.NaT, '', None, 'I stay'])
237
+ >>> ser = md.Series([np.NaN, '2', md.NaT, '', None, 'I stay'])
238
238
  >>> ser.execute()
239
239
  0 NaN
240
240
  1 2
@@ -132,11 +132,11 @@ def fillna(
132
132
  --------
133
133
  >>> import maxframe.tensor as mt
134
134
  >>> import maxframe.dataframe as md
135
- >>> df = md.DataFrame([[mt.nan, 2, mt.nan, 0],
136
- ... [3, 4, mt.nan, 1],
137
- ... [mt.nan, mt.nan, mt.nan, 5],
138
- ... [mt.nan, 3, mt.nan, 4]],
139
- ... columns=list('ABCD'))
135
+ >>> df = md.DataFrame([[np.nan, 2, np.nan, 0],
136
+ [3, 4, np.nan, 1],
137
+ [np.nan, np.nan, np.nan, 5],
138
+ [np.nan, 3, np.nan, 4]],
139
+ columns=list('ABCD'))
140
140
  >>> df.execute()
141
141
  A B C D
142
142
  0 NaN 2.0 NaN 0
@@ -67,6 +67,7 @@ def dataframe_sort_values(
67
67
  parallel_kind="PSRS",
68
68
  psrs_kinds=None,
69
69
  ):
70
+ # FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/15
70
71
  """
71
72
  Sort by the values along either axis.
72
73
 
@@ -152,17 +153,6 @@ def dataframe_sort_values(
152
153
  0 A 2 0
153
154
  1 A 1 1
154
155
  3 NaN 8 4
155
-
156
- Putting NAs first
157
-
158
- >>> df.sort_values(by='col1', ascending=False, na_position='first').execute()
159
- col1 col2 col3
160
- 3 NaN 8 4
161
- 4 D 7 2
162
- 5 C 4 3
163
- 2 B 9 9
164
- 0 A 2 0
165
- 1 A 1 1
166
156
  """
167
157
 
168
158
  if na_position not in ["last", "first"]: # pragma: no cover
@@ -43,7 +43,7 @@ class DataFrameCorr(DataFrameOperator, DataFrameOperatorMixin):
43
43
  def __call__(self, df_or_series):
44
44
  if isinstance(df_or_series, SERIES_TYPE):
45
45
  inputs = filter_inputs([df_or_series, self.other])
46
- return self.new_scalar(inputs, dtype=np.dtype(np.float_))
46
+ return self.new_scalar(inputs, dtype=np.dtype(float))
47
47
  else:
48
48
 
49
49
  def _filter_numeric(obj):
@@ -60,7 +60,7 @@ class DataFrameCorr(DataFrameOperator, DataFrameOperatorMixin):
60
60
  inputs = filter_inputs([df_or_series, self.other])
61
61
  if self.axis is None:
62
62
  dtypes = pd.Series(
63
- [np.dtype(np.float_)] * len(df_or_series.dtypes),
63
+ [np.dtype(float)] * len(df_or_series.dtypes),
64
64
  index=df_or_series.dtypes.index,
65
65
  )
66
66
  return self.new_dataframe(
@@ -85,7 +85,7 @@ class DataFrameCorr(DataFrameOperator, DataFrameOperatorMixin):
85
85
  return self.new_series(
86
86
  inputs,
87
87
  shape=shape,
88
- dtype=np.dtype(np.float_),
88
+ dtype=np.dtype(float),
89
89
  index_value=new_index_value,
90
90
  )
91
91
 
@@ -14,8 +14,9 @@
14
14
 
15
15
  import numpy as np
16
16
  import pandas as pd
17
+ from pandas.core.dtypes.cast import find_common_type
17
18
 
18
- from ... import opcodes as OperandDef
19
+ from ... import opcodes
19
20
  from ...core import ENTITY_TYPE
20
21
  from ...serialization.serializables import (
21
22
  AnyField,
@@ -32,11 +33,11 @@ from ...tensor.datasource import tensor as astensor
32
33
  from ...tensor.statistics.quantile import quantile as tensor_quantile
33
34
  from ..core import DATAFRAME_TYPE
34
35
  from ..operators import DataFrameOperator, DataFrameOperatorMixin
35
- from ..utils import build_empty_df, find_common_type, parse_index, validate_axis
36
+ from ..utils import build_empty_df, parse_index, validate_axis
36
37
 
37
38
 
38
39
  class DataFrameQuantile(DataFrameOperator, DataFrameOperatorMixin):
39
- _op_type_ = OperandDef.QUANTILE
40
+ _op_type_ = opcodes.QUANTILE
40
41
 
41
42
  input = KeyField("input", default=None)
42
43
  q = AnyField("q", default=None)
@@ -259,6 +260,7 @@ def quantile_series(series, q=0.5, interpolation="linear"):
259
260
 
260
261
 
261
262
  def quantile_dataframe(df, q=0.5, axis=0, numeric_only=True, interpolation="linear"):
263
+ # FIXME: Timedelta not support. Data invalid: ODPS-0010000:InvalidArgument:duration[ns] is not equal to string
262
264
  """
263
265
  Return values at the given quantile over requested axis.
264
266
 
@@ -309,20 +311,6 @@ def quantile_dataframe(df, q=0.5, axis=0, numeric_only=True, interpolation="line
309
311
  a b
310
312
  0.1 1.3 3.7
311
313
  0.5 2.5 55.0
312
-
313
- Specifying `numeric_only=False` will also compute the quantile of
314
- datetime and timedelta data.
315
-
316
- >>> df = md.DataFrame({'A': [1, 2],
317
- ... 'B': [md.Timestamp('2010'),
318
- ... md.Timestamp('2011')],
319
- ... 'C': [md.Timedelta('1 days'),
320
- ... md.Timedelta('2 days')]})
321
- >>> df.quantile(0.5, numeric_only=False).execute()
322
- A 1.5
323
- B 2010-07-02 12:00:00
324
- C 1 days 12:00:00
325
- Name: 0.5, dtype: object
326
314
  """
327
315
  if isinstance(q, ENTITY_TYPE):
328
316
  q = astensor(q)