maxframe 0.1.0b5__cp311-cp311-win_amd64.whl → 1.0.0__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (203) hide show
  1. maxframe/_utils.cp311-win_amd64.pyd +0 -0
  2. maxframe/codegen.py +10 -4
  3. maxframe/config/config.py +68 -10
  4. maxframe/config/validators.py +42 -11
  5. maxframe/conftest.py +58 -14
  6. maxframe/core/__init__.py +2 -16
  7. maxframe/core/entity/__init__.py +1 -12
  8. maxframe/core/entity/executable.py +1 -1
  9. maxframe/core/entity/objects.py +46 -45
  10. maxframe/core/entity/output_types.py +0 -3
  11. maxframe/core/entity/tests/test_objects.py +43 -0
  12. maxframe/core/entity/tileables.py +5 -78
  13. maxframe/core/graph/__init__.py +2 -2
  14. maxframe/core/graph/builder/__init__.py +0 -1
  15. maxframe/core/graph/builder/base.py +5 -4
  16. maxframe/core/graph/builder/tileable.py +4 -4
  17. maxframe/core/graph/builder/utils.py +4 -8
  18. maxframe/core/graph/core.cp311-win_amd64.pyd +0 -0
  19. maxframe/core/graph/core.pyx +4 -4
  20. maxframe/core/graph/entity.py +9 -33
  21. maxframe/core/operator/__init__.py +2 -9
  22. maxframe/core/operator/base.py +3 -5
  23. maxframe/core/operator/objects.py +0 -9
  24. maxframe/core/operator/utils.py +55 -0
  25. maxframe/dataframe/__init__.py +1 -1
  26. maxframe/dataframe/arithmetic/around.py +5 -17
  27. maxframe/dataframe/arithmetic/core.py +15 -7
  28. maxframe/dataframe/arithmetic/docstring.py +7 -33
  29. maxframe/dataframe/arithmetic/equal.py +4 -2
  30. maxframe/dataframe/arithmetic/greater.py +4 -2
  31. maxframe/dataframe/arithmetic/greater_equal.py +4 -2
  32. maxframe/dataframe/arithmetic/less.py +2 -2
  33. maxframe/dataframe/arithmetic/less_equal.py +4 -2
  34. maxframe/dataframe/arithmetic/not_equal.py +4 -2
  35. maxframe/dataframe/arithmetic/tests/test_arithmetic.py +39 -16
  36. maxframe/dataframe/core.py +31 -7
  37. maxframe/dataframe/datasource/date_range.py +2 -2
  38. maxframe/dataframe/datasource/read_odps_query.py +117 -23
  39. maxframe/dataframe/datasource/read_odps_table.py +6 -3
  40. maxframe/dataframe/datasource/tests/test_datasource.py +103 -8
  41. maxframe/dataframe/datastore/tests/test_to_odps.py +48 -0
  42. maxframe/dataframe/datastore/to_odps.py +28 -0
  43. maxframe/dataframe/extensions/__init__.py +5 -0
  44. maxframe/dataframe/extensions/flatjson.py +131 -0
  45. maxframe/dataframe/extensions/flatmap.py +317 -0
  46. maxframe/dataframe/extensions/reshuffle.py +1 -1
  47. maxframe/dataframe/extensions/tests/test_extensions.py +108 -3
  48. maxframe/dataframe/groupby/core.py +1 -1
  49. maxframe/dataframe/groupby/cum.py +0 -1
  50. maxframe/dataframe/groupby/fill.py +4 -1
  51. maxframe/dataframe/groupby/getitem.py +6 -0
  52. maxframe/dataframe/groupby/tests/test_groupby.py +5 -1
  53. maxframe/dataframe/groupby/transform.py +5 -1
  54. maxframe/dataframe/indexing/align.py +1 -1
  55. maxframe/dataframe/indexing/loc.py +6 -4
  56. maxframe/dataframe/indexing/rename.py +5 -28
  57. maxframe/dataframe/indexing/sample.py +0 -1
  58. maxframe/dataframe/indexing/set_index.py +68 -1
  59. maxframe/dataframe/initializer.py +11 -1
  60. maxframe/dataframe/merge/__init__.py +9 -1
  61. maxframe/dataframe/merge/concat.py +41 -31
  62. maxframe/dataframe/merge/merge.py +237 -3
  63. maxframe/dataframe/merge/tests/test_merge.py +126 -1
  64. maxframe/dataframe/misc/apply.py +5 -10
  65. maxframe/dataframe/misc/case_when.py +1 -1
  66. maxframe/dataframe/misc/describe.py +2 -2
  67. maxframe/dataframe/misc/drop_duplicates.py +8 -8
  68. maxframe/dataframe/misc/eval.py +4 -0
  69. maxframe/dataframe/misc/memory_usage.py +2 -2
  70. maxframe/dataframe/misc/pct_change.py +1 -83
  71. maxframe/dataframe/misc/tests/test_misc.py +33 -2
  72. maxframe/dataframe/misc/transform.py +1 -30
  73. maxframe/dataframe/misc/value_counts.py +4 -17
  74. maxframe/dataframe/missing/dropna.py +1 -1
  75. maxframe/dataframe/missing/fillna.py +5 -5
  76. maxframe/dataframe/operators.py +1 -17
  77. maxframe/dataframe/reduction/core.py +2 -2
  78. maxframe/dataframe/reduction/tests/test_reduction.py +2 -4
  79. maxframe/dataframe/sort/sort_values.py +1 -11
  80. maxframe/dataframe/statistics/corr.py +3 -3
  81. maxframe/dataframe/statistics/quantile.py +13 -19
  82. maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
  83. maxframe/dataframe/tests/test_initializer.py +33 -2
  84. maxframe/dataframe/utils.py +26 -11
  85. maxframe/dataframe/window/expanding.py +5 -3
  86. maxframe/dataframe/window/tests/test_expanding.py +2 -2
  87. maxframe/errors.py +13 -0
  88. maxframe/extension.py +12 -0
  89. maxframe/io/__init__.py +13 -0
  90. maxframe/io/objects/__init__.py +24 -0
  91. maxframe/io/objects/core.py +140 -0
  92. maxframe/io/objects/tensor.py +76 -0
  93. maxframe/io/objects/tests/__init__.py +13 -0
  94. maxframe/io/objects/tests/test_object_io.py +97 -0
  95. maxframe/{odpsio → io/odpsio}/__init__.py +3 -1
  96. maxframe/{odpsio → io/odpsio}/arrow.py +42 -10
  97. maxframe/{odpsio → io/odpsio}/schema.py +38 -16
  98. maxframe/io/odpsio/tableio.py +719 -0
  99. maxframe/io/odpsio/tests/__init__.py +13 -0
  100. maxframe/{odpsio → io/odpsio}/tests/test_schema.py +59 -22
  101. maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +50 -23
  102. maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
  103. maxframe/io/odpsio/volumeio.py +63 -0
  104. maxframe/learn/contrib/__init__.py +3 -1
  105. maxframe/learn/contrib/graph/__init__.py +15 -0
  106. maxframe/learn/contrib/graph/connected_components.py +215 -0
  107. maxframe/learn/contrib/graph/tests/__init__.py +13 -0
  108. maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
  109. maxframe/learn/contrib/llm/__init__.py +16 -0
  110. maxframe/learn/contrib/llm/core.py +54 -0
  111. maxframe/learn/contrib/llm/models/__init__.py +14 -0
  112. maxframe/learn/contrib/llm/models/dashscope.py +73 -0
  113. maxframe/learn/contrib/llm/multi_modal.py +42 -0
  114. maxframe/learn/contrib/llm/text.py +42 -0
  115. maxframe/learn/contrib/xgboost/classifier.py +26 -2
  116. maxframe/learn/contrib/xgboost/core.py +87 -2
  117. maxframe/learn/contrib/xgboost/dmatrix.py +3 -6
  118. maxframe/learn/contrib/xgboost/predict.py +29 -46
  119. maxframe/learn/contrib/xgboost/regressor.py +3 -10
  120. maxframe/learn/contrib/xgboost/train.py +29 -18
  121. maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
  122. maxframe/lib/mmh3.cp311-win_amd64.pyd +0 -0
  123. maxframe/lib/mmh3.pyi +43 -0
  124. maxframe/lib/sparse/tests/test_sparse.py +15 -15
  125. maxframe/lib/wrapped_pickle.py +2 -1
  126. maxframe/opcodes.py +8 -0
  127. maxframe/protocol.py +154 -27
  128. maxframe/remote/core.py +4 -8
  129. maxframe/serialization/__init__.py +1 -0
  130. maxframe/serialization/core.cp311-win_amd64.pyd +0 -0
  131. maxframe/serialization/core.pxd +3 -0
  132. maxframe/serialization/core.pyi +3 -0
  133. maxframe/serialization/core.pyx +67 -26
  134. maxframe/serialization/exception.py +1 -1
  135. maxframe/serialization/pandas.py +52 -17
  136. maxframe/serialization/serializables/core.py +180 -15
  137. maxframe/serialization/serializables/field_type.py +4 -1
  138. maxframe/serialization/serializables/tests/test_serializable.py +54 -5
  139. maxframe/serialization/tests/test_serial.py +2 -1
  140. maxframe/session.py +9 -2
  141. maxframe/tensor/__init__.py +81 -2
  142. maxframe/tensor/arithmetic/isclose.py +1 -0
  143. maxframe/tensor/arithmetic/tests/test_arithmetic.py +22 -18
  144. maxframe/tensor/core.py +5 -136
  145. maxframe/tensor/datasource/array.py +3 -0
  146. maxframe/tensor/datasource/full.py +1 -1
  147. maxframe/tensor/datasource/tests/test_datasource.py +1 -1
  148. maxframe/tensor/indexing/flatnonzero.py +1 -1
  149. maxframe/tensor/indexing/getitem.py +2 -0
  150. maxframe/tensor/merge/__init__.py +2 -0
  151. maxframe/tensor/merge/concatenate.py +101 -0
  152. maxframe/tensor/merge/tests/test_merge.py +30 -1
  153. maxframe/tensor/merge/vstack.py +74 -0
  154. maxframe/tensor/{base → misc}/__init__.py +2 -0
  155. maxframe/tensor/{base → misc}/atleast_1d.py +1 -3
  156. maxframe/tensor/misc/atleast_2d.py +70 -0
  157. maxframe/tensor/misc/atleast_3d.py +85 -0
  158. maxframe/tensor/misc/tests/__init__.py +13 -0
  159. maxframe/tensor/{base → misc}/transpose.py +22 -18
  160. maxframe/tensor/{base → misc}/unique.py +3 -3
  161. maxframe/tensor/operators.py +1 -7
  162. maxframe/tensor/random/core.py +1 -1
  163. maxframe/tensor/reduction/count_nonzero.py +2 -1
  164. maxframe/tensor/reduction/mean.py +1 -0
  165. maxframe/tensor/reduction/nanmean.py +1 -0
  166. maxframe/tensor/reduction/nanvar.py +2 -0
  167. maxframe/tensor/reduction/tests/test_reduction.py +12 -1
  168. maxframe/tensor/reduction/var.py +2 -0
  169. maxframe/tensor/statistics/quantile.py +2 -2
  170. maxframe/tensor/utils.py +2 -22
  171. maxframe/tests/test_protocol.py +34 -0
  172. maxframe/tests/test_utils.py +0 -12
  173. maxframe/tests/utils.py +17 -2
  174. maxframe/typing_.py +4 -1
  175. maxframe/udf.py +8 -9
  176. maxframe/utils.py +106 -86
  177. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/METADATA +25 -25
  178. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/RECORD +197 -173
  179. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/WHEEL +1 -1
  180. maxframe_client/__init__.py +0 -1
  181. maxframe_client/clients/framedriver.py +4 -1
  182. maxframe_client/fetcher.py +81 -74
  183. maxframe_client/session/consts.py +3 -0
  184. maxframe_client/session/graph.py +8 -2
  185. maxframe_client/session/odps.py +194 -40
  186. maxframe_client/session/task.py +94 -39
  187. maxframe_client/tests/test_fetcher.py +21 -3
  188. maxframe_client/tests/test_session.py +109 -8
  189. maxframe/core/entity/chunks.py +0 -68
  190. maxframe/core/entity/fuse.py +0 -73
  191. maxframe/core/graph/builder/chunk.py +0 -430
  192. maxframe/odpsio/tableio.py +0 -322
  193. maxframe/odpsio/volumeio.py +0 -95
  194. maxframe_client/clients/spe.py +0 -104
  195. /maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
  196. /maxframe/{tensor/base → dataframe/datastore}/tests/__init__.py +0 -0
  197. /maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
  198. /maxframe/tensor/{base → misc}/astype.py +0 -0
  199. /maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
  200. /maxframe/tensor/{base → misc}/ravel.py +0 -0
  201. /maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
  202. /maxframe/tensor/{base → misc}/where.py +0 -0
  203. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/top_level.txt +0 -0
@@ -18,6 +18,7 @@ from ..utils import validate_axis
18
18
  def pct_change(
19
19
  df_or_series, periods=1, fill_method="pad", limit=None, freq=None, **kwargs
20
20
  ):
21
+ # FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/32
21
22
  """
22
23
  Percentage change between the current and a prior element.
23
24
 
@@ -50,89 +51,6 @@ def pct_change(
50
51
  DataFrame.diff : Compute the difference of two elements in a DataFrame.
51
52
  Series.shift : Shift the index by some number of periods.
52
53
  DataFrame.shift : Shift the index by some number of periods.
53
-
54
- Examples
55
- --------
56
- **Series**
57
-
58
- >>> import maxframe.dataframe as md
59
-
60
- >>> s = md.Series([90, 91, 85])
61
- >>> s.execute()
62
- 0 90
63
- 1 91
64
- 2 85
65
- dtype: int64
66
-
67
- >>> s.pct_change().execute()
68
- 0 NaN
69
- 1 0.011111
70
- 2 -0.065934
71
- dtype: float64
72
-
73
- >>> s.pct_change(periods=2).execute()
74
- 0 NaN
75
- 1 NaN
76
- 2 -0.055556
77
- dtype: float64
78
-
79
- See the percentage change in a Series where filling NAs with last
80
- valid observation forward to next valid.
81
-
82
- >>> s = md.Series([90, 91, None, 85])
83
- >>> s.execute()
84
- 0 90.0
85
- 1 91.0
86
- 2 NaN
87
- 3 85.0
88
- dtype: float64
89
-
90
- >>> s.pct_change(fill_method='ffill').execute()
91
- 0 NaN
92
- 1 0.011111
93
- 2 0.000000
94
- 3 -0.065934
95
- dtype: float64
96
-
97
- **DataFrame**
98
-
99
- Percentage change in French franc, Deutsche Mark, and Italian lira from
100
- 1980-01-01 to 1980-03-01.
101
-
102
- >>> df = md.DataFrame({
103
- ... 'FR': [4.0405, 4.0963, 4.3149],
104
- ... 'GR': [1.7246, 1.7482, 1.8519],
105
- ... 'IT': [804.74, 810.01, 860.13]},
106
- ... index=['1980-01-01', '1980-02-01', '1980-03-01'])
107
- >>> df.execute()
108
- FR GR IT
109
- 1980-01-01 4.0405 1.7246 804.74
110
- 1980-02-01 4.0963 1.7482 810.01
111
- 1980-03-01 4.3149 1.8519 860.13
112
-
113
- >>> df.pct_change().execute()
114
- FR GR IT
115
- 1980-01-01 NaN NaN NaN
116
- 1980-02-01 0.013810 0.013684 0.006549
117
- 1980-03-01 0.053365 0.059318 0.061876
118
-
119
- Percentage of change in GOOG and APPL stock volume. Shows computing
120
- the percentage change between columns.
121
-
122
- >>> df = md.DataFrame({
123
- ... '2016': [1769950, 30586265],
124
- ... '2015': [1500923, 40912316],
125
- ... '2014': [1371819, 41403351]},
126
- ... index=['GOOG', 'APPL'])
127
- >>> df.execute()
128
- 2016 2015 2014
129
- GOOG 1769950 1500923 1371819
130
- APPL 30586265 40912316 41403351
131
-
132
- >>> df.pct_change(axis='columns').execute()
133
- 2016 2015 2014
134
- GOOG NaN -0.151997 -0.086016
135
- APPL NaN 0.337604 0.012002
136
54
  """
137
55
 
138
56
  axis = validate_axis(kwargs.pop("axis", 0))
@@ -18,6 +18,7 @@ import pytest
18
18
 
19
19
  from .... import opcodes
20
20
  from ....core import OutputType
21
+ from ....dataframe import DataFrame
21
22
  from ....tensor.core import TENSOR_TYPE
22
23
  from ... import eval as maxframe_eval
23
24
  from ... import get_dummies, to_numeric
@@ -348,7 +349,9 @@ def test_drop():
348
349
  def test_drop_duplicates():
349
350
  rs = np.random.RandomState(0)
350
351
  raw = pd.DataFrame(
351
- rs.randint(1000, size=(20, 7)), columns=["c" + str(i + 1) for i in range(7)]
352
+ rs.randint(1000, size=(20, 7)),
353
+ columns=["c" + str(i + 1) for i in range(7)],
354
+ index=pd.Index(range(20), name="idx"),
352
355
  )
353
356
  raw["c7"] = [f"s{j}" for j in range(20)]
354
357
 
@@ -360,6 +363,12 @@ def test_drop_duplicates():
360
363
  with pytest.raises(KeyError):
361
364
  df.drop_duplicates(subset="c8")
362
365
 
366
+ # check index
367
+ distinct_df = df.drop_duplicates()
368
+ assert distinct_df.index_value.name == df.index_value.name
369
+ assert isinstance(df.index_value.to_pandas(), pd.RangeIndex)
370
+ assert not isinstance(distinct_df.index_value.to_pandas(), pd.RangeIndex)
371
+
363
372
  s = df["c7"]
364
373
  with pytest.raises(ValueError):
365
374
  s.drop_duplicates(method="unknown")
@@ -430,6 +439,28 @@ def test_case_when():
430
439
  assert isinstance(col.inputs[2].op, DataFrameGreater)
431
440
 
432
441
 
442
+ def test_apply():
443
+ df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]})
444
+
445
+ keys = [1, 2]
446
+
447
+ def f(x, keys):
448
+ if x["a"] in keys:
449
+ return [1, 0]
450
+ else:
451
+ return [0, 1]
452
+
453
+ apply_df = df[["a"]].apply(
454
+ f,
455
+ output_type="dataframe",
456
+ dtypes=pd.Series(["int64", "int64"]),
457
+ axis=1,
458
+ result_type="expand",
459
+ keys=keys,
460
+ )
461
+ assert apply_df.shape == (3, 2)
462
+
463
+
433
464
  def test_pivot_table():
434
465
  from ...groupby.aggregation import DataFrameGroupByAgg
435
466
  from ...misc.pivot_table import DataFramePivotTable
@@ -451,7 +482,7 @@ def test_pivot_table():
451
482
  with pytest.raises(ValueError):
452
483
  df.pivot_table(values=["D", "E"], aggfunc="sum")
453
484
 
454
- t = df.pivot_table(index="A")
485
+ t = df.pivot_table(index=["A", "B", "C"])
455
486
  assert isinstance(t.op, DataFrameGroupByAgg)
456
487
  t = df.pivot_table(index="A", values=["D", "E"], aggfunc="sum")
457
488
  assert isinstance(t.op, DataFrameGroupByAgg)
@@ -228,21 +228,6 @@ def df_transform(df, func, axis=0, *args, dtypes=None, skip_infer=False, **kwarg
228
228
  0 1 2
229
229
  1 2 3
230
230
  2 3 4
231
-
232
- Even though the resulting DataFrame must have the same length as the
233
- input DataFrame, it is possible to provide several input functions:
234
-
235
- >>> s = md.Series(range(3))
236
- >>> s.execute()
237
- 0 0
238
- 1 1
239
- 2 2
240
- dtype: int64
241
- >>> s.transform([mt.sqrt, mt.exp]).execute()
242
- sqrt exp
243
- 0 0.000000 1.000000
244
- 1 1.000000 2.718282
245
- 2 1.414214 7.389056
246
231
  """
247
232
  op = TransformOperator(
248
233
  func=func,
@@ -265,6 +250,7 @@ def series_transform(
265
250
  dtype=None,
266
251
  **kwargs
267
252
  ):
253
+ # FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/10
268
254
  """
269
255
  Call ``func`` on self producing a Series with transformed values.
270
256
 
@@ -332,21 +318,6 @@ def series_transform(
332
318
  0 1 2
333
319
  1 2 3
334
320
  2 3 4
335
-
336
- Even though the resulting Series must have the same length as the
337
- input Series, it is possible to provide several input functions:
338
-
339
- >>> s = md.Series(range(3))
340
- >>> s.execute()
341
- 0 0
342
- 1 1
343
- 2 2
344
- dtype: int64
345
- >>> s.transform([mt.sqrt, mt.exp]).execute()
346
- sqrt exp
347
- 0 0.000000 1.000000
348
- 1 1.000000 2.718282
349
- 2 1.414214 7.389056
350
321
  """
351
322
  op = TransformOperator(
352
323
  func=func,
@@ -85,6 +85,7 @@ def value_counts(
85
85
  dropna=True,
86
86
  method="auto",
87
87
  ):
88
+ # FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/33
88
89
  """
89
90
  Return a Series containing counts of unique values.
90
91
 
@@ -125,9 +126,8 @@ def value_counts(
125
126
  Examples
126
127
  --------
127
128
  >>> import maxframe.dataframe as md
128
- >>> import maxframe.tensor as mt
129
-
130
- >>> s = md.Series([3, 1, 2, 3, 4, mt.nan])
129
+ >>> import numpy as np
130
+ >>> s = md.Series([3, 1, 2, 3, 4, np.nan])
131
131
  >>> s.value_counts().execute()
132
132
  3.0 2
133
133
  4.0 1
@@ -138,7 +138,7 @@ def value_counts(
138
138
  With `normalize` set to `True`, returns the relative frequency by
139
139
  dividing all values by the sum of values.
140
140
 
141
- >>> s = md.Series([3, 1, 2, 3, 4, mt.nan])
141
+ >>> s = md.Series([3, 1, 2, 3, 4, np.nan])
142
142
  >>> s.value_counts(normalize=True).execute()
143
143
  3.0 0.4
144
144
  4.0 0.2
@@ -146,19 +146,6 @@ def value_counts(
146
146
  1.0 0.2
147
147
  dtype: float64
148
148
 
149
- **bins**
150
-
151
- Bins can be useful for going from a continuous variable to a
152
- categorical variable; instead of counting unique
153
- apparitions of values, divide the index in the specified
154
- number of half-open bins.
155
-
156
- >>> s.value_counts(bins=3).execute()
157
- (2.0, 3.0] 2
158
- (0.996, 2.0] 2
159
- (3.0, 4.0] 1
160
- dtype: int64
161
-
162
149
  **dropna**
163
150
 
164
151
  With `dropna` set to `False` we can also see NaN index values.
@@ -234,7 +234,7 @@ def series_dropna(series, axis=0, inplace=False, how=None):
234
234
  Empty strings are not considered NA values. ``None`` is considered an
235
235
  NA value.
236
236
 
237
- >>> ser = md.Series([np.NaN, 2, md.NaT, '', None, 'I stay'])
237
+ >>> ser = md.Series([np.NaN, '2', md.NaT, '', None, 'I stay'])
238
238
  >>> ser.execute()
239
239
  0 NaN
240
240
  1 2
@@ -132,11 +132,11 @@ def fillna(
132
132
  --------
133
133
  >>> import maxframe.tensor as mt
134
134
  >>> import maxframe.dataframe as md
135
- >>> df = md.DataFrame([[mt.nan, 2, mt.nan, 0],
136
- ... [3, 4, mt.nan, 1],
137
- ... [mt.nan, mt.nan, mt.nan, 5],
138
- ... [mt.nan, 3, mt.nan, 4]],
139
- ... columns=list('ABCD'))
135
+ >>> df = md.DataFrame([[np.nan, 2, np.nan, 0],
136
+ [3, 4, np.nan, 1],
137
+ [np.nan, np.nan, np.nan, 5],
138
+ [np.nan, 3, np.nan, 4]],
139
+ columns=list('ABCD'))
140
140
  >>> df.execute()
141
141
  A B C D
142
142
  0 NaN 2.0 NaN 0
@@ -16,13 +16,7 @@ import numpy as np
16
16
  import pandas as pd
17
17
 
18
18
  from ..core import ENTITY_TYPE, OutputType
19
- from ..core.operator import (
20
- Fuse,
21
- FuseChunkMixin,
22
- Operator,
23
- ShuffleProxy,
24
- TileableOperatorMixin,
25
- )
19
+ from ..core.operator import Operator, ShuffleProxy, TileableOperatorMixin
26
20
  from ..tensor.core import TENSOR_TYPE
27
21
  from ..tensor.datasource import tensor as astensor
28
22
  from .core import DATAFRAME_TYPE, SERIES_TYPE
@@ -261,13 +255,3 @@ DataFrameOperator = Operator
261
255
  class DataFrameShuffleProxy(ShuffleProxy, DataFrameOperatorMixin):
262
256
  def __init__(self, sparse=None, output_types=None, **kwargs):
263
257
  super().__init__(sparse=sparse, _output_types=output_types, **kwargs)
264
-
265
-
266
- class DataFrameFuseChunkMixin(FuseChunkMixin, DataFrameOperatorMixin):
267
- __slots__ = ()
268
-
269
-
270
- class DataFrameFuseChunk(Fuse, DataFrameFuseChunkMixin):
271
- @property
272
- def output_types(self):
273
- return self.outputs[-1].chunk.op.output_types
@@ -552,7 +552,7 @@ class ReductionCompiler:
552
552
  @enter_mode(build=True)
553
553
  def _compile_function(self, func, func_name=None, ndim=1) -> ReductionSteps:
554
554
  from ...tensor.arithmetic.core import TensorBinOp, TensorUnaryOp
555
- from ...tensor.base import TensorWhere
555
+ from ...tensor.misc import TensorWhere
556
556
  from ..arithmetic.core import DataFrameBinOp, DataFrameUnaryOp
557
557
  from ..datasource.dataframe import DataFrameDataSource
558
558
  from ..datasource.series import SeriesDataSource
@@ -679,8 +679,8 @@ class ReductionCompiler:
679
679
  ]
680
680
  """
681
681
  from ...tensor.arithmetic.core import TensorBinOp, TensorUnaryOp
682
- from ...tensor.base import TensorWhere
683
682
  from ...tensor.datasource import Scalar
683
+ from ...tensor.misc import TensorWhere
684
684
  from ..arithmetic.core import DataFrameBinOp, DataFrameUnaryOp
685
685
  from ..datasource.dataframe import DataFrameDataSource
686
686
  from ..datasource.series import SeriesDataSource
@@ -23,6 +23,7 @@ import pytest
23
23
 
24
24
  from .... import dataframe as md
25
25
  from ....tensor import Tensor
26
+ from ....tests.utils import assert_mf_index_dtype
26
27
  from ...core import DataFrame, IndexValue, OutputType, Series
27
28
  from ...datasource.dataframe import from_pandas as from_pandas_df
28
29
  from ...datasource.series import from_pandas as from_pandas_series
@@ -111,10 +112,7 @@ def test_dataframe_reduction(func_name, op, func_opts: FunctionOptions):
111
112
  reduction_df = getattr(from_pandas_df(data, chunk_size=3), func_name)()
112
113
 
113
114
  assert isinstance(reduction_df, Series)
114
- assert isinstance(
115
- reduction_df.index_value._index_value,
116
- (IndexValue.RangeIndex, IndexValue.Int64Index),
117
- )
115
+ assert_mf_index_dtype(reduction_df.index_value._index_value, np.int64)
118
116
  assert reduction_df.shape == (10,)
119
117
 
120
118
  data = pd.DataFrame(np.random.rand(20, 20), index=[str(i) for i in range(20)])
@@ -67,6 +67,7 @@ def dataframe_sort_values(
67
67
  parallel_kind="PSRS",
68
68
  psrs_kinds=None,
69
69
  ):
70
+ # FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/15
70
71
  """
71
72
  Sort by the values along either axis.
72
73
 
@@ -152,17 +153,6 @@ def dataframe_sort_values(
152
153
  0 A 2 0
153
154
  1 A 1 1
154
155
  3 NaN 8 4
155
-
156
- Putting NAs first
157
-
158
- >>> df.sort_values(by='col1', ascending=False, na_position='first').execute()
159
- col1 col2 col3
160
- 3 NaN 8 4
161
- 4 D 7 2
162
- 5 C 4 3
163
- 2 B 9 9
164
- 0 A 2 0
165
- 1 A 1 1
166
156
  """
167
157
 
168
158
  if na_position not in ["last", "first"]: # pragma: no cover
@@ -43,7 +43,7 @@ class DataFrameCorr(DataFrameOperator, DataFrameOperatorMixin):
43
43
  def __call__(self, df_or_series):
44
44
  if isinstance(df_or_series, SERIES_TYPE):
45
45
  inputs = filter_inputs([df_or_series, self.other])
46
- return self.new_scalar(inputs, dtype=np.dtype(np.float_))
46
+ return self.new_scalar(inputs, dtype=np.dtype(float))
47
47
  else:
48
48
 
49
49
  def _filter_numeric(obj):
@@ -60,7 +60,7 @@ class DataFrameCorr(DataFrameOperator, DataFrameOperatorMixin):
60
60
  inputs = filter_inputs([df_or_series, self.other])
61
61
  if self.axis is None:
62
62
  dtypes = pd.Series(
63
- [np.dtype(np.float_)] * len(df_or_series.dtypes),
63
+ [np.dtype(float)] * len(df_or_series.dtypes),
64
64
  index=df_or_series.dtypes.index,
65
65
  )
66
66
  return self.new_dataframe(
@@ -85,7 +85,7 @@ class DataFrameCorr(DataFrameOperator, DataFrameOperatorMixin):
85
85
  return self.new_series(
86
86
  inputs,
87
87
  shape=shape,
88
- dtype=np.dtype(np.float_),
88
+ dtype=np.dtype(float),
89
89
  index_value=new_index_value,
90
90
  )
91
91
 
@@ -14,8 +14,9 @@
14
14
 
15
15
  import numpy as np
16
16
  import pandas as pd
17
+ from pandas.core.dtypes.cast import find_common_type
17
18
 
18
- from ... import opcodes as OperandDef
19
+ from ... import opcodes
19
20
  from ...core import ENTITY_TYPE
20
21
  from ...serialization.serializables import (
21
22
  AnyField,
@@ -32,11 +33,11 @@ from ...tensor.datasource import tensor as astensor
32
33
  from ...tensor.statistics.quantile import quantile as tensor_quantile
33
34
  from ..core import DATAFRAME_TYPE
34
35
  from ..operators import DataFrameOperator, DataFrameOperatorMixin
35
- from ..utils import build_empty_df, find_common_type, parse_index, validate_axis
36
+ from ..utils import build_empty_df, parse_index, validate_axis
36
37
 
37
38
 
38
39
  class DataFrameQuantile(DataFrameOperator, DataFrameOperatorMixin):
39
- _op_type_ = OperandDef.QUANTILE
40
+ _op_type_ = opcodes.QUANTILE
40
41
 
41
42
  input = KeyField("input", default=None)
42
43
  q = AnyField("q", default=None)
@@ -80,7 +81,10 @@ class DataFrameQuantile(DataFrameOperator, DataFrameOperatorMixin):
80
81
  store_index_value = False
81
82
  else:
82
83
  q_val = np.asanyarray(self.q)
83
- pd_index = pd.Index(q_val)
84
+ if q_val.ndim == 0:
85
+ pd_index = pd.Index(q_val.reshape(1))
86
+ else:
87
+ pd_index = pd.Index(q_val)
84
88
  name = self.q if q_val.size == 1 else None
85
89
  store_index_value = True
86
90
  tokenize_objects = (a, q_val, self.interpolation, type(self).__name__)
@@ -163,7 +167,10 @@ class DataFrameQuantile(DataFrameOperator, DataFrameOperatorMixin):
163
167
  store_index_value = False
164
168
  else:
165
169
  q_val = np.asanyarray(self.q)
166
- index_val = pd.Index(q_val)
170
+ if q_val.ndim == 0:
171
+ index_val = pd.Index(q_val.reshape(1))
172
+ else:
173
+ index_val = pd.Index(q_val)
167
174
  store_index_value = True
168
175
 
169
176
  # get dtype by tensor
@@ -259,6 +266,7 @@ def quantile_series(series, q=0.5, interpolation="linear"):
259
266
 
260
267
 
261
268
  def quantile_dataframe(df, q=0.5, axis=0, numeric_only=True, interpolation="linear"):
269
+ # FIXME: Timedelta not support. Data invalid: ODPS-0010000:InvalidArgument:duration[ns] is not equal to string
262
270
  """
263
271
  Return values at the given quantile over requested axis.
264
272
 
@@ -309,20 +317,6 @@ def quantile_dataframe(df, q=0.5, axis=0, numeric_only=True, interpolation="line
309
317
  a b
310
318
  0.1 1.3 3.7
311
319
  0.5 2.5 55.0
312
-
313
- Specifying `numeric_only=False` will also compute the quantile of
314
- datetime and timedelta data.
315
-
316
- >>> df = md.DataFrame({'A': [1, 2],
317
- ... 'B': [md.Timestamp('2010'),
318
- ... md.Timestamp('2011')],
319
- ... 'C': [md.Timedelta('1 days'),
320
- ... md.Timedelta('2 days')]})
321
- >>> df.quantile(0.5, numeric_only=False).execute()
322
- A 1.5
323
- B 2010-07-02 12:00:00
324
- C 1 days 12:00:00
325
- Name: 0.5, dtype: object
326
320
  """
327
321
  if isinstance(q, ENTITY_TYPE):
328
322
  q = astensor(q)
@@ -49,7 +49,7 @@ def test_dataframe_quantile():
49
49
 
50
50
  # q = 0.3, axis = 0
51
51
  r = s.quantile(0.3)
52
- e = raw.quantile(0.3)
52
+ e = raw.quantile(0.3, numeric_only=True)
53
53
  assert isinstance(r, Series)
54
54
  assert r.shape == (2,)
55
55
  assert r.dtype == e.dtype
@@ -57,7 +57,7 @@ def test_dataframe_quantile():
57
57
 
58
58
  # q = 0.3, axis = 1
59
59
  r = s.quantile(0.3, axis=1)
60
- e = raw.quantile(0.3, axis=1)
60
+ e = raw.quantile(0.3, numeric_only=True, axis=1)
61
61
  assert isinstance(r, Series)
62
62
  assert r.shape == e.shape
63
63
  assert r.dtype == e.dtype
@@ -65,7 +65,7 @@ def test_dataframe_quantile():
65
65
 
66
66
  # q = [0.3, 0.7], axis = 0
67
67
  r = s.quantile([0.3, 0.7])
68
- e = raw.quantile([0.3, 0.7])
68
+ e = raw.quantile([0.3, 0.7], numeric_only=True)
69
69
  assert isinstance(r, DataFrame)
70
70
  assert r.shape == e.shape
71
71
  pd.testing.assert_series_equal(r.dtypes, e.dtypes)
@@ -74,7 +74,7 @@ def test_dataframe_quantile():
74
74
 
75
75
  # q = [0.3, 0.7], axis = 1
76
76
  r = s.quantile([0.3, 0.7], axis=1)
77
- e = raw.quantile([0.3, 0.7], axis=1)
77
+ e = raw.quantile([0.3, 0.7], numeric_only=True, axis=1)
78
78
  assert isinstance(r, DataFrame)
79
79
  assert r.shape == e.shape
80
80
  pd.testing.assert_series_equal(r.dtypes, e.dtypes)
@@ -13,12 +13,13 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import pandas as pd
16
+ import pytest
16
17
 
17
18
  from ..core import DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE
18
- from ..initializer import read_pandas
19
+ from ..initializer import DataFrame, Series, read_pandas
19
20
 
20
21
 
21
- def test_from_pandas():
22
+ def test_read_pandas():
22
23
  df_data = pd.DataFrame([["a", 1], ["b", 2]], columns=["a", "b"])
23
24
  assert isinstance(read_pandas(df_data), DATAFRAME_TYPE)
24
25
 
@@ -27,3 +28,33 @@ def test_from_pandas():
27
28
 
28
29
  idx_data = pd.Index(["a", "b"])
29
30
  assert isinstance(read_pandas(idx_data), INDEX_TYPE)
31
+
32
+
33
+ def test_init_dataframe_from_maxframe_series():
34
+ s = Series([1, 2, 3, 4], index=[1, 2, 3, 4])
35
+
36
+ df = DataFrame(s, index=s.index, columns=["col1"])
37
+
38
+ assert isinstance(df, DATAFRAME_TYPE)
39
+ assert df.dtypes.index == ["col1"]
40
+
41
+ with pytest.raises(ValueError):
42
+ DataFrame(s, index=s.index, columns=[])
43
+
44
+ with pytest.raises(ValueError):
45
+ DataFrame(s, index=s.index, columns="col1")
46
+
47
+ with pytest.raises(ValueError):
48
+ DataFrame(s, index=s.index, columns="col2")
49
+
50
+
51
+ def test_init_dataframe_from_maxframe_dataframe():
52
+ df1 = DataFrame({"A": [1, 2, 3, 4], "B": [1, 2, 3, 4]}, index=[1, 2, 3, 4])
53
+
54
+ df2 = DataFrame(df1, index=df1.index, columns=["col1", "col2"])
55
+
56
+ assert isinstance(df2, DATAFRAME_TYPE)
57
+ assert list(df2.dtypes.index) == ["col1", "col2"]
58
+
59
+ with pytest.raises(ValueError):
60
+ DataFrame(df1, index=df1.index, columns=["col1", "col2", "col3"])
@@ -26,7 +26,6 @@ import numpy as np
26
26
  import pandas as pd
27
27
  from pandas.api.extensions import ExtensionDtype
28
28
  from pandas.api.types import is_string_dtype
29
- from pandas.core.dtypes.cast import find_common_type
30
29
  from pandas.core.dtypes.inference import is_dict_like, is_list_like
31
30
 
32
31
  from ..core import Entity, ExecutableTuple
@@ -264,12 +263,30 @@ def parse_index(index_value, *args, store_data=False, key=None):
264
263
  return IndexValue(_index_value=_serialize_index(index_value))
265
264
 
266
265
 
267
- def gen_unknown_index_value(index_value, *args):
266
+ def gen_unknown_index_value(index_value, *args, normalize_range_index=False):
267
+ """
268
+ Generate new index value with the same likes of given index_value and args, but without any value.
269
+
270
+ Parameters
271
+ ----------
272
+ index_value
273
+ Given index value.
274
+ args
275
+ Arguments for parse_index.
276
+ normalize_range_index
277
+ If normalize range index to normal index.
278
+
279
+ Returns
280
+ -------
281
+ New created range index value.
282
+ """
268
283
  pd_index = index_value.to_pandas()
269
- if isinstance(pd_index, pd.RangeIndex):
270
- return parse_index(pd.RangeIndex(-1), *args)
284
+ if not normalize_range_index and isinstance(pd_index, pd.RangeIndex):
285
+ return parse_index(pd.RangeIndex(-1, name=pd_index.name), *args)
271
286
  elif not isinstance(pd_index, pd.MultiIndex):
272
- return parse_index(pd.Index([], dtype=pd_index.dtype), *args)
287
+ return parse_index(
288
+ pd.Index([], dtype=pd_index.dtype, name=pd_index.name), *args
289
+ )
273
290
  else:
274
291
  i = pd.MultiIndex.from_arrays(
275
292
  [c[:0] for c in pd_index.levels], names=pd_index.names
@@ -477,11 +494,11 @@ def build_df(df_obj, fill_value=1, size=1, ensure_string=False):
477
494
  else:
478
495
  fill_values = fill_value
479
496
 
480
- from .core import SERIES_TYPE
497
+ from .core import INDEX_TYPE, SERIES_TYPE
481
498
 
482
499
  dtypes = (
483
500
  pd.Series([df_obj.dtype], index=[df_obj.name])
484
- if isinstance(df_obj, SERIES_TYPE)
501
+ if isinstance(df_obj, (INDEX_TYPE, SERIES_TYPE))
485
502
  else df_obj.dtypes
486
503
  )
487
504
  for size, fill_value in zip(sizes, fill_values):
@@ -593,7 +610,7 @@ def build_series(
593
610
  return ret_series
594
611
 
595
612
 
596
- def infer_index_value(left_index_value, right_index_value):
613
+ def infer_index_value(left_index_value, right_index_value, level=None):
597
614
  from .core import IndexValue
598
615
 
599
616
  if isinstance(left_index_value.value, IndexValue.RangeIndex) and isinstance(
@@ -616,9 +633,7 @@ def infer_index_value(left_index_value, right_index_value):
616
633
 
617
634
  left_index = left_index_value.to_pandas()
618
635
  right_index = right_index_value.to_pandas()
619
- out_index = pd.Index(
620
- [], dtype=find_common_type([left_index.dtype, right_index.dtype])
621
- )
636
+ out_index = left_index.join(right_index, level=level)[:0]
622
637
  return parse_index(out_index, left_index_value, right_index_value)
623
638
 
624
639
 
@@ -28,6 +28,7 @@ from .aggregation import BaseDataFrameExpandingAgg
28
28
  from .core import Window
29
29
 
30
30
  _window_has_method = pd_release_version >= (1, 3, 0)
31
+ _window_has_center = pd_release_version < (2, 0, 0)
31
32
 
32
33
 
33
34
  class DataFrameExpandingAgg(BaseDataFrameExpandingAgg):
@@ -49,10 +50,11 @@ class Expanding(Window):
49
50
  def params(self):
50
51
  p = OrderedDict()
51
52
 
53
+ args = ["min_periods", "center", "axis", "method"]
52
54
  if not _window_has_method: # pragma: no cover
53
- args = ["min_periods", "center", "axis"]
54
- else:
55
- args = ["min_periods", "center", "axis", "method"]
55
+ args = [a for a in args if a != "method"]
56
+ if not _window_has_center:
57
+ args = [a for a in args if a != "center"]
56
58
 
57
59
  for k in args:
58
60
  p[k] = getattr(self, k)