maxframe 0.1.0b4__cp37-cp37m-win_amd64.whl → 1.0.0__cp37-cp37m-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (214) hide show
  1. maxframe/__init__.py +1 -0
  2. maxframe/_utils.cp37-win_amd64.pyd +0 -0
  3. maxframe/codegen.py +56 -5
  4. maxframe/config/config.py +78 -10
  5. maxframe/config/validators.py +42 -11
  6. maxframe/conftest.py +58 -14
  7. maxframe/core/__init__.py +2 -16
  8. maxframe/core/entity/__init__.py +1 -12
  9. maxframe/core/entity/executable.py +1 -1
  10. maxframe/core/entity/objects.py +46 -45
  11. maxframe/core/entity/output_types.py +0 -3
  12. maxframe/core/entity/tests/test_objects.py +43 -0
  13. maxframe/core/entity/tileables.py +5 -78
  14. maxframe/core/graph/__init__.py +2 -2
  15. maxframe/core/graph/builder/__init__.py +0 -1
  16. maxframe/core/graph/builder/base.py +5 -4
  17. maxframe/core/graph/builder/tileable.py +4 -4
  18. maxframe/core/graph/builder/utils.py +4 -8
  19. maxframe/core/graph/core.cp37-win_amd64.pyd +0 -0
  20. maxframe/core/graph/core.pyx +4 -4
  21. maxframe/core/graph/entity.py +9 -33
  22. maxframe/core/operator/__init__.py +2 -9
  23. maxframe/core/operator/base.py +3 -5
  24. maxframe/core/operator/objects.py +0 -9
  25. maxframe/core/operator/utils.py +55 -0
  26. maxframe/dataframe/__init__.py +2 -1
  27. maxframe/dataframe/arithmetic/around.py +5 -17
  28. maxframe/dataframe/arithmetic/core.py +15 -7
  29. maxframe/dataframe/arithmetic/docstring.py +7 -33
  30. maxframe/dataframe/arithmetic/equal.py +4 -2
  31. maxframe/dataframe/arithmetic/greater.py +4 -2
  32. maxframe/dataframe/arithmetic/greater_equal.py +4 -2
  33. maxframe/dataframe/arithmetic/less.py +2 -2
  34. maxframe/dataframe/arithmetic/less_equal.py +4 -2
  35. maxframe/dataframe/arithmetic/not_equal.py +4 -2
  36. maxframe/dataframe/arithmetic/tests/test_arithmetic.py +39 -16
  37. maxframe/dataframe/core.py +58 -12
  38. maxframe/dataframe/datasource/date_range.py +2 -2
  39. maxframe/dataframe/datasource/read_odps_query.py +120 -24
  40. maxframe/dataframe/datasource/read_odps_table.py +9 -4
  41. maxframe/dataframe/datasource/tests/test_datasource.py +103 -8
  42. maxframe/dataframe/datastore/tests/test_to_odps.py +48 -0
  43. maxframe/dataframe/datastore/to_odps.py +28 -0
  44. maxframe/dataframe/extensions/__init__.py +5 -0
  45. maxframe/dataframe/extensions/flatjson.py +131 -0
  46. maxframe/dataframe/extensions/flatmap.py +317 -0
  47. maxframe/dataframe/extensions/reshuffle.py +1 -1
  48. maxframe/dataframe/extensions/tests/test_extensions.py +108 -3
  49. maxframe/dataframe/groupby/core.py +1 -1
  50. maxframe/dataframe/groupby/cum.py +0 -1
  51. maxframe/dataframe/groupby/fill.py +4 -1
  52. maxframe/dataframe/groupby/getitem.py +6 -0
  53. maxframe/dataframe/groupby/tests/test_groupby.py +5 -1
  54. maxframe/dataframe/groupby/transform.py +5 -1
  55. maxframe/dataframe/indexing/align.py +1 -1
  56. maxframe/dataframe/indexing/loc.py +6 -4
  57. maxframe/dataframe/indexing/rename.py +5 -28
  58. maxframe/dataframe/indexing/sample.py +0 -1
  59. maxframe/dataframe/indexing/set_index.py +68 -1
  60. maxframe/dataframe/initializer.py +11 -1
  61. maxframe/dataframe/merge/__init__.py +9 -1
  62. maxframe/dataframe/merge/concat.py +41 -31
  63. maxframe/dataframe/merge/merge.py +237 -3
  64. maxframe/dataframe/merge/tests/test_merge.py +126 -1
  65. maxframe/dataframe/misc/__init__.py +4 -0
  66. maxframe/dataframe/misc/apply.py +6 -11
  67. maxframe/dataframe/misc/case_when.py +141 -0
  68. maxframe/dataframe/misc/describe.py +2 -2
  69. maxframe/dataframe/misc/drop_duplicates.py +8 -8
  70. maxframe/dataframe/misc/eval.py +4 -0
  71. maxframe/dataframe/misc/memory_usage.py +2 -2
  72. maxframe/dataframe/misc/pct_change.py +1 -83
  73. maxframe/dataframe/misc/pivot_table.py +262 -0
  74. maxframe/dataframe/misc/tests/test_misc.py +93 -1
  75. maxframe/dataframe/misc/transform.py +1 -30
  76. maxframe/dataframe/misc/value_counts.py +4 -17
  77. maxframe/dataframe/missing/dropna.py +1 -1
  78. maxframe/dataframe/missing/fillna.py +5 -5
  79. maxframe/dataframe/operators.py +1 -17
  80. maxframe/dataframe/plotting/core.py +2 -2
  81. maxframe/dataframe/reduction/core.py +4 -3
  82. maxframe/dataframe/reduction/tests/test_reduction.py +2 -4
  83. maxframe/dataframe/sort/sort_values.py +1 -11
  84. maxframe/dataframe/statistics/corr.py +3 -3
  85. maxframe/dataframe/statistics/quantile.py +13 -19
  86. maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
  87. maxframe/dataframe/tests/test_initializer.py +33 -2
  88. maxframe/dataframe/utils.py +33 -11
  89. maxframe/dataframe/window/expanding.py +5 -3
  90. maxframe/dataframe/window/tests/test_expanding.py +2 -2
  91. maxframe/errors.py +13 -0
  92. maxframe/extension.py +12 -0
  93. maxframe/io/__init__.py +13 -0
  94. maxframe/io/objects/__init__.py +24 -0
  95. maxframe/io/objects/core.py +140 -0
  96. maxframe/io/objects/tensor.py +76 -0
  97. maxframe/io/objects/tests/__init__.py +13 -0
  98. maxframe/io/objects/tests/test_object_io.py +97 -0
  99. maxframe/{odpsio → io/odpsio}/__init__.py +3 -1
  100. maxframe/{odpsio → io/odpsio}/arrow.py +43 -12
  101. maxframe/{odpsio → io/odpsio}/schema.py +38 -16
  102. maxframe/io/odpsio/tableio.py +719 -0
  103. maxframe/io/odpsio/tests/__init__.py +13 -0
  104. maxframe/{odpsio → io/odpsio}/tests/test_schema.py +75 -33
  105. maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +50 -23
  106. maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
  107. maxframe/io/odpsio/volumeio.py +63 -0
  108. maxframe/learn/contrib/__init__.py +3 -1
  109. maxframe/learn/contrib/graph/__init__.py +15 -0
  110. maxframe/learn/contrib/graph/connected_components.py +215 -0
  111. maxframe/learn/contrib/graph/tests/__init__.py +13 -0
  112. maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
  113. maxframe/learn/contrib/llm/__init__.py +16 -0
  114. maxframe/learn/contrib/llm/core.py +54 -0
  115. maxframe/learn/contrib/llm/models/__init__.py +14 -0
  116. maxframe/learn/contrib/llm/models/dashscope.py +73 -0
  117. maxframe/learn/contrib/llm/multi_modal.py +42 -0
  118. maxframe/learn/contrib/llm/text.py +42 -0
  119. maxframe/learn/contrib/utils.py +52 -0
  120. maxframe/learn/contrib/xgboost/__init__.py +26 -0
  121. maxframe/learn/contrib/xgboost/classifier.py +110 -0
  122. maxframe/learn/contrib/xgboost/core.py +241 -0
  123. maxframe/learn/contrib/xgboost/dmatrix.py +147 -0
  124. maxframe/learn/contrib/xgboost/predict.py +121 -0
  125. maxframe/learn/contrib/xgboost/regressor.py +71 -0
  126. maxframe/learn/contrib/xgboost/tests/__init__.py +13 -0
  127. maxframe/learn/contrib/xgboost/tests/test_core.py +43 -0
  128. maxframe/learn/contrib/xgboost/train.py +132 -0
  129. maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
  130. maxframe/learn/utils/__init__.py +15 -0
  131. maxframe/learn/utils/core.py +29 -0
  132. maxframe/lib/mmh3.cp37-win_amd64.pyd +0 -0
  133. maxframe/lib/mmh3.pyi +43 -0
  134. maxframe/lib/sparse/tests/test_sparse.py +15 -15
  135. maxframe/lib/wrapped_pickle.py +2 -1
  136. maxframe/opcodes.py +11 -0
  137. maxframe/protocol.py +154 -27
  138. maxframe/remote/core.py +4 -8
  139. maxframe/serialization/__init__.py +1 -0
  140. maxframe/serialization/core.cp37-win_amd64.pyd +0 -0
  141. maxframe/serialization/core.pxd +3 -0
  142. maxframe/serialization/core.pyi +64 -0
  143. maxframe/serialization/core.pyx +67 -26
  144. maxframe/serialization/exception.py +1 -1
  145. maxframe/serialization/pandas.py +52 -17
  146. maxframe/serialization/serializables/core.py +180 -15
  147. maxframe/serialization/serializables/field_type.py +4 -1
  148. maxframe/serialization/serializables/tests/test_serializable.py +54 -5
  149. maxframe/serialization/tests/test_serial.py +2 -1
  150. maxframe/session.py +37 -2
  151. maxframe/tensor/__init__.py +81 -2
  152. maxframe/tensor/arithmetic/isclose.py +1 -0
  153. maxframe/tensor/arithmetic/tests/test_arithmetic.py +22 -18
  154. maxframe/tensor/core.py +5 -136
  155. maxframe/tensor/datasource/array.py +7 -2
  156. maxframe/tensor/datasource/full.py +1 -1
  157. maxframe/tensor/datasource/scalar.py +1 -1
  158. maxframe/tensor/datasource/tests/test_datasource.py +1 -1
  159. maxframe/tensor/indexing/flatnonzero.py +1 -1
  160. maxframe/tensor/indexing/getitem.py +2 -0
  161. maxframe/tensor/merge/__init__.py +2 -0
  162. maxframe/tensor/merge/concatenate.py +101 -0
  163. maxframe/tensor/merge/tests/test_merge.py +30 -1
  164. maxframe/tensor/merge/vstack.py +74 -0
  165. maxframe/tensor/{base → misc}/__init__.py +4 -0
  166. maxframe/tensor/misc/atleast_1d.py +72 -0
  167. maxframe/tensor/misc/atleast_2d.py +70 -0
  168. maxframe/tensor/misc/atleast_3d.py +85 -0
  169. maxframe/tensor/misc/tests/__init__.py +13 -0
  170. maxframe/tensor/{base → misc}/transpose.py +22 -18
  171. maxframe/tensor/misc/unique.py +205 -0
  172. maxframe/tensor/operators.py +1 -7
  173. maxframe/tensor/random/core.py +1 -1
  174. maxframe/tensor/reduction/count_nonzero.py +2 -1
  175. maxframe/tensor/reduction/mean.py +1 -0
  176. maxframe/tensor/reduction/nanmean.py +1 -0
  177. maxframe/tensor/reduction/nanvar.py +2 -0
  178. maxframe/tensor/reduction/tests/test_reduction.py +12 -1
  179. maxframe/tensor/reduction/var.py +2 -0
  180. maxframe/tensor/statistics/quantile.py +2 -2
  181. maxframe/tensor/utils.py +2 -22
  182. maxframe/tests/test_protocol.py +34 -0
  183. maxframe/tests/test_utils.py +0 -12
  184. maxframe/tests/utils.py +17 -2
  185. maxframe/typing_.py +4 -1
  186. maxframe/udf.py +62 -3
  187. maxframe/utils.py +112 -86
  188. {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/METADATA +4 -4
  189. {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/RECORD +208 -167
  190. maxframe_client/__init__.py +0 -1
  191. maxframe_client/clients/framedriver.py +4 -1
  192. maxframe_client/fetcher.py +123 -54
  193. maxframe_client/session/consts.py +3 -0
  194. maxframe_client/session/graph.py +8 -2
  195. maxframe_client/session/odps.py +223 -40
  196. maxframe_client/session/task.py +108 -80
  197. maxframe_client/tests/test_fetcher.py +21 -3
  198. maxframe_client/tests/test_session.py +136 -8
  199. maxframe/core/entity/chunks.py +0 -68
  200. maxframe/core/entity/fuse.py +0 -73
  201. maxframe/core/graph/builder/chunk.py +0 -430
  202. maxframe/odpsio/tableio.py +0 -300
  203. maxframe/odpsio/volumeio.py +0 -95
  204. maxframe_client/clients/spe.py +0 -104
  205. /maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
  206. /maxframe/{tensor/base → dataframe/datastore}/tests/__init__.py +0 -0
  207. /maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
  208. /maxframe/tensor/{base → misc}/astype.py +0 -0
  209. /maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
  210. /maxframe/tensor/{base → misc}/ravel.py +0 -0
  211. /maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
  212. /maxframe/tensor/{base → misc}/where.py +0 -0
  213. {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/WHEEL +0 -0
  214. {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/top_level.txt +0 -0
@@ -18,9 +18,11 @@ import pytest
18
18
 
19
19
  from .... import opcodes
20
20
  from ....core import OutputType
21
+ from ....dataframe import DataFrame
21
22
  from ....tensor.core import TENSOR_TYPE
22
23
  from ... import eval as maxframe_eval
23
24
  from ... import get_dummies, to_numeric
25
+ from ...arithmetic import DataFrameGreater, DataFrameLess
24
26
  from ...core import CATEGORICAL_TYPE, DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE
25
27
  from ...datasource.dataframe import from_pandas as from_pandas_df
26
28
  from ...datasource.index import from_pandas as from_pandas_index
@@ -347,7 +349,9 @@ def test_drop():
347
349
  def test_drop_duplicates():
348
350
  rs = np.random.RandomState(0)
349
351
  raw = pd.DataFrame(
350
- rs.randint(1000, size=(20, 7)), columns=["c" + str(i + 1) for i in range(7)]
352
+ rs.randint(1000, size=(20, 7)),
353
+ columns=["c" + str(i + 1) for i in range(7)],
354
+ index=pd.Index(range(20), name="idx"),
351
355
  )
352
356
  raw["c7"] = [f"s{j}" for j in range(20)]
353
357
 
@@ -359,6 +363,12 @@ def test_drop_duplicates():
359
363
  with pytest.raises(KeyError):
360
364
  df.drop_duplicates(subset="c8")
361
365
 
366
+ # check index
367
+ distinct_df = df.drop_duplicates()
368
+ assert distinct_df.index_value.name == df.index_value.name
369
+ assert isinstance(df.index_value.to_pandas(), pd.RangeIndex)
370
+ assert not isinstance(distinct_df.index_value.to_pandas(), pd.RangeIndex)
371
+
362
372
  s = df["c7"]
363
373
  with pytest.raises(ValueError):
364
374
  s.drop_duplicates(method="unknown")
@@ -405,3 +415,85 @@ def test_to_numeric():
405
415
 
406
416
  with pytest.raises(ValueError):
407
417
  _ = to_numeric([])
418
+
419
+
420
+ def test_case_when():
421
+ rs = np.random.RandomState(0)
422
+ raw = pd.DataFrame(
423
+ rs.randint(1000, size=(20, 8)), columns=["c" + str(i + 1) for i in range(8)]
424
+ )
425
+ df = from_pandas_df(raw, chunk_size=8)
426
+
427
+ with pytest.raises(TypeError):
428
+ df.c1.case_when(df.c2)
429
+ with pytest.raises(ValueError):
430
+ df.c1.case_when([])
431
+ with pytest.raises(TypeError):
432
+ df.c1.case_when([[]])
433
+ with pytest.raises(ValueError):
434
+ df.c1.case_when([()])
435
+
436
+ col = df.c1.case_when([(df.c2 < 10, 10), (df.c2 > 20, df.c3)])
437
+ assert len(col.inputs) == 4
438
+ assert isinstance(col.inputs[1].op, DataFrameLess)
439
+ assert isinstance(col.inputs[2].op, DataFrameGreater)
440
+
441
+
442
+ def test_apply():
443
+ df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]})
444
+
445
+ keys = [1, 2]
446
+
447
+ def f(x, keys):
448
+ if x["a"] in keys:
449
+ return [1, 0]
450
+ else:
451
+ return [0, 1]
452
+
453
+ apply_df = df[["a"]].apply(
454
+ f,
455
+ output_type="dataframe",
456
+ dtypes=pd.Series(["int64", "int64"]),
457
+ axis=1,
458
+ result_type="expand",
459
+ keys=keys,
460
+ )
461
+ assert apply_df.shape == (3, 2)
462
+
463
+
464
+ def test_pivot_table():
465
+ from ...groupby.aggregation import DataFrameGroupByAgg
466
+ from ...misc.pivot_table import DataFramePivotTable
467
+
468
+ raw = pd.DataFrame(
469
+ {
470
+ "A": "foo foo foo foo foo bar bar bar bar".split(),
471
+ "B": "one one one two two one one two two".split(),
472
+ "C": "small large large small small large small small large".split(),
473
+ "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
474
+ "E": [2, 4, 5, 5, 6, 6, 8, 9, 9],
475
+ }
476
+ )
477
+ df = from_pandas_df(raw, chunk_size=8)
478
+ with pytest.raises(ValueError):
479
+ df.pivot_table(index=123)
480
+ with pytest.raises(ValueError):
481
+ df.pivot_table(index=["F"])
482
+ with pytest.raises(ValueError):
483
+ df.pivot_table(values=["D", "E"], aggfunc="sum")
484
+
485
+ t = df.pivot_table(index=["A", "B", "C"])
486
+ assert isinstance(t.op, DataFrameGroupByAgg)
487
+ t = df.pivot_table(index="A", values=["D", "E"], aggfunc="sum")
488
+ assert isinstance(t.op, DataFrameGroupByAgg)
489
+
490
+ t = df.pivot_table(index=["A", "B"], values=["D", "E"], aggfunc="sum", margins=True)
491
+ assert isinstance(t.op, DataFramePivotTable)
492
+
493
+ t = df.pivot_table(index="A", columns=["B", "C"], aggfunc="sum")
494
+ assert isinstance(t.op, DataFramePivotTable)
495
+ assert t.shape == (np.nan, np.nan)
496
+
497
+ t = df.pivot_table(index=["A", "B"], columns="C", aggfunc="sum")
498
+ assert isinstance(t.op, DataFramePivotTable)
499
+ assert t.shape == (np.nan, np.nan)
@@ -228,21 +228,6 @@ def df_transform(df, func, axis=0, *args, dtypes=None, skip_infer=False, **kwarg
228
228
  0 1 2
229
229
  1 2 3
230
230
  2 3 4
231
-
232
- Even though the resulting DataFrame must have the same length as the
233
- input DataFrame, it is possible to provide several input functions:
234
-
235
- >>> s = md.Series(range(3))
236
- >>> s.execute()
237
- 0 0
238
- 1 1
239
- 2 2
240
- dtype: int64
241
- >>> s.transform([mt.sqrt, mt.exp]).execute()
242
- sqrt exp
243
- 0 0.000000 1.000000
244
- 1 1.000000 2.718282
245
- 2 1.414214 7.389056
246
231
  """
247
232
  op = TransformOperator(
248
233
  func=func,
@@ -265,6 +250,7 @@ def series_transform(
265
250
  dtype=None,
266
251
  **kwargs
267
252
  ):
253
+ # FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/10
268
254
  """
269
255
  Call ``func`` on self producing a Series with transformed values.
270
256
 
@@ -332,21 +318,6 @@ def series_transform(
332
318
  0 1 2
333
319
  1 2 3
334
320
  2 3 4
335
-
336
- Even though the resulting Series must have the same length as the
337
- input Series, it is possible to provide several input functions:
338
-
339
- >>> s = md.Series(range(3))
340
- >>> s.execute()
341
- 0 0
342
- 1 1
343
- 2 2
344
- dtype: int64
345
- >>> s.transform([mt.sqrt, mt.exp]).execute()
346
- sqrt exp
347
- 0 0.000000 1.000000
348
- 1 1.000000 2.718282
349
- 2 1.414214 7.389056
350
321
  """
351
322
  op = TransformOperator(
352
323
  func=func,
@@ -85,6 +85,7 @@ def value_counts(
85
85
  dropna=True,
86
86
  method="auto",
87
87
  ):
88
+ # FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/33
88
89
  """
89
90
  Return a Series containing counts of unique values.
90
91
 
@@ -125,9 +126,8 @@ def value_counts(
125
126
  Examples
126
127
  --------
127
128
  >>> import maxframe.dataframe as md
128
- >>> import maxframe.tensor as mt
129
-
130
- >>> s = md.Series([3, 1, 2, 3, 4, mt.nan])
129
+ >>> import numpy as np
130
+ >>> s = md.Series([3, 1, 2, 3, 4, np.nan])
131
131
  >>> s.value_counts().execute()
132
132
  3.0 2
133
133
  4.0 1
@@ -138,7 +138,7 @@ def value_counts(
138
138
  With `normalize` set to `True`, returns the relative frequency by
139
139
  dividing all values by the sum of values.
140
140
 
141
- >>> s = md.Series([3, 1, 2, 3, 4, mt.nan])
141
+ >>> s = md.Series([3, 1, 2, 3, 4, np.nan])
142
142
  >>> s.value_counts(normalize=True).execute()
143
143
  3.0 0.4
144
144
  4.0 0.2
@@ -146,19 +146,6 @@ def value_counts(
146
146
  1.0 0.2
147
147
  dtype: float64
148
148
 
149
- **bins**
150
-
151
- Bins can be useful for going from a continuous variable to a
152
- categorical variable; instead of counting unique
153
- apparitions of values, divide the index in the specified
154
- number of half-open bins.
155
-
156
- >>> s.value_counts(bins=3).execute()
157
- (2.0, 3.0] 2
158
- (0.996, 2.0] 2
159
- (3.0, 4.0] 1
160
- dtype: int64
161
-
162
149
  **dropna**
163
150
 
164
151
  With `dropna` set to `False` we can also see NaN index values.
@@ -234,7 +234,7 @@ def series_dropna(series, axis=0, inplace=False, how=None):
234
234
  Empty strings are not considered NA values. ``None`` is considered an
235
235
  NA value.
236
236
 
237
- >>> ser = md.Series([np.NaN, 2, md.NaT, '', None, 'I stay'])
237
+ >>> ser = md.Series([np.NaN, '2', md.NaT, '', None, 'I stay'])
238
238
  >>> ser.execute()
239
239
  0 NaN
240
240
  1 2
@@ -132,11 +132,11 @@ def fillna(
132
132
  --------
133
133
  >>> import maxframe.tensor as mt
134
134
  >>> import maxframe.dataframe as md
135
- >>> df = md.DataFrame([[mt.nan, 2, mt.nan, 0],
136
- ... [3, 4, mt.nan, 1],
137
- ... [mt.nan, mt.nan, mt.nan, 5],
138
- ... [mt.nan, 3, mt.nan, 4]],
139
- ... columns=list('ABCD'))
135
+ >>> df = md.DataFrame([[np.nan, 2, np.nan, 0],
136
+ [3, 4, np.nan, 1],
137
+ [np.nan, np.nan, np.nan, 5],
138
+ [np.nan, 3, np.nan, 4]],
139
+ columns=list('ABCD'))
140
140
  >>> df.execute()
141
141
  A B C D
142
142
  0 NaN 2.0 NaN 0
@@ -16,13 +16,7 @@ import numpy as np
16
16
  import pandas as pd
17
17
 
18
18
  from ..core import ENTITY_TYPE, OutputType
19
- from ..core.operator import (
20
- Fuse,
21
- FuseChunkMixin,
22
- Operator,
23
- ShuffleProxy,
24
- TileableOperatorMixin,
25
- )
19
+ from ..core.operator import Operator, ShuffleProxy, TileableOperatorMixin
26
20
  from ..tensor.core import TENSOR_TYPE
27
21
  from ..tensor.datasource import tensor as astensor
28
22
  from .core import DATAFRAME_TYPE, SERIES_TYPE
@@ -261,13 +255,3 @@ DataFrameOperator = Operator
261
255
  class DataFrameShuffleProxy(ShuffleProxy, DataFrameOperatorMixin):
262
256
  def __init__(self, sparse=None, output_types=None, **kwargs):
263
257
  super().__init__(sparse=sparse, _output_types=output_types, **kwargs)
264
-
265
-
266
- class DataFrameFuseChunkMixin(FuseChunkMixin, DataFrameOperatorMixin):
267
- __slots__ = ()
268
-
269
-
270
- class DataFrameFuseChunk(Fuse, DataFrameFuseChunkMixin):
271
- @property
272
- def output_types(self):
273
- return self.outputs[-1].chunk.op.output_types
@@ -17,7 +17,7 @@ from collections import OrderedDict
17
17
  import pandas as pd
18
18
 
19
19
  from ...core import ENTITY_TYPE, ExecutableTuple
20
- from ...utils import adapt_docstring
20
+ from ...utils import adapt_docstring, get_item_if_scalar
21
21
 
22
22
 
23
23
  class PlotAccessor:
@@ -34,7 +34,7 @@ class PlotAccessor:
34
34
  .fetch(session=session)
35
35
  )
36
36
  for p, v in zip(to_executes, executed):
37
- result[p] = v
37
+ result[p] = get_item_if_scalar(v)
38
38
 
39
39
  data = result.pop("__object__")
40
40
  pd_kwargs = kwargs.copy()
@@ -30,7 +30,7 @@ from ...serialization.serializables import (
30
30
  StringField,
31
31
  )
32
32
  from ...typing_ import TileableType
33
- from ...utils import pd_release_version, tokenize
33
+ from ...utils import get_item_if_scalar, pd_release_version, tokenize
34
34
  from ..operators import DATAFRAME_TYPE, DataFrameOperator, DataFrameOperatorMixin
35
35
  from ..utils import (
36
36
  build_df,
@@ -552,7 +552,7 @@ class ReductionCompiler:
552
552
  @enter_mode(build=True)
553
553
  def _compile_function(self, func, func_name=None, ndim=1) -> ReductionSteps:
554
554
  from ...tensor.arithmetic.core import TensorBinOp, TensorUnaryOp
555
- from ...tensor.base import TensorWhere
555
+ from ...tensor.misc import TensorWhere
556
556
  from ..arithmetic.core import DataFrameBinOp, DataFrameUnaryOp
557
557
  from ..datasource.dataframe import DataFrameDataSource
558
558
  from ..datasource.series import SeriesDataSource
@@ -679,8 +679,8 @@ class ReductionCompiler:
679
679
  ]
680
680
  """
681
681
  from ...tensor.arithmetic.core import TensorBinOp, TensorUnaryOp
682
- from ...tensor.base import TensorWhere
683
682
  from ...tensor.datasource import Scalar
683
+ from ...tensor.misc import TensorWhere
684
684
  from ..arithmetic.core import DataFrameBinOp, DataFrameUnaryOp
685
685
  from ..datasource.dataframe import DataFrameDataSource
686
686
  from ..datasource.series import SeriesDataSource
@@ -715,6 +715,7 @@ class ReductionCompiler:
715
715
  keys_to_vars = {inp.key: local_key_to_var[inp.key] for inp in t.inputs}
716
716
 
717
717
  def _interpret_var(v):
718
+ v = get_item_if_scalar(v)
718
719
  # get representation for variables
719
720
  if hasattr(v, "key"):
720
721
  return keys_to_vars[v.key]
@@ -23,6 +23,7 @@ import pytest
23
23
 
24
24
  from .... import dataframe as md
25
25
  from ....tensor import Tensor
26
+ from ....tests.utils import assert_mf_index_dtype
26
27
  from ...core import DataFrame, IndexValue, OutputType, Series
27
28
  from ...datasource.dataframe import from_pandas as from_pandas_df
28
29
  from ...datasource.series import from_pandas as from_pandas_series
@@ -111,10 +112,7 @@ def test_dataframe_reduction(func_name, op, func_opts: FunctionOptions):
111
112
  reduction_df = getattr(from_pandas_df(data, chunk_size=3), func_name)()
112
113
 
113
114
  assert isinstance(reduction_df, Series)
114
- assert isinstance(
115
- reduction_df.index_value._index_value,
116
- (IndexValue.RangeIndex, IndexValue.Int64Index),
117
- )
115
+ assert_mf_index_dtype(reduction_df.index_value._index_value, np.int64)
118
116
  assert reduction_df.shape == (10,)
119
117
 
120
118
  data = pd.DataFrame(np.random.rand(20, 20), index=[str(i) for i in range(20)])
@@ -67,6 +67,7 @@ def dataframe_sort_values(
67
67
  parallel_kind="PSRS",
68
68
  psrs_kinds=None,
69
69
  ):
70
+ # FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/15
70
71
  """
71
72
  Sort by the values along either axis.
72
73
 
@@ -152,17 +153,6 @@ def dataframe_sort_values(
152
153
  0 A 2 0
153
154
  1 A 1 1
154
155
  3 NaN 8 4
155
-
156
- Putting NAs first
157
-
158
- >>> df.sort_values(by='col1', ascending=False, na_position='first').execute()
159
- col1 col2 col3
160
- 3 NaN 8 4
161
- 4 D 7 2
162
- 5 C 4 3
163
- 2 B 9 9
164
- 0 A 2 0
165
- 1 A 1 1
166
156
  """
167
157
 
168
158
  if na_position not in ["last", "first"]: # pragma: no cover
@@ -43,7 +43,7 @@ class DataFrameCorr(DataFrameOperator, DataFrameOperatorMixin):
43
43
  def __call__(self, df_or_series):
44
44
  if isinstance(df_or_series, SERIES_TYPE):
45
45
  inputs = filter_inputs([df_or_series, self.other])
46
- return self.new_scalar(inputs, dtype=np.dtype(np.float_))
46
+ return self.new_scalar(inputs, dtype=np.dtype(float))
47
47
  else:
48
48
 
49
49
  def _filter_numeric(obj):
@@ -60,7 +60,7 @@ class DataFrameCorr(DataFrameOperator, DataFrameOperatorMixin):
60
60
  inputs = filter_inputs([df_or_series, self.other])
61
61
  if self.axis is None:
62
62
  dtypes = pd.Series(
63
- [np.dtype(np.float_)] * len(df_or_series.dtypes),
63
+ [np.dtype(float)] * len(df_or_series.dtypes),
64
64
  index=df_or_series.dtypes.index,
65
65
  )
66
66
  return self.new_dataframe(
@@ -85,7 +85,7 @@ class DataFrameCorr(DataFrameOperator, DataFrameOperatorMixin):
85
85
  return self.new_series(
86
86
  inputs,
87
87
  shape=shape,
88
- dtype=np.dtype(np.float_),
88
+ dtype=np.dtype(float),
89
89
  index_value=new_index_value,
90
90
  )
91
91
 
@@ -14,8 +14,9 @@
14
14
 
15
15
  import numpy as np
16
16
  import pandas as pd
17
+ from pandas.core.dtypes.cast import find_common_type
17
18
 
18
- from ... import opcodes as OperandDef
19
+ from ... import opcodes
19
20
  from ...core import ENTITY_TYPE
20
21
  from ...serialization.serializables import (
21
22
  AnyField,
@@ -32,11 +33,11 @@ from ...tensor.datasource import tensor as astensor
32
33
  from ...tensor.statistics.quantile import quantile as tensor_quantile
33
34
  from ..core import DATAFRAME_TYPE
34
35
  from ..operators import DataFrameOperator, DataFrameOperatorMixin
35
- from ..utils import build_empty_df, find_common_type, parse_index, validate_axis
36
+ from ..utils import build_empty_df, parse_index, validate_axis
36
37
 
37
38
 
38
39
  class DataFrameQuantile(DataFrameOperator, DataFrameOperatorMixin):
39
- _op_type_ = OperandDef.QUANTILE
40
+ _op_type_ = opcodes.QUANTILE
40
41
 
41
42
  input = KeyField("input", default=None)
42
43
  q = AnyField("q", default=None)
@@ -80,7 +81,10 @@ class DataFrameQuantile(DataFrameOperator, DataFrameOperatorMixin):
80
81
  store_index_value = False
81
82
  else:
82
83
  q_val = np.asanyarray(self.q)
83
- pd_index = pd.Index(q_val)
84
+ if q_val.ndim == 0:
85
+ pd_index = pd.Index(q_val.reshape(1))
86
+ else:
87
+ pd_index = pd.Index(q_val)
84
88
  name = self.q if q_val.size == 1 else None
85
89
  store_index_value = True
86
90
  tokenize_objects = (a, q_val, self.interpolation, type(self).__name__)
@@ -163,7 +167,10 @@ class DataFrameQuantile(DataFrameOperator, DataFrameOperatorMixin):
163
167
  store_index_value = False
164
168
  else:
165
169
  q_val = np.asanyarray(self.q)
166
- index_val = pd.Index(q_val)
170
+ if q_val.ndim == 0:
171
+ index_val = pd.Index(q_val.reshape(1))
172
+ else:
173
+ index_val = pd.Index(q_val)
167
174
  store_index_value = True
168
175
 
169
176
  # get dtype by tensor
@@ -259,6 +266,7 @@ def quantile_series(series, q=0.5, interpolation="linear"):
259
266
 
260
267
 
261
268
  def quantile_dataframe(df, q=0.5, axis=0, numeric_only=True, interpolation="linear"):
269
+ # FIXME: Timedelta not support. Data invalid: ODPS-0010000:InvalidArgument:duration[ns] is not equal to string
262
270
  """
263
271
  Return values at the given quantile over requested axis.
264
272
 
@@ -309,20 +317,6 @@ def quantile_dataframe(df, q=0.5, axis=0, numeric_only=True, interpolation="line
309
317
  a b
310
318
  0.1 1.3 3.7
311
319
  0.5 2.5 55.0
312
-
313
- Specifying `numeric_only=False` will also compute the quantile of
314
- datetime and timedelta data.
315
-
316
- >>> df = md.DataFrame({'A': [1, 2],
317
- ... 'B': [md.Timestamp('2010'),
318
- ... md.Timestamp('2011')],
319
- ... 'C': [md.Timedelta('1 days'),
320
- ... md.Timedelta('2 days')]})
321
- >>> df.quantile(0.5, numeric_only=False).execute()
322
- A 1.5
323
- B 2010-07-02 12:00:00
324
- C 1 days 12:00:00
325
- Name: 0.5, dtype: object
326
320
  """
327
321
  if isinstance(q, ENTITY_TYPE):
328
322
  q = astensor(q)
@@ -49,7 +49,7 @@ def test_dataframe_quantile():
49
49
 
50
50
  # q = 0.3, axis = 0
51
51
  r = s.quantile(0.3)
52
- e = raw.quantile(0.3)
52
+ e = raw.quantile(0.3, numeric_only=True)
53
53
  assert isinstance(r, Series)
54
54
  assert r.shape == (2,)
55
55
  assert r.dtype == e.dtype
@@ -57,7 +57,7 @@ def test_dataframe_quantile():
57
57
 
58
58
  # q = 0.3, axis = 1
59
59
  r = s.quantile(0.3, axis=1)
60
- e = raw.quantile(0.3, axis=1)
60
+ e = raw.quantile(0.3, numeric_only=True, axis=1)
61
61
  assert isinstance(r, Series)
62
62
  assert r.shape == e.shape
63
63
  assert r.dtype == e.dtype
@@ -65,7 +65,7 @@ def test_dataframe_quantile():
65
65
 
66
66
  # q = [0.3, 0.7], axis = 0
67
67
  r = s.quantile([0.3, 0.7])
68
- e = raw.quantile([0.3, 0.7])
68
+ e = raw.quantile([0.3, 0.7], numeric_only=True)
69
69
  assert isinstance(r, DataFrame)
70
70
  assert r.shape == e.shape
71
71
  pd.testing.assert_series_equal(r.dtypes, e.dtypes)
@@ -74,7 +74,7 @@ def test_dataframe_quantile():
74
74
 
75
75
  # q = [0.3, 0.7], axis = 1
76
76
  r = s.quantile([0.3, 0.7], axis=1)
77
- e = raw.quantile([0.3, 0.7], axis=1)
77
+ e = raw.quantile([0.3, 0.7], numeric_only=True, axis=1)
78
78
  assert isinstance(r, DataFrame)
79
79
  assert r.shape == e.shape
80
80
  pd.testing.assert_series_equal(r.dtypes, e.dtypes)
@@ -13,12 +13,13 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import pandas as pd
16
+ import pytest
16
17
 
17
18
  from ..core import DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE
18
- from ..initializer import read_pandas
19
+ from ..initializer import DataFrame, Series, read_pandas
19
20
 
20
21
 
21
- def test_from_pandas():
22
+ def test_read_pandas():
22
23
  df_data = pd.DataFrame([["a", 1], ["b", 2]], columns=["a", "b"])
23
24
  assert isinstance(read_pandas(df_data), DATAFRAME_TYPE)
24
25
 
@@ -27,3 +28,33 @@ def test_from_pandas():
27
28
 
28
29
  idx_data = pd.Index(["a", "b"])
29
30
  assert isinstance(read_pandas(idx_data), INDEX_TYPE)
31
+
32
+
33
+ def test_init_dataframe_from_maxframe_series():
34
+ s = Series([1, 2, 3, 4], index=[1, 2, 3, 4])
35
+
36
+ df = DataFrame(s, index=s.index, columns=["col1"])
37
+
38
+ assert isinstance(df, DATAFRAME_TYPE)
39
+ assert df.dtypes.index == ["col1"]
40
+
41
+ with pytest.raises(ValueError):
42
+ DataFrame(s, index=s.index, columns=[])
43
+
44
+ with pytest.raises(ValueError):
45
+ DataFrame(s, index=s.index, columns="col1")
46
+
47
+ with pytest.raises(ValueError):
48
+ DataFrame(s, index=s.index, columns="col2")
49
+
50
+
51
+ def test_init_dataframe_from_maxframe_dataframe():
52
+ df1 = DataFrame({"A": [1, 2, 3, 4], "B": [1, 2, 3, 4]}, index=[1, 2, 3, 4])
53
+
54
+ df2 = DataFrame(df1, index=df1.index, columns=["col1", "col2"])
55
+
56
+ assert isinstance(df2, DATAFRAME_TYPE)
57
+ assert list(df2.dtypes.index) == ["col1", "col2"]
58
+
59
+ with pytest.raises(ValueError):
60
+ DataFrame(df1, index=df1.index, columns=["col1", "col2", "col3"])
@@ -26,7 +26,6 @@ import numpy as np
26
26
  import pandas as pd
27
27
  from pandas.api.extensions import ExtensionDtype
28
28
  from pandas.api.types import is_string_dtype
29
- from pandas.core.dtypes.cast import find_common_type
30
29
  from pandas.core.dtypes.inference import is_dict_like, is_list_like
31
30
 
32
31
  from ..core import Entity, ExecutableTuple
@@ -264,12 +263,30 @@ def parse_index(index_value, *args, store_data=False, key=None):
264
263
  return IndexValue(_index_value=_serialize_index(index_value))
265
264
 
266
265
 
267
- def gen_unknown_index_value(index_value, *args):
266
+ def gen_unknown_index_value(index_value, *args, normalize_range_index=False):
267
+ """
268
+ Generate new index value with the same likes of given index_value and args, but without any value.
269
+
270
+ Parameters
271
+ ----------
272
+ index_value
273
+ Given index value.
274
+ args
275
+ Arguments for parse_index.
276
+ normalize_range_index
277
+ If normalize range index to normal index.
278
+
279
+ Returns
280
+ -------
281
+ New created range index value.
282
+ """
268
283
  pd_index = index_value.to_pandas()
269
- if isinstance(pd_index, pd.RangeIndex):
270
- return parse_index(pd.RangeIndex(-1), *args)
284
+ if not normalize_range_index and isinstance(pd_index, pd.RangeIndex):
285
+ return parse_index(pd.RangeIndex(-1, name=pd_index.name), *args)
271
286
  elif not isinstance(pd_index, pd.MultiIndex):
272
- return parse_index(pd.Index([], dtype=pd_index.dtype), *args)
287
+ return parse_index(
288
+ pd.Index([], dtype=pd_index.dtype, name=pd_index.name), *args
289
+ )
273
290
  else:
274
291
  i = pd.MultiIndex.from_arrays(
275
292
  [c[:0] for c in pd_index.levels], names=pd_index.names
@@ -477,11 +494,11 @@ def build_df(df_obj, fill_value=1, size=1, ensure_string=False):
477
494
  else:
478
495
  fill_values = fill_value
479
496
 
480
- from .core import SERIES_TYPE
497
+ from .core import INDEX_TYPE, SERIES_TYPE
481
498
 
482
499
  dtypes = (
483
500
  pd.Series([df_obj.dtype], index=[df_obj.name])
484
- if isinstance(df_obj, SERIES_TYPE)
501
+ if isinstance(df_obj, (INDEX_TYPE, SERIES_TYPE))
485
502
  else df_obj.dtypes
486
503
  )
487
504
  for size, fill_value in zip(sizes, fill_values):
@@ -593,7 +610,7 @@ def build_series(
593
610
  return ret_series
594
611
 
595
612
 
596
- def infer_index_value(left_index_value, right_index_value):
613
+ def infer_index_value(left_index_value, right_index_value, level=None):
597
614
  from .core import IndexValue
598
615
 
599
616
  if isinstance(left_index_value.value, IndexValue.RangeIndex) and isinstance(
@@ -616,9 +633,7 @@ def infer_index_value(left_index_value, right_index_value):
616
633
 
617
634
  left_index = left_index_value.to_pandas()
618
635
  right_index = right_index_value.to_pandas()
619
- out_index = pd.Index(
620
- [], dtype=find_common_type([left_index.dtype, right_index.dtype])
621
- )
636
+ out_index = left_index.join(right_index, level=level)[:0]
622
637
  return parse_index(out_index, left_index_value, right_index_value)
623
638
 
624
639
 
@@ -1136,6 +1151,13 @@ def concat_on_columns(objs: List) -> Any:
1136
1151
  return result
1137
1152
 
1138
1153
 
1154
+ def apply_if_callable(maybe_callable, obj, **kwargs):
1155
+ if callable(maybe_callable):
1156
+ return maybe_callable(obj, **kwargs)
1157
+
1158
+ return maybe_callable
1159
+
1160
+
1139
1161
  def patch_sa_engine_execute():
1140
1162
  """
1141
1163
  pandas did not resolve compatibility issue of sqlalchemy 2.0, the issue
@@ -28,6 +28,7 @@ from .aggregation import BaseDataFrameExpandingAgg
28
28
  from .core import Window
29
29
 
30
30
  _window_has_method = pd_release_version >= (1, 3, 0)
31
+ _window_has_center = pd_release_version < (2, 0, 0)
31
32
 
32
33
 
33
34
  class DataFrameExpandingAgg(BaseDataFrameExpandingAgg):
@@ -49,10 +50,11 @@ class Expanding(Window):
49
50
  def params(self):
50
51
  p = OrderedDict()
51
52
 
53
+ args = ["min_periods", "center", "axis", "method"]
52
54
  if not _window_has_method: # pragma: no cover
53
- args = ["min_periods", "center", "axis"]
54
- else:
55
- args = ["min_periods", "center", "axis", "method"]
55
+ args = [a for a in args if a != "method"]
56
+ if not _window_has_center:
57
+ args = [a for a in args if a != "center"]
56
58
 
57
59
  for k in args:
58
60
  p[k] = getattr(self, k)