maxframe 0.1.0b4__cp39-cp39-win32.whl → 1.0.0__cp39-cp39-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (214) hide show
  1. maxframe/__init__.py +1 -0
  2. maxframe/_utils.cp39-win32.pyd +0 -0
  3. maxframe/codegen.py +56 -5
  4. maxframe/config/config.py +78 -10
  5. maxframe/config/validators.py +42 -11
  6. maxframe/conftest.py +58 -14
  7. maxframe/core/__init__.py +2 -16
  8. maxframe/core/entity/__init__.py +1 -12
  9. maxframe/core/entity/executable.py +1 -1
  10. maxframe/core/entity/objects.py +46 -45
  11. maxframe/core/entity/output_types.py +0 -3
  12. maxframe/core/entity/tests/test_objects.py +43 -0
  13. maxframe/core/entity/tileables.py +5 -78
  14. maxframe/core/graph/__init__.py +2 -2
  15. maxframe/core/graph/builder/__init__.py +0 -1
  16. maxframe/core/graph/builder/base.py +5 -4
  17. maxframe/core/graph/builder/tileable.py +4 -4
  18. maxframe/core/graph/builder/utils.py +4 -8
  19. maxframe/core/graph/core.cp39-win32.pyd +0 -0
  20. maxframe/core/graph/core.pyx +4 -4
  21. maxframe/core/graph/entity.py +9 -33
  22. maxframe/core/operator/__init__.py +2 -9
  23. maxframe/core/operator/base.py +3 -5
  24. maxframe/core/operator/objects.py +0 -9
  25. maxframe/core/operator/utils.py +55 -0
  26. maxframe/dataframe/__init__.py +2 -1
  27. maxframe/dataframe/arithmetic/around.py +5 -17
  28. maxframe/dataframe/arithmetic/core.py +15 -7
  29. maxframe/dataframe/arithmetic/docstring.py +7 -33
  30. maxframe/dataframe/arithmetic/equal.py +4 -2
  31. maxframe/dataframe/arithmetic/greater.py +4 -2
  32. maxframe/dataframe/arithmetic/greater_equal.py +4 -2
  33. maxframe/dataframe/arithmetic/less.py +2 -2
  34. maxframe/dataframe/arithmetic/less_equal.py +4 -2
  35. maxframe/dataframe/arithmetic/not_equal.py +4 -2
  36. maxframe/dataframe/arithmetic/tests/test_arithmetic.py +39 -16
  37. maxframe/dataframe/core.py +58 -12
  38. maxframe/dataframe/datasource/date_range.py +2 -2
  39. maxframe/dataframe/datasource/read_odps_query.py +120 -24
  40. maxframe/dataframe/datasource/read_odps_table.py +9 -4
  41. maxframe/dataframe/datasource/tests/test_datasource.py +103 -8
  42. maxframe/dataframe/datastore/tests/test_to_odps.py +48 -0
  43. maxframe/dataframe/datastore/to_odps.py +28 -0
  44. maxframe/dataframe/extensions/__init__.py +5 -0
  45. maxframe/dataframe/extensions/flatjson.py +131 -0
  46. maxframe/dataframe/extensions/flatmap.py +317 -0
  47. maxframe/dataframe/extensions/reshuffle.py +1 -1
  48. maxframe/dataframe/extensions/tests/test_extensions.py +108 -3
  49. maxframe/dataframe/groupby/core.py +1 -1
  50. maxframe/dataframe/groupby/cum.py +0 -1
  51. maxframe/dataframe/groupby/fill.py +4 -1
  52. maxframe/dataframe/groupby/getitem.py +6 -0
  53. maxframe/dataframe/groupby/tests/test_groupby.py +5 -1
  54. maxframe/dataframe/groupby/transform.py +5 -1
  55. maxframe/dataframe/indexing/align.py +1 -1
  56. maxframe/dataframe/indexing/loc.py +6 -4
  57. maxframe/dataframe/indexing/rename.py +5 -28
  58. maxframe/dataframe/indexing/sample.py +0 -1
  59. maxframe/dataframe/indexing/set_index.py +68 -1
  60. maxframe/dataframe/initializer.py +11 -1
  61. maxframe/dataframe/merge/__init__.py +9 -1
  62. maxframe/dataframe/merge/concat.py +41 -31
  63. maxframe/dataframe/merge/merge.py +237 -3
  64. maxframe/dataframe/merge/tests/test_merge.py +126 -1
  65. maxframe/dataframe/misc/__init__.py +4 -0
  66. maxframe/dataframe/misc/apply.py +6 -11
  67. maxframe/dataframe/misc/case_when.py +141 -0
  68. maxframe/dataframe/misc/describe.py +2 -2
  69. maxframe/dataframe/misc/drop_duplicates.py +8 -8
  70. maxframe/dataframe/misc/eval.py +4 -0
  71. maxframe/dataframe/misc/memory_usage.py +2 -2
  72. maxframe/dataframe/misc/pct_change.py +1 -83
  73. maxframe/dataframe/misc/pivot_table.py +262 -0
  74. maxframe/dataframe/misc/tests/test_misc.py +93 -1
  75. maxframe/dataframe/misc/transform.py +1 -30
  76. maxframe/dataframe/misc/value_counts.py +4 -17
  77. maxframe/dataframe/missing/dropna.py +1 -1
  78. maxframe/dataframe/missing/fillna.py +5 -5
  79. maxframe/dataframe/operators.py +1 -17
  80. maxframe/dataframe/plotting/core.py +2 -2
  81. maxframe/dataframe/reduction/core.py +4 -3
  82. maxframe/dataframe/reduction/tests/test_reduction.py +2 -4
  83. maxframe/dataframe/sort/sort_values.py +1 -11
  84. maxframe/dataframe/statistics/corr.py +3 -3
  85. maxframe/dataframe/statistics/quantile.py +13 -19
  86. maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
  87. maxframe/dataframe/tests/test_initializer.py +33 -2
  88. maxframe/dataframe/utils.py +33 -11
  89. maxframe/dataframe/window/expanding.py +5 -3
  90. maxframe/dataframe/window/tests/test_expanding.py +2 -2
  91. maxframe/errors.py +13 -0
  92. maxframe/extension.py +12 -0
  93. maxframe/io/__init__.py +13 -0
  94. maxframe/io/objects/__init__.py +24 -0
  95. maxframe/io/objects/core.py +140 -0
  96. maxframe/io/objects/tensor.py +76 -0
  97. maxframe/io/objects/tests/__init__.py +13 -0
  98. maxframe/io/objects/tests/test_object_io.py +97 -0
  99. maxframe/{odpsio → io/odpsio}/__init__.py +3 -1
  100. maxframe/{odpsio → io/odpsio}/arrow.py +43 -12
  101. maxframe/{odpsio → io/odpsio}/schema.py +38 -16
  102. maxframe/io/odpsio/tableio.py +719 -0
  103. maxframe/io/odpsio/tests/__init__.py +13 -0
  104. maxframe/{odpsio → io/odpsio}/tests/test_schema.py +75 -33
  105. maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +50 -23
  106. maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
  107. maxframe/io/odpsio/volumeio.py +63 -0
  108. maxframe/learn/contrib/__init__.py +3 -1
  109. maxframe/learn/contrib/graph/__init__.py +15 -0
  110. maxframe/learn/contrib/graph/connected_components.py +215 -0
  111. maxframe/learn/contrib/graph/tests/__init__.py +13 -0
  112. maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
  113. maxframe/learn/contrib/llm/__init__.py +16 -0
  114. maxframe/learn/contrib/llm/core.py +54 -0
  115. maxframe/learn/contrib/llm/models/__init__.py +14 -0
  116. maxframe/learn/contrib/llm/models/dashscope.py +73 -0
  117. maxframe/learn/contrib/llm/multi_modal.py +42 -0
  118. maxframe/learn/contrib/llm/text.py +42 -0
  119. maxframe/learn/contrib/utils.py +52 -0
  120. maxframe/learn/contrib/xgboost/__init__.py +26 -0
  121. maxframe/learn/contrib/xgboost/classifier.py +110 -0
  122. maxframe/learn/contrib/xgboost/core.py +241 -0
  123. maxframe/learn/contrib/xgboost/dmatrix.py +147 -0
  124. maxframe/learn/contrib/xgboost/predict.py +121 -0
  125. maxframe/learn/contrib/xgboost/regressor.py +71 -0
  126. maxframe/learn/contrib/xgboost/tests/__init__.py +13 -0
  127. maxframe/learn/contrib/xgboost/tests/test_core.py +43 -0
  128. maxframe/learn/contrib/xgboost/train.py +132 -0
  129. maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
  130. maxframe/learn/utils/__init__.py +15 -0
  131. maxframe/learn/utils/core.py +29 -0
  132. maxframe/lib/mmh3.cp39-win32.pyd +0 -0
  133. maxframe/lib/mmh3.pyi +43 -0
  134. maxframe/lib/sparse/tests/test_sparse.py +15 -15
  135. maxframe/lib/wrapped_pickle.py +2 -1
  136. maxframe/opcodes.py +11 -0
  137. maxframe/protocol.py +154 -27
  138. maxframe/remote/core.py +4 -8
  139. maxframe/serialization/__init__.py +1 -0
  140. maxframe/serialization/core.cp39-win32.pyd +0 -0
  141. maxframe/serialization/core.pxd +3 -0
  142. maxframe/serialization/core.pyi +64 -0
  143. maxframe/serialization/core.pyx +67 -26
  144. maxframe/serialization/exception.py +1 -1
  145. maxframe/serialization/pandas.py +52 -17
  146. maxframe/serialization/serializables/core.py +180 -15
  147. maxframe/serialization/serializables/field_type.py +4 -1
  148. maxframe/serialization/serializables/tests/test_serializable.py +54 -5
  149. maxframe/serialization/tests/test_serial.py +2 -1
  150. maxframe/session.py +37 -2
  151. maxframe/tensor/__init__.py +81 -2
  152. maxframe/tensor/arithmetic/isclose.py +1 -0
  153. maxframe/tensor/arithmetic/tests/test_arithmetic.py +22 -18
  154. maxframe/tensor/core.py +5 -136
  155. maxframe/tensor/datasource/array.py +7 -2
  156. maxframe/tensor/datasource/full.py +1 -1
  157. maxframe/tensor/datasource/scalar.py +1 -1
  158. maxframe/tensor/datasource/tests/test_datasource.py +1 -1
  159. maxframe/tensor/indexing/flatnonzero.py +1 -1
  160. maxframe/tensor/indexing/getitem.py +2 -0
  161. maxframe/tensor/merge/__init__.py +2 -0
  162. maxframe/tensor/merge/concatenate.py +101 -0
  163. maxframe/tensor/merge/tests/test_merge.py +30 -1
  164. maxframe/tensor/merge/vstack.py +74 -0
  165. maxframe/tensor/{base → misc}/__init__.py +4 -0
  166. maxframe/tensor/misc/atleast_1d.py +72 -0
  167. maxframe/tensor/misc/atleast_2d.py +70 -0
  168. maxframe/tensor/misc/atleast_3d.py +85 -0
  169. maxframe/tensor/misc/tests/__init__.py +13 -0
  170. maxframe/tensor/{base → misc}/transpose.py +22 -18
  171. maxframe/tensor/misc/unique.py +205 -0
  172. maxframe/tensor/operators.py +1 -7
  173. maxframe/tensor/random/core.py +1 -1
  174. maxframe/tensor/reduction/count_nonzero.py +2 -1
  175. maxframe/tensor/reduction/mean.py +1 -0
  176. maxframe/tensor/reduction/nanmean.py +1 -0
  177. maxframe/tensor/reduction/nanvar.py +2 -0
  178. maxframe/tensor/reduction/tests/test_reduction.py +12 -1
  179. maxframe/tensor/reduction/var.py +2 -0
  180. maxframe/tensor/statistics/quantile.py +2 -2
  181. maxframe/tensor/utils.py +2 -22
  182. maxframe/tests/test_protocol.py +34 -0
  183. maxframe/tests/test_utils.py +0 -12
  184. maxframe/tests/utils.py +17 -2
  185. maxframe/typing_.py +4 -1
  186. maxframe/udf.py +62 -3
  187. maxframe/utils.py +112 -86
  188. {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/METADATA +25 -25
  189. {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/RECORD +208 -167
  190. {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/WHEEL +1 -1
  191. maxframe_client/__init__.py +0 -1
  192. maxframe_client/clients/framedriver.py +4 -1
  193. maxframe_client/fetcher.py +123 -54
  194. maxframe_client/session/consts.py +3 -0
  195. maxframe_client/session/graph.py +8 -2
  196. maxframe_client/session/odps.py +223 -40
  197. maxframe_client/session/task.py +108 -80
  198. maxframe_client/tests/test_fetcher.py +21 -3
  199. maxframe_client/tests/test_session.py +136 -8
  200. maxframe/core/entity/chunks.py +0 -68
  201. maxframe/core/entity/fuse.py +0 -73
  202. maxframe/core/graph/builder/chunk.py +0 -430
  203. maxframe/odpsio/tableio.py +0 -300
  204. maxframe/odpsio/volumeio.py +0 -95
  205. maxframe_client/clients/spe.py +0 -104
  206. /maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
  207. /maxframe/{tensor/base → dataframe/datastore}/tests/__init__.py +0 -0
  208. /maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
  209. /maxframe/tensor/{base → misc}/astype.py +0 -0
  210. /maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
  211. /maxframe/tensor/{base → misc}/ravel.py +0 -0
  212. /maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
  213. /maxframe/tensor/{base → misc}/where.py +0 -0
  214. {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,141 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import numpy as np
16
+ from pandas.core.dtypes.cast import find_common_type
17
+
18
+ from ... import opcodes
19
+ from ...core import TILEABLE_TYPE
20
+ from ...serialization.serializables import FieldTypes, ListField
21
+ from ..core import SERIES_TYPE
22
+ from ..operators import DataFrameOperator, DataFrameOperatorMixin
23
+ from ..utils import apply_if_callable
24
+
25
+
26
+ class DataFrameCaseWhen(DataFrameOperator, DataFrameOperatorMixin):
27
+ _op_type_ = opcodes.CASE_WHEN
28
+
29
+ conditions = ListField("conditions", FieldTypes.reference, default=None)
30
+ replacements = ListField("replacements", FieldTypes.reference, default=None)
31
+
32
+ def __init__(self, output_types=None, **kw):
33
+ super().__init__(_output_types=output_types, **kw)
34
+
35
+ def _set_inputs(self, inputs):
36
+ super()._set_inputs(inputs)
37
+ it = iter(inputs)
38
+ next(it)
39
+ self.conditions = [
40
+ next(it) if isinstance(t, TILEABLE_TYPE) else t for t in self.conditions
41
+ ]
42
+ self.replacements = [
43
+ next(it) if isinstance(t, TILEABLE_TYPE) else t for t in self.replacements
44
+ ]
45
+
46
+ def __call__(self, series):
47
+ replacement_dtypes = [
48
+ it.dtype if isinstance(it, SERIES_TYPE) else np.array(it).dtype
49
+ for it in self.replacements
50
+ ]
51
+ dtype = find_common_type([series.dtype] + replacement_dtypes)
52
+
53
+ condition_tileables = [
54
+ it for it in self.conditions if isinstance(it, TILEABLE_TYPE)
55
+ ]
56
+ replacement_tileables = [
57
+ it for it in self.replacements if isinstance(it, TILEABLE_TYPE)
58
+ ]
59
+ inputs = [series] + condition_tileables + replacement_tileables
60
+
61
+ params = series.params
62
+ params["dtype"] = dtype
63
+ return self.new_series(inputs, **params)
64
+
65
+
66
+ def case_when(series, caselist):
67
+ """
68
+ Replace values where the conditions are True.
69
+
70
+ Parameters
71
+ ----------
72
+ caselist : A list of tuples of conditions and expected replacements
73
+ Takes the form: ``(condition0, replacement0)``,
74
+ ``(condition1, replacement1)``, ... .
75
+ ``condition`` should be a 1-D boolean array-like object
76
+ or a callable. If ``condition`` is a callable,
77
+ it is computed on the Series
78
+ and should return a boolean Series or array.
79
+ The callable must not change the input Series
80
+ (though pandas doesn`t check it). ``replacement`` should be a
81
+ 1-D array-like object, a scalar or a callable.
82
+ If ``replacement`` is a callable, it is computed on the Series
83
+ and should return a scalar or Series. The callable
84
+ must not change the input Series.
85
+
86
+ Returns
87
+ -------
88
+ Series
89
+
90
+ See Also
91
+ --------
92
+ Series.mask : Replace values where the condition is True.
93
+
94
+ Examples
95
+ --------
96
+ >>> import maxframe.dataframe as md
97
+ >>> c = md.Series([6, 7, 8, 9], name='c')
98
+ >>> a = md.Series([0, 0, 1, 2])
99
+ >>> b = md.Series([0, 3, 4, 5])
100
+
101
+ >>> c.case_when(caselist=[(a.gt(0), a), # condition, replacement
102
+ ... (b.gt(0), b)]).execute()
103
+ 0 6
104
+ 1 3
105
+ 2 1
106
+ 3 2
107
+ Name: c, dtype: int64
108
+ """
109
+ if not isinstance(caselist, list):
110
+ raise TypeError(
111
+ f"The caselist argument should be a list; instead got {type(caselist)}"
112
+ )
113
+
114
+ if not caselist:
115
+ raise ValueError(
116
+ "provide at least one boolean condition, "
117
+ "with a corresponding replacement."
118
+ )
119
+
120
+ for num, entry in enumerate(caselist):
121
+ if not isinstance(entry, tuple):
122
+ raise TypeError(
123
+ f"Argument {num} must be a tuple; instead got {type(entry)}."
124
+ )
125
+ if len(entry) != 2:
126
+ raise ValueError(
127
+ f"Argument {num} must have length 2; "
128
+ "a condition and replacement; "
129
+ f"instead got length {len(entry)}."
130
+ )
131
+ caselist = [
132
+ (
133
+ apply_if_callable(condition, series),
134
+ apply_if_callable(replacement, series),
135
+ )
136
+ for condition, replacement in caselist
137
+ ]
138
+ conditions = [case[0] for case in caselist]
139
+ replacements = [case[1] for case in caselist]
140
+ op = DataFrameCaseWhen(conditions=conditions, replacements=replacements)
141
+ return op(series)
@@ -15,7 +15,7 @@
15
15
  import numpy as np
16
16
  import pandas as pd
17
17
 
18
- from ... import opcodes as OperandDef
18
+ from ... import opcodes
19
19
  from ...serialization.serializables import AnyField, FieldTypes, KeyField, ListField
20
20
  from ..core import SERIES_TYPE
21
21
  from ..operators import DataFrameOperator, DataFrameOperatorMixin
@@ -23,7 +23,7 @@ from ..utils import build_empty_df, parse_index
23
23
 
24
24
 
25
25
  class DataFrameDescribe(DataFrameOperator, DataFrameOperatorMixin):
26
- _op_type_ = OperandDef.DESCRIBE
26
+ _op_type_ = opcodes.DESCRIBE
27
27
 
28
28
  input = KeyField("input", default=None)
29
29
  percentiles = ListField("percentiles", FieldTypes.float64, default=None)
@@ -37,16 +37,19 @@ class DataFrameDropDuplicates(DuplicateOperand):
37
37
  shape += (3,)
38
38
  return shape
39
39
 
40
- @classmethod
41
- def _gen_tileable_params(cls, op: "DataFrameDropDuplicates", input_params):
40
+ def _gen_tileable_params(self, op: "DataFrameDropDuplicates", input_params):
42
41
  params = input_params.copy()
43
- if op.ignore_index:
42
+ if op.ignore_index and self._output_types[0] != OutputType.index:
44
43
  params["index_value"] = parse_index(pd.RangeIndex(-1))
45
44
  else:
46
45
  params["index_value"] = gen_unknown_index_value(
47
- input_params["index_value"], op.keep, op.subset, type(op).__name__
46
+ input_params["index_value"],
47
+ op.keep,
48
+ op.subset,
49
+ type(op).__name__,
50
+ normalize_range_index=True,
48
51
  )
49
- params["shape"] = cls._get_shape(input_params["shape"], op)
52
+ params["shape"] = self._get_shape(input_params["shape"], op)
50
53
  return params
51
54
 
52
55
  def __call__(self, inp, inplace=False):
@@ -151,17 +154,14 @@ def series_drop_duplicates(
151
154
  With the 'keep' parameter, the selection behaviour of duplicated values
152
155
  can be changed. The value 'first' keeps the first occurrence for each
153
156
  set of duplicated entries. The default value of keep is 'first'.
154
-
155
157
  >>> s.drop_duplicates().execute()
156
158
  0 lame
157
159
  1 cow
158
160
  3 beetle
159
161
  5 hippo
160
162
  Name: animal, dtype: object
161
-
162
163
  The value 'last' for parameter 'keep' keeps the last occurrence for
163
164
  each set of duplicated entries.
164
-
165
165
  >>> s.drop_duplicates(keep='last').execute()
166
166
  1 cow
167
167
  3 beetle
@@ -120,6 +120,10 @@ class CollectionVisitor(ast.NodeVisitor):
120
120
  if obj_name in self.env:
121
121
  self.referenced_vars.add(obj_name)
122
122
  return self.env[obj_name]
123
+ try:
124
+ return self.target[obj_name]
125
+ except KeyError:
126
+ pass
123
127
  raise KeyError(f"name {obj_name} is not defined")
124
128
 
125
129
  def visit(self, node):
@@ -58,7 +58,7 @@ class DataFrameMemoryUsage(DataFrameOperator, DataFrameOperatorMixin):
58
58
  """
59
59
  if df_or_series.ndim == 1:
60
60
  # the input data is a series, a Scalar will be returned
61
- return self.new_scalar([df_or_series], dtype=np.dtype(np.int_))
61
+ return self.new_scalar([df_or_series], dtype=np.dtype(int))
62
62
  else:
63
63
  # the input data is a DataFrame, a Scalar will be returned
64
64
  # calculate shape of returning series given ``op.index``
@@ -71,7 +71,7 @@ class DataFrameMemoryUsage(DataFrameOperator, DataFrameOperatorMixin):
71
71
  [df_or_series],
72
72
  index_value=self._adapt_index(df_or_series.columns_value),
73
73
  shape=new_shape,
74
- dtype=np.dtype(np.int_),
74
+ dtype=np.dtype(int),
75
75
  )
76
76
 
77
77
 
@@ -18,6 +18,7 @@ from ..utils import validate_axis
18
18
  def pct_change(
19
19
  df_or_series, periods=1, fill_method="pad", limit=None, freq=None, **kwargs
20
20
  ):
21
+ # FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/32
21
22
  """
22
23
  Percentage change between the current and a prior element.
23
24
 
@@ -50,89 +51,6 @@ def pct_change(
50
51
  DataFrame.diff : Compute the difference of two elements in a DataFrame.
51
52
  Series.shift : Shift the index by some number of periods.
52
53
  DataFrame.shift : Shift the index by some number of periods.
53
-
54
- Examples
55
- --------
56
- **Series**
57
-
58
- >>> import maxframe.dataframe as md
59
-
60
- >>> s = md.Series([90, 91, 85])
61
- >>> s.execute()
62
- 0 90
63
- 1 91
64
- 2 85
65
- dtype: int64
66
-
67
- >>> s.pct_change().execute()
68
- 0 NaN
69
- 1 0.011111
70
- 2 -0.065934
71
- dtype: float64
72
-
73
- >>> s.pct_change(periods=2).execute()
74
- 0 NaN
75
- 1 NaN
76
- 2 -0.055556
77
- dtype: float64
78
-
79
- See the percentage change in a Series where filling NAs with last
80
- valid observation forward to next valid.
81
-
82
- >>> s = md.Series([90, 91, None, 85])
83
- >>> s.execute()
84
- 0 90.0
85
- 1 91.0
86
- 2 NaN
87
- 3 85.0
88
- dtype: float64
89
-
90
- >>> s.pct_change(fill_method='ffill').execute()
91
- 0 NaN
92
- 1 0.011111
93
- 2 0.000000
94
- 3 -0.065934
95
- dtype: float64
96
-
97
- **DataFrame**
98
-
99
- Percentage change in French franc, Deutsche Mark, and Italian lira from
100
- 1980-01-01 to 1980-03-01.
101
-
102
- >>> df = md.DataFrame({
103
- ... 'FR': [4.0405, 4.0963, 4.3149],
104
- ... 'GR': [1.7246, 1.7482, 1.8519],
105
- ... 'IT': [804.74, 810.01, 860.13]},
106
- ... index=['1980-01-01', '1980-02-01', '1980-03-01'])
107
- >>> df.execute()
108
- FR GR IT
109
- 1980-01-01 4.0405 1.7246 804.74
110
- 1980-02-01 4.0963 1.7482 810.01
111
- 1980-03-01 4.3149 1.8519 860.13
112
-
113
- >>> df.pct_change().execute()
114
- FR GR IT
115
- 1980-01-01 NaN NaN NaN
116
- 1980-02-01 0.013810 0.013684 0.006549
117
- 1980-03-01 0.053365 0.059318 0.061876
118
-
119
- Percentage of change in GOOG and APPL stock volume. Shows computing
120
- the percentage change between columns.
121
-
122
- >>> df = md.DataFrame({
123
- ... '2016': [1769950, 30586265],
124
- ... '2015': [1500923, 40912316],
125
- ... '2014': [1371819, 41403351]},
126
- ... index=['GOOG', 'APPL'])
127
- >>> df.execute()
128
- 2016 2015 2014
129
- GOOG 1769950 1500923 1371819
130
- APPL 30586265 40912316 41403351
131
-
132
- >>> df.pct_change(axis='columns').execute()
133
- 2016 2015 2014
134
- GOOG NaN -0.151997 -0.086016
135
- APPL NaN 0.337604 0.012002
136
54
  """
137
55
 
138
56
  axis = validate_axis(kwargs.pop("axis", 0))
@@ -0,0 +1,262 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import numpy as np
16
+ import pandas as pd
17
+ from pandas.api.types import is_list_like
18
+
19
+ from ... import opcodes
20
+ from ...core import OutputType
21
+ from ...serialization.serializables import AnyField, BoolField, StringField
22
+ from ...utils import no_default
23
+ from ..operators import DataFrameOperator, DataFrameOperatorMixin
24
+ from ..utils import build_df, parse_index
25
+
26
+
27
+ class DataFramePivotTable(DataFrameOperator, DataFrameOperatorMixin):
28
+ _op_type_ = opcodes.PIVOT_TABLE
29
+
30
+ values = AnyField("values", default=None)
31
+ index = AnyField("index", default=None)
32
+ columns = AnyField("columns", default=None)
33
+ aggfunc = AnyField("aggfunc", default="mean")
34
+ fill_value = AnyField("fill_value", default=None)
35
+ margins = BoolField("margins", default=False)
36
+ dropna = BoolField("dropna", default=True)
37
+ margins_name = StringField("margins_name", default=None)
38
+ sort = BoolField("sort", default=False)
39
+
40
+ def __init__(self, **kw):
41
+ super().__init__(**kw)
42
+ self.output_types = [OutputType.dataframe]
43
+
44
+ def __call__(self, df):
45
+ index_value = columns_value = dtypes = None
46
+ if self.index is not None:
47
+ # index is now a required field
48
+ if len(self.index) == 1:
49
+ index_data = pd.Index(
50
+ [], dtype=df.dtypes[self.index[0]], name=self.index[0]
51
+ )
52
+ else:
53
+ index_data = pd.MultiIndex.from_frame(build_df(df[self.index]))
54
+ index_value = parse_index(index_data)
55
+
56
+ if self.columns is None: # output columns can be determined
57
+ sel_df = df
58
+ groupby_obj = sel_df.groupby(self.index)
59
+ if self.values:
60
+ groupby_obj = groupby_obj[self.values]
61
+ aggregated_df = groupby_obj.agg(self.aggfunc)
62
+ index_value = aggregated_df.index_value
63
+ columns_value = aggregated_df.columns_value
64
+ dtypes = aggregated_df.dtypes
65
+ else:
66
+ columns_value = dtypes = None
67
+ return self.new_dataframe(
68
+ [df],
69
+ shape=(np.nan, np.nan),
70
+ dtypes=dtypes,
71
+ columns_value=columns_value,
72
+ index_value=index_value,
73
+ )
74
+
75
+
76
+ def pivot_table(
77
+ data,
78
+ values=None,
79
+ index=None,
80
+ columns=None,
81
+ aggfunc="mean",
82
+ fill_value=None,
83
+ margins=False,
84
+ dropna=True,
85
+ margins_name="All",
86
+ sort=True,
87
+ ):
88
+ """
89
+ Create a spreadsheet-style pivot table as a DataFrame.
90
+
91
+ The levels in the pivot table will be stored in MultiIndex objects
92
+ (hierarchical indexes) on the index and columns of the result DataFrame.
93
+
94
+ Parameters
95
+ ----------
96
+ values : column to aggregate, optional
97
+ index : column, Grouper, array, or list of the previous
98
+ If an array is passed, it must be the same length as the data. The
99
+ list can contain any of the other types (except list).
100
+ Keys to group by on the pivot table index. If an array is passed,
101
+ it is being used as the same manner as column values.
102
+ columns : column, Grouper, array, or list of the previous
103
+ If an array is passed, it must be the same length as the data. The
104
+ list can contain any of the other types (except list).
105
+ Keys to group by on the pivot table column. If an array is passed,
106
+ it is being used as the same manner as column values.
107
+ aggfunc : function, list of functions, dict, default numpy.mean
108
+ If list of functions passed, the resulting pivot table will have
109
+ hierarchical columns whose top level are the function names
110
+ (inferred from the function objects themselves)
111
+ If dict is passed, the key is column to aggregate and value
112
+ is function or list of functions.
113
+ fill_value : scalar, default None
114
+ Value to replace missing values with (in the resulting pivot table,
115
+ after aggregation).
116
+ margins : bool, default False
117
+ Add all row / columns (e.g. for subtotal / grand totals).
118
+ dropna : bool, default True
119
+ Do not include columns whose entries are all NaN.
120
+ margins_name : str, default 'All'
121
+ Name of the row / column that will contain the totals
122
+ when margins is True.
123
+ sort : bool, default True
124
+ Specifies if the result should be sorted.
125
+
126
+ Returns
127
+ -------
128
+ DataFrame
129
+ An Excel style pivot table.
130
+
131
+ See Also
132
+ --------
133
+ DataFrame.pivot : Pivot without aggregation that can handle
134
+ non-numeric data.
135
+ DataFrame.melt: Unpivot a DataFrame from wide to long format,
136
+ optionally leaving identifiers set.
137
+ wide_to_long : Wide panel to long format. Less flexible but more
138
+ user-friendly than melt.
139
+
140
+ Examples
141
+ --------
142
+ >>> import numpy as np
143
+ >>> import maxframe.dataframe as md
144
+ >>> df = md.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
145
+ ... "bar", "bar", "bar", "bar"],
146
+ ... "B": ["one", "one", "one", "two", "two",
147
+ ... "one", "one", "two", "two"],
148
+ ... "C": ["small", "large", "large", "small",
149
+ ... "small", "large", "small", "small",
150
+ ... "large"],
151
+ ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
152
+ ... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]})
153
+ >>> df.execute()
154
+ A B C D E
155
+ 0 foo one small 1 2
156
+ 1 foo one large 2 4
157
+ 2 foo one large 2 5
158
+ 3 foo two small 3 5
159
+ 4 foo two small 3 6
160
+ 5 bar one large 4 6
161
+ 6 bar one small 5 8
162
+ 7 bar two small 6 9
163
+ 8 bar two large 7 9
164
+
165
+ This first example aggregates values by taking the sum.
166
+
167
+ >>> table = md.pivot_table(df, values='D', index=['A', 'B'],
168
+ ... columns=['C'], aggfunc=np.sum)
169
+ >>> table.execute()
170
+ C large small
171
+ A B
172
+ bar one 4.0 5.0
173
+ two 7.0 6.0
174
+ foo one 4.0 1.0
175
+ two NaN 6.0
176
+
177
+ We can also fill missing values using the `fill_value` parameter.
178
+
179
+ >>> table = md.pivot_table(df, values='D', index=['A', 'B'],
180
+ ... columns=['C'], aggfunc=np.sum, fill_value=0)
181
+ >>> table.execute()
182
+ C large small
183
+ A B
184
+ bar one 4 5
185
+ two 7 6
186
+ foo one 4 1
187
+ two 0 6
188
+
189
+ The next example aggregates by taking the mean across multiple columns.
190
+
191
+ >>> table = md.pivot_table(df, values=['D', 'E'], index=['A', 'C'],
192
+ ... aggfunc={'D': np.mean,
193
+ ... 'E': np.mean})
194
+ >>> table.execute()
195
+ D E
196
+ A C
197
+ bar large 5.500000 7.500000
198
+ small 5.500000 8.500000
199
+ foo large 2.000000 4.500000
200
+ small 2.333333 4.333333
201
+
202
+ We can also calculate multiple types of aggregations for any given
203
+ value column.
204
+
205
+ >>> table = md.pivot_table(df, values=['D', 'E'], index=['A', 'C'],
206
+ ... aggfunc={'D': np.mean,
207
+ ... 'E': [min, max, np.mean]})
208
+ >>> table.execute()
209
+ D E
210
+ mean max mean min
211
+ A C
212
+ bar large 5.500000 9.0 7.500000 6.0
213
+ small 5.500000 9.0 8.500000 8.0
214
+ foo large 2.000000 5.0 4.500000 4.0
215
+ small 2.333333 6.0 4.333333 2.0
216
+ """
217
+ if index is None and columns is None:
218
+ raise ValueError(
219
+ "No group keys passed, need to specify at least one of index or columns"
220
+ )
221
+
222
+ def make_col_list(col):
223
+ try:
224
+ if col in data.dtypes.index:
225
+ return [col]
226
+ except TypeError:
227
+ return col
228
+ return col
229
+
230
+ values_list = make_col_list(values)
231
+ index_list = make_col_list(index)
232
+ columns_list = make_col_list(columns)
233
+
234
+ name_to_attr = {"values": values_list, "index": index_list, "columns": columns_list}
235
+ for key, val in name_to_attr.items():
236
+ if val is None:
237
+ continue
238
+ if not is_list_like(val):
239
+ raise ValueError(f"Need to specify {key} as a list-like object.")
240
+ non_exist_key = next((c for c in val if c not in data.dtypes.index), no_default)
241
+ if non_exist_key is not no_default:
242
+ raise ValueError(
243
+ f"Column {non_exist_key} specified in {key} is not a valid column."
244
+ )
245
+
246
+ if columns is None and not margins:
247
+ if values_list:
248
+ data = data[index_list + values_list]
249
+ return data.groupby(index, sort=sort).agg(aggfunc)
250
+
251
+ op = DataFramePivotTable(
252
+ values=values,
253
+ index=index,
254
+ columns=columns,
255
+ aggfunc=aggfunc,
256
+ fill_value=fill_value,
257
+ margins=margins,
258
+ dropna=dropna,
259
+ margins_name=margins_name,
260
+ sort=sort,
261
+ )
262
+ return op(data)