maxframe 0.1.0b4__cp310-cp310-win32.whl → 0.1.0b5__cp310-cp310-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (53) hide show
  1. maxframe/__init__.py +1 -0
  2. maxframe/_utils.cp310-win32.pyd +0 -0
  3. maxframe/codegen.py +46 -1
  4. maxframe/config/config.py +11 -1
  5. maxframe/core/graph/core.cp310-win32.pyd +0 -0
  6. maxframe/dataframe/__init__.py +1 -0
  7. maxframe/dataframe/core.py +30 -8
  8. maxframe/dataframe/datasource/read_odps_query.py +3 -1
  9. maxframe/dataframe/datasource/read_odps_table.py +3 -1
  10. maxframe/dataframe/misc/__init__.py +4 -0
  11. maxframe/dataframe/misc/apply.py +1 -1
  12. maxframe/dataframe/misc/case_when.py +141 -0
  13. maxframe/dataframe/misc/pivot_table.py +262 -0
  14. maxframe/dataframe/misc/tests/test_misc.py +61 -0
  15. maxframe/dataframe/plotting/core.py +2 -2
  16. maxframe/dataframe/reduction/core.py +2 -1
  17. maxframe/dataframe/utils.py +7 -0
  18. maxframe/learn/contrib/utils.py +52 -0
  19. maxframe/learn/contrib/xgboost/__init__.py +26 -0
  20. maxframe/learn/contrib/xgboost/classifier.py +86 -0
  21. maxframe/learn/contrib/xgboost/core.py +156 -0
  22. maxframe/learn/contrib/xgboost/dmatrix.py +150 -0
  23. maxframe/learn/contrib/xgboost/predict.py +138 -0
  24. maxframe/learn/contrib/xgboost/regressor.py +78 -0
  25. maxframe/learn/contrib/xgboost/tests/__init__.py +13 -0
  26. maxframe/learn/contrib/xgboost/tests/test_core.py +43 -0
  27. maxframe/learn/contrib/xgboost/train.py +121 -0
  28. maxframe/learn/utils/__init__.py +15 -0
  29. maxframe/learn/utils/core.py +29 -0
  30. maxframe/lib/mmh3.cp310-win32.pyd +0 -0
  31. maxframe/odpsio/arrow.py +2 -3
  32. maxframe/odpsio/tableio.py +22 -0
  33. maxframe/odpsio/tests/test_schema.py +16 -11
  34. maxframe/opcodes.py +3 -0
  35. maxframe/serialization/core.cp310-win32.pyd +0 -0
  36. maxframe/serialization/core.pyi +61 -0
  37. maxframe/session.py +28 -0
  38. maxframe/tensor/__init__.py +1 -1
  39. maxframe/tensor/base/__init__.py +2 -0
  40. maxframe/tensor/base/atleast_1d.py +74 -0
  41. maxframe/tensor/base/unique.py +205 -0
  42. maxframe/tensor/datasource/array.py +4 -2
  43. maxframe/tensor/datasource/scalar.py +1 -1
  44. maxframe/udf.py +63 -3
  45. maxframe/utils.py +6 -0
  46. {maxframe-0.1.0b4.dist-info → maxframe-0.1.0b5.dist-info}/METADATA +2 -2
  47. {maxframe-0.1.0b4.dist-info → maxframe-0.1.0b5.dist-info}/RECORD +53 -36
  48. maxframe_client/fetcher.py +65 -3
  49. maxframe_client/session/odps.py +30 -1
  50. maxframe_client/session/task.py +26 -53
  51. maxframe_client/tests/test_session.py +28 -1
  52. {maxframe-0.1.0b4.dist-info → maxframe-0.1.0b5.dist-info}/WHEEL +0 -0
  53. {maxframe-0.1.0b4.dist-info → maxframe-0.1.0b5.dist-info}/top_level.txt +0 -0
maxframe/__init__.py CHANGED
@@ -13,6 +13,7 @@
13
13
  # limitations under the License.
14
14
 
15
15
  from . import dataframe, learn, remote, tensor
16
+ from .config import options
16
17
  from .session import execute, fetch, new_session, stop_server
17
18
 
18
19
 
Binary file
maxframe/codegen.py CHANGED
@@ -16,6 +16,7 @@ import abc
16
16
  import base64
17
17
  import dataclasses
18
18
  import logging
19
+ from collections import defaultdict
19
20
  from enum import Enum
20
21
  from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union
21
22
 
@@ -32,7 +33,7 @@ from .protocol import DataFrameTableMeta, ResultInfo
32
33
  from .serialization import PickleContainer
33
34
  from .serialization.serializables import Serializable, StringField
34
35
  from .typing_ import PandasObjectTypes
35
- from .udf import MarkedFunction
36
+ from .udf import MarkedFunction, PythonPackOptions
36
37
 
37
38
  if TYPE_CHECKING:
38
39
  from odpsctx import ODPSSessionContext
@@ -75,6 +76,14 @@ class AbstractUDF(Serializable):
75
76
  def unregister(self, odps: "ODPSSessionContext"):
76
77
  raise NotImplementedError
77
78
 
79
+ @abc.abstractmethod
80
+ def collect_pythonpack(self) -> List[PythonPackOptions]:
81
+ raise NotImplementedError
82
+
83
+ @abc.abstractmethod
84
+ def load_pythonpack_resources(self, odps_ctx: "ODPSSessionContext") -> None:
85
+ raise NotImplementedError
86
+
78
87
 
79
88
  class UserCodeMixin:
80
89
  @classmethod
@@ -469,6 +478,42 @@ class BigDagCodeGenerator(metaclass=abc.ABCMeta):
469
478
  output_key_to_result_infos=self._context.get_tileable_result_infos(),
470
479
  )
471
480
 
481
+ def run_pythonpacks(
482
+ self,
483
+ odps_ctx: "ODPSSessionContext",
484
+ python_tag: str,
485
+ is_production: bool = False,
486
+ schedule_id: Optional[str] = None,
487
+ hints: Optional[dict] = None,
488
+ priority: Optional[int] = None,
489
+ ) -> Dict[str, PythonPackOptions]:
490
+ key_to_packs = defaultdict(list)
491
+ for udf in self._context.get_udfs():
492
+ for pack in udf.collect_pythonpack():
493
+ key_to_packs[pack.key].append(pack)
494
+ distinct_packs = []
495
+ for packs in key_to_packs.values():
496
+ distinct_packs.append(packs[0])
497
+
498
+ inst_id_to_req = {}
499
+ for pack in distinct_packs:
500
+ inst = odps_ctx.run_pythonpack(
501
+ requirements=pack.requirements,
502
+ prefer_binary=pack.prefer_binary,
503
+ pre_release=pack.pre_release,
504
+ force_rebuild=pack.force_rebuild,
505
+ python_tag=python_tag,
506
+ is_production=is_production,
507
+ schedule_id=schedule_id,
508
+ hints=hints,
509
+ priority=priority,
510
+ )
511
+ # fulfill instance id of pythonpacks with same keys
512
+ for same_pack in key_to_packs[pack.key]:
513
+ same_pack.pack_instance_id = inst.id
514
+ inst_id_to_req[inst.id] = pack
515
+ return inst_id_to_req
516
+
472
517
  def register_udfs(self, odps_ctx: "ODPSSessionContext"):
473
518
  for udf in self._context.get_udfs():
474
519
  logger.info("[Session %s] Registering UDF %s", self._session_id, udf.name)
maxframe/config/config.py CHANGED
@@ -40,6 +40,7 @@ _DEFAULT_SPE_OPERATION_TIMEOUT_SECONDS = 120
40
40
  _DEFAULT_UPLOAD_BATCH_SIZE = 4096
41
41
  _DEFAULT_TEMP_LIFECYCLE = 1
42
42
  _DEFAULT_TASK_START_TIMEOUT = 60
43
+ _DEFAULT_LOGVIEW_HOURS = 24 * 60
43
44
 
44
45
 
45
46
  class OptionError(Exception):
@@ -296,13 +297,15 @@ class Config:
296
297
 
297
298
 
298
299
  default_options = Config()
299
-
300
300
  default_options.register_option(
301
301
  "execution_mode", "trigger", validator=is_in(["trigger", "eager"])
302
302
  )
303
303
  default_options.register_option(
304
304
  "python_tag", get_python_tag(), validator=is_string, remote=True
305
305
  )
306
+ default_options.register_option(
307
+ "session.logview_hours", _DEFAULT_LOGVIEW_HOURS, validator=is_integer, remote=True
308
+ )
306
309
  default_options.register_option(
307
310
  "client.task_start_timeout", _DEFAULT_TASK_START_TIMEOUT, validator=is_integer
308
311
  )
@@ -312,6 +315,9 @@ default_options.register_option(
312
315
  )
313
316
  default_options.register_option("sql.settings", {}, validator=is_dict, remote=True)
314
317
 
318
+ default_options.register_option("is_production", False, validator=is_bool, remote=True)
319
+ default_options.register_option("schedule_id", "", validator=is_string, remote=True)
320
+
315
321
  default_options.register_option(
316
322
  "session.max_alive_seconds",
317
323
  _DEFAULT_MAX_ALIVE_SECONDS,
@@ -376,6 +382,10 @@ default_options.register_option(
376
382
  "spe.task.settings", dict(), validator=is_dict, remote=True
377
383
  )
378
384
 
385
+ default_options.register_option(
386
+ "pythonpack.task.settings", {}, validator=is_dict, remote=True
387
+ )
388
+
379
389
  _options_ctx_var = contextvars.ContextVar("_options_ctx_var")
380
390
 
381
391
 
Binary file
@@ -46,6 +46,7 @@ from .misc.cut import cut
46
46
  from .misc.eval import maxframe_eval as eval # pylint: disable=redefined-builtin
47
47
  from .misc.get_dummies import get_dummies
48
48
  from .misc.melt import melt
49
+ from .misc.pivot_table import pivot_table
49
50
  from .misc.qcut import qcut
50
51
  from .misc.to_numeric import to_numeric
51
52
  from .missing import isna, isnull, notna, notnull
@@ -35,6 +35,7 @@ from ..core import (
35
35
  register_output_types,
36
36
  )
37
37
  from ..core.entity.utils import refresh_tileable_shape
38
+ from ..protocol import DataFrameTableMeta
38
39
  from ..serialization.serializables import (
39
40
  AnyField,
40
41
  BoolField,
@@ -59,7 +60,13 @@ from ..utils import (
59
60
  on_serialize_numpy_type,
60
61
  tokenize,
61
62
  )
62
- from .utils import ReprSeries, fetch_corner_data, merge_index_value, parse_index
63
+ from .utils import (
64
+ ReprSeries,
65
+ apply_if_callable,
66
+ fetch_corner_data,
67
+ merge_index_value,
68
+ parse_index,
69
+ )
63
70
 
64
71
 
65
72
  class IndexValue(Serializable):
@@ -616,6 +623,9 @@ class IndexData(HasShapeTileableData, _ToPandasMixin):
616
623
  if self._name is None:
617
624
  self._name = self.chunks[0].name
618
625
 
626
+ def refresh_from_table_meta(self, table_meta: DataFrameTableMeta) -> None:
627
+ pass
628
+
619
629
  def _to_str(self, representation=False):
620
630
  if is_build_mode() or len(self._executed_sessions) == 0:
621
631
  # in build mode, or not executed, just return representation
@@ -945,6 +955,9 @@ class BaseSeriesData(HasShapeTileableData, _ToPandasMixin):
945
955
  if self._name is None:
946
956
  self._name = self.chunks[0].name
947
957
 
958
+ def refresh_from_table_meta(self, table_meta: DataFrameTableMeta) -> None:
959
+ pass
960
+
948
961
  def _to_str(self, representation=False):
949
962
  if is_build_mode() or len(self._executed_sessions) == 0:
950
963
  # in build mode, or not executed, just return representation
@@ -978,7 +991,7 @@ class BaseSeriesData(HasShapeTileableData, _ToPandasMixin):
978
991
  return self._to_str(representation=False)
979
992
 
980
993
  def __repr__(self):
981
- return self._to_str(representation=False)
994
+ return self._to_str(representation=True)
982
995
 
983
996
  @property
984
997
  def dtype(self):
@@ -1501,6 +1514,15 @@ class BaseDataFrameData(HasShapeTileableData, _ToPandasMixin):
1501
1514
  refresh_index_value(self)
1502
1515
  refresh_dtypes(self)
1503
1516
 
1517
+ def refresh_from_table_meta(self, table_meta: DataFrameTableMeta) -> None:
1518
+ dtypes = table_meta.pd_column_dtypes
1519
+ self._dtypes = dtypes
1520
+ self._columns_value = parse_index(dtypes.index, store_data=True)
1521
+ self._dtypes_value = DtypesValue(key=tokenize(dtypes), value=dtypes)
1522
+ new_shape = list(self._shape)
1523
+ new_shape[0] = len(dtypes)
1524
+ self._shape = tuple(new_shape)
1525
+
1504
1526
  @property
1505
1527
  def dtypes(self):
1506
1528
  dt = getattr(self, "_dtypes", None)
@@ -1997,12 +2019,6 @@ class DataFrame(HasShapeTileable, _ToPandasMixin):
1997
2019
  Berkeley 25.0 77.0 298.15
1998
2020
  """
1999
2021
 
2000
- def apply_if_callable(maybe_callable, obj, **kwargs):
2001
- if callable(maybe_callable):
2002
- return maybe_callable(obj, **kwargs)
2003
-
2004
- return maybe_callable
2005
-
2006
2022
  data = self.copy()
2007
2023
 
2008
2024
  for k, v in kwargs.items():
@@ -2197,6 +2213,9 @@ class CategoricalData(HasShapeTileableData, _ToPandasMixin):
2197
2213
  pd.Categorical(categories).categories, store_data=True
2198
2214
  )
2199
2215
 
2216
+ def refresh_from_table_meta(self, table_meta: DataFrameTableMeta) -> None:
2217
+ pass
2218
+
2200
2219
  def _to_str(self, representation=False):
2201
2220
  if is_build_mode() or len(self._executed_sessions) == 0:
2202
2221
  # in build mode, or not executed, just return representation
@@ -2347,6 +2366,9 @@ class DataFrameOrSeriesData(HasShapeTileableData, _ToPandasMixin):
2347
2366
  data_params["name"] = self.chunks[0].name
2348
2367
  self._data_params.update(data_params)
2349
2368
 
2369
+ def refresh_from_table_meta(self, table_meta: DataFrameTableMeta) -> None:
2370
+ pass
2371
+
2350
2372
  def ensure_data(self):
2351
2373
  from .fetch.core import DataFrameFetch
2352
2374
 
@@ -216,7 +216,9 @@ class DataFrameReadODPSQuery(
216
216
  index_value = parse_index(pd.RangeIndex(0))
217
217
  elif len(self.index_columns) == 1:
218
218
  index_value = parse_index(
219
- pd.Index([], name=self.index_columns[0]).astype(self.index_dtypes[0])
219
+ pd.Index([], name=self.index_columns[0]).astype(
220
+ self.index_dtypes.iloc[0]
221
+ )
220
222
  )
221
223
  else:
222
224
  idx = pd.MultiIndex.from_frame(
@@ -82,7 +82,9 @@ class DataFrameReadODPSTable(
82
82
  index_value = parse_index(pd.RangeIndex(shape[0]))
83
83
  elif len(self.index_columns) == 1:
84
84
  index_value = parse_index(
85
- pd.Index([], name=self.index_columns[0]).astype(self.index_dtypes[0])
85
+ pd.Index([], name=self.index_columns[0]).astype(
86
+ self.index_dtypes.iloc[0]
87
+ )
86
88
  )
87
89
  else:
88
90
  idx = pd.MultiIndex.from_frame(
@@ -14,6 +14,7 @@
14
14
 
15
15
  from .apply import df_apply, series_apply
16
16
  from .astype import astype, index_astype
17
+ from .case_when import case_when
17
18
  from .check_monotonic import (
18
19
  check_monotonic,
19
20
  is_monotonic,
@@ -37,6 +38,7 @@ from .map import index_map, series_map
37
38
  from .melt import melt
38
39
  from .memory_usage import df_memory_usage, index_memory_usage, series_memory_usage
39
40
  from .pct_change import pct_change
41
+ from .pivot_table import pivot_table
40
42
  from .qcut import qcut
41
43
  from .select_dtypes import select_dtypes
42
44
  from .shift import shift, tshift
@@ -69,6 +71,7 @@ def _install():
69
71
  setattr(t, "melt", melt)
70
72
  setattr(t, "memory_usage", df_memory_usage)
71
73
  setattr(t, "pct_change", pct_change)
74
+ setattr(t, "pivot_table", pivot_table)
72
75
  setattr(t, "pop", df_pop)
73
76
  setattr(t, "query", df_query)
74
77
  setattr(t, "select_dtypes", select_dtypes)
@@ -81,6 +84,7 @@ def _install():
81
84
  for t in SERIES_TYPE:
82
85
  setattr(t, "apply", series_apply)
83
86
  setattr(t, "astype", astype)
87
+ setattr(t, "case_when", case_when)
84
88
  setattr(t, "check_monotonic", check_monotonic)
85
89
  setattr(t, "describe", describe)
86
90
  setattr(t, "diff", series_diff)
@@ -225,7 +225,7 @@ class ApplyOperator(
225
225
  else: # pragma: no cover
226
226
  index_value = parse_index(infer_series.index)
227
227
  else:
228
- index_value = parse_index(None, series)
228
+ index_value = parse_index(series.index_value)
229
229
 
230
230
  if output_type == OutputType.dataframe:
231
231
  if dtypes is None:
@@ -0,0 +1,141 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import numpy as np
16
+ from pandas.core.dtypes.cast import find_common_type
17
+
18
+ from ... import opcodes
19
+ from ...core import TILEABLE_TYPE
20
+ from ...serialization.serializables import FieldTypes, ListField
21
+ from ..core import SERIES_TYPE
22
+ from ..operators import DataFrameOperator, DataFrameOperatorMixin
23
+ from ..utils import apply_if_callable
24
+
25
+
26
+ class DataFrameCaseWhen(DataFrameOperator, DataFrameOperatorMixin):
27
+ _op_type_ = opcodes.CASE_WHEN
28
+
29
+ conditions = ListField("conditions", FieldTypes.reference, default=None)
30
+ replacements = ListField("replacements", FieldTypes.reference, default=None)
31
+
32
+ def __init__(self, output_types=None, **kw):
33
+ super().__init__(_output_types=output_types, **kw)
34
+
35
+ def _set_inputs(self, inputs):
36
+ super()._set_inputs(inputs)
37
+ it = iter(inputs)
38
+ next(it)
39
+ self.conditions = [
40
+ next(it) if isinstance(t, TILEABLE_TYPE) else t for t in self.conditions
41
+ ]
42
+ self.replacements = [
43
+ next(it) if isinstance(t, TILEABLE_TYPE) else t for t in self.replacements
44
+ ]
45
+
46
+ def __call__(self, series):
47
+ replacement_dtypes = [
48
+ it.dtype if isinstance(it, SERIES_TYPE) else np.array(it).dtype
49
+ for it in self.replacements
50
+ ]
51
+ dtype = find_common_type([series.dtype] + replacement_dtypes)
52
+
53
+ condition_tileables = [
54
+ it for it in self.conditions if isinstance(it, TILEABLE_TYPE)
55
+ ]
56
+ replacement_tileables = [
57
+ it for it in self.replacements if isinstance(it, TILEABLE_TYPE)
58
+ ]
59
+ inputs = [series] + condition_tileables + replacement_tileables
60
+
61
+ params = series.params
62
+ params["dtype"] = dtype
63
+ return self.new_series(inputs, **params)
64
+
65
+
66
+ def case_when(series, caselist):
67
+ """
68
+ Replace values where the conditions are True.
69
+
70
+ Parameters
71
+ ----------
72
+ caselist : A list of tuples of conditions and expected replacements
73
+ Takes the form: ``(condition0, replacement0)``,
74
+ ``(condition1, replacement1)``, ... .
75
+ ``condition`` should be a 1-D boolean array-like object
76
+ or a callable. If ``condition`` is a callable,
77
+ it is computed on the Series
78
+ and should return a boolean Series or array.
79
+ The callable must not change the input Series
80
+ (though pandas doesn`t check it). ``replacement`` should be a
81
+ 1-D array-like object, a scalar or a callable.
82
+ If ``replacement`` is a callable, it is computed on the Series
83
+ and should return a scalar or Series. The callable
84
+ must not change the input Series.
85
+
86
+ Returns
87
+ -------
88
+ Series
89
+
90
+ See Also
91
+ --------
92
+ Series.mask : Replace values where the condition is True.
93
+
94
+ Examples
95
+ --------
96
+ >>> import maxframe.dataframe as md
97
+ >>> c = md.Series([6, 7, 8, 9], name='c')
98
+ >>> a = md.Series([0, 0, 1, 2])
99
+ >>> b = md.Series([0, 3, 4, 5])
100
+
101
+ >>> c.case_when(caselist=[(a.gt(0), a), # condition, replacement
102
+ ... (b.gt(0), b)])
103
+ 0 6
104
+ 1 3
105
+ 2 1
106
+ 3 2
107
+ Name: c, dtype: int64
108
+ """
109
+ if not isinstance(caselist, list):
110
+ raise TypeError(
111
+ f"The caselist argument should be a list; instead got {type(caselist)}"
112
+ )
113
+
114
+ if not caselist:
115
+ raise ValueError(
116
+ "provide at least one boolean condition, "
117
+ "with a corresponding replacement."
118
+ )
119
+
120
+ for num, entry in enumerate(caselist):
121
+ if not isinstance(entry, tuple):
122
+ raise TypeError(
123
+ f"Argument {num} must be a tuple; instead got {type(entry)}."
124
+ )
125
+ if len(entry) != 2:
126
+ raise ValueError(
127
+ f"Argument {num} must have length 2; "
128
+ "a condition and replacement; "
129
+ f"instead got length {len(entry)}."
130
+ )
131
+ caselist = [
132
+ (
133
+ apply_if_callable(condition, series),
134
+ apply_if_callable(replacement, series),
135
+ )
136
+ for condition, replacement in caselist
137
+ ]
138
+ conditions = [case[0] for case in caselist]
139
+ replacements = [case[1] for case in caselist]
140
+ op = DataFrameCaseWhen(conditions=conditions, replacements=replacements)
141
+ return op(series)