maxframe 0.1.0b3__cp37-cp37m-win_amd64.whl → 0.1.0b5__cp37-cp37m-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (58) hide show
  1. maxframe/__init__.py +1 -0
  2. maxframe/_utils.cp37-win_amd64.pyd +0 -0
  3. maxframe/codegen.py +46 -1
  4. maxframe/config/config.py +14 -1
  5. maxframe/core/graph/core.cp37-win_amd64.pyd +0 -0
  6. maxframe/dataframe/__init__.py +6 -0
  7. maxframe/dataframe/core.py +34 -10
  8. maxframe/dataframe/datasource/read_odps_query.py +6 -2
  9. maxframe/dataframe/datasource/read_odps_table.py +5 -1
  10. maxframe/dataframe/datastore/core.py +19 -0
  11. maxframe/dataframe/datastore/to_csv.py +2 -2
  12. maxframe/dataframe/datastore/to_odps.py +2 -2
  13. maxframe/dataframe/indexing/reset_index.py +1 -17
  14. maxframe/dataframe/misc/__init__.py +4 -0
  15. maxframe/dataframe/misc/apply.py +1 -1
  16. maxframe/dataframe/misc/case_when.py +141 -0
  17. maxframe/dataframe/misc/pivot_table.py +262 -0
  18. maxframe/dataframe/misc/tests/test_misc.py +61 -0
  19. maxframe/dataframe/plotting/core.py +2 -2
  20. maxframe/dataframe/reduction/core.py +2 -1
  21. maxframe/dataframe/utils.py +7 -0
  22. maxframe/learn/contrib/utils.py +52 -0
  23. maxframe/learn/contrib/xgboost/__init__.py +26 -0
  24. maxframe/learn/contrib/xgboost/classifier.py +86 -0
  25. maxframe/learn/contrib/xgboost/core.py +156 -0
  26. maxframe/learn/contrib/xgboost/dmatrix.py +150 -0
  27. maxframe/learn/contrib/xgboost/predict.py +138 -0
  28. maxframe/learn/contrib/xgboost/regressor.py +78 -0
  29. maxframe/learn/contrib/xgboost/tests/__init__.py +13 -0
  30. maxframe/learn/contrib/xgboost/tests/test_core.py +43 -0
  31. maxframe/learn/contrib/xgboost/train.py +121 -0
  32. maxframe/learn/utils/__init__.py +15 -0
  33. maxframe/learn/utils/core.py +29 -0
  34. maxframe/lib/mmh3.cp37-win_amd64.pyd +0 -0
  35. maxframe/odpsio/arrow.py +10 -6
  36. maxframe/odpsio/schema.py +18 -5
  37. maxframe/odpsio/tableio.py +22 -0
  38. maxframe/odpsio/tests/test_schema.py +41 -11
  39. maxframe/opcodes.py +8 -0
  40. maxframe/serialization/core.cp37-win_amd64.pyd +0 -0
  41. maxframe/serialization/core.pyi +61 -0
  42. maxframe/session.py +32 -2
  43. maxframe/tensor/__init__.py +1 -1
  44. maxframe/tensor/base/__init__.py +2 -0
  45. maxframe/tensor/base/atleast_1d.py +74 -0
  46. maxframe/tensor/base/unique.py +205 -0
  47. maxframe/tensor/datasource/array.py +4 -2
  48. maxframe/tensor/datasource/scalar.py +1 -1
  49. maxframe/udf.py +63 -3
  50. maxframe/utils.py +11 -0
  51. {maxframe-0.1.0b3.dist-info → maxframe-0.1.0b5.dist-info}/METADATA +2 -2
  52. {maxframe-0.1.0b3.dist-info → maxframe-0.1.0b5.dist-info}/RECORD +58 -40
  53. maxframe_client/fetcher.py +65 -3
  54. maxframe_client/session/odps.py +41 -11
  55. maxframe_client/session/task.py +26 -53
  56. maxframe_client/tests/test_session.py +49 -1
  57. {maxframe-0.1.0b3.dist-info → maxframe-0.1.0b5.dist-info}/WHEEL +0 -0
  58. {maxframe-0.1.0b3.dist-info → maxframe-0.1.0b5.dist-info}/top_level.txt +0 -0
@@ -12,16 +12,21 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ import base64
16
+ import json
15
17
  from abc import ABC, abstractmethod
16
18
  from numbers import Integral
17
- from typing import Any, Dict, List, Type, Union
19
+ from typing import Any, Dict, List, Optional, Type, Union
18
20
 
21
+ import pandas as pd
19
22
  import pyarrow as pa
20
23
  from odps import ODPS
21
24
  from odps.models import ExternalVolume, PartedVolume
25
+ from odps.tunnel import TableTunnel
22
26
  from tornado import httpclient
23
27
 
24
28
  from maxframe.core import OBJECT_TYPE
29
+ from maxframe.dataframe.core import DATAFRAME_TYPE
25
30
  from maxframe.lib import wrapped_pickle as pickle
26
31
  from maxframe.odpsio import HaloTableIO, arrow_to_pandas, build_dataframe_table_meta
27
32
  from maxframe.protocol import (
@@ -31,8 +36,9 @@ from maxframe.protocol import (
31
36
  ResultInfo,
32
37
  ResultType,
33
38
  )
39
+ from maxframe.tensor.core import TENSOR_TYPE
34
40
  from maxframe.typing_ import PandasObjectTypes, TileableType
35
- from maxframe.utils import ToThreadMixin
41
+ from maxframe.utils import ToThreadMixin, deserialize_serializable
36
42
 
37
43
  _result_fetchers: Dict[ResultType, Type["ResultFetcher"]] = dict()
38
44
 
@@ -52,6 +58,14 @@ class ResultFetcher(ABC):
52
58
  def __init__(self, odps_entry: ODPS):
53
59
  self._odps_entry = odps_entry
54
60
 
61
+ @abstractmethod
62
+ async def update_tileable_meta(
63
+ self,
64
+ tileable: TileableType,
65
+ info: ResultInfo,
66
+ ) -> None:
67
+ raise NotImplementedError
68
+
55
69
  @abstractmethod
56
70
  async def fetch(
57
71
  self,
@@ -66,6 +80,13 @@ class ResultFetcher(ABC):
66
80
  class NullFetcher(ResultFetcher):
67
81
  result_type = ResultType.NULL
68
82
 
83
+ async def update_tileable_meta(
84
+ self,
85
+ tileable: TileableType,
86
+ info: ResultInfo,
87
+ ) -> None:
88
+ return
89
+
69
90
  async def fetch(
70
91
  self,
71
92
  tileable: TileableType,
@@ -79,6 +100,40 @@ class NullFetcher(ResultFetcher):
79
100
  class ODPSTableFetcher(ToThreadMixin, ResultFetcher):
80
101
  result_type = ResultType.ODPS_TABLE
81
102
 
103
+ def _get_table_comment(self, table_name: str) -> Optional[str]:
104
+ table = self._odps_entry.get_table(table_name)
105
+ return getattr(table, "comment", None)
106
+
107
+ async def update_tileable_meta(
108
+ self,
109
+ tileable: TileableType,
110
+ info: ODPSTableResultInfo,
111
+ ) -> None:
112
+ if isinstance(tileable, DATAFRAME_TYPE) and tileable.dtypes is None:
113
+ tb_comment = await self.to_thread(
114
+ self._get_table_comment, info.full_table_name
115
+ )
116
+ if tb_comment: # pragma: no branch
117
+ comment_data = json.loads(tb_comment)
118
+
119
+ table_meta: DataFrameTableMeta = deserialize_serializable(
120
+ base64.b64decode(comment_data["table_meta"])
121
+ )
122
+ tileable.refresh_from_table_meta(table_meta)
123
+
124
+ if tileable.shape and any(pd.isna(x) for x in tileable.shape):
125
+ part_specs = [None] if not info.partition_specs else info.partition_specs
126
+ tunnel = TableTunnel(self._odps_entry)
127
+ total_records = 0
128
+ for part_spec in part_specs:
129
+ session = tunnel.create_download_session(
130
+ info.full_table_name, part_spec
131
+ )
132
+ total_records += session.count
133
+ new_shape_list = list(tileable.shape)
134
+ new_shape_list[-1] = total_records
135
+ tileable.params = {"shape": tuple(new_shape_list)}
136
+
82
137
  def _read_single_source(
83
138
  self,
84
139
  table_meta: DataFrameTableMeta,
@@ -149,6 +204,13 @@ class ODPSTableFetcher(ToThreadMixin, ResultFetcher):
149
204
  class ODPSVolumeFetcher(ToThreadMixin, ResultFetcher):
150
205
  result_type = ResultType.ODPS_VOLUME
151
206
 
207
+ async def update_tileable_meta(
208
+ self,
209
+ tileable: TileableType,
210
+ info: ODPSVolumeResultInfo,
211
+ ) -> None:
212
+ return
213
+
152
214
  async def _read_parted_volume_data(
153
215
  self, volume: PartedVolume, partition: str, file_name: str
154
216
  ) -> bytes:
@@ -197,6 +259,6 @@ class ODPSVolumeFetcher(ToThreadMixin, ResultFetcher):
197
259
  info: ODPSVolumeResultInfo,
198
260
  indexes: List[Union[Integral, slice]],
199
261
  ) -> Any:
200
- if isinstance(tileable, OBJECT_TYPE):
262
+ if isinstance(tileable, (OBJECT_TYPE, TENSOR_TYPE)):
201
263
  return await self._fetch_object(info)
202
264
  raise NotImplementedError(f"Fetching {type(tileable)} not implemented")
@@ -84,6 +84,9 @@ class MaxFrameServiceCaller(metaclass=abc.ABCMeta):
84
84
  def decref(self, tileable_keys: List[str]) -> None:
85
85
  raise NotImplementedError
86
86
 
87
+ def get_logview_address(self, dag_id=None, hours=None) -> Optional[str]:
88
+ return None
89
+
87
90
 
88
91
  class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
89
92
  _odps_entry: Optional[ODPS]
@@ -115,7 +118,7 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
115
118
  ):
116
119
  super().__init__(address, session_id)
117
120
  self.timeout = timeout
118
- self._odps_entry = odps_entry or ODPS.from_environments()
121
+ self._odps_entry = odps_entry or ODPS.from_global() or ODPS.from_environments()
119
122
  self._tileable_to_infos = weakref.WeakKeyDictionary()
120
123
 
121
124
  self._caller = self._create_caller(odps_entry, address, **kwargs)
@@ -129,6 +132,7 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
129
132
  async def _init(self, _address: str):
130
133
  session_info = await self.ensure_async_call(self._caller.create_session)
131
134
  self._session_id = session_info.session_id
135
+ await self._show_logview_address()
132
136
 
133
137
  def _upload_and_get_read_tileable(self, t: TileableType) -> Optional[TileableType]:
134
138
  if (
@@ -142,20 +146,23 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
142
146
  if self._odps_entry.exist_table(table_meta.table_name):
143
147
  self._odps_entry.delete_table(table_meta.table_name)
144
148
  table_name = build_temp_table_name(self.session_id, t.key)
145
- table_obj = self._odps_entry.create_table(table_name, schema)
149
+ table_obj = self._odps_entry.create_table(
150
+ table_name, schema, lifecycle=options.session.temp_table_lifecycle
151
+ )
146
152
 
147
153
  data = t.op.get_data()
148
154
  batch_size = options.session.upload_batch_size
149
155
 
150
- halo_client = HaloTableIO(self._odps_entry)
151
- with halo_client.open_writer(table_obj.full_table_name) as writer:
152
- for batch_start in range(0, len(data), batch_size):
153
- if isinstance(data, pd.Index):
154
- batch = data[batch_start : batch_start + batch_size]
155
- else:
156
- batch = data.iloc[batch_start : batch_start + batch_size]
157
- arrow_batch, _ = pandas_to_arrow(batch)
158
- writer.write(arrow_batch)
156
+ if len(data):
157
+ halo_client = HaloTableIO(self._odps_entry)
158
+ with halo_client.open_writer(table_obj.full_table_name) as writer:
159
+ for batch_start in range(0, len(data), batch_size):
160
+ if isinstance(data, pd.Index):
161
+ batch = data[batch_start : batch_start + batch_size]
162
+ else:
163
+ batch = data.iloc[batch_start : batch_start + batch_size]
164
+ arrow_batch, _ = pandas_to_arrow(batch)
165
+ writer.write(arrow_batch)
159
166
 
160
167
  read_tileable = read_odps_table(
161
168
  table_obj.full_table_name,
@@ -238,6 +245,8 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
238
245
  self._caller.submit_dag, tileable_graph, replaced_infos
239
246
  )
240
247
 
248
+ await self._show_logview_address(dag_info.dag_id)
249
+
241
250
  progress = Progress()
242
251
  profiling = Profiling()
243
252
  aio_task = asyncio.create_task(
@@ -293,6 +302,8 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
293
302
 
294
303
  for key, result_info in dag_info.tileable_to_result_infos.items():
295
304
  t = key_to_tileables[key]
305
+ fetcher = get_fetcher_cls(result_info.result_type)(self._odps_entry)
306
+ await fetcher.update_tileable_meta(t, result_info)
296
307
  self._tileable_to_infos[t] = result_info
297
308
 
298
309
  def _get_data_tileable_and_indexes(
@@ -387,6 +398,25 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
387
398
  async def get_mutable_tensor(self, name: str):
388
399
  raise NotImplementedError
389
400
 
401
+ async def get_logview_address(self, hours=None) -> Optional[str]:
402
+ return await self.get_dag_logview_address(None, hours)
403
+
404
+ async def get_dag_logview_address(self, dag_id=None, hours=None) -> Optional[str]:
405
+ return await self.ensure_async_call(
406
+ self._caller.get_logview_address, dag_id, hours
407
+ )
408
+
409
+ async def _show_logview_address(self, dag_id=None, hours=None):
410
+ identity = f"Session ID: {self._session_id}"
411
+ if dag_id:
412
+ identity += f", DAG ID: {dag_id}"
413
+
414
+ logview_addr = await self.get_dag_logview_address(dag_id, hours)
415
+ if logview_addr:
416
+ logger.info("%s, Logview: %s", identity, logview_addr)
417
+ else:
418
+ logger.info("%s, Logview address does not exist", identity)
419
+
390
420
 
391
421
  class MaxFrameRestCaller(MaxFrameServiceCaller):
392
422
  _client: FrameDriverClient
@@ -21,9 +21,8 @@ from typing import Dict, List, Optional, Type, Union
21
21
  import msgpack
22
22
  from odps import ODPS
23
23
  from odps import options as odps_options
24
- from odps import serializers
25
24
  from odps.errors import parse_instance_error
26
- from odps.models import Instance, Task
25
+ from odps.models import Instance, MaxFrameTask
27
26
 
28
27
  from maxframe.config import options
29
28
  from maxframe.core import TileableGraph
@@ -55,55 +54,6 @@ from .odps import MaxFrameServiceCaller, MaxFrameSession
55
54
  logger = logging.getLogger(__name__)
56
55
 
57
56
 
58
- class MaxFrameTask(Task):
59
- __slots__ = ("_output_format", "_major_version", "_service_endpoint")
60
- _root = "MaxFrame"
61
- _anonymous_task_name = "AnonymousMaxFrameTask"
62
-
63
- command = serializers.XMLNodeField("Command", default="CREATE_SESSION")
64
-
65
- def __init__(self, **kwargs):
66
- kwargs["name"] = kwargs.get("name") or self._anonymous_task_name
67
- self._output_format = kwargs.pop(
68
- "output_format", MAXFRAME_OUTPUT_MSGPACK_FORMAT
69
- )
70
- self._major_version = kwargs.pop("major_version", None)
71
- self._service_endpoint = kwargs.pop("service_endpoint", None)
72
- super().__init__(**kwargs)
73
-
74
- def serial(self):
75
- if self.properties is None:
76
- self.properties = dict()
77
-
78
- if odps_options.default_task_settings:
79
- settings = odps_options.default_task_settings
80
- else:
81
- settings = dict()
82
-
83
- if self._major_version is not None:
84
- settings["odps.task.major.version"] = self._major_version
85
-
86
- if "settings" in self.properties:
87
- settings.update(json.loads(self.properties["settings"]))
88
-
89
- # merge sql options
90
- sql_settings = (odps_options.sql.settings or {}).copy()
91
- sql_settings.update(options.sql.settings or {})
92
-
93
- mf_settings = dict(options.to_dict(remote_only=True).items())
94
- mf_settings["sql.settings"] = sql_settings
95
- mf_opts = {
96
- "odps.maxframe.settings": json.dumps(mf_settings),
97
- "odps.maxframe.output_format": self._output_format,
98
- "odps.service.endpoint": self._service_endpoint,
99
- }
100
- if mf_version:
101
- mf_opts["odps.maxframe.client_version"] = mf_version
102
- settings.update(mf_opts)
103
- self.properties["settings"] = json.dumps(settings)
104
- return super().serial()
105
-
106
-
107
57
  class MaxFrameInstanceCaller(MaxFrameServiceCaller):
108
58
  _instance: Optional[Instance]
109
59
 
@@ -159,13 +109,31 @@ class MaxFrameInstanceCaller(MaxFrameServiceCaller):
159
109
  f"Serialization format {self._output_format} not supported"
160
110
  )
161
111
 
162
- def create_session(self) -> SessionInfo:
112
+ def _create_maxframe_task(self) -> MaxFrameTask:
163
113
  task = MaxFrameTask(
164
114
  name=self._task_name,
165
115
  major_version=self._major_version,
166
- output_format=self._output_format,
167
116
  service_endpoint=self._odps_entry.endpoint,
168
117
  )
118
+
119
+ # merge sql options
120
+ sql_settings = (odps_options.sql.settings or {}).copy()
121
+ sql_settings.update(options.sql.settings or {})
122
+
123
+ mf_settings = dict(options.to_dict(remote_only=True).items())
124
+ mf_settings["sql.settings"] = sql_settings
125
+
126
+ mf_opts = {
127
+ "odps.maxframe.settings": json.dumps(mf_settings),
128
+ "odps.maxframe.output_format": self._output_format,
129
+ }
130
+ if mf_version:
131
+ mf_opts["odps.maxframe.client_version"] = mf_version
132
+ task.update_settings(mf_opts)
133
+ return task
134
+
135
+ def create_session(self) -> SessionInfo:
136
+ task = self._create_maxframe_task()
169
137
  if not self._nested:
170
138
  self._task_name = task.name
171
139
  project = self._odps_entry.get_project(self._project)
@@ -278,6 +246,11 @@ class MaxFrameInstanceCaller(MaxFrameServiceCaller):
278
246
  self._task_name, MAXFRAME_TASK_DECREF_METHOD, json.dumps(req_data)
279
247
  )
280
248
 
249
+ def get_logview_address(self, dag_id=None, hours=None) -> Optional[str]:
250
+ hours = hours or options.session.logview_hours
251
+ subquery_suffix = f"&subQuery={dag_id}" if dag_id else ""
252
+ return self._instance.get_logview_address(hours) + subquery_suffix
253
+
281
254
 
282
255
  class MaxFrameTaskSession(MaxFrameSession):
283
256
  schemes = [ODPS_SESSION_INSECURE_SCHEME, ODPS_SESSION_SECURE_SCHEME]
@@ -28,6 +28,7 @@ from maxframe.lib.aio import stop_isolation
28
28
  from maxframe.protocol import ResultInfo
29
29
  from maxframe.serialization import RemoteException
30
30
  from maxframe.session import new_session
31
+ from maxframe.tests.utils import tn
31
32
  from maxframe.utils import build_temp_table_name
32
33
  from maxframe_framedriver.app.tests.test_framedriver_webapp import ( # noqa: F401
33
34
  framedriver_app,
@@ -98,9 +99,12 @@ def test_simple_run_dataframe(start_mock_session):
98
99
  corner_top, corner_bottom = ExecutableTuple([df.iloc[:10], df.iloc[-10:]]).fetch()
99
100
  assert len(corner_top) == len(corner_bottom) == 10
100
101
 
101
- # check ellipsis mark in DataFrame errors
102
+ # check ellipsis mark in DataFrame reprs
102
103
  df_str_repr = str(df)
103
104
  assert ".." in df_str_repr
105
+ # check ellipsis mark in Series reprs
106
+ series_str_repr = str(df.A.execute())
107
+ assert ".." in series_str_repr
104
108
 
105
109
  key = df.key
106
110
  assert odps_entry.exist_table(
@@ -115,6 +119,26 @@ def test_simple_run_dataframe(start_mock_session):
115
119
  assert not odps_entry.exist_table(build_temp_table_name(start_mock_session, key))
116
120
 
117
121
 
122
+ def test_run_empty_table(start_mock_session):
123
+ odps_entry = ODPS.from_environments()
124
+
125
+ table_name = tn("test_session_empty_table")
126
+ odps_entry.delete_table(table_name, if_exists=True)
127
+ empty_table = odps_entry.create_table(
128
+ table_name, "_idx_0 bigint, a double, b double", lifecycle=1
129
+ )
130
+ df = md.read_odps_table(table_name, index_col="_idx_0")
131
+ df["d"] = df["a"] + df["b"]
132
+
133
+ executed = df.execute()
134
+ assert "Index: []" in str(executed)
135
+
136
+ fetched = executed.fetch()
137
+ assert 0 == len(fetched)
138
+
139
+ empty_table.drop()
140
+
141
+
118
142
  def test_run_dataframe_with_pd_source(start_mock_session):
119
143
  odps_entry = ODPS.from_environments()
120
144
 
@@ -205,3 +229,27 @@ def test_run_remote_error(start_mock_session):
205
229
 
206
230
  with pytest.raises((ValueError, RemoteException)):
207
231
  v.execute()
232
+
233
+
234
+ def test_pivot_dataframe(start_mock_session):
235
+ pd_df = pd.DataFrame(
236
+ {
237
+ "A": "foo foo foo foo foo bar bar bar bar".split(),
238
+ "B": "one one one two two one one two two".split(),
239
+ "C": "small large large small small large small small large".split(),
240
+ "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
241
+ "E": [2, 4, 5, 5, 6, 6, 8, 9, 9],
242
+ }
243
+ )
244
+ df = md.DataFrame(pd_df)
245
+ pivot = df.pivot_table(values="D", index=["A", "B"], columns=["C"], aggfunc="sum")
246
+ executed = pivot.execute()
247
+ assert pivot.shape == (2, 4)
248
+ pd.testing.assert_index_equal(
249
+ pivot.dtypes.index, pd.Index(["large", "small"], name="C")
250
+ )
251
+
252
+ expected = pd_df.pivot_table(
253
+ values="D", index=["A", "B"], columns=["C"], aggfunc="sum"
254
+ )
255
+ pd.testing.assert_frame_equal(executed.to_pandas(), expected)