maxframe 1.0.0rc3__cp39-cp39-macosx_10_9_universal2.whl → 1.0.0rc4__cp39-cp39-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (57) hide show
  1. maxframe/_utils.cpython-39-darwin.so +0 -0
  2. maxframe/codegen.py +1 -0
  3. maxframe/config/config.py +13 -1
  4. maxframe/conftest.py +43 -12
  5. maxframe/core/entity/executable.py +1 -1
  6. maxframe/core/graph/core.cpython-39-darwin.so +0 -0
  7. maxframe/dataframe/arithmetic/docstring.py +26 -2
  8. maxframe/dataframe/arithmetic/equal.py +4 -2
  9. maxframe/dataframe/arithmetic/greater.py +4 -2
  10. maxframe/dataframe/arithmetic/greater_equal.py +4 -2
  11. maxframe/dataframe/arithmetic/less.py +2 -2
  12. maxframe/dataframe/arithmetic/less_equal.py +4 -2
  13. maxframe/dataframe/arithmetic/not_equal.py +4 -2
  14. maxframe/dataframe/core.py +2 -0
  15. maxframe/dataframe/datasource/read_odps_query.py +66 -7
  16. maxframe/dataframe/datasource/read_odps_table.py +3 -1
  17. maxframe/dataframe/datasource/tests/test_datasource.py +35 -6
  18. maxframe/dataframe/datastore/to_odps.py +7 -0
  19. maxframe/dataframe/extensions/__init__.py +3 -0
  20. maxframe/dataframe/extensions/flatmap.py +326 -0
  21. maxframe/dataframe/extensions/tests/test_extensions.py +62 -1
  22. maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
  23. maxframe/dataframe/indexing/rename.py +11 -0
  24. maxframe/dataframe/initializer.py +11 -1
  25. maxframe/dataframe/misc/drop_duplicates.py +18 -1
  26. maxframe/dataframe/tests/test_initializer.py +33 -2
  27. maxframe/io/odpsio/schema.py +5 -3
  28. maxframe/io/odpsio/tableio.py +44 -38
  29. maxframe/io/odpsio/tests/test_schema.py +0 -4
  30. maxframe/io/odpsio/volumeio.py +9 -3
  31. maxframe/learn/contrib/__init__.py +2 -1
  32. maxframe/learn/contrib/graph/__init__.py +15 -0
  33. maxframe/learn/contrib/graph/connected_components.py +215 -0
  34. maxframe/learn/contrib/graph/tests/__init__.py +13 -0
  35. maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
  36. maxframe/learn/contrib/xgboost/classifier.py +3 -3
  37. maxframe/learn/contrib/xgboost/predict.py +8 -39
  38. maxframe/learn/contrib/xgboost/train.py +4 -3
  39. maxframe/lib/mmh3.cpython-39-darwin.so +0 -0
  40. maxframe/opcodes.py +3 -0
  41. maxframe/protocol.py +6 -1
  42. maxframe/serialization/core.cpython-39-darwin.so +0 -0
  43. maxframe/session.py +9 -2
  44. maxframe/tensor/indexing/getitem.py +2 -0
  45. maxframe/tensor/merge/concatenate.py +23 -20
  46. maxframe/tensor/merge/vstack.py +5 -1
  47. maxframe/tensor/misc/transpose.py +1 -1
  48. maxframe/utils.py +34 -12
  49. {maxframe-1.0.0rc3.dist-info → maxframe-1.0.0rc4.dist-info}/METADATA +1 -1
  50. {maxframe-1.0.0rc3.dist-info → maxframe-1.0.0rc4.dist-info}/RECORD +57 -52
  51. {maxframe-1.0.0rc3.dist-info → maxframe-1.0.0rc4.dist-info}/WHEEL +1 -1
  52. maxframe_client/fetcher.py +10 -8
  53. maxframe_client/session/consts.py +3 -0
  54. maxframe_client/session/odps.py +84 -13
  55. maxframe_client/session/task.py +58 -20
  56. maxframe_client/tests/test_session.py +14 -2
  57. {maxframe-1.0.0rc3.dist-info → maxframe-1.0.0rc4.dist-info}/top_level.txt +0 -0
@@ -18,12 +18,13 @@ import logging
18
18
  import time
19
19
  import weakref
20
20
  from numbers import Integral
21
- from typing import Dict, List, Mapping, Optional, Tuple, Union
21
+ from typing import Any, Dict, List, Mapping, Optional, Tuple, Union
22
22
  from urllib.parse import urlparse
23
23
 
24
24
  import numpy as np
25
25
  import pandas as pd
26
26
  from odps import ODPS
27
+ from odps import options as odps_options
27
28
 
28
29
  from maxframe.config import options
29
30
  from maxframe.core import Entity, TileableGraph, build_fetch, enter_mode
@@ -65,6 +66,8 @@ from maxframe.utils import (
65
66
  ToThreadMixin,
66
67
  build_session_volume_name,
67
68
  build_temp_table_name,
69
+ str_to_bool,
70
+ sync_pyodps_options,
68
71
  )
69
72
 
70
73
  from ..clients.framedriver import FrameDriverClient
@@ -76,6 +79,43 @@ logger = logging.getLogger(__name__)
76
79
 
77
80
 
78
81
  class MaxFrameServiceCaller(metaclass=abc.ABCMeta):
82
+ def get_settings_to_upload(self) -> Dict[str, Any]:
83
+ sql_settings = (odps_options.sql.settings or {}).copy()
84
+ sql_settings.update(options.sql.settings or {})
85
+
86
+ quota_name = options.session.quota_name or getattr(
87
+ odps_options, "quota_name", None
88
+ )
89
+ lifecycle = options.session.table_lifecycle or odps_options.lifecycle
90
+ temp_lifecycle = (
91
+ options.session.temp_table_lifecycle or odps_options.temp_lifecycle
92
+ )
93
+
94
+ enable_schema = options.session.enable_schema
95
+ default_schema = options.session.default_schema
96
+ if hasattr(self, "_odps_entry"):
97
+ default_schema = default_schema or self._odps_entry.schema
98
+
99
+ # use flags in sql settings
100
+ if sql_settings.get("odps.default.schema"):
101
+ default_schema = sql_settings["odps.default.schema"]
102
+ if str_to_bool(
103
+ sql_settings.get("odps.namespace.schema") or "false"
104
+ ) or str_to_bool(
105
+ sql_settings.get("odps.sql.allow.namespace.schema") or "false"
106
+ ):
107
+ enable_schema = True
108
+
109
+ mf_settings = dict(options.to_dict(remote_only=True).items())
110
+ mf_settings["sql.settings"] = sql_settings
111
+ mf_settings["session.table_lifecycle"] = lifecycle
112
+ mf_settings["session.temp_table_lifecycle"] = temp_lifecycle
113
+ mf_settings["session.quota_name"] = quota_name
114
+ if enable_schema is not None:
115
+ mf_settings["session.enable_schema"] = enable_schema
116
+ mf_settings["session.default_schema"] = default_schema or "default"
117
+ return mf_settings
118
+
79
119
  @abc.abstractmethod
80
120
  def create_session(self) -> SessionInfo:
81
121
  raise NotImplementedError
@@ -86,7 +126,10 @@ class MaxFrameServiceCaller(metaclass=abc.ABCMeta):
86
126
 
87
127
  @abc.abstractmethod
88
128
  def submit_dag(
89
- self, dag: TileableGraph, managed_input_infos: Dict[str, ResultInfo]
129
+ self,
130
+ dag: TileableGraph,
131
+ managed_input_infos: Dict[str, ResultInfo],
132
+ new_settings: Dict[str, Any] = None,
90
133
  ) -> DagInfo:
91
134
  raise NotImplementedError
92
135
 
@@ -140,6 +183,7 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
140
183
  self._tileable_to_infos = weakref.WeakKeyDictionary()
141
184
 
142
185
  self._caller = self._create_caller(odps_entry, address, **kwargs)
186
+ self._last_settings = None
143
187
 
144
188
  @classmethod
145
189
  def _create_caller(
@@ -149,13 +193,14 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
149
193
 
150
194
  async def _init(self, _address: str):
151
195
  session_info = await self.ensure_async_call(self._caller.create_session)
196
+ self._last_settings = self._caller.get_settings_to_upload()
152
197
  self._session_id = session_info.session_id
153
198
  await self._show_logview_address()
154
199
 
155
200
  def _upload_and_get_table_read_tileable(
156
201
  self, t: TileableType
157
202
  ) -> Optional[TileableType]:
158
- schema, table_meta = pandas_to_odps_schema(t, unknown_as_string=True)
203
+ table_schema, table_meta = pandas_to_odps_schema(t, unknown_as_string=True)
159
204
  if self._odps_entry.exist_table(table_meta.table_name):
160
205
  self._odps_entry.delete_table(
161
206
  table_meta.table_name, hints=options.sql.settings
@@ -163,7 +208,7 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
163
208
  table_name = build_temp_table_name(self.session_id, t.key)
164
209
  table_obj = self._odps_entry.create_table(
165
210
  table_name,
166
- schema,
211
+ table_schema,
167
212
  lifecycle=options.session.temp_table_lifecycle,
168
213
  hints=options.sql.settings,
169
214
  )
@@ -217,10 +262,11 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
217
262
  or t.inputs
218
263
  ):
219
264
  return None
220
- if isinstance(t.op, PandasDataSourceOperator):
221
- return self._upload_and_get_table_read_tileable(t)
222
- else:
223
- return self._upload_and_get_vol_read_tileable(t)
265
+ with sync_pyodps_options():
266
+ if isinstance(t.op, PandasDataSourceOperator):
267
+ return self._upload_and_get_table_read_tileable(t)
268
+ else:
269
+ return self._upload_and_get_vol_read_tileable(t)
224
270
 
225
271
  @enter_mode(kernel=True, build=True)
226
272
  def _scan_and_replace_local_sources(
@@ -244,7 +290,7 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
244
290
 
245
291
  for succ in successors:
246
292
  graph.add_edge(replaced, succ)
247
- succ.inputs = [replacements.get(t, t) for t in succ.inputs]
293
+ succ.op._set_inputs([replacements.get(t, t) for t in succ.inputs])
248
294
 
249
295
  graph.results = [replacements.get(t, t) for t in graph.results]
250
296
  return replacements
@@ -269,6 +315,24 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
269
315
  infos[key] = ODPSTableResultInfo(full_table_name=t.op.table_name)
270
316
  return infos
271
317
 
318
+ def _get_diff_settings(self) -> Dict[str, Any]:
319
+ new_settings = self._caller.get_settings_to_upload()
320
+ if not self._last_settings: # pragma: no cover
321
+ self._last_settings = new_settings
322
+ return new_settings
323
+
324
+ update = dict()
325
+ for k in new_settings.keys():
326
+ old_item = self._last_settings.get(k)
327
+ new_item = new_settings.get(k)
328
+ try:
329
+ if old_item != new_item:
330
+ update[k] = new_item
331
+ except: # noqa: E722 # nosec # pylint: disable=bare-except
332
+ update[k] = new_item
333
+ self._last_settings = new_settings
334
+ return update
335
+
272
336
  async def execute(self, *tileables, **kwargs) -> ExecutionInfo:
273
337
  tileables = [
274
338
  tileable.data if isinstance(tileable, Entity) else tileable
@@ -288,7 +352,10 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
288
352
 
289
353
  replaced_infos = self._get_input_infos(list(source_replacements.values()))
290
354
  dag_info = await self.ensure_async_call(
291
- self._caller.submit_dag, tileable_graph, replaced_infos
355
+ self._caller.submit_dag,
356
+ tileable_graph,
357
+ replaced_infos,
358
+ self._get_diff_settings(),
292
359
  )
293
360
 
294
361
  await self._show_logview_address(dag_info.dag_id)
@@ -498,7 +565,8 @@ class MaxFrameRestCaller(MaxFrameServiceCaller):
498
565
  _client: FrameDriverClient
499
566
  _session_id: Optional[str]
500
567
 
501
- def __init__(self, client: FrameDriverClient):
568
+ def __init__(self, odps_entry: ODPS, client: FrameDriverClient):
569
+ self._odps_entry = odps_entry
502
570
  self._client = client
503
571
  self._session_id = None
504
572
 
@@ -511,7 +579,10 @@ class MaxFrameRestCaller(MaxFrameServiceCaller):
511
579
  await self._client.delete_session(self._session_id)
512
580
 
513
581
  async def submit_dag(
514
- self, dag: TileableGraph, managed_input_infos: Dict[str, ResultInfo]
582
+ self,
583
+ dag: TileableGraph,
584
+ managed_input_infos: Dict[str, ResultInfo] = None,
585
+ new_settings: Dict[str, Any] = None,
515
586
  ) -> DagInfo:
516
587
  return await self._client.submit_dag(self._session_id, dag, managed_input_infos)
517
588
 
@@ -551,7 +622,7 @@ class MaxFrameRestSession(MaxFrameSession):
551
622
 
552
623
  @classmethod
553
624
  def _create_caller(cls, odps_entry: ODPS, address: str, **kwargs):
554
- return MaxFrameRestCaller(FrameDriverClient(address))
625
+ return MaxFrameRestCaller(odps_entry, FrameDriverClient(address))
555
626
 
556
627
 
557
628
  def register_session_schemes(overwrite: bool = False):
@@ -16,7 +16,7 @@ import base64
16
16
  import json
17
17
  import logging
18
18
  import time
19
- from typing import Dict, List, Optional, Type, Union
19
+ from typing import Any, Dict, List, Optional, Type, Union
20
20
 
21
21
  import msgpack
22
22
  from odps import ODPS
@@ -24,6 +24,12 @@ from odps import options as odps_options
24
24
  from odps.errors import parse_instance_error
25
25
  from odps.models import Instance, MaxFrameTask
26
26
 
27
+ try:
28
+ from odps.errors import EmptyTaskInfoError
29
+ except ImportError: # pragma: no cover
30
+ # todo remove when pyodps>=0.12.0 is enforced
31
+ EmptyTaskInfoError = type("EmptyTaskInfoError", (Exception,), {})
32
+
27
33
  from maxframe.config import options
28
34
  from maxframe.core import TileableGraph
29
35
  from maxframe.errors import NoTaskServerResponseError, SessionAlreadyClosedError
@@ -36,6 +42,7 @@ except ImportError:
36
42
  mf_version = None
37
43
 
38
44
  from .consts import (
45
+ EMPTY_RESPONSE_RETRY_COUNT,
39
46
  MAXFRAME_DEFAULT_PROTOCOL,
40
47
  MAXFRAME_OUTPUT_JSON_FORMAT,
41
48
  MAXFRAME_OUTPUT_MAXFRAME_FORMAT,
@@ -92,6 +99,10 @@ class MaxFrameInstanceCaller(MaxFrameServiceCaller):
92
99
  self._nested = True
93
100
  self._instance = odps_entry.get_instance(nested_instance_id)
94
101
 
102
+ @property
103
+ def instance(self):
104
+ return self._instance
105
+
95
106
  def _deserial_task_info_result(
96
107
  self, content: Union[bytes, str, dict], target_cls: Type[JsonSerializable]
97
108
  ):
@@ -125,16 +136,8 @@ class MaxFrameInstanceCaller(MaxFrameServiceCaller):
125
136
  major_version=self._major_version,
126
137
  service_endpoint=self._odps_entry.endpoint,
127
138
  )
128
-
129
- # merge sql options
130
- sql_settings = (odps_options.sql.settings or {}).copy()
131
- sql_settings.update(options.sql.settings or {})
132
-
133
- mf_settings = dict(options.to_dict(remote_only=True).items())
134
- mf_settings["sql.settings"] = sql_settings
135
-
136
139
  mf_opts = {
137
- "odps.maxframe.settings": json.dumps(mf_settings),
140
+ "odps.maxframe.settings": json.dumps(self.get_settings_to_upload()),
138
141
  "odps.maxframe.output_format": self._output_format,
139
142
  }
140
143
  if mf_version:
@@ -189,18 +192,39 @@ class MaxFrameInstanceCaller(MaxFrameServiceCaller):
189
192
  interval = min(max_interval, interval * 2)
190
193
 
191
194
  def _put_task_info(self, method_name: str, json_data: dict):
192
- resp_data = self._instance.put_task_info(
193
- self._task_name, method_name, json.dumps(json_data)
194
- )
195
- if not resp_data:
196
- raise NoTaskServerResponseError(f"No response for request {method_name}")
197
- return resp_data
195
+ for trial in range(EMPTY_RESPONSE_RETRY_COUNT):
196
+ try:
197
+ return self._instance.put_task_info(
198
+ self._task_name,
199
+ method_name,
200
+ json.dumps(json_data),
201
+ raise_empty=True,
202
+ )
203
+ except TypeError: # pragma: no cover
204
+ # todo remove when pyodps>=0.12.0 is enforced
205
+ resp_data = self._instance.put_task_info(
206
+ self._task_name, method_name, json.dumps(json_data)
207
+ )
208
+ if resp_data:
209
+ return resp_data
210
+ else:
211
+ raise NoTaskServerResponseError(
212
+ f"No response for request {method_name}. "
213
+ f"Instance ID: {self._instance.id}"
214
+ )
215
+ except EmptyTaskInfoError as ex:
216
+ # retry when server returns HTTP 204, which is designed for retry
217
+ if ex.code != 204 or trial >= EMPTY_RESPONSE_RETRY_COUNT - 1:
218
+ raise NoTaskServerResponseError(
219
+ f"No response for request {method_name}. "
220
+ f"Instance ID: {self._instance.id}. "
221
+ f"Request ID: {ex.request_id}"
222
+ ) from None
223
+ time.sleep(0.5)
198
224
 
199
225
  def get_session(self) -> SessionInfo:
200
226
  req_data = {"output_format": self._output_format}
201
- serialized = self._instance.put_task_info(
202
- self._task_name, MAXFRAME_TASK_GET_SESSION_METHOD, json.dumps(req_data)
203
- )
227
+ serialized = self._put_task_info(MAXFRAME_TASK_GET_SESSION_METHOD, req_data)
204
228
  info: SessionInfo = self._deserial_task_info_result(serialized, SessionInfo)
205
229
  info.session_id = self._instance.id
206
230
  return info
@@ -217,13 +241,18 @@ class MaxFrameInstanceCaller(MaxFrameServiceCaller):
217
241
  self,
218
242
  dag: TileableGraph,
219
243
  managed_input_infos: Optional[Dict[str, ResultInfo]] = None,
244
+ new_settings: Dict[str, Any] = None,
220
245
  ) -> DagInfo:
246
+ new_settings_value = {
247
+ "odps.maxframe.settings": json.dumps(new_settings),
248
+ }
221
249
  req_data = {
222
250
  "protocol": MAXFRAME_DEFAULT_PROTOCOL,
223
251
  "dag": base64.b64encode(serialize_serializable(dag)).decode(),
224
252
  "managed_input_infos": base64.b64encode(
225
253
  serialize_serializable(managed_input_infos)
226
254
  ).decode(),
255
+ "new_settings": json.dumps(new_settings_value),
227
256
  "output_format": self._output_format,
228
257
  }
229
258
  res = self._put_task_info(MAXFRAME_TASK_SUBMIT_DAG_METHOD, req_data)
@@ -276,7 +305,7 @@ class MaxFrameInstanceCaller(MaxFrameServiceCaller):
276
305
  class MaxFrameTaskSession(MaxFrameSession):
277
306
  schemes = [ODPS_SESSION_INSECURE_SCHEME, ODPS_SESSION_SECURE_SCHEME]
278
307
 
279
- _instance: Instance
308
+ _caller: MaxFrameInstanceCaller
280
309
 
281
310
  @classmethod
282
311
  def _create_caller(
@@ -296,6 +325,15 @@ class MaxFrameTaskSession(MaxFrameSession):
296
325
  **kwargs,
297
326
  )
298
327
 
328
+ @property
329
+ def closed(self) -> bool:
330
+ if super().closed:
331
+ return True
332
+ if not self._caller or not self._caller.instance:
333
+ # session not initialized yet
334
+ return False
335
+ return self._caller.instance.is_terminated()
336
+
299
337
 
300
338
  def register_session_schemes(overwrite: bool = False):
301
339
  MaxFrameTaskSession.register_schemes(overwrite=overwrite)
@@ -137,6 +137,15 @@ def test_simple_run_dataframe(start_mock_session):
137
137
  assert not odps_entry.exist_table(build_temp_table_name(start_mock_session, key))
138
138
 
139
139
 
140
+ def test_run_and_fetch_slice(start_mock_session):
141
+ pd_df = pd.DataFrame(np.random.rand(1000, 5), columns=list("ABCDE"))
142
+ df = md.DataFrame(pd_df)
143
+ result = df.execute()
144
+
145
+ sliced = result.head(10).fetch()
146
+ assert len(sliced) == 10
147
+
148
+
140
149
  def test_run_empty_table(start_mock_session):
141
150
  odps_entry = ODPS.from_environments()
142
151
 
@@ -189,7 +198,7 @@ def test_run_dataframe_from_to_odps_table(start_mock_session):
189
198
  table_name = build_temp_table_name(start_mock_session, "tmp_save")
190
199
  table_obj = odps_entry.get_table(table_name)
191
200
  try:
192
- md.to_odps_table(md.DataFrame(pd_df), table_obj).execute().fetch()
201
+ md.to_odps_table(md.DataFrame(pd_df), table_obj, lifecycle=1).execute().fetch()
193
202
  with table_obj.open_reader() as reader:
194
203
  result_df = reader.to_pandas()
195
204
  assert len(result_df) == 10
@@ -256,7 +265,10 @@ def test_execute_with_tensor(oss_config, start_mock_session):
256
265
 
257
266
  result = (df - [1, 2]).execute().fetch()
258
267
  expected = pd_df - [1, 2]
259
- pd.testing.assert_frame_equal(result, expected)
268
+ # TODO: currently the record order in tensor reading from table is the index
269
+ # sorting order
270
+ expected.sort_index(axis=0, inplace=True)
271
+ pd.testing.assert_frame_equal(result, expected, check_like=True)
260
272
 
261
273
 
262
274
  def test_run_remote_success(oss_config, start_mock_session):