maxframe 1.0.0rc4__cp310-cp310-win32.whl → 1.1.1__cp310-cp310-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (88) hide show
  1. maxframe/_utils.cp310-win32.pyd +0 -0
  2. maxframe/config/__init__.py +1 -1
  3. maxframe/config/config.py +26 -0
  4. maxframe/config/tests/test_config.py +20 -1
  5. maxframe/conftest.py +17 -4
  6. maxframe/core/graph/core.cp310-win32.pyd +0 -0
  7. maxframe/core/operator/base.py +2 -0
  8. maxframe/dataframe/arithmetic/tests/test_arithmetic.py +17 -16
  9. maxframe/dataframe/core.py +24 -2
  10. maxframe/dataframe/datasource/read_odps_query.py +65 -35
  11. maxframe/dataframe/datasource/read_odps_table.py +4 -2
  12. maxframe/dataframe/datasource/tests/test_datasource.py +59 -7
  13. maxframe/dataframe/extensions/__init__.py +5 -0
  14. maxframe/dataframe/extensions/apply_chunk.py +649 -0
  15. maxframe/dataframe/extensions/flatjson.py +131 -0
  16. maxframe/dataframe/extensions/flatmap.py +28 -40
  17. maxframe/dataframe/extensions/reshuffle.py +1 -1
  18. maxframe/dataframe/extensions/tests/test_apply_chunk.py +186 -0
  19. maxframe/dataframe/extensions/tests/test_extensions.py +46 -2
  20. maxframe/dataframe/groupby/__init__.py +1 -0
  21. maxframe/dataframe/groupby/aggregation.py +1 -0
  22. maxframe/dataframe/groupby/apply.py +9 -1
  23. maxframe/dataframe/groupby/core.py +1 -1
  24. maxframe/dataframe/groupby/fill.py +4 -1
  25. maxframe/dataframe/groupby/getitem.py +6 -0
  26. maxframe/dataframe/groupby/tests/test_groupby.py +1 -1
  27. maxframe/dataframe/groupby/transform.py +8 -2
  28. maxframe/dataframe/indexing/loc.py +6 -4
  29. maxframe/dataframe/merge/__init__.py +9 -1
  30. maxframe/dataframe/merge/concat.py +41 -31
  31. maxframe/dataframe/merge/merge.py +1 -1
  32. maxframe/dataframe/merge/tests/test_merge.py +3 -1
  33. maxframe/dataframe/misc/apply.py +3 -0
  34. maxframe/dataframe/misc/drop_duplicates.py +5 -1
  35. maxframe/dataframe/misc/map.py +3 -1
  36. maxframe/dataframe/misc/tests/test_misc.py +24 -2
  37. maxframe/dataframe/misc/transform.py +22 -13
  38. maxframe/dataframe/reduction/__init__.py +3 -0
  39. maxframe/dataframe/reduction/aggregation.py +1 -0
  40. maxframe/dataframe/reduction/median.py +56 -0
  41. maxframe/dataframe/reduction/tests/test_reduction.py +17 -7
  42. maxframe/dataframe/statistics/quantile.py +8 -2
  43. maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
  44. maxframe/dataframe/tests/test_utils.py +60 -0
  45. maxframe/dataframe/utils.py +110 -7
  46. maxframe/dataframe/window/expanding.py +5 -3
  47. maxframe/dataframe/window/tests/test_expanding.py +2 -2
  48. maxframe/io/objects/tests/test_object_io.py +39 -12
  49. maxframe/io/odpsio/__init__.py +1 -1
  50. maxframe/io/odpsio/arrow.py +51 -2
  51. maxframe/io/odpsio/schema.py +23 -5
  52. maxframe/io/odpsio/tableio.py +80 -124
  53. maxframe/io/odpsio/tests/test_schema.py +40 -0
  54. maxframe/io/odpsio/tests/test_tableio.py +5 -5
  55. maxframe/io/odpsio/tests/test_volumeio.py +35 -11
  56. maxframe/io/odpsio/volumeio.py +27 -3
  57. maxframe/learn/contrib/__init__.py +3 -2
  58. maxframe/learn/contrib/llm/__init__.py +16 -0
  59. maxframe/learn/contrib/llm/core.py +54 -0
  60. maxframe/learn/contrib/llm/models/__init__.py +14 -0
  61. maxframe/learn/contrib/llm/models/dashscope.py +73 -0
  62. maxframe/learn/contrib/llm/multi_modal.py +42 -0
  63. maxframe/learn/contrib/llm/text.py +42 -0
  64. maxframe/lib/mmh3.cp310-win32.pyd +0 -0
  65. maxframe/lib/sparse/tests/test_sparse.py +15 -15
  66. maxframe/opcodes.py +7 -1
  67. maxframe/serialization/core.cp310-win32.pyd +0 -0
  68. maxframe/serialization/core.pyx +13 -1
  69. maxframe/serialization/pandas.py +50 -20
  70. maxframe/serialization/serializables/core.py +70 -15
  71. maxframe/serialization/serializables/field_type.py +4 -1
  72. maxframe/serialization/serializables/tests/test_serializable.py +12 -2
  73. maxframe/serialization/tests/test_serial.py +2 -1
  74. maxframe/tensor/__init__.py +19 -7
  75. maxframe/tensor/merge/vstack.py +1 -1
  76. maxframe/tests/utils.py +16 -0
  77. maxframe/udf.py +27 -0
  78. maxframe/utils.py +42 -8
  79. {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/METADATA +4 -4
  80. {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/RECORD +88 -77
  81. {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/WHEEL +1 -1
  82. maxframe_client/clients/framedriver.py +4 -1
  83. maxframe_client/fetcher.py +23 -8
  84. maxframe_client/session/odps.py +40 -11
  85. maxframe_client/session/task.py +6 -25
  86. maxframe_client/session/tests/test_task.py +35 -6
  87. maxframe_client/tests/test_session.py +30 -10
  88. {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/top_level.txt +0 -0
@@ -14,6 +14,7 @@
14
14
 
15
15
  import abc
16
16
  import asyncio
17
+ import copy
17
18
  import logging
18
19
  import time
19
20
  import weakref
@@ -25,6 +26,7 @@ import numpy as np
25
26
  import pandas as pd
26
27
  from odps import ODPS
27
28
  from odps import options as odps_options
29
+ from odps.console import in_ipython_frontend
28
30
 
29
31
  from maxframe.config import options
30
32
  from maxframe.core import Entity, TileableGraph, build_fetch, enter_mode
@@ -82,10 +84,21 @@ class MaxFrameServiceCaller(metaclass=abc.ABCMeta):
82
84
  def get_settings_to_upload(self) -> Dict[str, Any]:
83
85
  sql_settings = (odps_options.sql.settings or {}).copy()
84
86
  sql_settings.update(options.sql.settings or {})
85
-
86
87
  quota_name = options.session.quota_name or getattr(
87
88
  odps_options, "quota_name", None
88
89
  )
90
+ quota_settings = {
91
+ sql_settings.get("odps.task.wlm.quota", None),
92
+ options.spe.task.settings.get("odps.task.wlm.quota", None),
93
+ options.pythonpack.task.settings.get("odps.task.wlm.quota", None),
94
+ quota_name,
95
+ }.difference([None])
96
+ if len(quota_settings) >= 2:
97
+ raise ValueError(
98
+ "Quota settings are conflicting: %s" % ", ".join(sorted(quota_settings))
99
+ )
100
+ elif len(quota_settings) == 1:
101
+ quota_name = quota_settings.pop()
89
102
  lifecycle = options.session.table_lifecycle or odps_options.lifecycle
90
103
  temp_lifecycle = (
91
104
  options.session.temp_table_lifecycle or odps_options.temp_lifecycle
@@ -113,6 +126,8 @@ class MaxFrameServiceCaller(metaclass=abc.ABCMeta):
113
126
  mf_settings["session.quota_name"] = quota_name
114
127
  if enable_schema is not None:
115
128
  mf_settings["session.enable_schema"] = enable_schema
129
+ if options.session.enable_high_availability is None:
130
+ mf_settings["session.enable_high_availability"] = not in_ipython_frontend()
116
131
  mf_settings["session.default_schema"] = default_schema or "default"
117
132
  return mf_settings
118
133
 
@@ -184,6 +199,8 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
184
199
 
185
200
  self._caller = self._create_caller(odps_entry, address, **kwargs)
186
201
  self._last_settings = None
202
+ self._pull_interval = 1 if in_ipython_frontend() else 3
203
+ self._replace_internal_host = kwargs.get("replace_internal_host", True)
187
204
 
188
205
  @classmethod
189
206
  def _create_caller(
@@ -193,7 +210,7 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
193
210
 
194
211
  async def _init(self, _address: str):
195
212
  session_info = await self.ensure_async_call(self._caller.create_session)
196
- self._last_settings = self._caller.get_settings_to_upload()
213
+ self._last_settings = copy.deepcopy(self._caller.get_settings_to_upload())
197
214
  self._session_id = session_info.session_id
198
215
  await self._show_logview_address()
199
216
 
@@ -250,7 +267,12 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
250
267
  self, t: TileableType
251
268
  ) -> Optional[TileableType]:
252
269
  vol_name = build_session_volume_name(self.session_id)
253
- writer = ODPSVolumeWriter(self._odps_entry, vol_name, t.key)
270
+ writer = ODPSVolumeWriter(
271
+ self._odps_entry,
272
+ vol_name,
273
+ t.key,
274
+ replace_internal_host=self._replace_internal_host,
275
+ )
254
276
  io_handler = get_object_io_handler(t)
255
277
  io_handler().write_object(writer, t, t.op.data)
256
278
  return build_fetch(t).data
@@ -318,9 +340,14 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
318
340
  def _get_diff_settings(self) -> Dict[str, Any]:
319
341
  new_settings = self._caller.get_settings_to_upload()
320
342
  if not self._last_settings: # pragma: no cover
321
- self._last_settings = new_settings
343
+ self._last_settings = copy.deepcopy(new_settings)
322
344
  return new_settings
323
345
 
346
+ if self._last_settings.get("session.quota_name", None) != new_settings.get(
347
+ "session.quota_name", None
348
+ ):
349
+ raise ValueError("Quota name cannot be changed after sessions are created")
350
+
324
351
  update = dict()
325
352
  for k in new_settings.keys():
326
353
  old_item = self._last_settings.get(k)
@@ -330,7 +357,7 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
330
357
  update[k] = new_item
331
358
  except: # noqa: E722 # nosec # pylint: disable=bare-except
332
359
  update[k] = new_item
333
- self._last_settings = new_settings
360
+ self._last_settings = copy.deepcopy(new_settings)
334
361
  return update
335
362
 
336
363
  async def execute(self, *tileables, **kwargs) -> ExecutionInfo:
@@ -379,18 +406,18 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
379
406
  start_time = time.time()
380
407
  session_id = dag_info.session_id
381
408
  dag_id = dag_info.dag_id
382
- wait_timeout = 10
383
409
  server_no_response_time = None
384
410
  with enter_mode(build=True, kernel=True):
385
411
  key_to_tileables = {t.key: t for t in tileables}
386
-
412
+ timeout_val = 0.1
387
413
  try:
388
414
  while True:
389
415
  elapsed_time = time.time() - start_time
416
+ next_timeout_val = min(timeout_val * 2, self._pull_interval)
390
417
  timeout_val = (
391
- min(self.timeout - elapsed_time, wait_timeout)
418
+ min(self.timeout - elapsed_time, next_timeout_val)
392
419
  if self.timeout
393
- else wait_timeout
420
+ else next_timeout_val
394
421
  )
395
422
  if timeout_val <= 0:
396
423
  raise TimeoutError("Running DAG timed out")
@@ -584,7 +611,9 @@ class MaxFrameRestCaller(MaxFrameServiceCaller):
584
611
  managed_input_infos: Dict[str, ResultInfo] = None,
585
612
  new_settings: Dict[str, Any] = None,
586
613
  ) -> DagInfo:
587
- return await self._client.submit_dag(self._session_id, dag, managed_input_infos)
614
+ return await self._client.submit_dag(
615
+ self._session_id, dag, managed_input_infos, new_settings=new_settings
616
+ )
588
617
 
589
618
  async def get_dag_info(self, dag_id: str) -> DagInfo:
590
619
  return await self._client.get_dag_info(self._session_id, dag_id)
@@ -617,7 +646,7 @@ class MaxFrameRestSession(MaxFrameSession):
617
646
  real_endpoint = address.replace(f"{parsed_endpoint.scheme}://", f"{scheme}://")
618
647
 
619
648
  super().__init__(
620
- real_endpoint, session_id, odps_entry=odps_entry, timeout=timeout
649
+ real_endpoint, session_id, odps_entry=odps_entry, timeout=timeout, **kwargs
621
650
  )
622
651
 
623
652
  @classmethod
@@ -21,15 +21,9 @@ from typing import Any, Dict, List, Optional, Type, Union
21
21
  import msgpack
22
22
  from odps import ODPS
23
23
  from odps import options as odps_options
24
- from odps.errors import parse_instance_error
24
+ from odps.errors import EmptyTaskInfoError, parse_instance_error
25
25
  from odps.models import Instance, MaxFrameTask
26
26
 
27
- try:
28
- from odps.errors import EmptyTaskInfoError
29
- except ImportError: # pragma: no cover
30
- # todo remove when pyodps>=0.12.0 is enforced
31
- EmptyTaskInfoError = type("EmptyTaskInfoError", (Exception,), {})
32
-
33
27
  from maxframe.config import options
34
28
  from maxframe.core import TileableGraph
35
29
  from maxframe.errors import NoTaskServerResponseError, SessionAlreadyClosedError
@@ -131,15 +125,14 @@ class MaxFrameInstanceCaller(MaxFrameServiceCaller):
131
125
  )
132
126
 
133
127
  def _create_maxframe_task(self) -> MaxFrameTask:
134
- task = MaxFrameTask(
135
- name=self._task_name,
136
- major_version=self._major_version,
137
- service_endpoint=self._odps_entry.endpoint,
138
- )
128
+ task = MaxFrameTask(name=self._task_name, major_version=self._major_version)
129
+ mf_settings = self.get_settings_to_upload()
139
130
  mf_opts = {
140
- "odps.maxframe.settings": json.dumps(self.get_settings_to_upload()),
131
+ "odps.maxframe.settings": json.dumps(mf_settings),
141
132
  "odps.maxframe.output_format": self._output_format,
142
133
  }
134
+ if mf_settings.get("session.quota_name", None):
135
+ mf_opts["odps.task.wlm.quota"] = mf_settings["session.quota_name"]
143
136
  if mf_version:
144
137
  mf_opts["odps.maxframe.client_version"] = mf_version
145
138
  task.update_settings(mf_opts)
@@ -200,18 +193,6 @@ class MaxFrameInstanceCaller(MaxFrameServiceCaller):
200
193
  json.dumps(json_data),
201
194
  raise_empty=True,
202
195
  )
203
- except TypeError: # pragma: no cover
204
- # todo remove when pyodps>=0.12.0 is enforced
205
- resp_data = self._instance.put_task_info(
206
- self._task_name, method_name, json.dumps(json_data)
207
- )
208
- if resp_data:
209
- return resp_data
210
- else:
211
- raise NoTaskServerResponseError(
212
- f"No response for request {method_name}. "
213
- f"Instance ID: {self._instance.id}"
214
- )
215
196
  except EmptyTaskInfoError as ex:
216
197
  # retry when server returns HTTP 204, which is designed for retry
217
198
  if ex.code != 204 or trial >= EMPTY_RESPONSE_RETRY_COUNT - 1:
@@ -11,17 +11,20 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
-
15
14
  import json
16
15
  import os
17
16
 
18
17
  import mock
18
+ import pytest
19
19
  from defusedxml import ElementTree
20
20
  from odps import ODPS
21
21
  from odps import options as odps_options
22
22
 
23
+ from maxframe import options
24
+ from maxframe.config import option_context
25
+
23
26
  from ...session.consts import MAXFRAME_OUTPUT_JSON_FORMAT
24
- from ...session.task import MaxFrameInstanceCaller, MaxFrameTask
27
+ from ...session.task import MaxFrameInstanceCaller, MaxFrameTask, MaxFrameTaskSession
25
28
 
26
29
  expected_file_dir = os.path.join(os.path.dirname(__file__), "expected-data")
27
30
 
@@ -53,10 +56,6 @@ def test_maxframe_instance_caller_creating_session():
53
56
  assert property_node.find("Name").text == "settings"
54
57
  setting_dict = json.loads(property_node.find("Value").text)
55
58
  assert setting_dict["odps.task.major.version"] == "test_version"
56
- assert (
57
- setting_dict["odps.service.endpoint"]
58
- == "http://100.69.248.78:8002/odps_dailyrunnew"
59
- )
60
59
 
61
60
  assert setting_dict["odps.maxframe.output_format"] == "json"
62
61
  maxframe_setting_dict = json.loads(setting_dict["odps.maxframe.settings"])
@@ -83,3 +82,33 @@ def test_maxframe_instance_caller_creating_session():
83
82
  finally:
84
83
  odps_options.priority = old_priority
85
84
  odps_options.get_priority = old_get_priority
85
+
86
+
87
+ @pytest.mark.asyncio
88
+ async def test_session_quota_flag_valid():
89
+ def mock_create(self, task: MaxFrameTask, **kwargs):
90
+ assert task.properties["settings"]
91
+ task_settings = json.loads(task.properties["settings"])
92
+ assert task_settings["odps.task.wlm.quota"] == "session_quota"
93
+
94
+ with mock.patch.multiple(
95
+ target="maxframe_client.session.task.MaxFrameInstanceCaller",
96
+ _wait_instance_task_ready=mock.DEFAULT,
97
+ get_session=mock.DEFAULT,
98
+ get_logview_address=mock.DEFAULT,
99
+ ), mock.patch("odps.models.instances.BaseInstances.create", mock_create):
100
+ with option_context({"session.quota_name": "session_quota"}):
101
+ with pytest.raises(ValueError):
102
+ options.sql.settings["odps.task.wlm.quota"] = "session_quota2"
103
+ await MaxFrameTaskSession.init(
104
+ address="test", odps_entry=ODPS.from_environments()
105
+ )
106
+ options.sql.settings["odps.task.wlm.quota"] = "session_quota"
107
+ mf_task_session = await MaxFrameTaskSession.init(
108
+ address="test", odps_entry=ODPS.from_environments()
109
+ )
110
+ with pytest.raises(ValueError):
111
+ options.sql.settings["odps.task.wlm.quota"] = "session_quota2"
112
+ mf_task_session._get_diff_settings()
113
+ options.sql.settings["odps.task.wlm.quota"] = "session_quota"
114
+ mf_task_session._get_diff_settings()
@@ -13,7 +13,7 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import time
16
- from typing import Dict
16
+ from typing import Any, Dict
17
17
 
18
18
  import mock
19
19
  import numpy as np
@@ -31,7 +31,7 @@ from maxframe.lib.aio import stop_isolation
31
31
  from maxframe.protocol import ResultInfo
32
32
  from maxframe.serialization import RemoteException
33
33
  from maxframe.session import new_session
34
- from maxframe.tests.utils import tn
34
+ from maxframe.tests.utils import ensure_table_deleted, tn
35
35
  from maxframe.utils import build_temp_table_name
36
36
  from maxframe_framedriver.app.tests.test_framedriver_webapp import ( # noqa: F401
37
37
  framedriver_app,
@@ -86,9 +86,12 @@ def test_simple_run_dataframe(start_mock_session):
86
86
  session_id: str,
87
87
  dag: TileableGraph,
88
88
  managed_input_infos: Dict[str, ResultInfo] = None,
89
+ new_settings: Dict[str, Any] = None,
89
90
  ):
90
91
  assert len(dag) == 2
91
- return await original_submit_dag(self, session_id, dag, managed_input_infos)
92
+ return await original_submit_dag(
93
+ self, session_id, dag, managed_input_infos, new_settings
94
+ )
92
95
 
93
96
  no_task_server_raised = False
94
97
  original_get_dag_info = MaxFrameRestCaller.get_dag_info
@@ -130,11 +133,10 @@ def test_simple_run_dataframe(start_mock_session):
130
133
  )
131
134
  assert odps_entry.exist_table(build_temp_table_name(start_mock_session, key))
132
135
  del df
133
- time.sleep(5)
134
- assert not odps_entry.exist_table(
135
- build_temp_table_name(start_mock_session, intermediate_key)
136
+ ensure_table_deleted(
137
+ odps_entry, build_temp_table_name(start_mock_session, intermediate_key)
136
138
  )
137
- assert not odps_entry.exist_table(build_temp_table_name(start_mock_session, key))
139
+ ensure_table_deleted(odps_entry, build_temp_table_name(start_mock_session, key))
138
140
 
139
141
 
140
142
  def test_run_and_fetch_slice(start_mock_session):
@@ -166,6 +168,25 @@ def test_run_empty_table(start_mock_session):
166
168
  empty_table.drop()
167
169
 
168
170
 
171
+ def test_run_odps_query_without_schema(start_mock_session):
172
+ odps_entry = ODPS.from_environments()
173
+
174
+ table_name = tn("test_query_without_schema")
175
+ odps_entry.delete_table(table_name, if_exists=True)
176
+ test_table = odps_entry.create_table(table_name, "a double, b double", lifecycle=1)
177
+
178
+ with test_table.open_writer() as writer:
179
+ writer.write([123, 456])
180
+
181
+ df = md.read_odps_query(
182
+ f"select a, b, a + b as `special: name` from {table_name}", skip_schema=True
183
+ )
184
+ executed = df.execute().fetch()
185
+ assert len(executed.dtypes) == 3
186
+
187
+ test_table.drop()
188
+
189
+
169
190
  def test_run_dataframe_with_pd_source(start_mock_session):
170
191
  odps_entry = ODPS.from_environments()
171
192
 
@@ -246,9 +267,8 @@ def test_run_and_fetch_series(start_mock_session):
246
267
  pd.testing.assert_series_equal(pd_result, result)
247
268
 
248
269
  del s1
249
- time.sleep(5)
250
- assert not odps_entry.exist_table(
251
- build_temp_table_name(start_mock_session, src_key)
270
+ ensure_table_deleted(
271
+ odps_entry, build_temp_table_name(start_mock_session, src_key)
252
272
  )
253
273
  finally:
254
274
  odps_entry.delete_table(