PyPI - maxframe - Versions diffs - 2.0.0b1__cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl → 2.0.0b2__cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl - Mend

maxframe 2.0.0b1__cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl → 2.0.0b2__cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of maxframe might be problematic. Click here for more details.

Files changed (11) hide show

maxframe/dataframe/datasource/read_odps_query.py CHANGED Viewed

@@ -13,6 +13,7 @@
 # limitations under the License.
 import dataclasses
+import functools
 import io
 import logging
 import re
@@ -22,6 +23,8 @@ from typing import Dict, List, MutableMapping, Optional, Tuple, Union
 import numpy as np
 import pandas as pd
 from odps import ODPS
+from odps.errors import ODPSError
+from odps.models import TableSchema
 from odps.types import Column, OdpsSchema, validate_data_type
 from odps.utils import split_sql_by_semicolon
@@ -245,13 +248,18 @@ def _parse_explained_schema(explain_string: str) -> OdpsSchema:
         return _parse_full_explain(explain_string)
-def _build_explain_sql(sql_stmt: str, no_split: bool = False) -> str:
+def _build_explain_sql(
+    sql_stmt: str, no_split: bool = False, use_output: bool = False
+) -> str:
+    clause = "EXPLAIN "
+    if use_output:
+        clause += "OUTPUT "
     if no_split:
-        return "EXPLAIN " + sql_stmt
+        return clause + sql_stmt
     sql_parts = split_sql_by_semicolon(sql_stmt)
     if not sql_parts:
         raise ValueError(f"Cannot explain SQL statement {sql_stmt}")
-    sql_parts[-1] = "EXPLAIN " + sql_parts[-1]
+    sql_parts[-1] = clause + sql_parts[-1]
     return "\n".join(sql_parts)
@@ -332,6 +340,62 @@ def _check_token_in_sql(token: str, sql: str) -> bool:
         return False
+def _resolve_schema_by_explain(
+    odps_entry: ODPS,
+    query: str,
+    no_split_sql: bool = False,
+    hints: Dict[str, str] = None,
+    use_explain_output: bool = True,
+) -> OdpsSchema:
+    hints = (hints or dict()).copy()
+    hints["odps.sql.select.output.format"] = "json"
+    explain_stmt = _build_explain_sql(
+        query, no_split=no_split_sql, use_output=use_explain_output
+    )
+    inst = odps_entry.execute_sql(explain_stmt, hints=hints)
+    logger.debug("Explain output instance ID: %s", inst.id)
+    explain_str = list(inst.get_task_results().values())[0]
+    if use_explain_output:
+        if not explain_str or "nothing to explain" in explain_str:
+            raise ValueError("The SQL statement should be an instant query")
+        return TableSchema.parse(None, explain_str)
+    else:
+        return _parse_explained_schema(explain_str)
+def _resolve_query_schema(
+    odps_entry: ODPS,
+    query: str,
+    no_split_sql: bool = False,
+    hints: Dict[str, str] = None,
+    use_explain_output: Optional[bool] = None,
+) -> OdpsSchema:
+    methods = []
+    if use_explain_output is not False:
+        # None or True
+        methods.append(_resolve_schema_by_explain)
+    if use_explain_output is not True:
+        # None or False
+        methods.append(
+            functools.partial(_resolve_schema_by_explain, use_explain_output=False)
+        )
+    for idx, resolve_method in enumerate(methods):
+        try:
+            return resolve_method(
+                odps_entry, query, no_split_sql=no_split_sql, hints=hints
+            )
+        except ODPSError as ex:
+            msg = (
+                f"Failed to obtain schema from SQL explain: {ex!r}\n"
+                f"Explain instance ID: {ex.instance_id}"
+            )
+            if idx + 1 == len(methods) or "ODPS-0130161" not in str(ex):
+                exc = ValueError(msg)
+                raise exc.with_traceback(ex.__traceback__) from None
+    # will this happen?
+    raise ValueError("Failed to obtain schema from SQL explain")  # pragma: no cover
 def read_odps_query(
     query: str,
     odps_entry: ODPS = None,
@@ -371,6 +435,8 @@ def read_odps_query(
         DataFrame read from MaxCompute (ODPS) table
     """
     no_split_sql = kw.pop("no_split_sql", False)
+    # if use_explain_output is None, will try two methods.
+    use_explain_output = kw.pop("use_explain_output", None)
     hints = options.sql.settings.copy() or {}
     if sql_hints:
@@ -395,19 +461,13 @@ def read_odps_query(
     col_renames = {}
     if not skip_schema:
-        explain_stmt = _build_explain_sql(query, no_split=no_split_sql)
-        inst = odps_entry.execute_sql(explain_stmt, hints=hints)
-        logger.debug("Explain instance ID: %s", inst.id)
-        explain_str = list(inst.get_task_results().values())[0]
-        try:
-            odps_schema = _parse_explained_schema(explain_str)
-        except BaseException as ex:
-            exc = ValueError(
-                f"Failed to obtain schema from SQL explain: {ex!r}"
-                f"\nExplain instance ID: {inst.id}"
-            )
-            raise exc.with_traceback(ex.__traceback__) from None
+        odps_schema = _resolve_query_schema(
+            odps_entry,
+            query,
+            no_split_sql=no_split_sql,
+            hints=hints,
+            use_explain_output=use_explain_output,
+        )
         new_columns = []
         for col in odps_schema.columns:

maxframe/dataframe/datasource/tests/test_datasource.py CHANGED Viewed

@@ -17,11 +17,13 @@ import uuid
 from collections import OrderedDict
 from math import isinf
+import mock
 import numpy as np
 import pandas as pd
 import pytest
 from odps import ODPS
 from odps import types as odps_types
+from odps.errors import ODPSError
 from .... import tensor as mt
 from ....core import OutputType
@@ -50,6 +52,7 @@ from ..read_odps_query import (
     ColumnSchema,
     _parse_full_explain,
     _parse_simple_explain,
+    _resolve_query_schema,
     _resolve_task_sector,
 )
 from ..series import from_pandas as from_pandas_series
@@ -360,7 +363,7 @@ def test_from_odps_query():
     with pytest.raises(ValueError) as err_info:
         read_odps_query(
-            f"CREATE TABLE dummy_table_{uuid.uuid4().hex} "
+            f"CREATE TABLE dummy_table_{uuid.uuid4().hex} LIFECYCLE 1 "
             f"AS SELECT * FROM {table1_name}"
         )
     assert "instant query" in err_info.value.args[0]
@@ -578,3 +581,46 @@ def test_resolve_break_lines():
     for col, (exp_nm, exp_tp) in zip(schema.columns, expected_col_types.items()):
         assert col.name == exp_nm
         assert col.type == odps_types.validate_data_type(exp_tp)
+@pytest.mark.parametrize("use_explain_output", [None, False, True])
+def test_explain_use_explain_output(use_explain_output):
+    class MockInstance:
+        @property
+        def id(self):
+            return "mock_id"
+        def get_task_results(self):
+            return {"pot": """{"columns":[{"name":"a_bigint","type":"BIGINT"}]}"""}
+    old_execute_sql = ODPS.execute_sql
+    exec_count = 0
+    def new_execute_sql(self, sql, *args, **kw):
+        nonlocal exec_count
+        exec_count += 1
+        if use_explain_output and sql.lower().startswith("explain output select"):
+            return MockInstance()
+        elif use_explain_output is None and sql.lower().startswith("explain output"):
+            raise ODPSError("ODPS-0130161: mock error")
+        return old_execute_sql(self, sql, *args, **kw)
+    odps_entry = ODPS.from_environments()
+    with mock.patch("odps.core.ODPS.execute_sql", new=new_execute_sql):
+        with pytest.raises(ValueError):
+            _resolve_query_schema(
+                odps_entry, "not_a_sql", use_explain_output=use_explain_output
+            )
+        assert exec_count == (2 if use_explain_output is None else 1)
+        exec_count = 0
+        schema = _resolve_query_schema(
+            odps_entry,
+            "select cast(1 as bigint) as a_bigint",
+            use_explain_output=use_explain_output,
+        )
+        assert schema.columns[0].name == "a_bigint"
+        assert schema.columns[0].type == odps_types.bigint
+        assert exec_count == (2 if use_explain_output is None else 1)

maxframe/io/objects/tests/test_object_io.py CHANGED Viewed

@@ -19,7 +19,7 @@ from odps import ODPS
 from ....core import OutputType
 from ....core.operator import ObjectOperatorMixin, Operator
 from ....tensor.datasource import ArrayDataSource
-from ....tests.utils import create_test_volume, tn
+from ....tests.utils import create_test_volume, get_test_unique_name, tn
 from ...odpsio import ODPSVolumeReader, ODPSVolumeWriter
 from ..core import get_object_io_handler
@@ -32,7 +32,9 @@ class TestObjectOp(Operator, ObjectOperatorMixin):
 @pytest.fixture(scope="module")
 def create_volume(oss_config):
-    with create_test_volume(tn("test_object_io_vol"), oss_config) as test_vol_name:
+    with create_test_volume(
+        tn("test_object_io_vol_" + get_test_unique_name(5)), oss_config
+    ) as test_vol_name:
         yield test_vol_name

maxframe/io/odpsio/tests/test_volumeio.py CHANGED Viewed

@@ -17,13 +17,13 @@ import contextlib
 import pytest
 from odps import ODPS
-from ....tests.utils import create_test_volume, tn
+from ....tests.utils import create_test_volume, get_test_unique_name, tn
 from ..volumeio import ODPSVolumeReader, ODPSVolumeWriter
 @pytest.fixture
 def create_volume(request, oss_config):
-    test_vol_name = tn("test_vol_name_" + request.param)
+    test_vol_name = tn(f"test_vol_name_{get_test_unique_name(5)}_" + request.param)
     odps_entry = ODPS.from_environments()
     @contextlib.contextmanager
@@ -41,24 +41,13 @@ def create_volume(request, oss_config):
             except BaseException:
                 pass
-    oss_test_dir_name = None
     if request.param == "parted":
         ctx = create_parted_volume()
     else:
         ctx = create_test_volume(test_vol_name, oss_config)
-    try:
-        with ctx:
-            yield test_vol_name
-    finally:
-        if oss_test_dir_name is not None:
-            import oss2
-            keys = [
-                obj.key
-                for obj in oss2.ObjectIterator(oss_config.oss_bucket, oss_test_dir_name)
-            ]
-            oss_config.oss_bucket.batch_delete_objects(keys)
+    with ctx:
+        yield test_vol_name
 @pytest.mark.parametrize("create_volume", ["external"], indirect=True)

maxframe/io/odpsio/volumeio.py CHANGED Viewed

@@ -14,7 +14,9 @@
 import inspect
 from typing import Iterator, List, Optional, Union
+from urllib.parse import urlparse
+import requests
 from odps import ODPS
 from odps import __version__ as pyodps_version
@@ -74,14 +76,27 @@ class ODPSVolumeWriter:
         self._replace_internal_host = replace_internal_host
     def write_file(self, file_name: str, data: Union[bytes, Iterator[bytes]]):
-        kw = {}
-        if _has_replace_internal_host and self._replace_internal_host:
-            kw = {"replace_internal_host": self._replace_internal_host}
-        with self._volume.open_writer(
-            self._volume_dir + "/" + file_name, **kw
-        ) as writer:
+        sign_url = self._volume.get_sign_url(
+            self._volume_dir + "/" + file_name,
+            method="PUT",
+            seconds=3600,
+        )
+        if self._replace_internal_host:
+            parsed_url = urlparse(sign_url)
+            if "-internal." in parsed_url.netloc:
+                new_netloc = parsed_url.netloc.replace("-internal.", ".")
+                sign_url = sign_url.replace(parsed_url.netloc, new_netloc)
+        def _to_bytes(d):
+            if not isinstance(d, (bytes, bytearray)):
+                return bytes(d)
+            return d
+        def data_func():
             if not inspect.isgenerator(data):
-                writer.write(data)
+                yield _to_bytes(data)
             else:
                 for chunk in data:
-                    writer.write(chunk)
+                    yield _to_bytes(chunk)
+        requests.put(sign_url, data=data_func())

maxframe/learn/contrib/xgboost/core.py CHANGED Viewed

@@ -163,6 +163,7 @@ else:
                 params["objective"] = "reg:squarederror"
             self.evals_result_ = dict()
             train_kw = {}
+            train_kw.update(kw)
             if getattr(self, "n_classes_", None):
                 train_kw["num_class"] = self.n_classes_

maxframe/tests/utils.py CHANGED Viewed

@@ -16,6 +16,7 @@ import asyncio
 import contextlib
 import functools
 import hashlib
+import logging
 import os
 import queue
 import socket
@@ -191,14 +192,8 @@ def assert_mf_index_dtype(idx_obj, dtype):
 @contextlib.contextmanager
 def create_test_volume(vol_name, oss_config):
-    test_vol_name = vol_name
     odps_entry = ODPS.from_environments()
-    try:
-        odps_entry.delete_volume(test_vol_name, auto_remove_dir=True, recursive=True)
-    except:
-        pass
     oss_test_dir_name = "test_dir_" + vol_name
     if oss_config is None:
         pytest.skip("Need oss and its config to run this test")
@@ -232,17 +227,14 @@ def create_test_volume(vol_name, oss_config):
         rolearn = oss_config.oss_rolearn
     oss_config.oss_bucket.put_object(oss_test_dir_name + "/", b"")
-    odps_entry.create_external_volume(
-        test_vol_name, location=test_location, rolearn=rolearn
-    )
+    odps_entry.create_external_volume(vol_name, location=test_location, rolearn=rolearn)
     try:
-        yield test_vol_name
+        yield vol_name
     finally:
         try:
-            odps_entry.delete_volume(
-                test_vol_name, auto_remove_dir=True, recursive=True
-            )
+            logging.warning("Deleting test volume %s", vol_name)
+            odps_entry.delete_volume(vol_name, auto_remove_dir=True, recursive=True)
         except:
             pass

{maxframe-2.0.0b1.dist-info → maxframe-2.0.0b2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: maxframe
-Version: 2.0.0b1
+Version: 2.0.0b2
 Summary: MaxFrame operator-based data analyze framework
 Requires-Dist: numpy<2.0.0,>=1.19.0
 Requires-Dist: pandas>=1.0.0