PyPI - maxframe - Versions diffs - 1.0.0rc4__cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl → 1.1.1__cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl - Mend

maxframe 1.0.0rc4__cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl → 1.1.1__cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of maxframe might be problematic. Click here for more details.

Files changed (85) hide show

maxframe/config/__init__.py +1 -1
maxframe/config/config.py +26 -0
maxframe/config/tests/test_config.py +20 -1
maxframe/conftest.py +17 -4
maxframe/core/operator/base.py +2 -0
maxframe/dataframe/arithmetic/tests/test_arithmetic.py +17 -16
maxframe/dataframe/core.py +24 -2
maxframe/dataframe/datasource/read_odps_query.py +65 -35
maxframe/dataframe/datasource/read_odps_table.py +4 -2
maxframe/dataframe/datasource/tests/test_datasource.py +59 -7
maxframe/dataframe/extensions/__init__.py +5 -0
maxframe/dataframe/extensions/apply_chunk.py +649 -0
maxframe/dataframe/extensions/flatjson.py +131 -0
maxframe/dataframe/extensions/flatmap.py +28 -40
maxframe/dataframe/extensions/reshuffle.py +1 -1
maxframe/dataframe/extensions/tests/test_apply_chunk.py +186 -0
maxframe/dataframe/extensions/tests/test_extensions.py +46 -2
maxframe/dataframe/groupby/__init__.py +1 -0
maxframe/dataframe/groupby/aggregation.py +1 -0
maxframe/dataframe/groupby/apply.py +9 -1
maxframe/dataframe/groupby/core.py +1 -1
maxframe/dataframe/groupby/fill.py +4 -1
maxframe/dataframe/groupby/getitem.py +6 -0
maxframe/dataframe/groupby/tests/test_groupby.py +1 -1
maxframe/dataframe/groupby/transform.py +8 -2
maxframe/dataframe/indexing/loc.py +6 -4
maxframe/dataframe/merge/__init__.py +9 -1
maxframe/dataframe/merge/concat.py +41 -31
maxframe/dataframe/merge/merge.py +1 -1
maxframe/dataframe/merge/tests/test_merge.py +3 -1
maxframe/dataframe/misc/apply.py +3 -0
maxframe/dataframe/misc/drop_duplicates.py +5 -1
maxframe/dataframe/misc/map.py +3 -1
maxframe/dataframe/misc/tests/test_misc.py +24 -2
maxframe/dataframe/misc/transform.py +22 -13
maxframe/dataframe/reduction/__init__.py +3 -0
maxframe/dataframe/reduction/aggregation.py +1 -0
maxframe/dataframe/reduction/median.py +56 -0
maxframe/dataframe/reduction/tests/test_reduction.py +17 -7
maxframe/dataframe/statistics/quantile.py +8 -2
maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
maxframe/dataframe/tests/test_utils.py +60 -0
maxframe/dataframe/utils.py +110 -7
maxframe/dataframe/window/expanding.py +5 -3
maxframe/dataframe/window/tests/test_expanding.py +2 -2
maxframe/io/objects/tests/test_object_io.py +39 -12
maxframe/io/odpsio/__init__.py +1 -1
maxframe/io/odpsio/arrow.py +51 -2
maxframe/io/odpsio/schema.py +23 -5
maxframe/io/odpsio/tableio.py +80 -124
maxframe/io/odpsio/tests/test_schema.py +40 -0
maxframe/io/odpsio/tests/test_tableio.py +5 -5
maxframe/io/odpsio/tests/test_volumeio.py +35 -11
maxframe/io/odpsio/volumeio.py +27 -3
maxframe/learn/contrib/__init__.py +3 -2
maxframe/learn/contrib/llm/__init__.py +16 -0
maxframe/learn/contrib/llm/core.py +54 -0
maxframe/learn/contrib/llm/models/__init__.py +14 -0
maxframe/learn/contrib/llm/models/dashscope.py +73 -0
maxframe/learn/contrib/llm/multi_modal.py +42 -0
maxframe/learn/contrib/llm/text.py +42 -0
maxframe/lib/sparse/tests/test_sparse.py +15 -15
maxframe/opcodes.py +7 -1
maxframe/serialization/core.cpython-311-aarch64-linux-gnu.so +0 -0
maxframe/serialization/core.pyx +13 -1
maxframe/serialization/pandas.py +50 -20
maxframe/serialization/serializables/core.py +70 -15
maxframe/serialization/serializables/field_type.py +4 -1
maxframe/serialization/serializables/tests/test_serializable.py +12 -2
maxframe/serialization/tests/test_serial.py +2 -1
maxframe/tensor/__init__.py +19 -7
maxframe/tensor/merge/vstack.py +1 -1
maxframe/tests/utils.py +16 -0
maxframe/udf.py +27 -0
maxframe/utils.py +42 -8
{maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/METADATA +4 -4
{maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/RECORD +573 -562
{maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/WHEEL +1 -1
maxframe_client/clients/framedriver.py +4 -1
maxframe_client/fetcher.py +23 -8
maxframe_client/session/odps.py +40 -11
maxframe_client/session/task.py +6 -25
maxframe_client/session/tests/test_task.py +35 -6
maxframe_client/tests/test_session.py +30 -10
{maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/top_level.txt +0 -0

maxframe/config/__init__.py CHANGED Viewed

@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .config import AttributeDict, option_context, options
+from .config import AttributeDict, option_context, options, update_wlm_quota_settings

maxframe/config/config.py CHANGED Viewed

@@ -28,6 +28,8 @@ except ImportError:
     available_timezones = lambda: all_timezones
+import logging
 from ..utils import get_python_tag
 from .validators import (
     ValidatorType,
@@ -43,6 +45,8 @@ from .validators import (
     is_valid_cache_path,
 )
+logger = logging.getLogger(__name__)
 _DEFAULT_REDIRECT_WARN = "Option {source} has been replaced by {target} and might be removed in a future release."
 _DEFAULT_MAX_ALIVE_SECONDS = 3 * 24 * 3600
 _DEFAULT_MAX_IDLE_SECONDS = 3600
@@ -380,6 +384,9 @@ default_options.register_option(
 default_options.register_option(
     "session.enable_schema", None, validator=is_null | is_bool, remote=True
 )
+default_options.register_option(
+    "session.enable_high_availability", None, validator=is_null | is_bool, remote=True
+)
 default_options.register_option(
     "session.default_schema", None, validator=is_null | is_string, remote=True
 )
@@ -496,3 +503,22 @@ class OptionsProxy:
 options = OptionsProxy()
+def update_wlm_quota_settings(session_id: str, engine_settings: Dict[str, Any]):
+    engine_quota = engine_settings.get("odps.task.wlm.quota", None)
+    session_quota = options.session.quota_name or None
+    if engine_quota != session_quota and engine_quota:
+        logger.warning(
+            "[Session=%s] Session quota (%s) is different to SubDag engine quota (%s)",
+            session_id,
+            session_quota,
+            engine_quota,
+        )
+        # TODO(renxiang): overwrite or not overwrite
+        return
+    if session_quota:
+        engine_settings["odps.task.wlm.quota"] = session_quota
+    elif "odps.task.wlm.quota" in engine_settings:
+        engine_settings.pop("odps.task.wlm.quota")

maxframe/config/tests/test_config.py CHANGED Viewed

@@ -18,7 +18,14 @@ import threading
 import pytest
-from ..config import Config, is_integer, is_string, option_context, options
+from ..config import (
+    Config,
+    is_integer,
+    is_string,
+    option_context,
+    options,
+    update_wlm_quota_settings,
+)
 def test_config_context():
@@ -101,3 +108,15 @@ def test_config_copy():
     target_cfg.update(src_cfg_dict)
     assert target_cfg.a.b.c == 1
+def test_update_wlm_quota_settings():
+    with option_context({}):
+        options.session.quota_name = "quota1"
+        engine_settings = {}
+        update_wlm_quota_settings("session_id", engine_settings)
+        assert engine_settings["odps.task.wlm.quota"] == "quota1"
+        options.session.quota_name = None
+        update_wlm_quota_settings("session_id", engine_settings)
+        # TODO(renxiang): overwrite or not overwrite
+        assert "odps.task.wlm.quota" in engine_settings

maxframe/conftest.py CHANGED Viewed

@@ -40,10 +40,14 @@ def _get_odps_env(test_config: ConfigParser, section_name: str) -> ODPS:
         access_id = test_config.get(section_name, "access_id")
     except NoOptionError:
         access_id = test_config.get("odps", "access_id")
+    if not access_id:
+        access_id = os.getenv("ACCESS_ID")
     try:
         secret_access_key = test_config.get(section_name, "secret_access_key")
     except NoOptionError:
         secret_access_key = test_config.get("odps", "secret_access_key")
+    if not secret_access_key:
+        secret_access_key = os.getenv("SECRET_ACCESS_KEY")
     try:
         project = test_config.get(section_name, "project")
     except NoOptionError:
@@ -119,14 +123,23 @@ def oss_config():
     old_cache_url = options.object_cache_url
     try:
-        oss_access_id = config.get("oss", "access_id")
-        oss_secret_access_key = config.get("oss", "secret_access_key")
+        oss_access_id = config.get("oss", "access_id") or os.getenv("ACCESS_ID")
+        oss_secret_access_key = config.get("oss", "secret_access_key") or os.getenv(
+            "SECRET_ACCESS_KEY"
+        )
         oss_bucket_name = config.get("oss", "bucket_name")
         oss_endpoint = config.get("oss", "endpoint")
         oss_rolearn = config.get("oss", "rolearn")
         options.service_role_arn = oss_rolearn
-        options.object_cache_url = f"oss://{oss_endpoint}/{oss_bucket_name}"
+        if "test" in oss_endpoint:
+            oss_svc_endpoint = oss_endpoint
+        else:
+            endpoint_parts = oss_endpoint.split(".", 1)
+            if "-internal" not in endpoint_parts[0]:
+                endpoint_parts[0] += "-internal"
+            oss_svc_endpoint = ".".join(endpoint_parts)
+        options.object_cache_url = f"oss://{oss_svc_endpoint}/{oss_bucket_name}"
         config.oss_config = (
             oss_access_id,
@@ -141,7 +154,7 @@ def oss_config():
         config.oss_bucket = oss2.Bucket(auth, oss_endpoint, oss_bucket_name)
         config.oss_rolearn = oss_rolearn
         yield config
-    except (ConfigParser.NoSectionError, ConfigParser.NoOptionError, ImportError):
+    except (NoSectionError, NoOptionError, ImportError):
         return None
     finally:
         options.service_role_arn = old_role_arn

maxframe/core/operator/base.py CHANGED Viewed

@@ -86,6 +86,8 @@ class SchedulingHint(Serializable):
     # `gpu` indicates that if the operator should be executed on the GPU.
     gpu = BoolField("gpu", default=None)
     priority = Int32Field("priority", default=None)
+    expect_engine = StringField("expect_engine", default=None)
+    expect_resources = DictField("expect_resources", FieldTypes.string, default=None)
     @classproperty
     @lru_cache(1)

maxframe/dataframe/arithmetic/tests/test_arithmetic.py CHANGED Viewed

@@ -22,6 +22,7 @@ import pandas as pd
 import pytest
 from ....core import OperatorType
+from ....tests.utils import assert_mf_index_dtype
 from ....utils import dataslots
 from ...core import IndexValue
 from ...datasource.dataframe import from_pandas
@@ -164,7 +165,7 @@ def test_without_shuffle(func_name, func_opts):
     pd.testing.assert_index_equal(
         df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
     )
-    assert isinstance(df3.index_value.value, IndexValue.Int64Index)
+    assert_mf_index_dtype(df3.index_value.value, np.int64)
     pd.testing.assert_index_equal(
         df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
     )
@@ -176,7 +177,7 @@ def test_without_shuffle(func_name, func_opts):
     pd.testing.assert_index_equal(
         df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
     )
-    assert isinstance(df3.index_value.value, IndexValue.Int64Index)
+    assert_mf_index_dtype(df3.index_value.value, np.int64)
     pd.testing.assert_index_equal(
         df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
     )
@@ -370,7 +371,7 @@ def test_with_one_shuffle(func_name, func_opts):
     pd.testing.assert_index_equal(
         df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
     )
-    assert isinstance(df3.index_value.value, IndexValue.Int64Index)
+    assert_mf_index_dtype(df3.index_value.value, np.int64)
     pd.testing.assert_index_equal(
         df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
     )
@@ -403,7 +404,7 @@ def test_with_all_shuffle(func_name, func_opts):
     pd.testing.assert_index_equal(
         df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
     )
-    assert isinstance(df3.index_value.value, IndexValue.Int64Index)
+    assert_mf_index_dtype(df3.index_value.value, np.int64)
     pd.testing.assert_index_equal(
         df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
     )
@@ -433,7 +434,7 @@ def test_with_all_shuffle(func_name, func_opts):
     pd.testing.assert_index_equal(
         df6.columns_value.to_pandas(), func_opts.func(data4, data5).columns
     )
-    assert isinstance(df6.index_value.value, IndexValue.Int64Index)
+    assert_mf_index_dtype(df6.index_value.value, np.int64)
     pd.testing.assert_index_equal(
         df6.index_value.to_pandas(), pd.Index([], dtype=np.int64)
     )
@@ -468,7 +469,7 @@ def test_without_shuffle_and_with_one_chunk(func_name, func_opts):
     pd.testing.assert_index_equal(
         df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
     )
-    assert isinstance(df3.index_value.value, IndexValue.Int64Index)
+    assert_mf_index_dtype(df3.index_value.value, np.int64)
     pd.testing.assert_index_equal(
         df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
     )
@@ -501,7 +502,7 @@ def test_both_one_chunk(func_name, func_opts):
     pd.testing.assert_index_equal(
         df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
     )
-    assert isinstance(df3.index_value.value, IndexValue.Int64Index)
+    assert_mf_index_dtype(df3.index_value.value, np.int64)
     pd.testing.assert_index_equal(
         df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
     )
@@ -534,7 +535,7 @@ def test_with_shuffle_and_one_chunk(func_name, func_opts):
     pd.testing.assert_index_equal(
         df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
     )
-    assert isinstance(df3.index_value.value, IndexValue.Int64Index)
+    assert_mf_index_dtype(df3.index_value.value, np.int64)
     pd.testing.assert_index_equal(
         df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
     )
@@ -558,7 +559,7 @@ def test_on_same_dataframe(func_name, func_opts):
     pd.testing.assert_index_equal(
         df2.columns_value.to_pandas(), func_opts.func(data, data).columns
     )
-    assert isinstance(df2.index_value.value, IndexValue.Int64Index)
+    assert_mf_index_dtype(df2.index_value.value, np.int64)
     pd.testing.assert_index_equal(
         df2.index_value.to_pandas(), pd.Index([], dtype=np.int64)
     )
@@ -590,19 +591,19 @@ def test_dataframe_and_scalar(func_name, func_opts):
     pd.testing.assert_series_equal(result.dtypes, expected.dtypes)
     pd.testing.assert_index_equal(result.columns_value.to_pandas(), data.columns)
-    assert isinstance(result.index_value.value, IndexValue.Int64Index)
+    assert_mf_index_dtype(result.index_value.value, np.int64)
     pd.testing.assert_index_equal(result2.columns_value.to_pandas(), data.columns)
-    assert isinstance(result2.index_value.value, IndexValue.Int64Index)
+    assert_mf_index_dtype(result2.index_value.value, np.int64)
     pd.testing.assert_index_equal(result3.columns_value.to_pandas(), data.columns)
-    assert isinstance(result3.index_value.value, IndexValue.Int64Index)
+    assert_mf_index_dtype(result3.index_value.value, np.int64)
     pd.testing.assert_index_equal(result4.columns_value.to_pandas(), data.columns)
-    assert isinstance(result4.index_value.value, IndexValue.Int64Index)
+    assert_mf_index_dtype(result4.index_value.value, np.int64)
     pd.testing.assert_index_equal(result5.columns_value.to_pandas(), data.columns)
-    assert isinstance(result5.index_value.value, IndexValue.Int64Index)
+    assert_mf_index_dtype(result5.index_value.value, np.int64)
     if "builtin_function_or_method" not in str(type(func_opts.func)):
         # skip NotImplemented test for comparison function
@@ -679,7 +680,7 @@ def test_abs():
     pd.testing.assert_index_equal(
         df2.columns_value.to_pandas(), df1.columns_value.to_pandas()
     )
-    assert isinstance(df2.index_value.value, IndexValue.Int64Index)
+    assert_mf_index_dtype(df2.index_value.value, np.int64)
     assert df2.shape == (10, 10)
@@ -697,7 +698,7 @@ def test_not():
     pd.testing.assert_index_equal(
         df2.columns_value.to_pandas(), df1.columns_value.to_pandas()
     )
-    assert isinstance(df2.index_value.value, IndexValue.Int64Index)
+    assert_mf_index_dtype(df2.index_value.value, np.int64)
     assert df2.shape == (10, 10)

maxframe/dataframe/core.py CHANGED Viewed

@@ -142,6 +142,14 @@ class IndexValue(Serializable):
         _data = NDArrayField("data")
         _dtype = DataTypeField("dtype")
+        @property
+        def dtype(self):
+            return getattr(self, "_dtype", None)
+        @property
+        def inferred_type(self):
+            return "floating" if self.dtype.kind == "f" else "integer"
     class RangeIndex(IndexBase):
         _name = AnyField("name")
         _slice = SliceField("slice")
@@ -243,6 +251,10 @@ class IndexValue(Serializable):
         _data = NDArrayField("data")
         _dtype = DataTypeField("dtype")
+        @property
+        def dtype(self):
+            return getattr(self, "_dtype", None)
         @property
         def inferred_type(self):
             return "integer"
@@ -254,6 +266,10 @@ class IndexValue(Serializable):
         _data = NDArrayField("data")
         _dtype = DataTypeField("dtype")
+        @property
+        def dtype(self):
+            return getattr(self, "_dtype", None)
         @property
         def inferred_type(self):
             return "integer"
@@ -265,6 +281,10 @@ class IndexValue(Serializable):
         _data = NDArrayField("data")
         _dtype = DataTypeField("dtype")
+        @property
+        def dtype(self):
+            return getattr(self, "_dtype", None)
         @property
         def inferred_type(self):
             return "floating"
@@ -1514,8 +1534,7 @@ class BaseDataFrameData(HasShapeTileableData, _ToPandasMixin):
         refresh_index_value(self)
         refresh_dtypes(self)
-    def refresh_from_table_meta(self, table_meta: DataFrameTableMeta) -> None:
-        dtypes = table_meta.pd_column_dtypes
+    def refresh_from_dtypes(self, dtypes: pd.Series) -> None:
         self._dtypes = dtypes
         self._columns_value = parse_index(dtypes.index, store_data=True)
         self._dtypes_value = DtypesValue(key=tokenize(dtypes), value=dtypes)
@@ -1523,6 +1542,9 @@ class BaseDataFrameData(HasShapeTileableData, _ToPandasMixin):
         new_shape[-1] = len(dtypes)
         self._shape = tuple(new_shape)
+    def refresh_from_table_meta(self, table_meta: DataFrameTableMeta) -> None:
+        self.refresh_from_dtypes(table_meta.pd_column_dtypes)
     @property
     def dtypes(self):
         dt = getattr(self, "_dtypes", None)

maxframe/dataframe/datasource/read_odps_query.py CHANGED Viewed

@@ -37,6 +37,7 @@ from ...serialization.serializables import (
     SeriesField,
     StringField,
 )
+from ...utils import is_empty
 from ..utils import parse_index
 from .core import ColumnPruneSupportedDataSourceMixin, IncrementalIndexDatasource
@@ -57,7 +58,7 @@ _EXPLAIN_COLUMN_REGEX = re.compile(r"([^\(]+) \(([^\n]+)\)(?:| AS ([^ ]+))(?:\n|
 _ANONYMOUS_COL_REGEX = re.compile(r"^_c(\d+)$")
 _SIMPLE_SCHEMA_COLS_REGEX = re.compile(r"SELECT (([^:]+:[^, ]+[, ]*)+)FROM")
-_SIMPLE_SCHEMA_COL_REGEX = re.compile(r"([^\.]+):([^, ]+)")
+_SIMPLE_SCHEMA_COL_REGEX = re.compile(r"([^ \.\)]+):([^ ]+)")
 @dataclasses.dataclass
@@ -180,23 +181,30 @@ def _parse_full_explain(explain_string: str) -> OdpsSchema:
     job_dag = jobs_sector.build_dag()
     indep_job_names = list(job_dag.iter_indep(reverse=True))
-    if len(indep_job_names) > 1:  # pragma: no cover
-        raise ValueError("Only one final job is allowed in SQL statement")
-    tasks_sector = jobs_sector.jobs[indep_job_names[0]]
-    task_dag = tasks_sector.build_dag()
-    indep_task_names = list(task_dag.iter_indep(reverse=True))
-    if len(indep_task_names) > 1:  # pragma: no cover
+    schema_signatures = dict()
+    for job_name in indep_job_names:
+        tasks_sector = jobs_sector.jobs[job_name]
+        task_dag = tasks_sector.build_dag()
+        indep_task_names = list(task_dag.iter_indep(reverse=True))
+        for task_name in indep_task_names:
+            task_sector = tasks_sector.tasks[task_name]
+            if not task_sector.schema:  # pragma: no cover
+                raise ValueError("Cannot detect output schema")
+            if task_sector.output_target != "Screen":
+                raise ValueError("The SQL statement should be an instant query")
+            sig_tuples = sorted(
+                [
+                    (c.column_alias or c.column_name, c.column_type)
+                    for c in task_sector.schema
+                ]
+            )
+            schema_signatures[hash(tuple(sig_tuples))] = task_sector.schema
+    if len(schema_signatures) != 1:
         raise ValueError("Only one final task is allowed in SQL statement")
-    task_sector = tasks_sector.tasks[indep_task_names[0]]
-    if not task_sector.schema:  # pragma: no cover
-        raise ValueError("Cannot detect output schema")
-    if task_sector.output_target != "Screen":
-        raise ValueError("The SQL statement should be an instant query")
+    schema = list(schema_signatures.values())[0]
     cols = [
         Column(c.column_alias or c.column_name, validate_data_type(c.column_type))
-        for c in task_sector.schema
+        for c in schema
     ]
     return OdpsSchema(cols)
@@ -209,7 +217,7 @@ def _parse_simple_explain(explain_string: str) -> OdpsSchema:
     fields_str = fields_match.group(1)
     cols = []
     for field, type_name in _SIMPLE_SCHEMA_COL_REGEX.findall(fields_str):
-        cols.append(Column(field, validate_data_type(type_name)))
+        cols.append(Column(field, validate_data_type(type_name.rstrip(","))))
     return OdpsSchema(cols)
@@ -243,7 +251,7 @@ class DataFrameReadODPSQuery(
         self.columns = columns
     def __call__(self, chunk_bytes=None, chunk_size=None):
-        if not self.index_columns:
+        if is_empty(self.index_columns):
             index_value = parse_index(pd.RangeIndex(0))
         elif len(self.index_columns) == 1:
             index_value = parse_index(
@@ -257,12 +265,18 @@ class DataFrameReadODPSQuery(
             )
             index_value = parse_index(idx)
-        columns_value = parse_index(self.dtypes.index, store_data=True)
+        if self.dtypes is not None:
+            columns_value = parse_index(self.dtypes.index, store_data=True)
+            shape = (np.nan, len(self.dtypes))
+        else:
+            columns_value = None
+            shape = (np.nan, np.nan)
         self.output_types = [OutputType.dataframe]
         return self.new_tileable(
             [],
             None,
-            shape=(len(self.dtypes), np.nan),
+            shape=shape,
             dtypes=self.dtypes,
             index_value=index_value,
             columns_value=columns_value,
@@ -278,6 +292,7 @@ def read_odps_query(
     string_as_binary: bool = None,
     sql_hints: Dict[str, str] = None,
     anonymous_col_prefix: str = _DEFAULT_ANONYMOUS_COL_PREFIX,
+    skip_schema: bool = False,
     **kw,
 ):
     """
@@ -298,6 +313,10 @@ def read_odps_query(
         User specified SQL hints.
     anonymous_col_prefix: str, optional
         Prefix for anonymous columns, '_anon_col_' by default.
+    skip_schema: bool, optional
+        Skip resolving output schema before execution. Once this is configured,
+        the output DataFrame cannot be inputs of other DataFrame operators
+        before execution.
     Returns
     -------
@@ -319,28 +338,39 @@ def read_odps_query(
     if odps_entry is None:
         raise ValueError("Missing odps_entry parameter")
-    inst = odps_entry.execute_sql(f"EXPLAIN {query}", hints=hints)
-    logger.debug("Explain instance ID: %s", inst.id)
-    explain_str = list(inst.get_task_results().values())[0]
-    odps_schema = _parse_explained_schema(explain_str)
-    new_columns = []
     col_renames = {}
-    for col in odps_schema.columns:
-        anon_match = _ANONYMOUS_COL_REGEX.match(col.name)
-        if anon_match and col.name not in query:
-            new_name = anonymous_col_prefix + anon_match.group(1)
-            col_renames[col.name] = new_name
-            new_columns.append(Column(new_name, col.type))
-        else:
-            new_columns.append(col)
-    dtypes = odps_schema_to_pandas_dtypes(OdpsSchema(new_columns))
+    if not skip_schema:
+        inst = odps_entry.execute_sql(f"EXPLAIN {query}", hints=hints)
+        logger.debug("Explain instance ID: %s", inst.id)
+        explain_str = list(inst.get_task_results().values())[0]
+        try:
+            odps_schema = _parse_explained_schema(explain_str)
+        except ValueError as ex:
+            exc = ValueError(str(ex) + "\nExplain instance ID: " + inst.id)
+            raise exc.with_traceback(ex.__traceback__) from None
+        new_columns = []
+        for col in odps_schema.columns:
+            anon_match = _ANONYMOUS_COL_REGEX.match(col.name)
+            if anon_match and col.name not in query:
+                new_name = anonymous_col_prefix + anon_match.group(1)
+                col_renames[col.name] = new_name
+                new_columns.append(Column(new_name, col.type))
+            else:
+                new_columns.append(col)
+        dtypes = odps_schema_to_pandas_dtypes(OdpsSchema(new_columns))
+    else:
+        dtypes = None
     if not index_col:
         index_dtypes = None
     else:
+        if dtypes is None:
+            raise ValueError("Cannot configure index_col when skip_schema is True")
         if isinstance(index_col, str):
             index_col = [index_col]
         index_col_set = set(index_col)

maxframe/dataframe/datasource/read_odps_table.py CHANGED Viewed

@@ -34,6 +34,7 @@ from ...serialization.serializables import (
     SeriesField,
     StringField,
 )
+from ...utils import is_empty
 from ..core import DataFrame  # noqa: F401
 from ..utils import parse_index
 from .core import ColumnPruneSupportedDataSourceMixin, IncrementalIndexDatasource
@@ -76,7 +77,7 @@ class DataFrameReadODPSTable(
         self.columns = columns
     def __call__(self, shape, chunk_bytes=None, chunk_size=None):
-        if not self.index_columns:
+        if is_empty(self.index_columns):
             if np.isnan(shape[0]):
                 index_value = parse_index(pd.RangeIndex(0))
             else:
@@ -238,7 +239,8 @@ def read_odps_table(
         partitions = [partitions]
     append_partitions = append_partitions or any(
-        pt.name in (columns or ()) for pt in (table.table_schema.partitions or ())
+        pt.name in (columns if not is_empty(columns) else ())
+        for pt in (table.table_schema.partitions or ())
     )
     op = DataFrameReadODPSTable(
         table_name=table.full_table_name,

maxframe/dataframe/datasource/tests/test_datasource.py CHANGED Viewed

@@ -13,6 +13,7 @@
 # limitations under the License.
 import os
+import uuid
 from collections import OrderedDict
 import numpy as np
@@ -26,7 +27,14 @@ from ....core import OutputType
 from ....tests.utils import tn
 from ....utils import lazy_import
 from ... import read_odps_query, read_odps_table
-from ...core import DatetimeIndex, Float64Index, IndexValue, Int64Index, MultiIndex
+from ...core import (
+    DatetimeIndex,
+    Float64Index,
+    Index,
+    IndexValue,
+    Int64Index,
+    MultiIndex,
+)
 from ..dataframe import from_pandas as from_pandas_df
 from ..date_range import date_range
 from ..from_tensor import (
@@ -36,7 +44,12 @@ from ..from_tensor import (
 )
 from ..index import from_pandas as from_pandas_index
 from ..index import from_tileable
-from ..read_odps_query import ColumnSchema, _parse_simple_explain, _resolve_task_sector
+from ..read_odps_query import (
+    ColumnSchema,
+    _parse_full_explain,
+    _parse_simple_explain,
+    _resolve_task_sector,
+)
 from ..series import from_pandas as from_pandas_series
 ray = lazy_import("ray")
@@ -114,18 +127,22 @@ def test_from_tileable_index():
     for o in [df, df[0]]:
         index = o.index
-        assert isinstance(index, Int64Index)
+        assert isinstance(index, (Index, Int64Index))
         assert index.dtype == np.int64
         assert index.name == pd_df.index.name
-        assert isinstance(index.index_value.value, IndexValue.Int64Index)
+        assert isinstance(
+            index.index_value.value, (IndexValue.Int64Index, IndexValue.Index)
+        )
     t = mt.random.rand(10, chunk_size=6)
     index = from_tileable(t, name="new_name")
-    assert isinstance(index, Float64Index)
+    assert isinstance(index, (Index, Float64Index))
     assert index.dtype == np.float64
     assert index.name == "new_name"
-    assert isinstance(index.index_value.value, IndexValue.Float64Index)
+    assert isinstance(
+        index.index_value.value, (IndexValue.Float64Index, IndexValue.Index)
+    )
 def test_from_tensor():
@@ -327,7 +344,10 @@ def test_from_odps_query():
     odps_entry.write_table(test_table2, [["A", 10, 4.5]])
     with pytest.raises(ValueError) as err_info:
-        read_odps_query(f"CREATE TABLE dummy_table AS SELECT * FROM {table1_name}")
+        read_odps_query(
+            f"CREATE TABLE dummy_table_{uuid.uuid4().hex} "
+            f"AS SELECT * FROM {table1_name}"
+        )
     assert "instant query" in err_info.value.args[0]
     query1 = f"SELECT * FROM {table1_name} WHERE col1 > 10"
@@ -343,6 +363,10 @@ def test_from_odps_query():
         ),
     )
+    df = read_odps_query(query1, skip_schema=True)
+    assert df.dtypes is None
+    assert df.columns_value is None
     df = read_odps_query(query1, index_col="col1")
     assert df.op.query == query1
     assert df.index_value.name == "col1"
@@ -442,3 +466,31 @@ def test_resolve_simple_explain():
     assert schema.columns[0].type == odps_types.string
     assert schema.columns[1].name == "createdate"
     assert schema.columns[1].type == odps_types.bigint
+def test_resolve_conditional():
+    input_path = os.path.join(
+        os.path.dirname(__file__), "test-data", "task-input-multi-cond.txt"
+    )
+    with open(input_path, "r") as f:
+        sector = f.read()
+    expected_col_types = {
+        "cs1": "string",
+        "cs2": "string",
+        "ci1": "bigint",
+        "cs3": "string",
+        "cs4": "string",
+        "cs5": "string",
+        "cs6": "string",
+        "cs7": "string",
+        "cs8": "string",
+        "ci2": "int",
+        "ci3": "bigint",
+        "cs9": "string",
+    }
+    schema = _parse_full_explain(sector)
+    for col, (exp_nm, exp_tp) in zip(schema.columns, expected_col_types.items()):
+        assert col.name == exp_nm
+        assert col.type == odps_types.validate_data_type(exp_tp)