PyPI - maxframe - Versions diffs - 0.1.0b4__cp311-cp311-win32.whl → 1.0.0__cp311-cp311-win32.whl - Mend

maxframe 0.1.0b4cp311-cp311-win32.whl → 1.0.0cp311-cp311-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of maxframe might be problematic. Click here for more details.

Files changed (214) hide show

maxframe/__init__.py +1 -0
maxframe/_utils.cp311-win32.pyd +0 -0
maxframe/codegen.py +56 -5
maxframe/config/config.py +78 -10
maxframe/config/validators.py +42 -11
maxframe/conftest.py +58 -14
maxframe/core/__init__.py +2 -16
maxframe/core/entity/__init__.py +1 -12
maxframe/core/entity/executable.py +1 -1
maxframe/core/entity/objects.py +46 -45
maxframe/core/entity/output_types.py +0 -3
maxframe/core/entity/tests/test_objects.py +43 -0
maxframe/core/entity/tileables.py +5 -78
maxframe/core/graph/__init__.py +2 -2
maxframe/core/graph/builder/__init__.py +0 -1
maxframe/core/graph/builder/base.py +5 -4
maxframe/core/graph/builder/tileable.py +4 -4
maxframe/core/graph/builder/utils.py +4 -8
maxframe/core/graph/core.cp311-win32.pyd +0 -0
maxframe/core/graph/core.pyx +4 -4
maxframe/core/graph/entity.py +9 -33
maxframe/core/operator/__init__.py +2 -9
maxframe/core/operator/base.py +3 -5
maxframe/core/operator/objects.py +0 -9
maxframe/core/operator/utils.py +55 -0
maxframe/dataframe/__init__.py +2 -1
maxframe/dataframe/arithmetic/around.py +5 -17
maxframe/dataframe/arithmetic/core.py +15 -7
maxframe/dataframe/arithmetic/docstring.py +7 -33
maxframe/dataframe/arithmetic/equal.py +4 -2
maxframe/dataframe/arithmetic/greater.py +4 -2
maxframe/dataframe/arithmetic/greater_equal.py +4 -2
maxframe/dataframe/arithmetic/less.py +2 -2
maxframe/dataframe/arithmetic/less_equal.py +4 -2
maxframe/dataframe/arithmetic/not_equal.py +4 -2
maxframe/dataframe/arithmetic/tests/test_arithmetic.py +39 -16
maxframe/dataframe/core.py +58 -12
maxframe/dataframe/datasource/date_range.py +2 -2
maxframe/dataframe/datasource/read_odps_query.py +120 -24
maxframe/dataframe/datasource/read_odps_table.py +9 -4
maxframe/dataframe/datasource/tests/test_datasource.py +103 -8
maxframe/dataframe/datastore/tests/test_to_odps.py +48 -0
maxframe/dataframe/datastore/to_odps.py +28 -0
maxframe/dataframe/extensions/__init__.py +5 -0
maxframe/dataframe/extensions/flatjson.py +131 -0
maxframe/dataframe/extensions/flatmap.py +317 -0
maxframe/dataframe/extensions/reshuffle.py +1 -1
maxframe/dataframe/extensions/tests/test_extensions.py +108 -3
maxframe/dataframe/groupby/core.py +1 -1
maxframe/dataframe/groupby/cum.py +0 -1
maxframe/dataframe/groupby/fill.py +4 -1
maxframe/dataframe/groupby/getitem.py +6 -0
maxframe/dataframe/groupby/tests/test_groupby.py +5 -1
maxframe/dataframe/groupby/transform.py +5 -1
maxframe/dataframe/indexing/align.py +1 -1
maxframe/dataframe/indexing/loc.py +6 -4
maxframe/dataframe/indexing/rename.py +5 -28
maxframe/dataframe/indexing/sample.py +0 -1
maxframe/dataframe/indexing/set_index.py +68 -1
maxframe/dataframe/initializer.py +11 -1
maxframe/dataframe/merge/__init__.py +9 -1
maxframe/dataframe/merge/concat.py +41 -31
maxframe/dataframe/merge/merge.py +237 -3
maxframe/dataframe/merge/tests/test_merge.py +126 -1
maxframe/dataframe/misc/__init__.py +4 -0
maxframe/dataframe/misc/apply.py +6 -11
maxframe/dataframe/misc/case_when.py +141 -0
maxframe/dataframe/misc/describe.py +2 -2
maxframe/dataframe/misc/drop_duplicates.py +8 -8
maxframe/dataframe/misc/eval.py +4 -0
maxframe/dataframe/misc/memory_usage.py +2 -2
maxframe/dataframe/misc/pct_change.py +1 -83
maxframe/dataframe/misc/pivot_table.py +262 -0
maxframe/dataframe/misc/tests/test_misc.py +93 -1
maxframe/dataframe/misc/transform.py +1 -30
maxframe/dataframe/misc/value_counts.py +4 -17
maxframe/dataframe/missing/dropna.py +1 -1
maxframe/dataframe/missing/fillna.py +5 -5
maxframe/dataframe/operators.py +1 -17
maxframe/dataframe/plotting/core.py +2 -2
maxframe/dataframe/reduction/core.py +4 -3
maxframe/dataframe/reduction/tests/test_reduction.py +2 -4
maxframe/dataframe/sort/sort_values.py +1 -11
maxframe/dataframe/statistics/corr.py +3 -3
maxframe/dataframe/statistics/quantile.py +13 -19
maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
maxframe/dataframe/tests/test_initializer.py +33 -2
maxframe/dataframe/utils.py +33 -11
maxframe/dataframe/window/expanding.py +5 -3
maxframe/dataframe/window/tests/test_expanding.py +2 -2
maxframe/errors.py +13 -0
maxframe/extension.py +12 -0
maxframe/io/__init__.py +13 -0
maxframe/io/objects/__init__.py +24 -0
maxframe/io/objects/core.py +140 -0
maxframe/io/objects/tensor.py +76 -0
maxframe/io/objects/tests/__init__.py +13 -0
maxframe/io/objects/tests/test_object_io.py +97 -0
maxframe/{odpsio → io/odpsio}/__init__.py +3 -1
maxframe/{odpsio → io/odpsio}/arrow.py +43 -12
maxframe/{odpsio → io/odpsio}/schema.py +38 -16
maxframe/io/odpsio/tableio.py +719 -0
maxframe/io/odpsio/tests/__init__.py +13 -0
maxframe/{odpsio → io/odpsio}/tests/test_schema.py +75 -33
maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +50 -23
maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
maxframe/io/odpsio/volumeio.py +63 -0
maxframe/learn/contrib/__init__.py +3 -1
maxframe/learn/contrib/graph/__init__.py +15 -0
maxframe/learn/contrib/graph/connected_components.py +215 -0
maxframe/learn/contrib/graph/tests/__init__.py +13 -0
maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
maxframe/learn/contrib/llm/__init__.py +16 -0
maxframe/learn/contrib/llm/core.py +54 -0
maxframe/learn/contrib/llm/models/__init__.py +14 -0
maxframe/learn/contrib/llm/models/dashscope.py +73 -0
maxframe/learn/contrib/llm/multi_modal.py +42 -0
maxframe/learn/contrib/llm/text.py +42 -0
maxframe/learn/contrib/utils.py +52 -0
maxframe/learn/contrib/xgboost/__init__.py +26 -0
maxframe/learn/contrib/xgboost/classifier.py +110 -0
maxframe/learn/contrib/xgboost/core.py +241 -0
maxframe/learn/contrib/xgboost/dmatrix.py +147 -0
maxframe/learn/contrib/xgboost/predict.py +121 -0
maxframe/learn/contrib/xgboost/regressor.py +71 -0
maxframe/learn/contrib/xgboost/tests/__init__.py +13 -0
maxframe/learn/contrib/xgboost/tests/test_core.py +43 -0
maxframe/learn/contrib/xgboost/train.py +132 -0
maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
maxframe/learn/utils/__init__.py +15 -0
maxframe/learn/utils/core.py +29 -0
maxframe/lib/mmh3.cp311-win32.pyd +0 -0
maxframe/lib/mmh3.pyi +43 -0
maxframe/lib/sparse/tests/test_sparse.py +15 -15
maxframe/lib/wrapped_pickle.py +2 -1
maxframe/opcodes.py +11 -0
maxframe/protocol.py +154 -27
maxframe/remote/core.py +4 -8
maxframe/serialization/__init__.py +1 -0
maxframe/serialization/core.cp311-win32.pyd +0 -0
maxframe/serialization/core.pxd +3 -0
maxframe/serialization/core.pyi +64 -0
maxframe/serialization/core.pyx +67 -26
maxframe/serialization/exception.py +1 -1
maxframe/serialization/pandas.py +52 -17
maxframe/serialization/serializables/core.py +180 -15
maxframe/serialization/serializables/field_type.py +4 -1
maxframe/serialization/serializables/tests/test_serializable.py +54 -5
maxframe/serialization/tests/test_serial.py +2 -1
maxframe/session.py +37 -2
maxframe/tensor/__init__.py +81 -2
maxframe/tensor/arithmetic/isclose.py +1 -0
maxframe/tensor/arithmetic/tests/test_arithmetic.py +22 -18
maxframe/tensor/core.py +5 -136
maxframe/tensor/datasource/array.py +7 -2
maxframe/tensor/datasource/full.py +1 -1
maxframe/tensor/datasource/scalar.py +1 -1
maxframe/tensor/datasource/tests/test_datasource.py +1 -1
maxframe/tensor/indexing/flatnonzero.py +1 -1
maxframe/tensor/indexing/getitem.py +2 -0
maxframe/tensor/merge/__init__.py +2 -0
maxframe/tensor/merge/concatenate.py +101 -0
maxframe/tensor/merge/tests/test_merge.py +30 -1
maxframe/tensor/merge/vstack.py +74 -0
maxframe/tensor/{base → misc}/__init__.py +4 -0
maxframe/tensor/misc/atleast_1d.py +72 -0
maxframe/tensor/misc/atleast_2d.py +70 -0
maxframe/tensor/misc/atleast_3d.py +85 -0
maxframe/tensor/misc/tests/__init__.py +13 -0
maxframe/tensor/{base → misc}/transpose.py +22 -18
maxframe/tensor/misc/unique.py +205 -0
maxframe/tensor/operators.py +1 -7
maxframe/tensor/random/core.py +1 -1
maxframe/tensor/reduction/count_nonzero.py +2 -1
maxframe/tensor/reduction/mean.py +1 -0
maxframe/tensor/reduction/nanmean.py +1 -0
maxframe/tensor/reduction/nanvar.py +2 -0
maxframe/tensor/reduction/tests/test_reduction.py +12 -1
maxframe/tensor/reduction/var.py +2 -0
maxframe/tensor/statistics/quantile.py +2 -2
maxframe/tensor/utils.py +2 -22
maxframe/tests/test_protocol.py +34 -0
maxframe/tests/test_utils.py +0 -12
maxframe/tests/utils.py +17 -2
maxframe/typing_.py +4 -1
maxframe/udf.py +62 -3
maxframe/utils.py +112 -86
{maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/METADATA +25 -25
{maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/RECORD +208 -167
{maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/WHEEL +1 -1
maxframe_client/__init__.py +0 -1
maxframe_client/clients/framedriver.py +4 -1
maxframe_client/fetcher.py +123 -54
maxframe_client/session/consts.py +3 -0
maxframe_client/session/graph.py +8 -2
maxframe_client/session/odps.py +223 -40
maxframe_client/session/task.py +108 -80
maxframe_client/tests/test_fetcher.py +21 -3
maxframe_client/tests/test_session.py +136 -8
maxframe/core/entity/chunks.py +0 -68
maxframe/core/entity/fuse.py +0 -73
maxframe/core/graph/builder/chunk.py +0 -430
maxframe/odpsio/tableio.py +0 -300
maxframe/odpsio/volumeio.py +0 -95
maxframe_client/clients/spe.py +0 -104
/maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
/maxframe/{tensor/base → dataframe/datastore}/tests/__init__.py +0 -0
/maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
/maxframe/tensor/{base → misc}/astype.py +0 -0
/maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
/maxframe/tensor/{base → misc}/ravel.py +0 -0
/maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
/maxframe/tensor/{base → misc}/where.py +0 -0
{maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/top_level.txt +0 -0

maxframe/dataframe/datasource/read_odps_query.py CHANGED Viewed

@@ -13,6 +13,7 @@
 # limitations under the License.
 import dataclasses
+import logging
 import re
 from typing import Dict, List, Optional, Tuple, Union
@@ -22,12 +23,14 @@ from odps import ODPS
 from odps.types import Column, OdpsSchema, validate_data_type
 from ... import opcodes
+from ...config import options
 from ...core import OutputType
 from ...core.graph import DAG
-from ...odpsio import odps_schema_to_pandas_dtypes
+from ...io.odpsio import odps_schema_to_pandas_dtypes
 from ...serialization.serializables import (
     AnyField,
     BoolField,
+    DictField,
     FieldTypes,
     Int64Field,
     ListField,
@@ -37,6 +40,10 @@ from ...serialization.serializables import (
 from ..utils import parse_index
 from .core import ColumnPruneSupportedDataSourceMixin, IncrementalIndexDatasource
+logger = logging.getLogger(__name__)
+_DEFAULT_ANONYMOUS_COL_PREFIX = "_anon_col_"
 _EXPLAIN_DEPENDS_REGEX = re.compile(r"([^\s]+) depends on: ([^\n]+)")
 _EXPLAIN_JOB_REGEX = re.compile(r"(\S+) is root job")
 _EXPLAIN_TASKS_HEADER_REGEX = re.compile(r"In Job ([^:]+):")
@@ -46,7 +53,11 @@ _EXPLAIN_TASK_SCHEMA_REGEX = re.compile(
     r"In Task ([^:]+)[\S\s]+FS: output: ([^\n #]+)[\s\S]+schema:\s+([\S\s]+)$",
     re.MULTILINE,
 )
-_EXPLAIN_COLUMN_REGEX = re.compile(r"([^\(]+) \(([^)]+)\)(?:| AS ([^ ]+))(?:\n|$)")
+_EXPLAIN_COLUMN_REGEX = re.compile(r"([^\(]+) \(([^\n]+)\)(?:| AS ([^ ]+))(?:\n|$)")
+_ANONYMOUS_COL_REGEX = re.compile(r"^_c(\d+)$")
+_SIMPLE_SCHEMA_COLS_REGEX = re.compile(r"SELECT (([^:]+:[^, ]+[, ]*)+)FROM")
+_SIMPLE_SCHEMA_COL_REGEX = re.compile(r"([^ \.\)]+):([^ ]+)")
 @dataclasses.dataclass
@@ -151,7 +162,7 @@ def _resolve_task_sector(job_name: str, sector: str) -> TaskSector:
     return TaskSector(job_name, task_name, out_target, schemas)
-def _parse_explained_schema(explain_string: str) -> OdpsSchema:
+def _parse_full_explain(explain_string: str) -> OdpsSchema:
     sectors = _split_explain_string(explain_string)
     jobs_sector = tasks_sector = None
@@ -169,27 +180,53 @@ def _parse_explained_schema(explain_string: str) -> OdpsSchema:
     job_dag = jobs_sector.build_dag()
     indep_job_names = list(job_dag.iter_indep(reverse=True))
-    if len(indep_job_names) > 1:  # pragma: no cover
-        raise ValueError("Only one final job is allowed in SQL statement")
-    tasks_sector = jobs_sector.jobs[indep_job_names[0]]
-    task_dag = tasks_sector.build_dag()
-    indep_task_names = list(task_dag.iter_indep(reverse=True))
-    if len(indep_task_names) > 1:  # pragma: no cover
+    schema_signatures = dict()
+    for job_name in indep_job_names:
+        tasks_sector = jobs_sector.jobs[job_name]
+        task_dag = tasks_sector.build_dag()
+        indep_task_names = list(task_dag.iter_indep(reverse=True))
+        for task_name in indep_task_names:
+            task_sector = tasks_sector.tasks[task_name]
+            if not task_sector.schema:  # pragma: no cover
+                raise ValueError("Cannot detect output schema")
+            if task_sector.output_target != "Screen":
+                raise ValueError("The SQL statement should be an instant query")
+            sig_tuples = sorted(
+                [
+                    (c.column_alias or c.column_name, c.column_type)
+                    for c in task_sector.schema
+                ]
+            )
+            schema_signatures[hash(tuple(sig_tuples))] = task_sector.schema
+    if len(schema_signatures) != 1:
         raise ValueError("Only one final task is allowed in SQL statement")
-    task_sector = tasks_sector.tasks[indep_task_names[0]]
-    if not task_sector.schema:  # pragma: no cover
-        raise ValueError("Cannot detect output schema")
-    if task_sector.output_target != "Screen":
-        raise ValueError("The SQL statement should be an instant query")
+    schema = list(schema_signatures.values())[0]
     cols = [
         Column(c.column_alias or c.column_name, validate_data_type(c.column_type))
-        for c in task_sector.schema
+        for c in schema
     ]
     return OdpsSchema(cols)
+def _parse_simple_explain(explain_string: str) -> OdpsSchema:
+    fields_match = _SIMPLE_SCHEMA_COLS_REGEX.search(explain_string)
+    if not fields_match:
+        raise ValueError("Cannot detect output table schema")
+    fields_str = fields_match.group(1)
+    cols = []
+    for field, type_name in _SIMPLE_SCHEMA_COL_REGEX.findall(fields_str):
+        cols.append(Column(field, validate_data_type(type_name.rstrip(","))))
+    return OdpsSchema(cols)
+def _parse_explained_schema(explain_string: str) -> OdpsSchema:
+    if explain_string.startswith("AdhocSink"):
+        return _parse_simple_explain(explain_string)
+    else:
+        return _parse_full_explain(explain_string)
 class DataFrameReadODPSQuery(
     IncrementalIndexDatasource,
     ColumnPruneSupportedDataSourceMixin,
@@ -204,6 +241,7 @@ class DataFrameReadODPSQuery(
     string_as_binary = BoolField("string_as_binary", default=None)
     index_columns = ListField("index_columns", FieldTypes.string, default=None)
     index_dtypes = SeriesField("index_dtypes", default=None)
+    column_renames = DictField("column_renames", default=None)
     def get_columns(self):
         return self.columns
@@ -216,7 +254,9 @@ class DataFrameReadODPSQuery(
             index_value = parse_index(pd.RangeIndex(0))
         elif len(self.index_columns) == 1:
             index_value = parse_index(
-                pd.Index([], name=self.index_columns[0]).astype(self.index_dtypes[0])
+                pd.Index([], name=self.index_columns[0]).astype(
+                    self.index_dtypes.iloc[0]
+                )
             )
         else:
             idx = pd.MultiIndex.from_frame(
@@ -224,12 +264,18 @@ class DataFrameReadODPSQuery(
             )
             index_value = parse_index(idx)
-        columns_value = parse_index(self.dtypes.index, store_data=True)
+        if self.dtypes is not None:
+            columns_value = parse_index(self.dtypes.index, store_data=True)
+            shape = (np.nan, len(self.dtypes))
+        else:
+            columns_value = None
+            shape = (np.nan, np.nan)
         self.output_types = [OutputType.dataframe]
         return self.new_tileable(
             [],
             None,
-            shape=(len(self.dtypes), np.nan),
+            shape=shape,
             dtypes=self.dtypes,
             index_value=index_value,
             columns_value=columns_value,
@@ -243,6 +289,9 @@ def read_odps_query(
     odps_entry: ODPS = None,
     index_col: Union[None, str, List[str]] = None,
     string_as_binary: bool = None,
+    sql_hints: Dict[str, str] = None,
+    anonymous_col_prefix: str = _DEFAULT_ANONYMOUS_COL_PREFIX,
+    skip_schema: bool = False,
     **kw,
 ):
     """
@@ -257,24 +306,70 @@ def read_odps_query(
         MaxCompute SQL statement.
     index_col: Union[None, str, List[str]]
         Columns to be specified as indexes.
+    string_as_binary: bool, optional
+        Whether to convert string columns to binary.
+    sql_hints: Dict[str, str], optional
+        User specified SQL hints.
+    anonymous_col_prefix: str, optional
+        Prefix for anonymous columns, '_anon_col_' by default.
+    skip_schema: bool, optional
+        Skip resolving output schema before execution. Once this is configured,
+        the output DataFrame cannot be inputs of other DataFrame operators
+        before execution.
     Returns
     -------
     result: DataFrame
         DataFrame read from MaxCompute (ODPS) table
     """
+    hints = options.sql.settings.copy() or {}
+    if sql_hints:
+        hints.update(sql_hints)
     odps_entry = odps_entry or ODPS.from_global() or ODPS.from_environments()
+    if options.session.enable_schema or odps_entry.is_schema_namespace_enabled():
+        hints["odps.namespace.schema"] = "true"
+        hints["odps.sql.allow.namespace.schema"] = "true"
+    # fixme workaround for multi-stage split process
+    hints["odps.sql.object.table.split.by.object.size.enabled"] = "false"
     if odps_entry is None:
         raise ValueError("Missing odps_entry parameter")
-    inst = odps_entry.execute_sql(f"EXPLAIN {query}")
-    explain_str = list(inst.get_task_results().values())[0]
-    odps_schema = _parse_explained_schema(explain_str)
-    dtypes = odps_schema_to_pandas_dtypes(odps_schema)
+    col_renames = {}
+    if not skip_schema:
+        inst = odps_entry.execute_sql(f"EXPLAIN {query}", hints=hints)
+        logger.debug("Explain instance ID: %s", inst.id)
+        explain_str = list(inst.get_task_results().values())[0]
+        try:
+            odps_schema = _parse_explained_schema(explain_str)
+        except ValueError as ex:
+            exc = ValueError(str(ex) + "\nExplain instance ID: " + inst.id)
+            raise exc.with_traceback(ex.__traceback__) from None
+        new_columns = []
+        for col in odps_schema.columns:
+            anon_match = _ANONYMOUS_COL_REGEX.match(col.name)
+            if anon_match and col.name not in query:
+                new_name = anonymous_col_prefix + anon_match.group(1)
+                col_renames[col.name] = new_name
+                new_columns.append(Column(new_name, col.type))
+            else:
+                new_columns.append(col)
+        dtypes = odps_schema_to_pandas_dtypes(OdpsSchema(new_columns))
+    else:
+        dtypes = None
     if not index_col:
         index_dtypes = None
     else:
+        if dtypes is None:
+            raise ValueError("Cannot configure index_col when skip_schema is True")
         if isinstance(index_col, str):
             index_col = [index_col]
         index_col_set = set(index_col)
@@ -293,5 +388,6 @@ def read_odps_query(
         string_as_binary=string_as_binary,
         index_columns=index_col,
         index_dtypes=index_dtypes,
+        column_renames=col_renames,
     )
     return op(chunk_bytes=chunk_bytes, chunk_size=chunk_size)

maxframe/dataframe/datasource/read_odps_table.py CHANGED Viewed

@@ -22,8 +22,9 @@ from odps.models import Table
 from odps.utils import to_timestamp
 from ... import opcodes
+from ...config import options
 from ...core import OutputType
-from ...odpsio import odps_schema_to_pandas_dtypes
+from ...io.odpsio import odps_schema_to_pandas_dtypes
 from ...serialization.serializables import (
     AnyField,
     BoolField,
@@ -82,7 +83,9 @@ class DataFrameReadODPSTable(
                 index_value = parse_index(pd.RangeIndex(shape[0]))
         elif len(self.index_columns) == 1:
             index_value = parse_index(
-                pd.Index([], name=self.index_columns[0]).astype(self.index_dtypes[0])
+                pd.Index([], name=self.index_columns[0]).astype(
+                    self.index_dtypes.iloc[0]
+                )
             )
         else:
             idx = pd.MultiIndex.from_frame(
@@ -117,9 +120,10 @@ class DataFrameReadODPSTable(
             return self.new_tileable(
                 [],
                 None,
-                shape=shape,
+                shape=shape[:1],
                 name=getattr(index_value, "name", None),
                 names=getattr(index_value, "names", None),
+                dtype=self.index_dtypes.iloc[0],
                 index_value=index_value,
                 chunk_bytes=chunk_bytes,
                 chunk_size=chunk_size,
@@ -164,12 +168,13 @@ def read_odps_table(
         DataFrame read from MaxCompute (ODPS) table
     """
     odps_entry = odps_entry or ODPS.from_global() or ODPS.from_environments()
+    schema = options.session.default_schema or odps_entry.schema
     if odps_entry is None:
         raise ValueError("Missing odps_entry parameter")
     if isinstance(table_name, Table):
         table = table_name
     else:
-        table = odps_entry.get_table(table_name)
+        table = odps_entry.get_table(table_name, schema=schema)
     if not table.table_schema.partitions and (
         partitions is not None or append_partitions

maxframe/dataframe/datasource/tests/test_datasource.py CHANGED Viewed

@@ -13,18 +13,28 @@
 # limitations under the License.
 import os
+import uuid
 from collections import OrderedDict
 import numpy as np
 import pandas as pd
 import pytest
 from odps import ODPS
+from odps import types as odps_types
 from .... import tensor as mt
+from ....core import OutputType
 from ....tests.utils import tn
 from ....utils import lazy_import
 from ... import read_odps_query, read_odps_table
-from ...core import DatetimeIndex, Float64Index, IndexValue, Int64Index, MultiIndex
+from ...core import (
+    DatetimeIndex,
+    Float64Index,
+    Index,
+    IndexValue,
+    Int64Index,
+    MultiIndex,
+)
 from ..dataframe import from_pandas as from_pandas_df
 from ..date_range import date_range
 from ..from_tensor import (
@@ -34,7 +44,12 @@ from ..from_tensor import (
 )
 from ..index import from_pandas as from_pandas_index
 from ..index import from_tileable
-from ..read_odps_query import ColumnSchema, _resolve_task_sector
+from ..read_odps_query import (
+    ColumnSchema,
+    _parse_full_explain,
+    _parse_simple_explain,
+    _resolve_task_sector,
+)
 from ..series import from_pandas as from_pandas_series
 ray = lazy_import("ray")
@@ -112,18 +127,22 @@ def test_from_tileable_index():
     for o in [df, df[0]]:
         index = o.index
-        assert isinstance(index, Int64Index)
+        assert isinstance(index, (Index, Int64Index))
         assert index.dtype == np.int64
         assert index.name == pd_df.index.name
-        assert isinstance(index.index_value.value, IndexValue.Int64Index)
+        assert isinstance(
+            index.index_value.value, (IndexValue.Int64Index, IndexValue.Index)
+        )
     t = mt.random.rand(10, chunk_size=6)
     index = from_tileable(t, name="new_name")
-    assert isinstance(index, Float64Index)
+    assert isinstance(index, (Index, Float64Index))
     assert index.dtype == np.float64
     assert index.name == "new_name"
-    assert isinstance(index.index_value.value, IndexValue.Float64Index)
+    assert isinstance(
+        index.index_value.value, (IndexValue.Float64Index, IndexValue.Index)
+    )
 def test_from_tensor():
@@ -295,6 +314,15 @@ def test_from_odps_table():
         ),
     )
+    out_idx = read_odps_table(
+        test_table,
+        columns=[],
+        index_col=["col1", "col2"],
+        output_type=OutputType.index,
+    )
+    assert out_idx.names == ["col1", "col2"]
+    assert out_idx.shape == (np.nan,)
     test_table.drop()
     test_parted_table.drop()
@@ -316,7 +344,10 @@ def test_from_odps_query():
     odps_entry.write_table(test_table2, [["A", 10, 4.5]])
     with pytest.raises(ValueError) as err_info:
-        read_odps_query(f"CREATE TABLE dummy_table AS SELECT * FROM {table1_name}")
+        read_odps_query(
+            f"CREATE TABLE dummy_table_{uuid.uuid4().hex} "
+            f"AS SELECT * FROM {table1_name}"
+        )
     assert "instant query" in err_info.value.args[0]
     query1 = f"SELECT * FROM {table1_name} WHERE col1 > 10"
@@ -332,6 +363,10 @@ def test_from_odps_query():
         ),
     )
+    df = read_odps_query(query1, skip_schema=True)
+    assert df.dtypes is None
+    assert df.columns_value is None
     df = read_odps_query(query1, index_col="col1")
     assert df.op.query == query1
     assert df.index_value.name == "col1"
@@ -387,7 +422,9 @@ def test_date_range():
 def test_resolve_task_sector():
-    input_path = os.path.join(os.path.dirname(__file__), "test-data", "task-input.txt")
+    input_path = os.path.join(
+        os.path.dirname(__file__), "test-data", "task-input-full.txt"
+    )
     with open(input_path, "r") as f:
         sector = f.read()
     actual_sector = _resolve_task_sector("job0", sector)
@@ -399,3 +436,61 @@ def test_resolve_task_sector():
     assert actual_sector.schema[0] == ColumnSchema("unnamed: 0", "bigint", "")
     assert actual_sector.schema[1] == ColumnSchema("id", "bigint", "id_alias")
     assert actual_sector.schema[2] == ColumnSchema("listing_url", "string", "")
+def test_resolve_task_odps2():
+    input_path = os.path.join(
+        os.path.dirname(__file__), "test-data", "task-input-odps2.txt"
+    )
+    with open(input_path, "r") as f:
+        sector = f.read()
+    actual_sector = _resolve_task_sector("job0", sector)
+    assert actual_sector.job_name == "job0"
+    assert actual_sector.task_name == "M1"
+    assert actual_sector.output_target == "Screen"
+    assert len(actual_sector.schema) == 2
+    assert actual_sector.schema[0] == ColumnSchema("key", "varchar(2048)", "")
+    assert actual_sector.schema[1] == ColumnSchema("data", "binary", "")
+def test_resolve_simple_explain():
+    input_path = os.path.join(
+        os.path.dirname(__file__), "test-data", "task-input-simple.txt"
+    )
+    with open(input_path, "r") as f:
+        sector = f.read()
+    schema = _parse_simple_explain(sector)
+    assert schema.columns[0].name == "memberid"
+    assert schema.columns[0].type == odps_types.string
+    assert schema.columns[1].name == "createdate"
+    assert schema.columns[1].type == odps_types.bigint
+def test_resolve_conditional():
+    input_path = os.path.join(
+        os.path.dirname(__file__), "test-data", "task-input-multi-cond.txt"
+    )
+    with open(input_path, "r") as f:
+        sector = f.read()
+    expected_col_types = {
+        "cs1": "string",
+        "cs2": "string",
+        "ci1": "bigint",
+        "cs3": "string",
+        "cs4": "string",
+        "cs5": "string",
+        "cs6": "string",
+        "cs7": "string",
+        "cs8": "string",
+        "ci2": "int",
+        "ci3": "bigint",
+        "cs9": "string",
+    }
+    schema = _parse_full_explain(sector)
+    for col, (exp_nm, exp_tp) in zip(schema.columns, expected_col_types.items()):
+        assert col.name == exp_nm
+        assert col.type == odps_types.validate_data_type(exp_tp)

maxframe/dataframe/datastore/tests/test_to_odps.py ADDED Viewed

@@ -0,0 +1,48 @@
+# Copyright 1999-2024 Alibaba Group Holding Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pytest
+from ... import DataFrame
+from ..to_odps import to_odps_table
+@pytest.fixture
+def df():
+    return DataFrame({"A": [1, 2], "B": [3, 4]})
+@pytest.mark.parametrize(
+    "kwargs",
+    [
+        {"partition_col": ["A", "C"]},
+        {"partition_col": "C"},
+        {"partition": "a=1,C=2"},
+    ],
+)
+def test_to_odps_table_validation(df, kwargs):
+    with pytest.raises(ValueError):
+        to_odps_table(df, "test_table", **kwargs)
+@pytest.mark.parametrize(
+    "kwargs",
+    [
+        {"partition_col": ["a", "B"]},
+        {"partition_col": "a"},
+        {"partition": "C=1,d=2"},
+    ],
+)
+def test_to_odps_table_vaild(df, kwargs):
+    to_odps_table(df, "test_table", **kwargs)

maxframe/dataframe/datastore/to_odps.py CHANGED Viewed

@@ -17,11 +17,14 @@
 import logging
 from typing import List, Optional, Union
+from odps import ODPS
 from odps.models import Table as ODPSTable
+from odps.types import PartitionSpec
 from ... import opcodes
 from ...config import options
 from ...core import OutputType
+from ...io.odpsio import build_dataframe_table_meta
 from ...serialization.serializables import (
     BoolField,
     FieldTypes,
@@ -134,8 +137,14 @@ def to_odps_table(
     --------
     """
+    odps_entry = ODPS.from_global() or ODPS.from_environments()
     if isinstance(table, ODPSTable):
         table = table.full_table_name
+    elif options.session.enable_schema and "." not in table:
+        default_schema = (
+            options.session.default_schema or odps_entry.schema or "default"
+        )
+        table = default_schema + "." + table
     if isinstance(index_label, str):
         index_label = [index_label]
@@ -147,6 +156,25 @@ def to_odps_table(
             f"index_label needs {len(df.index.nlevels)} labels "
             f"but it only have {len(index_label)}"
         )
+    table_cols = set(build_dataframe_table_meta(df).table_column_names)
+    if partition:
+        partition_intersect = (
+            set(x.lower() for x in PartitionSpec(partition).keys()) & table_cols
+        )
+        if partition_intersect:
+            raise ValueError(
+                f"Data column(s) {partition_intersect} in the dataframe"
+                " cannot be used in parameter 'partition'."
+                " Use 'partition_col' instead."
+            )
+    if partition_col:
+        partition_diff = set(x.lower() for x in partition_col) - table_cols
+        if partition_diff:
+            raise ValueError(
+                f"Partition column(s) {partition_diff}"
+                " is not the data column(s) of the input dataframe."
+            )
     op = DataFrameToODPSTable(
         dtypes=df.dtypes,

maxframe/dataframe/extensions/__init__.py CHANGED Viewed

@@ -18,6 +18,8 @@ from .accessor import (
     IndexMaxFrameAccessor,
     SeriesMaxFrameAccessor,
 )
+from .flatjson import series_flatjson
+from .flatmap import df_flatmap, series_flatmap
 from .reshuffle import DataFrameReshuffle, df_reshuffle
@@ -25,6 +27,9 @@ def _install():
     from ..core import DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE
     DataFrameMaxFrameAccessor._register("reshuffle", df_reshuffle)
+    DataFrameMaxFrameAccessor._register("flatmap", df_flatmap)
+    SeriesMaxFrameAccessor._register("flatmap", series_flatmap)
+    SeriesMaxFrameAccessor._register("flatjson", series_flatjson)
     if DataFrameMaxFrameAccessor._api_count:
         for t in DATAFRAME_TYPE:

maxframe 0.1.0b4__cp311-cp311-win32.whl → 1.0.0__cp311-cp311-win32.whl

Potentially problematic release.

maxframe 0.1.0b4cp311-cp311-win32.whl → 1.0.0cp311-cp311-win32.whl