PyPI - maxframe - Versions diffs - 1.0.0rc2__cp38-cp38-win32.whl → 1.0.0rc4__cp38-cp38-win32.whl - Mend

maxframe 1.0.0rc2cp38-cp38-win32.whl → 1.0.0rc4cp38-cp38-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of maxframe might be problematic. Click here for more details.

Files changed (134) hide show

maxframe/_utils.cp38-win32.pyd +0 -0
maxframe/codegen.py +4 -2
maxframe/config/config.py +28 -9
maxframe/config/validators.py +42 -12
maxframe/conftest.py +56 -14
maxframe/core/__init__.py +2 -13
maxframe/core/entity/__init__.py +0 -4
maxframe/core/entity/executable.py +1 -1
maxframe/core/entity/objects.py +45 -2
maxframe/core/entity/output_types.py +0 -3
maxframe/core/entity/tests/test_objects.py +43 -0
maxframe/core/entity/tileables.py +5 -78
maxframe/core/graph/__init__.py +2 -2
maxframe/core/graph/builder/__init__.py +0 -1
maxframe/core/graph/builder/base.py +5 -4
maxframe/core/graph/builder/tileable.py +4 -4
maxframe/core/graph/builder/utils.py +4 -8
maxframe/core/graph/core.cp38-win32.pyd +0 -0
maxframe/core/graph/entity.py +9 -33
maxframe/core/operator/__init__.py +2 -9
maxframe/core/operator/base.py +3 -5
maxframe/core/operator/objects.py +0 -9
maxframe/core/operator/utils.py +55 -0
maxframe/dataframe/arithmetic/docstring.py +26 -2
maxframe/dataframe/arithmetic/equal.py +4 -2
maxframe/dataframe/arithmetic/greater.py +4 -2
maxframe/dataframe/arithmetic/greater_equal.py +4 -2
maxframe/dataframe/arithmetic/less.py +2 -2
maxframe/dataframe/arithmetic/less_equal.py +4 -2
maxframe/dataframe/arithmetic/not_equal.py +4 -2
maxframe/dataframe/core.py +2 -0
maxframe/dataframe/datasource/read_odps_query.py +67 -8
maxframe/dataframe/datasource/read_odps_table.py +4 -2
maxframe/dataframe/datasource/tests/test_datasource.py +35 -6
maxframe/dataframe/datastore/to_odps.py +8 -1
maxframe/dataframe/extensions/__init__.py +3 -0
maxframe/dataframe/extensions/flatmap.py +326 -0
maxframe/dataframe/extensions/tests/test_extensions.py +62 -1
maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
maxframe/dataframe/indexing/rename.py +11 -0
maxframe/dataframe/initializer.py +11 -1
maxframe/dataframe/misc/drop_duplicates.py +18 -1
maxframe/dataframe/operators.py +1 -17
maxframe/dataframe/reduction/core.py +2 -2
maxframe/dataframe/tests/test_initializer.py +33 -2
maxframe/io/objects/__init__.py +24 -0
maxframe/io/objects/core.py +140 -0
maxframe/io/objects/tensor.py +76 -0
maxframe/io/objects/tests/__init__.py +13 -0
maxframe/io/objects/tests/test_object_io.py +97 -0
maxframe/{odpsio → io/odpsio}/__init__.py +2 -0
maxframe/{odpsio → io/odpsio}/arrow.py +4 -4
maxframe/{odpsio → io/odpsio}/schema.py +10 -8
maxframe/{odpsio → io/odpsio}/tableio.py +50 -38
maxframe/io/odpsio/tests/__init__.py +13 -0
maxframe/{odpsio → io/odpsio}/tests/test_schema.py +3 -7
maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +3 -3
maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
maxframe/io/odpsio/volumeio.py +63 -0
maxframe/learn/contrib/__init__.py +2 -1
maxframe/learn/contrib/graph/__init__.py +15 -0
maxframe/learn/contrib/graph/connected_components.py +215 -0
maxframe/learn/contrib/graph/tests/__init__.py +13 -0
maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
maxframe/learn/contrib/xgboost/classifier.py +26 -2
maxframe/learn/contrib/xgboost/core.py +87 -2
maxframe/learn/contrib/xgboost/dmatrix.py +1 -4
maxframe/learn/contrib/xgboost/predict.py +27 -44
maxframe/learn/contrib/xgboost/regressor.py +3 -10
maxframe/learn/contrib/xgboost/train.py +27 -16
maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
maxframe/lib/mmh3.cp38-win32.pyd +0 -0
maxframe/opcodes.py +3 -0
maxframe/protocol.py +7 -16
maxframe/remote/core.py +4 -8
maxframe/serialization/__init__.py +1 -0
maxframe/serialization/core.cp38-win32.pyd +0 -0
maxframe/session.py +9 -2
maxframe/tensor/__init__.py +10 -2
maxframe/tensor/arithmetic/isclose.py +1 -0
maxframe/tensor/arithmetic/tests/test_arithmetic.py +21 -17
maxframe/tensor/core.py +5 -136
maxframe/tensor/datasource/array.py +3 -0
maxframe/tensor/datasource/full.py +1 -1
maxframe/tensor/datasource/tests/test_datasource.py +1 -1
maxframe/tensor/indexing/flatnonzero.py +1 -1
maxframe/tensor/indexing/getitem.py +2 -0
maxframe/tensor/merge/__init__.py +2 -0
maxframe/tensor/merge/concatenate.py +101 -0
maxframe/tensor/merge/tests/test_merge.py +30 -1
maxframe/tensor/merge/vstack.py +74 -0
maxframe/tensor/{base → misc}/__init__.py +2 -0
maxframe/tensor/{base → misc}/atleast_1d.py +0 -2
maxframe/tensor/misc/atleast_2d.py +70 -0
maxframe/tensor/misc/atleast_3d.py +85 -0
maxframe/tensor/misc/tests/__init__.py +13 -0
maxframe/tensor/{base → misc}/transpose.py +22 -18
maxframe/tensor/operators.py +1 -7
maxframe/tensor/random/core.py +1 -1
maxframe/tensor/reduction/count_nonzero.py +1 -0
maxframe/tensor/reduction/mean.py +1 -0
maxframe/tensor/reduction/nanmean.py +1 -0
maxframe/tensor/reduction/nanvar.py +2 -0
maxframe/tensor/reduction/tests/test_reduction.py +12 -1
maxframe/tensor/reduction/var.py +2 -0
maxframe/tensor/utils.py +2 -22
maxframe/typing_.py +4 -1
maxframe/udf.py +8 -9
maxframe/utils.py +49 -73
maxframe-1.0.0rc4.dist-info/METADATA +104 -0
{maxframe-1.0.0rc2.dist-info → maxframe-1.0.0rc4.dist-info}/RECORD +129 -114
{maxframe-1.0.0rc2.dist-info → maxframe-1.0.0rc4.dist-info}/WHEEL +1 -1
maxframe_client/fetcher.py +33 -50
maxframe_client/session/consts.py +3 -0
maxframe_client/session/graph.py +8 -2
maxframe_client/session/odps.py +134 -27
maxframe_client/session/task.py +58 -20
maxframe_client/tests/test_fetcher.py +1 -1
maxframe_client/tests/test_session.py +27 -3
maxframe/core/entity/chunks.py +0 -68
maxframe/core/entity/fuse.py +0 -73
maxframe/core/graph/builder/chunk.py +0 -430
maxframe/odpsio/volumeio.py +0 -95
maxframe-1.0.0rc2.dist-info/METADATA +0 -177
/maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
/maxframe/{tensor/base/tests → io}/__init__.py +0 -0
/maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
/maxframe/tensor/{base → misc}/astype.py +0 -0
/maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
/maxframe/tensor/{base → misc}/ravel.py +0 -0
/maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
/maxframe/tensor/{base → misc}/unique.py +0 -0
/maxframe/tensor/{base → misc}/where.py +0 -0
{maxframe-1.0.0rc2.dist-info → maxframe-1.0.0rc4.dist-info}/top_level.txt +0 -0

maxframe/core/graph/builder/base.py CHANGED Viewed

@@ -14,10 +14,10 @@
 from abc import ABC, abstractmethod
-from typing import Generator, List, Set, Union
+from typing import Generator, List, Set
 from ....typing_ import EntityType
-from ..entity import ChunkGraph, EntityGraph, TileableGraph
+from ..entity import EntityGraph
 def _default_inputs_selector(inputs: List[EntityType]) -> List[EntityType]:
@@ -43,7 +43,7 @@ class AbstractGraphBuilder(ABC):
     def _add_nodes(
         self,
-        graph: Union[ChunkGraph, TileableGraph],
+        graph: EntityGraph,
         nodes: List[EntityType],
         visited: Set,
     ):
@@ -75,7 +75,7 @@ class AbstractGraphBuilder(ABC):
                         nodes.append(out)
     @abstractmethod
-    def build(self) -> Generator[Union[EntityGraph, ChunkGraph], None, None]:
+    def build(self) -> Generator[EntityGraph, None, None]:
         """
         Build a entity graph.
@@ -84,3 +84,4 @@ class AbstractGraphBuilder(ABC):
         graph : EntityGraph
             Entity graph.
         """
+        raise NotImplementedError

maxframe/core/graph/builder/tileable.py CHANGED Viewed

@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Generator, Union
+from typing import Generator
 from ...mode import enter_mode
-from ..entity import ChunkGraph, TileableGraph
+from ..entity import TileableGraph
 from .base import AbstractGraphBuilder
@@ -26,9 +26,9 @@ class TileableGraphBuilder(AbstractGraphBuilder):
         super().__init__(graph=graph)
     @enter_mode(build=True, kernel=True)
-    def _build(self) -> Union[TileableGraph, ChunkGraph]:
+    def _build(self) -> TileableGraph:
         self._add_nodes(self._graph, list(self._graph.result_tileables), set())
         return self._graph
-    def build(self) -> Generator[Union[TileableGraph, ChunkGraph], None, None]:
+    def build(self) -> Generator[TileableGraph, None, None]:
         yield self._build()

maxframe/core/graph/builder/utils.py CHANGED Viewed

@@ -13,12 +13,11 @@
 # limitations under the License.
 import itertools
-from typing import List, Union
+from typing import List
 from ....typing_ import TileableType
 from ...mode import enter_mode
-from ..entity import ChunkGraph, TileableGraph
-from .chunk import ChunkGraphBuilder
+from ..entity import EntityGraph, TileableGraph
 from .tileable import TileableGraphBuilder
@@ -28,14 +27,11 @@ def build_graph(
     tile: bool = False,
     fuse_enabled: bool = True,
     **chunk_graph_build_kwargs
-) -> Union[TileableGraph, ChunkGraph]:
+) -> EntityGraph:
     tileables = list(itertools.chain(*(tileable.op.outputs for tileable in tileables)))
     tileable_graph = TileableGraph(tileables)
     tileable_graph_builder = TileableGraphBuilder(tileable_graph)
     tileable_graph = next(tileable_graph_builder.build())
     if not tile:
         return tileable_graph
-    chunk_graph_builder = ChunkGraphBuilder(
-        tileable_graph, fuse_enabled=fuse_enabled, **chunk_graph_build_kwargs
-    )
-    return next(chunk_graph_builder.build())
+    raise NotImplementedError

maxframe/core/graph/core.cp38-win32.pyd CHANGED Viewed

Binary file

maxframe/core/graph/entity.py CHANGED Viewed

@@ -13,9 +13,9 @@
 # limitations under the License.
 from abc import ABCMeta, abstractmethod
-from typing import Dict, Iterable, List, Union
+from typing import Dict, Iterable, List
-from ...core import Chunk, Tileable
+from ...core import Tileable
 from ...serialization.core import buffered
 from ...serialization.serializables import BoolField, DictField, ListField, Serializable
 from ...serialization.serializables.core import SerializableSerializer
@@ -97,26 +97,6 @@ class TileableGraph(EntityGraph, Iterable[Tileable]):
         return self._logic_key
-class ChunkGraph(EntityGraph, Iterable[Chunk]):
-    _result_chunks: List[Chunk]
-    def __init__(self, result_chunks: List[Chunk] = None):
-        super().__init__()
-        self._result_chunks = result_chunks
-    @property
-    def result_chunks(self):
-        return self._result_chunks
-    @property
-    def results(self):
-        return self._result_chunks
-    @results.setter
-    def results(self, new_results):
-        self._result_chunks = new_results
 class SerializableGraph(Serializable):
     _is_chunk = BoolField("is_chunk")
     # TODO(qinxuye): remove this logic when we handle fetch elegantly,
@@ -132,12 +112,11 @@ class SerializableGraph(Serializable):
     _results = ListField("results")
     @classmethod
-    def from_graph(cls, graph: Union[TileableGraph, ChunkGraph]) -> "SerializableGraph":
+    def from_graph(cls, graph: EntityGraph) -> "SerializableGraph":
         from ..operator import Fetch
-        is_chunk = isinstance(graph, ChunkGraph)
         return SerializableGraph(
-            _is_chunk=is_chunk,
+            _is_chunk=False,
             _fetch_nodes=[chunk for chunk in graph if isinstance(chunk.op, Fetch)],
             _nodes=graph._nodes,
             _predecessors=graph._predecessors,
@@ -145,9 +124,8 @@ class SerializableGraph(Serializable):
             _results=graph.results,
         )
-    def to_graph(self) -> Union[TileableGraph, ChunkGraph]:
-        graph_cls = ChunkGraph if self._is_chunk else TileableGraph
-        graph = graph_cls(self._results)
+    def to_graph(self) -> EntityGraph:
+        graph = TileableGraph(self._results)
         graph._nodes.update(self._nodes)
         graph._predecessors.update(self._predecessors)
         graph._successors.update(self._successors)
@@ -156,14 +134,12 @@ class SerializableGraph(Serializable):
 class GraphSerializer(SerializableSerializer):
     @buffered
-    def serial(self, obj: Union[TileableGraph, ChunkGraph], context: Dict):
+    def serial(self, obj: EntityGraph, context: Dict):
         serializable_graph = SerializableGraph.from_graph(obj)
         return [], [serializable_graph], False
-    def deserial(
-        self, serialized: List, context: Dict, subs: List
-    ) -> Union[TileableGraph, ChunkGraph]:
-        serializable_graph: SerializableGraph = subs[0]
+    def deserial(self, serialized: List, context: Dict, subs: List) -> TileableGraph:
+        serializable_graph: EntityGraph = subs[0]
         return serializable_graph.to_graph()

maxframe/core/operator/__init__.py CHANGED Viewed

@@ -22,13 +22,6 @@ from .base import (
 )
 from .core import TileableOperatorMixin, estimate_size, execute
 from .fetch import Fetch, FetchMixin, FetchShuffle, ShuffleFetchType
-from .fuse import Fuse, FuseChunkMixin
-from .objects import (
-    MergeDictOperator,
-    ObjectFetch,
-    ObjectFuseChunk,
-    ObjectFuseChunkMixin,
-    ObjectOperator,
-    ObjectOperatorMixin,
-)
+from .objects import MergeDictOperator, ObjectFetch, ObjectOperator, ObjectOperatorMixin
 from .shuffle import MapReduceOperator, ShuffleProxy
+from .utils import add_fetch_builder, build_fetch

maxframe/core/operator/base.py CHANGED Viewed

@@ -12,11 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import functools
 import weakref
 from copy import deepcopy
 from enum import Enum
-from functools import partial
+from functools import lru_cache, partial
 from typing import Any, Dict, List, Optional, Tuple, Type, Union
 from ...serialization.core import Placeholder
@@ -37,7 +36,6 @@ from ...serialization.serializables.core import SerializableSerializer
 from ...typing_ import OperatorType
 from ...utils import AttributeDict, classproperty, get_user_call_point, tokenize
 from ..base import Base
-from ..entity.chunks import Chunk
 from ..entity.core import ENTITY_TYPE, Entity, EntityData
 from ..entity.output_types import OutputType
 from ..entity.tileables import Tileable
@@ -90,7 +88,7 @@ class SchedulingHint(Serializable):
     priority = Int32Field("priority", default=None)
     @classproperty
-    @functools.lru_cache(1)
+    @lru_cache(1)
     def all_hint_names(cls):
         return list(cls._FIELDS)
@@ -341,7 +339,7 @@ class Operator(Base, OperatorLogicKeyGeneratorMixin, metaclass=OperatorMetaclass
             raise ValueError("Outputs' size exceeds limitation")
     @property
-    def outputs(self) -> List[Union[Chunk, Tileable]]:
+    def outputs(self) -> List[Tileable]:
         outputs = self._outputs
         if outputs:
             return [ref() for ref in outputs]

maxframe/core/operator/objects.py CHANGED Viewed

@@ -17,7 +17,6 @@ from ..entity import OutputType, register_fetch_class
 from .base import Operator
 from .core import TileableOperatorMixin
 from .fetch import Fetch, FetchMixin
-from .fuse import Fuse, FuseChunkMixin
 class ObjectOperator(Operator):
@@ -28,14 +27,6 @@ class ObjectOperatorMixin(TileableOperatorMixin):
     _output_type_ = OutputType.object
-class ObjectFuseChunkMixin(FuseChunkMixin, ObjectOperatorMixin):
-    __slots__ = ()
-class ObjectFuseChunk(ObjectFuseChunkMixin, Fuse):
-    pass
 class ObjectFetch(FetchMixin, ObjectOperatorMixin, Fetch):
     _output_type_ = OutputType.object

maxframe/core/operator/utils.py ADDED Viewed

@@ -0,0 +1,55 @@
+# Copyright 1999-2024 Alibaba Group Holding Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ...typing_ import EntityType, TileableType
+from ..entity import TILEABLE_TYPE
+def build_fetch_tileable(tileable: TileableType) -> TileableType:
+    if tileable.is_coarse():
+        chunks = None
+    else:
+        chunks = []
+        for c in tileable.chunks:
+            fetch_chunk = build_fetch(c, index=c.index)
+            chunks.append(fetch_chunk)
+    tileable_op = tileable.op
+    params = tileable.params.copy()
+    new_op = tileable_op.get_fetch_op_cls(tileable)(_id=tileable_op.id)
+    return new_op.new_tileables(
+        None,
+        chunks=chunks,
+        nsplits=tileable.nsplits,
+        _key=tileable.key,
+        _id=tileable.id,
+        **params,
+    )[0]
+_type_to_builder = [
+    (TILEABLE_TYPE, build_fetch_tileable),
+]
+def build_fetch(entity: EntityType, **kw) -> EntityType:
+    for entity_types, func in _type_to_builder:
+        if isinstance(entity, entity_types):
+            return func(entity, **kw)
+    raise TypeError(f"Type {type(entity)} not supported")
+def add_fetch_builder(entity_type, builder_func):
+    _type_to_builder.append((entity_type, builder_func))

maxframe/dataframe/arithmetic/docstring.py CHANGED Viewed

@@ -185,7 +185,6 @@ e    NaN
 dtype: float64
 """
-# FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/48
 _flex_comp_doc_FRAME = """
 Get {desc} of dataframe and other, element-wise (binary operator `{op_name}`).
 Among flexible wrappers (`eq`, `ne`, `le`, `lt`, `ge`, `gt`) to comparison
@@ -291,7 +290,7 @@ C   True    False
 Compare to a DataFrame of different shape.
->>> other = pd.DataFrame({{'revenue': [300, 250, 100, 150]}},
+>>> other = md.DataFrame({{'revenue': [300, 250, 100, 150]}},
 ...                      index=['A', 'B', 'C', 'D'])
 >>> other.execute()
    revenue
@@ -306,6 +305,31 @@ A  False    False
 B  False    False
 C  False     True
 D  False    False
+Compare to a MultiIndex by level.
+>>> df_multindex = md.DataFrame({{'cost': [250, 150, 100, 150, 300, 220],
+...                              'revenue': [100, 250, 300, 200, 175, 225]}},
+...                             index=[['Q1', 'Q1', 'Q1', 'Q2', 'Q2', 'Q2'],
+...                                    ['A', 'B', 'C', 'A', 'B', 'C']])
+>>> df_multindex.execute()
+      cost  revenue
+Q1 A   250      100
+   B   150      250
+   C   100      300
+Q2 A   150      200
+   B   300      175
+   C   220      225
+>>> df.le(df_multindex, level=1).execute()
+       cost  revenue
+Q1 A   True     True
+   B   True     True
+   C   True     True
+Q2 A  False     True
+   B   True    False
+   C   True    False
 """

maxframe/dataframe/arithmetic/equal.py CHANGED Viewed

@@ -51,6 +51,8 @@ dtype: bool
 @bin_compare_doc("Equal to", equiv="==", series_example=_eq_example)
-def eq(df, other, axis="columns", level=None):
-    op = DataFrameEqual(axis=axis, level=level, lhs=df, rhs=other)
+def eq(df, other, axis="columns", level=None, fill_value=None):
+    op = DataFrameEqual(
+        axis=axis, level=level, lhs=df, rhs=other, fill_value=fill_value
+    )
     return op(df, other)

maxframe/dataframe/arithmetic/greater.py CHANGED Viewed

@@ -52,6 +52,8 @@ dtype: bool
 @bin_compare_doc("Greater than", equiv=">", series_example=_gt_example)
-def gt(df, other, axis="columns", level=None):
-    op = DataFrameGreater(axis=axis, level=level, lhs=df, rhs=other)
+def gt(df, other, axis="columns", level=None, fill_value=None):
+    op = DataFrameGreater(
+        axis=axis, level=level, lhs=df, rhs=other, fill_value=fill_value
+    )
     return op(df, other)

maxframe/dataframe/arithmetic/greater_equal.py CHANGED Viewed

@@ -52,6 +52,8 @@ dtype: bool
 @bin_compare_doc("Greater than or equal to", equiv=">=", series_example=_ge_example)
-def ge(df, other, axis="columns", level=None):
-    op = DataFrameGreaterEqual(axis=axis, level=level, lhs=df, rhs=other)
+def ge(df, other, axis="columns", level=None, fill_value=None):
+    op = DataFrameGreaterEqual(
+        axis=axis, level=level, lhs=df, rhs=other, fill_value=fill_value
+    )
     return op(df, other)

maxframe/dataframe/arithmetic/less.py CHANGED Viewed

@@ -52,6 +52,6 @@ dtype: bool
 @bin_compare_doc("Less than", equiv="<", series_example=_lt_example)
-def lt(df, other, axis="columns", level=None):
-    op = DataFrameLess(axis=axis, level=level, lhs=df, rhs=other)
+def lt(df, other, axis="columns", level=None, fill_value=None):
+    op = DataFrameLess(axis=axis, level=level, lhs=df, rhs=other, fill_value=fill_value)
     return op(df, other)

maxframe/dataframe/arithmetic/less_equal.py CHANGED Viewed

@@ -52,6 +52,8 @@ dtype: bool
 @bin_compare_doc("Less than or equal to", equiv="<=", series_example=_le_example)
-def le(df, other, axis="columns", level=None):
-    op = DataFrameLessEqual(axis=axis, level=level, lhs=df, rhs=other)
+def le(df, other, axis="columns", level=None, fill_value=None):
+    op = DataFrameLessEqual(
+        axis=axis, level=level, lhs=df, rhs=other, fill_value=fill_value
+    )
     return op(df, other)

maxframe/dataframe/arithmetic/not_equal.py CHANGED Viewed

@@ -51,6 +51,8 @@ dtype: bool
 @bin_compare_doc("Not equal to", equiv="!=", series_example=_ne_example)
-def ne(df, other, axis="columns", level=None):
-    op = DataFrameNotEqual(axis=axis, level=level, lhs=df, rhs=other)
+def ne(df, other, axis="columns", level=None, fill_value=None):
+    op = DataFrameNotEqual(
+        axis=axis, level=level, lhs=df, rhs=other, fill_value=fill_value
+    )
     return op(df, other)

maxframe/dataframe/core.py CHANGED Viewed

@@ -1666,6 +1666,8 @@ class DataFrameData(_BatchedFetcher, BaseDataFrameData):
             raise NotImplementedError
         corner_data = fetch_corner_data(self, session=self._executed_sessions[-1])
+        if corner_data is None:
+            return
         buf = StringIO()
         max_rows = pd.get_option("display.max_rows")

maxframe/dataframe/datasource/read_odps_query.py CHANGED Viewed

@@ -13,6 +13,7 @@
 # limitations under the License.
 import dataclasses
+import logging
 import re
 from typing import Dict, List, Optional, Tuple, Union
@@ -22,12 +23,14 @@ from odps import ODPS
 from odps.types import Column, OdpsSchema, validate_data_type
 from ... import opcodes
+from ...config import options
 from ...core import OutputType
 from ...core.graph import DAG
-from ...odpsio import odps_schema_to_pandas_dtypes
+from ...io.odpsio import odps_schema_to_pandas_dtypes
 from ...serialization.serializables import (
     AnyField,
     BoolField,
+    DictField,
     FieldTypes,
     Int64Field,
     ListField,
@@ -37,6 +40,10 @@ from ...serialization.serializables import (
 from ..utils import parse_index
 from .core import ColumnPruneSupportedDataSourceMixin, IncrementalIndexDatasource
+logger = logging.getLogger(__name__)
+_DEFAULT_ANONYMOUS_COL_PREFIX = "_anon_col_"
 _EXPLAIN_DEPENDS_REGEX = re.compile(r"([^\s]+) depends on: ([^\n]+)")
 _EXPLAIN_JOB_REGEX = re.compile(r"(\S+) is root job")
 _EXPLAIN_TASKS_HEADER_REGEX = re.compile(r"In Job ([^:]+):")
@@ -46,8 +53,11 @@ _EXPLAIN_TASK_SCHEMA_REGEX = re.compile(
     r"In Task ([^:]+)[\S\s]+FS: output: ([^\n #]+)[\s\S]+schema:\s+([\S\s]+)$",
     re.MULTILINE,
 )
-_EXPLAIN_COLUMN_REGEX = re.compile(r"([^\(]+) \(([^)]+)\)(?:| AS ([^ ]+))(?:\n|$)")
-_ANONYMOUS_COL_REGEX = re.compile(r"^_c\d+$")
+_EXPLAIN_COLUMN_REGEX = re.compile(r"([^\(]+) \(([^\n]+)\)(?:| AS ([^ ]+))(?:\n|$)")
+_ANONYMOUS_COL_REGEX = re.compile(r"^_c(\d+)$")
+_SIMPLE_SCHEMA_COLS_REGEX = re.compile(r"SELECT (([^:]+:[^, ]+[, ]*)+)FROM")
+_SIMPLE_SCHEMA_COL_REGEX = re.compile(r"([^\.]+):([^, ]+)")
 @dataclasses.dataclass
@@ -152,7 +162,7 @@ def _resolve_task_sector(job_name: str, sector: str) -> TaskSector:
     return TaskSector(job_name, task_name, out_target, schemas)
-def _parse_explained_schema(explain_string: str) -> OdpsSchema:
+def _parse_full_explain(explain_string: str) -> OdpsSchema:
     sectors = _split_explain_string(explain_string)
     jobs_sector = tasks_sector = None
@@ -191,6 +201,25 @@ def _parse_explained_schema(explain_string: str) -> OdpsSchema:
     return OdpsSchema(cols)
+def _parse_simple_explain(explain_string: str) -> OdpsSchema:
+    fields_match = _SIMPLE_SCHEMA_COLS_REGEX.search(explain_string)
+    if not fields_match:
+        raise ValueError("Cannot detect output table schema")
+    fields_str = fields_match.group(1)
+    cols = []
+    for field, type_name in _SIMPLE_SCHEMA_COL_REGEX.findall(fields_str):
+        cols.append(Column(field, validate_data_type(type_name)))
+    return OdpsSchema(cols)
+def _parse_explained_schema(explain_string: str) -> OdpsSchema:
+    if explain_string.startswith("AdhocSink"):
+        return _parse_simple_explain(explain_string)
+    else:
+        return _parse_full_explain(explain_string)
 class DataFrameReadODPSQuery(
     IncrementalIndexDatasource,
     ColumnPruneSupportedDataSourceMixin,
@@ -205,6 +234,7 @@ class DataFrameReadODPSQuery(
     string_as_binary = BoolField("string_as_binary", default=None)
     index_columns = ListField("index_columns", FieldTypes.string, default=None)
     index_dtypes = SeriesField("index_dtypes", default=None)
+    column_renames = DictField("column_renames", default=None)
     def get_columns(self):
         return self.columns
@@ -246,6 +276,8 @@ def read_odps_query(
     odps_entry: ODPS = None,
     index_col: Union[None, str, List[str]] = None,
     string_as_binary: bool = None,
+    sql_hints: Dict[str, str] = None,
+    anonymous_col_prefix: str = _DEFAULT_ANONYMOUS_COL_PREFIX,
     **kw,
 ):
     """
@@ -260,25 +292,51 @@ def read_odps_query(
         MaxCompute SQL statement.
     index_col: Union[None, str, List[str]]
         Columns to be specified as indexes.
+    string_as_binary: bool, optional
+        Whether to convert string columns to binary.
+    sql_hints: Dict[str, str], optional
+        User specified SQL hints.
+    anonymous_col_prefix: str, optional
+        Prefix for anonymous columns, '_anon_col_' by default.
     Returns
     -------
     result: DataFrame
         DataFrame read from MaxCompute (ODPS) table
     """
+    hints = options.sql.settings.copy() or {}
+    if sql_hints:
+        hints.update(sql_hints)
     odps_entry = odps_entry or ODPS.from_global() or ODPS.from_environments()
+    if options.session.enable_schema or odps_entry.is_schema_namespace_enabled():
+        hints["odps.namespace.schema"] = "true"
+        hints["odps.sql.allow.namespace.schema"] = "true"
+    # fixme workaround for multi-stage split process
+    hints["odps.sql.object.table.split.by.object.size.enabled"] = "false"
     if odps_entry is None:
         raise ValueError("Missing odps_entry parameter")
-    inst = odps_entry.execute_sql(f"EXPLAIN {query}")
+    inst = odps_entry.execute_sql(f"EXPLAIN {query}", hints=hints)
+    logger.debug("Explain instance ID: %s", inst.id)
     explain_str = list(inst.get_task_results().values())[0]
     odps_schema = _parse_explained_schema(explain_str)
+    new_columns = []
+    col_renames = {}
     for col in odps_schema.columns:
-        if _ANONYMOUS_COL_REGEX.match(col.name) and col.name not in query:
-            raise ValueError("Need to specify names for all columns in SELECT clause.")
+        anon_match = _ANONYMOUS_COL_REGEX.match(col.name)
+        if anon_match and col.name not in query:
+            new_name = anonymous_col_prefix + anon_match.group(1)
+            col_renames[col.name] = new_name
+            new_columns.append(Column(new_name, col.type))
+        else:
+            new_columns.append(col)
-    dtypes = odps_schema_to_pandas_dtypes(odps_schema)
+    dtypes = odps_schema_to_pandas_dtypes(OdpsSchema(new_columns))
     if not index_col:
         index_dtypes = None
@@ -301,5 +359,6 @@ def read_odps_query(
         string_as_binary=string_as_binary,
         index_columns=index_col,
         index_dtypes=index_dtypes,
+        column_renames=col_renames,
     )
     return op(chunk_bytes=chunk_bytes, chunk_size=chunk_size)

maxframe/dataframe/datasource/read_odps_table.py CHANGED Viewed

@@ -22,8 +22,9 @@ from odps.models import Table
 from odps.utils import to_timestamp
 from ... import opcodes
+from ...config import options
 from ...core import OutputType
-from ...odpsio import odps_schema_to_pandas_dtypes
+from ...io.odpsio import odps_schema_to_pandas_dtypes
 from ...serialization.serializables import (
     AnyField,
     BoolField,
@@ -167,12 +168,13 @@ def read_odps_table(
         DataFrame read from MaxCompute (ODPS) table
     """
     odps_entry = odps_entry or ODPS.from_global() or ODPS.from_environments()
+    schema = options.session.default_schema or odps_entry.schema
     if odps_entry is None:
         raise ValueError("Missing odps_entry parameter")
     if isinstance(table_name, Table):
         table = table_name
     else:
-        table = odps_entry.get_table(table_name)
+        table = odps_entry.get_table(table_name, schema=schema)
     if not table.table_schema.partitions and (
         partitions is not None or append_partitions

maxframe 1.0.0rc2__cp38-cp38-win32.whl → 1.0.0rc4__cp38-cp38-win32.whl

Potentially problematic release.

maxframe 1.0.0rc2cp38-cp38-win32.whl → 1.0.0rc4cp38-cp38-win32.whl