maxframe 1.0.0rc2__cp38-cp38-win32.whl → 1.0.0rc4__cp38-cp38-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cp38-win32.pyd +0 -0
- maxframe/codegen.py +4 -2
- maxframe/config/config.py +28 -9
- maxframe/config/validators.py +42 -12
- maxframe/conftest.py +56 -14
- maxframe/core/__init__.py +2 -13
- maxframe/core/entity/__init__.py +0 -4
- maxframe/core/entity/executable.py +1 -1
- maxframe/core/entity/objects.py +45 -2
- maxframe/core/entity/output_types.py +0 -3
- maxframe/core/entity/tests/test_objects.py +43 -0
- maxframe/core/entity/tileables.py +5 -78
- maxframe/core/graph/__init__.py +2 -2
- maxframe/core/graph/builder/__init__.py +0 -1
- maxframe/core/graph/builder/base.py +5 -4
- maxframe/core/graph/builder/tileable.py +4 -4
- maxframe/core/graph/builder/utils.py +4 -8
- maxframe/core/graph/core.cp38-win32.pyd +0 -0
- maxframe/core/graph/entity.py +9 -33
- maxframe/core/operator/__init__.py +2 -9
- maxframe/core/operator/base.py +3 -5
- maxframe/core/operator/objects.py +0 -9
- maxframe/core/operator/utils.py +55 -0
- maxframe/dataframe/arithmetic/docstring.py +26 -2
- maxframe/dataframe/arithmetic/equal.py +4 -2
- maxframe/dataframe/arithmetic/greater.py +4 -2
- maxframe/dataframe/arithmetic/greater_equal.py +4 -2
- maxframe/dataframe/arithmetic/less.py +2 -2
- maxframe/dataframe/arithmetic/less_equal.py +4 -2
- maxframe/dataframe/arithmetic/not_equal.py +4 -2
- maxframe/dataframe/core.py +2 -0
- maxframe/dataframe/datasource/read_odps_query.py +67 -8
- maxframe/dataframe/datasource/read_odps_table.py +4 -2
- maxframe/dataframe/datasource/tests/test_datasource.py +35 -6
- maxframe/dataframe/datastore/to_odps.py +8 -1
- maxframe/dataframe/extensions/__init__.py +3 -0
- maxframe/dataframe/extensions/flatmap.py +326 -0
- maxframe/dataframe/extensions/tests/test_extensions.py +62 -1
- maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
- maxframe/dataframe/indexing/rename.py +11 -0
- maxframe/dataframe/initializer.py +11 -1
- maxframe/dataframe/misc/drop_duplicates.py +18 -1
- maxframe/dataframe/operators.py +1 -17
- maxframe/dataframe/reduction/core.py +2 -2
- maxframe/dataframe/tests/test_initializer.py +33 -2
- maxframe/io/objects/__init__.py +24 -0
- maxframe/io/objects/core.py +140 -0
- maxframe/io/objects/tensor.py +76 -0
- maxframe/io/objects/tests/__init__.py +13 -0
- maxframe/io/objects/tests/test_object_io.py +97 -0
- maxframe/{odpsio → io/odpsio}/__init__.py +2 -0
- maxframe/{odpsio → io/odpsio}/arrow.py +4 -4
- maxframe/{odpsio → io/odpsio}/schema.py +10 -8
- maxframe/{odpsio → io/odpsio}/tableio.py +50 -38
- maxframe/io/odpsio/tests/__init__.py +13 -0
- maxframe/{odpsio → io/odpsio}/tests/test_schema.py +3 -7
- maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +3 -3
- maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
- maxframe/io/odpsio/volumeio.py +63 -0
- maxframe/learn/contrib/__init__.py +2 -1
- maxframe/learn/contrib/graph/__init__.py +15 -0
- maxframe/learn/contrib/graph/connected_components.py +215 -0
- maxframe/learn/contrib/graph/tests/__init__.py +13 -0
- maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
- maxframe/learn/contrib/xgboost/classifier.py +26 -2
- maxframe/learn/contrib/xgboost/core.py +87 -2
- maxframe/learn/contrib/xgboost/dmatrix.py +1 -4
- maxframe/learn/contrib/xgboost/predict.py +27 -44
- maxframe/learn/contrib/xgboost/regressor.py +3 -10
- maxframe/learn/contrib/xgboost/train.py +27 -16
- maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
- maxframe/lib/mmh3.cp38-win32.pyd +0 -0
- maxframe/opcodes.py +3 -0
- maxframe/protocol.py +7 -16
- maxframe/remote/core.py +4 -8
- maxframe/serialization/__init__.py +1 -0
- maxframe/serialization/core.cp38-win32.pyd +0 -0
- maxframe/session.py +9 -2
- maxframe/tensor/__init__.py +10 -2
- maxframe/tensor/arithmetic/isclose.py +1 -0
- maxframe/tensor/arithmetic/tests/test_arithmetic.py +21 -17
- maxframe/tensor/core.py +5 -136
- maxframe/tensor/datasource/array.py +3 -0
- maxframe/tensor/datasource/full.py +1 -1
- maxframe/tensor/datasource/tests/test_datasource.py +1 -1
- maxframe/tensor/indexing/flatnonzero.py +1 -1
- maxframe/tensor/indexing/getitem.py +2 -0
- maxframe/tensor/merge/__init__.py +2 -0
- maxframe/tensor/merge/concatenate.py +101 -0
- maxframe/tensor/merge/tests/test_merge.py +30 -1
- maxframe/tensor/merge/vstack.py +74 -0
- maxframe/tensor/{base → misc}/__init__.py +2 -0
- maxframe/tensor/{base → misc}/atleast_1d.py +0 -2
- maxframe/tensor/misc/atleast_2d.py +70 -0
- maxframe/tensor/misc/atleast_3d.py +85 -0
- maxframe/tensor/misc/tests/__init__.py +13 -0
- maxframe/tensor/{base → misc}/transpose.py +22 -18
- maxframe/tensor/operators.py +1 -7
- maxframe/tensor/random/core.py +1 -1
- maxframe/tensor/reduction/count_nonzero.py +1 -0
- maxframe/tensor/reduction/mean.py +1 -0
- maxframe/tensor/reduction/nanmean.py +1 -0
- maxframe/tensor/reduction/nanvar.py +2 -0
- maxframe/tensor/reduction/tests/test_reduction.py +12 -1
- maxframe/tensor/reduction/var.py +2 -0
- maxframe/tensor/utils.py +2 -22
- maxframe/typing_.py +4 -1
- maxframe/udf.py +8 -9
- maxframe/utils.py +49 -73
- maxframe-1.0.0rc4.dist-info/METADATA +104 -0
- {maxframe-1.0.0rc2.dist-info → maxframe-1.0.0rc4.dist-info}/RECORD +129 -114
- {maxframe-1.0.0rc2.dist-info → maxframe-1.0.0rc4.dist-info}/WHEEL +1 -1
- maxframe_client/fetcher.py +33 -50
- maxframe_client/session/consts.py +3 -0
- maxframe_client/session/graph.py +8 -2
- maxframe_client/session/odps.py +134 -27
- maxframe_client/session/task.py +58 -20
- maxframe_client/tests/test_fetcher.py +1 -1
- maxframe_client/tests/test_session.py +27 -3
- maxframe/core/entity/chunks.py +0 -68
- maxframe/core/entity/fuse.py +0 -73
- maxframe/core/graph/builder/chunk.py +0 -430
- maxframe/odpsio/volumeio.py +0 -95
- maxframe-1.0.0rc2.dist-info/METADATA +0 -177
- /maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
- /maxframe/{tensor/base/tests → io}/__init__.py +0 -0
- /maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
- /maxframe/tensor/{base → misc}/astype.py +0 -0
- /maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
- /maxframe/tensor/{base → misc}/ravel.py +0 -0
- /maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
- /maxframe/tensor/{base → misc}/unique.py +0 -0
- /maxframe/tensor/{base → misc}/where.py +0 -0
- {maxframe-1.0.0rc2.dist-info → maxframe-1.0.0rc4.dist-info}/top_level.txt +0 -0
|
@@ -14,10 +14,10 @@
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
from abc import ABC, abstractmethod
|
|
17
|
-
from typing import Generator, List, Set
|
|
17
|
+
from typing import Generator, List, Set
|
|
18
18
|
|
|
19
19
|
from ....typing_ import EntityType
|
|
20
|
-
from ..entity import
|
|
20
|
+
from ..entity import EntityGraph
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
def _default_inputs_selector(inputs: List[EntityType]) -> List[EntityType]:
|
|
@@ -43,7 +43,7 @@ class AbstractGraphBuilder(ABC):
|
|
|
43
43
|
|
|
44
44
|
def _add_nodes(
|
|
45
45
|
self,
|
|
46
|
-
graph:
|
|
46
|
+
graph: EntityGraph,
|
|
47
47
|
nodes: List[EntityType],
|
|
48
48
|
visited: Set,
|
|
49
49
|
):
|
|
@@ -75,7 +75,7 @@ class AbstractGraphBuilder(ABC):
|
|
|
75
75
|
nodes.append(out)
|
|
76
76
|
|
|
77
77
|
@abstractmethod
|
|
78
|
-
def build(self) -> Generator[
|
|
78
|
+
def build(self) -> Generator[EntityGraph, None, None]:
|
|
79
79
|
"""
|
|
80
80
|
Build a entity graph.
|
|
81
81
|
|
|
@@ -84,3 +84,4 @@ class AbstractGraphBuilder(ABC):
|
|
|
84
84
|
graph : EntityGraph
|
|
85
85
|
Entity graph.
|
|
86
86
|
"""
|
|
87
|
+
raise NotImplementedError
|
|
@@ -12,10 +12,10 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from typing import Generator
|
|
15
|
+
from typing import Generator
|
|
16
16
|
|
|
17
17
|
from ...mode import enter_mode
|
|
18
|
-
from ..entity import
|
|
18
|
+
from ..entity import TileableGraph
|
|
19
19
|
from .base import AbstractGraphBuilder
|
|
20
20
|
|
|
21
21
|
|
|
@@ -26,9 +26,9 @@ class TileableGraphBuilder(AbstractGraphBuilder):
|
|
|
26
26
|
super().__init__(graph=graph)
|
|
27
27
|
|
|
28
28
|
@enter_mode(build=True, kernel=True)
|
|
29
|
-
def _build(self) ->
|
|
29
|
+
def _build(self) -> TileableGraph:
|
|
30
30
|
self._add_nodes(self._graph, list(self._graph.result_tileables), set())
|
|
31
31
|
return self._graph
|
|
32
32
|
|
|
33
|
-
def build(self) -> Generator[
|
|
33
|
+
def build(self) -> Generator[TileableGraph, None, None]:
|
|
34
34
|
yield self._build()
|
|
@@ -13,12 +13,11 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import itertools
|
|
16
|
-
from typing import List
|
|
16
|
+
from typing import List
|
|
17
17
|
|
|
18
18
|
from ....typing_ import TileableType
|
|
19
19
|
from ...mode import enter_mode
|
|
20
|
-
from ..entity import
|
|
21
|
-
from .chunk import ChunkGraphBuilder
|
|
20
|
+
from ..entity import EntityGraph, TileableGraph
|
|
22
21
|
from .tileable import TileableGraphBuilder
|
|
23
22
|
|
|
24
23
|
|
|
@@ -28,14 +27,11 @@ def build_graph(
|
|
|
28
27
|
tile: bool = False,
|
|
29
28
|
fuse_enabled: bool = True,
|
|
30
29
|
**chunk_graph_build_kwargs
|
|
31
|
-
) ->
|
|
30
|
+
) -> EntityGraph:
|
|
32
31
|
tileables = list(itertools.chain(*(tileable.op.outputs for tileable in tileables)))
|
|
33
32
|
tileable_graph = TileableGraph(tileables)
|
|
34
33
|
tileable_graph_builder = TileableGraphBuilder(tileable_graph)
|
|
35
34
|
tileable_graph = next(tileable_graph_builder.build())
|
|
36
35
|
if not tile:
|
|
37
36
|
return tileable_graph
|
|
38
|
-
|
|
39
|
-
tileable_graph, fuse_enabled=fuse_enabled, **chunk_graph_build_kwargs
|
|
40
|
-
)
|
|
41
|
-
return next(chunk_graph_builder.build())
|
|
37
|
+
raise NotImplementedError
|
|
Binary file
|
maxframe/core/graph/entity.py
CHANGED
|
@@ -13,9 +13,9 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
from abc import ABCMeta, abstractmethod
|
|
16
|
-
from typing import Dict, Iterable, List
|
|
16
|
+
from typing import Dict, Iterable, List
|
|
17
17
|
|
|
18
|
-
from ...core import
|
|
18
|
+
from ...core import Tileable
|
|
19
19
|
from ...serialization.core import buffered
|
|
20
20
|
from ...serialization.serializables import BoolField, DictField, ListField, Serializable
|
|
21
21
|
from ...serialization.serializables.core import SerializableSerializer
|
|
@@ -97,26 +97,6 @@ class TileableGraph(EntityGraph, Iterable[Tileable]):
|
|
|
97
97
|
return self._logic_key
|
|
98
98
|
|
|
99
99
|
|
|
100
|
-
class ChunkGraph(EntityGraph, Iterable[Chunk]):
|
|
101
|
-
_result_chunks: List[Chunk]
|
|
102
|
-
|
|
103
|
-
def __init__(self, result_chunks: List[Chunk] = None):
|
|
104
|
-
super().__init__()
|
|
105
|
-
self._result_chunks = result_chunks
|
|
106
|
-
|
|
107
|
-
@property
|
|
108
|
-
def result_chunks(self):
|
|
109
|
-
return self._result_chunks
|
|
110
|
-
|
|
111
|
-
@property
|
|
112
|
-
def results(self):
|
|
113
|
-
return self._result_chunks
|
|
114
|
-
|
|
115
|
-
@results.setter
|
|
116
|
-
def results(self, new_results):
|
|
117
|
-
self._result_chunks = new_results
|
|
118
|
-
|
|
119
|
-
|
|
120
100
|
class SerializableGraph(Serializable):
|
|
121
101
|
_is_chunk = BoolField("is_chunk")
|
|
122
102
|
# TODO(qinxuye): remove this logic when we handle fetch elegantly,
|
|
@@ -132,12 +112,11 @@ class SerializableGraph(Serializable):
|
|
|
132
112
|
_results = ListField("results")
|
|
133
113
|
|
|
134
114
|
@classmethod
|
|
135
|
-
def from_graph(cls, graph:
|
|
115
|
+
def from_graph(cls, graph: EntityGraph) -> "SerializableGraph":
|
|
136
116
|
from ..operator import Fetch
|
|
137
117
|
|
|
138
|
-
is_chunk = isinstance(graph, ChunkGraph)
|
|
139
118
|
return SerializableGraph(
|
|
140
|
-
_is_chunk=
|
|
119
|
+
_is_chunk=False,
|
|
141
120
|
_fetch_nodes=[chunk for chunk in graph if isinstance(chunk.op, Fetch)],
|
|
142
121
|
_nodes=graph._nodes,
|
|
143
122
|
_predecessors=graph._predecessors,
|
|
@@ -145,9 +124,8 @@ class SerializableGraph(Serializable):
|
|
|
145
124
|
_results=graph.results,
|
|
146
125
|
)
|
|
147
126
|
|
|
148
|
-
def to_graph(self) ->
|
|
149
|
-
|
|
150
|
-
graph = graph_cls(self._results)
|
|
127
|
+
def to_graph(self) -> EntityGraph:
|
|
128
|
+
graph = TileableGraph(self._results)
|
|
151
129
|
graph._nodes.update(self._nodes)
|
|
152
130
|
graph._predecessors.update(self._predecessors)
|
|
153
131
|
graph._successors.update(self._successors)
|
|
@@ -156,14 +134,12 @@ class SerializableGraph(Serializable):
|
|
|
156
134
|
|
|
157
135
|
class GraphSerializer(SerializableSerializer):
|
|
158
136
|
@buffered
|
|
159
|
-
def serial(self, obj:
|
|
137
|
+
def serial(self, obj: EntityGraph, context: Dict):
|
|
160
138
|
serializable_graph = SerializableGraph.from_graph(obj)
|
|
161
139
|
return [], [serializable_graph], False
|
|
162
140
|
|
|
163
|
-
def deserial(
|
|
164
|
-
|
|
165
|
-
) -> Union[TileableGraph, ChunkGraph]:
|
|
166
|
-
serializable_graph: SerializableGraph = subs[0]
|
|
141
|
+
def deserial(self, serialized: List, context: Dict, subs: List) -> TileableGraph:
|
|
142
|
+
serializable_graph: EntityGraph = subs[0]
|
|
167
143
|
return serializable_graph.to_graph()
|
|
168
144
|
|
|
169
145
|
|
|
@@ -22,13 +22,6 @@ from .base import (
|
|
|
22
22
|
)
|
|
23
23
|
from .core import TileableOperatorMixin, estimate_size, execute
|
|
24
24
|
from .fetch import Fetch, FetchMixin, FetchShuffle, ShuffleFetchType
|
|
25
|
-
from .
|
|
26
|
-
from .objects import (
|
|
27
|
-
MergeDictOperator,
|
|
28
|
-
ObjectFetch,
|
|
29
|
-
ObjectFuseChunk,
|
|
30
|
-
ObjectFuseChunkMixin,
|
|
31
|
-
ObjectOperator,
|
|
32
|
-
ObjectOperatorMixin,
|
|
33
|
-
)
|
|
25
|
+
from .objects import MergeDictOperator, ObjectFetch, ObjectOperator, ObjectOperatorMixin
|
|
34
26
|
from .shuffle import MapReduceOperator, ShuffleProxy
|
|
27
|
+
from .utils import add_fetch_builder, build_fetch
|
maxframe/core/operator/base.py
CHANGED
|
@@ -12,11 +12,10 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
import functools
|
|
16
15
|
import weakref
|
|
17
16
|
from copy import deepcopy
|
|
18
17
|
from enum import Enum
|
|
19
|
-
from functools import partial
|
|
18
|
+
from functools import lru_cache, partial
|
|
20
19
|
from typing import Any, Dict, List, Optional, Tuple, Type, Union
|
|
21
20
|
|
|
22
21
|
from ...serialization.core import Placeholder
|
|
@@ -37,7 +36,6 @@ from ...serialization.serializables.core import SerializableSerializer
|
|
|
37
36
|
from ...typing_ import OperatorType
|
|
38
37
|
from ...utils import AttributeDict, classproperty, get_user_call_point, tokenize
|
|
39
38
|
from ..base import Base
|
|
40
|
-
from ..entity.chunks import Chunk
|
|
41
39
|
from ..entity.core import ENTITY_TYPE, Entity, EntityData
|
|
42
40
|
from ..entity.output_types import OutputType
|
|
43
41
|
from ..entity.tileables import Tileable
|
|
@@ -90,7 +88,7 @@ class SchedulingHint(Serializable):
|
|
|
90
88
|
priority = Int32Field("priority", default=None)
|
|
91
89
|
|
|
92
90
|
@classproperty
|
|
93
|
-
@
|
|
91
|
+
@lru_cache(1)
|
|
94
92
|
def all_hint_names(cls):
|
|
95
93
|
return list(cls._FIELDS)
|
|
96
94
|
|
|
@@ -341,7 +339,7 @@ class Operator(Base, OperatorLogicKeyGeneratorMixin, metaclass=OperatorMetaclass
|
|
|
341
339
|
raise ValueError("Outputs' size exceeds limitation")
|
|
342
340
|
|
|
343
341
|
@property
|
|
344
|
-
def outputs(self) -> List[
|
|
342
|
+
def outputs(self) -> List[Tileable]:
|
|
345
343
|
outputs = self._outputs
|
|
346
344
|
if outputs:
|
|
347
345
|
return [ref() for ref in outputs]
|
|
@@ -17,7 +17,6 @@ from ..entity import OutputType, register_fetch_class
|
|
|
17
17
|
from .base import Operator
|
|
18
18
|
from .core import TileableOperatorMixin
|
|
19
19
|
from .fetch import Fetch, FetchMixin
|
|
20
|
-
from .fuse import Fuse, FuseChunkMixin
|
|
21
20
|
|
|
22
21
|
|
|
23
22
|
class ObjectOperator(Operator):
|
|
@@ -28,14 +27,6 @@ class ObjectOperatorMixin(TileableOperatorMixin):
|
|
|
28
27
|
_output_type_ = OutputType.object
|
|
29
28
|
|
|
30
29
|
|
|
31
|
-
class ObjectFuseChunkMixin(FuseChunkMixin, ObjectOperatorMixin):
|
|
32
|
-
__slots__ = ()
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
class ObjectFuseChunk(ObjectFuseChunkMixin, Fuse):
|
|
36
|
-
pass
|
|
37
|
-
|
|
38
|
-
|
|
39
30
|
class ObjectFetch(FetchMixin, ObjectOperatorMixin, Fetch):
|
|
40
31
|
_output_type_ = OutputType.object
|
|
41
32
|
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from ...typing_ import EntityType, TileableType
|
|
16
|
+
from ..entity import TILEABLE_TYPE
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def build_fetch_tileable(tileable: TileableType) -> TileableType:
|
|
20
|
+
if tileable.is_coarse():
|
|
21
|
+
chunks = None
|
|
22
|
+
else:
|
|
23
|
+
chunks = []
|
|
24
|
+
for c in tileable.chunks:
|
|
25
|
+
fetch_chunk = build_fetch(c, index=c.index)
|
|
26
|
+
chunks.append(fetch_chunk)
|
|
27
|
+
|
|
28
|
+
tileable_op = tileable.op
|
|
29
|
+
params = tileable.params.copy()
|
|
30
|
+
|
|
31
|
+
new_op = tileable_op.get_fetch_op_cls(tileable)(_id=tileable_op.id)
|
|
32
|
+
return new_op.new_tileables(
|
|
33
|
+
None,
|
|
34
|
+
chunks=chunks,
|
|
35
|
+
nsplits=tileable.nsplits,
|
|
36
|
+
_key=tileable.key,
|
|
37
|
+
_id=tileable.id,
|
|
38
|
+
**params,
|
|
39
|
+
)[0]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
_type_to_builder = [
|
|
43
|
+
(TILEABLE_TYPE, build_fetch_tileable),
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def build_fetch(entity: EntityType, **kw) -> EntityType:
|
|
48
|
+
for entity_types, func in _type_to_builder:
|
|
49
|
+
if isinstance(entity, entity_types):
|
|
50
|
+
return func(entity, **kw)
|
|
51
|
+
raise TypeError(f"Type {type(entity)} not supported")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def add_fetch_builder(entity_type, builder_func):
|
|
55
|
+
_type_to_builder.append((entity_type, builder_func))
|
|
@@ -185,7 +185,6 @@ e NaN
|
|
|
185
185
|
dtype: float64
|
|
186
186
|
"""
|
|
187
187
|
|
|
188
|
-
# FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/48
|
|
189
188
|
_flex_comp_doc_FRAME = """
|
|
190
189
|
Get {desc} of dataframe and other, element-wise (binary operator `{op_name}`).
|
|
191
190
|
Among flexible wrappers (`eq`, `ne`, `le`, `lt`, `ge`, `gt`) to comparison
|
|
@@ -291,7 +290,7 @@ C True False
|
|
|
291
290
|
|
|
292
291
|
Compare to a DataFrame of different shape.
|
|
293
292
|
|
|
294
|
-
>>> other =
|
|
293
|
+
>>> other = md.DataFrame({{'revenue': [300, 250, 100, 150]}},
|
|
295
294
|
... index=['A', 'B', 'C', 'D'])
|
|
296
295
|
>>> other.execute()
|
|
297
296
|
revenue
|
|
@@ -306,6 +305,31 @@ A False False
|
|
|
306
305
|
B False False
|
|
307
306
|
C False True
|
|
308
307
|
D False False
|
|
308
|
+
|
|
309
|
+
Compare to a MultiIndex by level.
|
|
310
|
+
|
|
311
|
+
>>> df_multindex = md.DataFrame({{'cost': [250, 150, 100, 150, 300, 220],
|
|
312
|
+
... 'revenue': [100, 250, 300, 200, 175, 225]}},
|
|
313
|
+
... index=[['Q1', 'Q1', 'Q1', 'Q2', 'Q2', 'Q2'],
|
|
314
|
+
... ['A', 'B', 'C', 'A', 'B', 'C']])
|
|
315
|
+
>>> df_multindex.execute()
|
|
316
|
+
cost revenue
|
|
317
|
+
Q1 A 250 100
|
|
318
|
+
B 150 250
|
|
319
|
+
C 100 300
|
|
320
|
+
Q2 A 150 200
|
|
321
|
+
B 300 175
|
|
322
|
+
C 220 225
|
|
323
|
+
|
|
324
|
+
>>> df.le(df_multindex, level=1).execute()
|
|
325
|
+
cost revenue
|
|
326
|
+
Q1 A True True
|
|
327
|
+
B True True
|
|
328
|
+
C True True
|
|
329
|
+
Q2 A False True
|
|
330
|
+
B True False
|
|
331
|
+
C True False
|
|
332
|
+
|
|
309
333
|
"""
|
|
310
334
|
|
|
311
335
|
|
|
@@ -51,6 +51,8 @@ dtype: bool
|
|
|
51
51
|
|
|
52
52
|
|
|
53
53
|
@bin_compare_doc("Equal to", equiv="==", series_example=_eq_example)
|
|
54
|
-
def eq(df, other, axis="columns", level=None):
|
|
55
|
-
op = DataFrameEqual(
|
|
54
|
+
def eq(df, other, axis="columns", level=None, fill_value=None):
|
|
55
|
+
op = DataFrameEqual(
|
|
56
|
+
axis=axis, level=level, lhs=df, rhs=other, fill_value=fill_value
|
|
57
|
+
)
|
|
56
58
|
return op(df, other)
|
|
@@ -52,6 +52,8 @@ dtype: bool
|
|
|
52
52
|
|
|
53
53
|
|
|
54
54
|
@bin_compare_doc("Greater than", equiv=">", series_example=_gt_example)
|
|
55
|
-
def gt(df, other, axis="columns", level=None):
|
|
56
|
-
op = DataFrameGreater(
|
|
55
|
+
def gt(df, other, axis="columns", level=None, fill_value=None):
|
|
56
|
+
op = DataFrameGreater(
|
|
57
|
+
axis=axis, level=level, lhs=df, rhs=other, fill_value=fill_value
|
|
58
|
+
)
|
|
57
59
|
return op(df, other)
|
|
@@ -52,6 +52,8 @@ dtype: bool
|
|
|
52
52
|
|
|
53
53
|
|
|
54
54
|
@bin_compare_doc("Greater than or equal to", equiv=">=", series_example=_ge_example)
|
|
55
|
-
def ge(df, other, axis="columns", level=None):
|
|
56
|
-
op = DataFrameGreaterEqual(
|
|
55
|
+
def ge(df, other, axis="columns", level=None, fill_value=None):
|
|
56
|
+
op = DataFrameGreaterEqual(
|
|
57
|
+
axis=axis, level=level, lhs=df, rhs=other, fill_value=fill_value
|
|
58
|
+
)
|
|
57
59
|
return op(df, other)
|
|
@@ -52,6 +52,6 @@ dtype: bool
|
|
|
52
52
|
|
|
53
53
|
|
|
54
54
|
@bin_compare_doc("Less than", equiv="<", series_example=_lt_example)
|
|
55
|
-
def lt(df, other, axis="columns", level=None):
|
|
56
|
-
op = DataFrameLess(axis=axis, level=level, lhs=df, rhs=other)
|
|
55
|
+
def lt(df, other, axis="columns", level=None, fill_value=None):
|
|
56
|
+
op = DataFrameLess(axis=axis, level=level, lhs=df, rhs=other, fill_value=fill_value)
|
|
57
57
|
return op(df, other)
|
|
@@ -52,6 +52,8 @@ dtype: bool
|
|
|
52
52
|
|
|
53
53
|
|
|
54
54
|
@bin_compare_doc("Less than or equal to", equiv="<=", series_example=_le_example)
|
|
55
|
-
def le(df, other, axis="columns", level=None):
|
|
56
|
-
op = DataFrameLessEqual(
|
|
55
|
+
def le(df, other, axis="columns", level=None, fill_value=None):
|
|
56
|
+
op = DataFrameLessEqual(
|
|
57
|
+
axis=axis, level=level, lhs=df, rhs=other, fill_value=fill_value
|
|
58
|
+
)
|
|
57
59
|
return op(df, other)
|
|
@@ -51,6 +51,8 @@ dtype: bool
|
|
|
51
51
|
|
|
52
52
|
|
|
53
53
|
@bin_compare_doc("Not equal to", equiv="!=", series_example=_ne_example)
|
|
54
|
-
def ne(df, other, axis="columns", level=None):
|
|
55
|
-
op = DataFrameNotEqual(
|
|
54
|
+
def ne(df, other, axis="columns", level=None, fill_value=None):
|
|
55
|
+
op = DataFrameNotEqual(
|
|
56
|
+
axis=axis, level=level, lhs=df, rhs=other, fill_value=fill_value
|
|
57
|
+
)
|
|
56
58
|
return op(df, other)
|
maxframe/dataframe/core.py
CHANGED
|
@@ -1666,6 +1666,8 @@ class DataFrameData(_BatchedFetcher, BaseDataFrameData):
|
|
|
1666
1666
|
raise NotImplementedError
|
|
1667
1667
|
|
|
1668
1668
|
corner_data = fetch_corner_data(self, session=self._executed_sessions[-1])
|
|
1669
|
+
if corner_data is None:
|
|
1670
|
+
return
|
|
1669
1671
|
|
|
1670
1672
|
buf = StringIO()
|
|
1671
1673
|
max_rows = pd.get_option("display.max_rows")
|
|
@@ -13,6 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import dataclasses
|
|
16
|
+
import logging
|
|
16
17
|
import re
|
|
17
18
|
from typing import Dict, List, Optional, Tuple, Union
|
|
18
19
|
|
|
@@ -22,12 +23,14 @@ from odps import ODPS
|
|
|
22
23
|
from odps.types import Column, OdpsSchema, validate_data_type
|
|
23
24
|
|
|
24
25
|
from ... import opcodes
|
|
26
|
+
from ...config import options
|
|
25
27
|
from ...core import OutputType
|
|
26
28
|
from ...core.graph import DAG
|
|
27
|
-
from ...odpsio import odps_schema_to_pandas_dtypes
|
|
29
|
+
from ...io.odpsio import odps_schema_to_pandas_dtypes
|
|
28
30
|
from ...serialization.serializables import (
|
|
29
31
|
AnyField,
|
|
30
32
|
BoolField,
|
|
33
|
+
DictField,
|
|
31
34
|
FieldTypes,
|
|
32
35
|
Int64Field,
|
|
33
36
|
ListField,
|
|
@@ -37,6 +40,10 @@ from ...serialization.serializables import (
|
|
|
37
40
|
from ..utils import parse_index
|
|
38
41
|
from .core import ColumnPruneSupportedDataSourceMixin, IncrementalIndexDatasource
|
|
39
42
|
|
|
43
|
+
logger = logging.getLogger(__name__)
|
|
44
|
+
|
|
45
|
+
_DEFAULT_ANONYMOUS_COL_PREFIX = "_anon_col_"
|
|
46
|
+
|
|
40
47
|
_EXPLAIN_DEPENDS_REGEX = re.compile(r"([^\s]+) depends on: ([^\n]+)")
|
|
41
48
|
_EXPLAIN_JOB_REGEX = re.compile(r"(\S+) is root job")
|
|
42
49
|
_EXPLAIN_TASKS_HEADER_REGEX = re.compile(r"In Job ([^:]+):")
|
|
@@ -46,8 +53,11 @@ _EXPLAIN_TASK_SCHEMA_REGEX = re.compile(
|
|
|
46
53
|
r"In Task ([^:]+)[\S\s]+FS: output: ([^\n #]+)[\s\S]+schema:\s+([\S\s]+)$",
|
|
47
54
|
re.MULTILINE,
|
|
48
55
|
)
|
|
49
|
-
_EXPLAIN_COLUMN_REGEX = re.compile(r"([^\(]+) \(([
|
|
50
|
-
_ANONYMOUS_COL_REGEX = re.compile(r"^_c\d
|
|
56
|
+
_EXPLAIN_COLUMN_REGEX = re.compile(r"([^\(]+) \(([^\n]+)\)(?:| AS ([^ ]+))(?:\n|$)")
|
|
57
|
+
_ANONYMOUS_COL_REGEX = re.compile(r"^_c(\d+)$")
|
|
58
|
+
|
|
59
|
+
_SIMPLE_SCHEMA_COLS_REGEX = re.compile(r"SELECT (([^:]+:[^, ]+[, ]*)+)FROM")
|
|
60
|
+
_SIMPLE_SCHEMA_COL_REGEX = re.compile(r"([^\.]+):([^, ]+)")
|
|
51
61
|
|
|
52
62
|
|
|
53
63
|
@dataclasses.dataclass
|
|
@@ -152,7 +162,7 @@ def _resolve_task_sector(job_name: str, sector: str) -> TaskSector:
|
|
|
152
162
|
return TaskSector(job_name, task_name, out_target, schemas)
|
|
153
163
|
|
|
154
164
|
|
|
155
|
-
def
|
|
165
|
+
def _parse_full_explain(explain_string: str) -> OdpsSchema:
|
|
156
166
|
sectors = _split_explain_string(explain_string)
|
|
157
167
|
jobs_sector = tasks_sector = None
|
|
158
168
|
|
|
@@ -191,6 +201,25 @@ def _parse_explained_schema(explain_string: str) -> OdpsSchema:
|
|
|
191
201
|
return OdpsSchema(cols)
|
|
192
202
|
|
|
193
203
|
|
|
204
|
+
def _parse_simple_explain(explain_string: str) -> OdpsSchema:
|
|
205
|
+
fields_match = _SIMPLE_SCHEMA_COLS_REGEX.search(explain_string)
|
|
206
|
+
if not fields_match:
|
|
207
|
+
raise ValueError("Cannot detect output table schema")
|
|
208
|
+
|
|
209
|
+
fields_str = fields_match.group(1)
|
|
210
|
+
cols = []
|
|
211
|
+
for field, type_name in _SIMPLE_SCHEMA_COL_REGEX.findall(fields_str):
|
|
212
|
+
cols.append(Column(field, validate_data_type(type_name)))
|
|
213
|
+
return OdpsSchema(cols)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def _parse_explained_schema(explain_string: str) -> OdpsSchema:
|
|
217
|
+
if explain_string.startswith("AdhocSink"):
|
|
218
|
+
return _parse_simple_explain(explain_string)
|
|
219
|
+
else:
|
|
220
|
+
return _parse_full_explain(explain_string)
|
|
221
|
+
|
|
222
|
+
|
|
194
223
|
class DataFrameReadODPSQuery(
|
|
195
224
|
IncrementalIndexDatasource,
|
|
196
225
|
ColumnPruneSupportedDataSourceMixin,
|
|
@@ -205,6 +234,7 @@ class DataFrameReadODPSQuery(
|
|
|
205
234
|
string_as_binary = BoolField("string_as_binary", default=None)
|
|
206
235
|
index_columns = ListField("index_columns", FieldTypes.string, default=None)
|
|
207
236
|
index_dtypes = SeriesField("index_dtypes", default=None)
|
|
237
|
+
column_renames = DictField("column_renames", default=None)
|
|
208
238
|
|
|
209
239
|
def get_columns(self):
|
|
210
240
|
return self.columns
|
|
@@ -246,6 +276,8 @@ def read_odps_query(
|
|
|
246
276
|
odps_entry: ODPS = None,
|
|
247
277
|
index_col: Union[None, str, List[str]] = None,
|
|
248
278
|
string_as_binary: bool = None,
|
|
279
|
+
sql_hints: Dict[str, str] = None,
|
|
280
|
+
anonymous_col_prefix: str = _DEFAULT_ANONYMOUS_COL_PREFIX,
|
|
249
281
|
**kw,
|
|
250
282
|
):
|
|
251
283
|
"""
|
|
@@ -260,25 +292,51 @@ def read_odps_query(
|
|
|
260
292
|
MaxCompute SQL statement.
|
|
261
293
|
index_col: Union[None, str, List[str]]
|
|
262
294
|
Columns to be specified as indexes.
|
|
295
|
+
string_as_binary: bool, optional
|
|
296
|
+
Whether to convert string columns to binary.
|
|
297
|
+
sql_hints: Dict[str, str], optional
|
|
298
|
+
User specified SQL hints.
|
|
299
|
+
anonymous_col_prefix: str, optional
|
|
300
|
+
Prefix for anonymous columns, '_anon_col_' by default.
|
|
263
301
|
|
|
264
302
|
Returns
|
|
265
303
|
-------
|
|
266
304
|
result: DataFrame
|
|
267
305
|
DataFrame read from MaxCompute (ODPS) table
|
|
268
306
|
"""
|
|
307
|
+
hints = options.sql.settings.copy() or {}
|
|
308
|
+
if sql_hints:
|
|
309
|
+
hints.update(sql_hints)
|
|
310
|
+
|
|
269
311
|
odps_entry = odps_entry or ODPS.from_global() or ODPS.from_environments()
|
|
312
|
+
|
|
313
|
+
if options.session.enable_schema or odps_entry.is_schema_namespace_enabled():
|
|
314
|
+
hints["odps.namespace.schema"] = "true"
|
|
315
|
+
hints["odps.sql.allow.namespace.schema"] = "true"
|
|
316
|
+
|
|
317
|
+
# fixme workaround for multi-stage split process
|
|
318
|
+
hints["odps.sql.object.table.split.by.object.size.enabled"] = "false"
|
|
319
|
+
|
|
270
320
|
if odps_entry is None:
|
|
271
321
|
raise ValueError("Missing odps_entry parameter")
|
|
272
|
-
inst = odps_entry.execute_sql(f"EXPLAIN {query}")
|
|
322
|
+
inst = odps_entry.execute_sql(f"EXPLAIN {query}", hints=hints)
|
|
323
|
+
logger.debug("Explain instance ID: %s", inst.id)
|
|
273
324
|
explain_str = list(inst.get_task_results().values())[0]
|
|
274
325
|
|
|
275
326
|
odps_schema = _parse_explained_schema(explain_str)
|
|
276
327
|
|
|
328
|
+
new_columns = []
|
|
329
|
+
col_renames = {}
|
|
277
330
|
for col in odps_schema.columns:
|
|
278
|
-
|
|
279
|
-
|
|
331
|
+
anon_match = _ANONYMOUS_COL_REGEX.match(col.name)
|
|
332
|
+
if anon_match and col.name not in query:
|
|
333
|
+
new_name = anonymous_col_prefix + anon_match.group(1)
|
|
334
|
+
col_renames[col.name] = new_name
|
|
335
|
+
new_columns.append(Column(new_name, col.type))
|
|
336
|
+
else:
|
|
337
|
+
new_columns.append(col)
|
|
280
338
|
|
|
281
|
-
dtypes = odps_schema_to_pandas_dtypes(
|
|
339
|
+
dtypes = odps_schema_to_pandas_dtypes(OdpsSchema(new_columns))
|
|
282
340
|
|
|
283
341
|
if not index_col:
|
|
284
342
|
index_dtypes = None
|
|
@@ -301,5 +359,6 @@ def read_odps_query(
|
|
|
301
359
|
string_as_binary=string_as_binary,
|
|
302
360
|
index_columns=index_col,
|
|
303
361
|
index_dtypes=index_dtypes,
|
|
362
|
+
column_renames=col_renames,
|
|
304
363
|
)
|
|
305
364
|
return op(chunk_bytes=chunk_bytes, chunk_size=chunk_size)
|
|
@@ -22,8 +22,9 @@ from odps.models import Table
|
|
|
22
22
|
from odps.utils import to_timestamp
|
|
23
23
|
|
|
24
24
|
from ... import opcodes
|
|
25
|
+
from ...config import options
|
|
25
26
|
from ...core import OutputType
|
|
26
|
-
from ...odpsio import odps_schema_to_pandas_dtypes
|
|
27
|
+
from ...io.odpsio import odps_schema_to_pandas_dtypes
|
|
27
28
|
from ...serialization.serializables import (
|
|
28
29
|
AnyField,
|
|
29
30
|
BoolField,
|
|
@@ -167,12 +168,13 @@ def read_odps_table(
|
|
|
167
168
|
DataFrame read from MaxCompute (ODPS) table
|
|
168
169
|
"""
|
|
169
170
|
odps_entry = odps_entry or ODPS.from_global() or ODPS.from_environments()
|
|
171
|
+
schema = options.session.default_schema or odps_entry.schema
|
|
170
172
|
if odps_entry is None:
|
|
171
173
|
raise ValueError("Missing odps_entry parameter")
|
|
172
174
|
if isinstance(table_name, Table):
|
|
173
175
|
table = table_name
|
|
174
176
|
else:
|
|
175
|
-
table = odps_entry.get_table(table_name)
|
|
177
|
+
table = odps_entry.get_table(table_name, schema=schema)
|
|
176
178
|
|
|
177
179
|
if not table.table_schema.partitions and (
|
|
178
180
|
partitions is not None or append_partitions
|