maxframe 1.0.0rc2__cp310-cp310-win32.whl → 1.0.0rc4__cp310-cp310-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (134) hide show
  1. maxframe/_utils.cp310-win32.pyd +0 -0
  2. maxframe/codegen.py +4 -2
  3. maxframe/config/config.py +28 -9
  4. maxframe/config/validators.py +42 -12
  5. maxframe/conftest.py +56 -14
  6. maxframe/core/__init__.py +2 -13
  7. maxframe/core/entity/__init__.py +0 -4
  8. maxframe/core/entity/executable.py +1 -1
  9. maxframe/core/entity/objects.py +45 -2
  10. maxframe/core/entity/output_types.py +0 -3
  11. maxframe/core/entity/tests/test_objects.py +43 -0
  12. maxframe/core/entity/tileables.py +5 -78
  13. maxframe/core/graph/__init__.py +2 -2
  14. maxframe/core/graph/builder/__init__.py +0 -1
  15. maxframe/core/graph/builder/base.py +5 -4
  16. maxframe/core/graph/builder/tileable.py +4 -4
  17. maxframe/core/graph/builder/utils.py +4 -8
  18. maxframe/core/graph/core.cp310-win32.pyd +0 -0
  19. maxframe/core/graph/entity.py +9 -33
  20. maxframe/core/operator/__init__.py +2 -9
  21. maxframe/core/operator/base.py +3 -5
  22. maxframe/core/operator/objects.py +0 -9
  23. maxframe/core/operator/utils.py +55 -0
  24. maxframe/dataframe/arithmetic/docstring.py +26 -2
  25. maxframe/dataframe/arithmetic/equal.py +4 -2
  26. maxframe/dataframe/arithmetic/greater.py +4 -2
  27. maxframe/dataframe/arithmetic/greater_equal.py +4 -2
  28. maxframe/dataframe/arithmetic/less.py +2 -2
  29. maxframe/dataframe/arithmetic/less_equal.py +4 -2
  30. maxframe/dataframe/arithmetic/not_equal.py +4 -2
  31. maxframe/dataframe/core.py +2 -0
  32. maxframe/dataframe/datasource/read_odps_query.py +67 -8
  33. maxframe/dataframe/datasource/read_odps_table.py +4 -2
  34. maxframe/dataframe/datasource/tests/test_datasource.py +35 -6
  35. maxframe/dataframe/datastore/to_odps.py +8 -1
  36. maxframe/dataframe/extensions/__init__.py +3 -0
  37. maxframe/dataframe/extensions/flatmap.py +326 -0
  38. maxframe/dataframe/extensions/tests/test_extensions.py +62 -1
  39. maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
  40. maxframe/dataframe/indexing/rename.py +11 -0
  41. maxframe/dataframe/initializer.py +11 -1
  42. maxframe/dataframe/misc/drop_duplicates.py +18 -1
  43. maxframe/dataframe/operators.py +1 -17
  44. maxframe/dataframe/reduction/core.py +2 -2
  45. maxframe/dataframe/tests/test_initializer.py +33 -2
  46. maxframe/io/objects/__init__.py +24 -0
  47. maxframe/io/objects/core.py +140 -0
  48. maxframe/io/objects/tensor.py +76 -0
  49. maxframe/io/objects/tests/__init__.py +13 -0
  50. maxframe/io/objects/tests/test_object_io.py +97 -0
  51. maxframe/{odpsio → io/odpsio}/__init__.py +2 -0
  52. maxframe/{odpsio → io/odpsio}/arrow.py +4 -4
  53. maxframe/{odpsio → io/odpsio}/schema.py +10 -8
  54. maxframe/{odpsio → io/odpsio}/tableio.py +50 -38
  55. maxframe/io/odpsio/tests/__init__.py +13 -0
  56. maxframe/{odpsio → io/odpsio}/tests/test_schema.py +3 -7
  57. maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +3 -3
  58. maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
  59. maxframe/io/odpsio/volumeio.py +63 -0
  60. maxframe/learn/contrib/__init__.py +2 -1
  61. maxframe/learn/contrib/graph/__init__.py +15 -0
  62. maxframe/learn/contrib/graph/connected_components.py +215 -0
  63. maxframe/learn/contrib/graph/tests/__init__.py +13 -0
  64. maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
  65. maxframe/learn/contrib/xgboost/classifier.py +26 -2
  66. maxframe/learn/contrib/xgboost/core.py +87 -2
  67. maxframe/learn/contrib/xgboost/dmatrix.py +1 -4
  68. maxframe/learn/contrib/xgboost/predict.py +27 -44
  69. maxframe/learn/contrib/xgboost/regressor.py +3 -10
  70. maxframe/learn/contrib/xgboost/train.py +27 -16
  71. maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
  72. maxframe/lib/mmh3.cp310-win32.pyd +0 -0
  73. maxframe/opcodes.py +3 -0
  74. maxframe/protocol.py +7 -16
  75. maxframe/remote/core.py +4 -8
  76. maxframe/serialization/__init__.py +1 -0
  77. maxframe/serialization/core.cp310-win32.pyd +0 -0
  78. maxframe/session.py +9 -2
  79. maxframe/tensor/__init__.py +10 -2
  80. maxframe/tensor/arithmetic/isclose.py +1 -0
  81. maxframe/tensor/arithmetic/tests/test_arithmetic.py +21 -17
  82. maxframe/tensor/core.py +5 -136
  83. maxframe/tensor/datasource/array.py +3 -0
  84. maxframe/tensor/datasource/full.py +1 -1
  85. maxframe/tensor/datasource/tests/test_datasource.py +1 -1
  86. maxframe/tensor/indexing/flatnonzero.py +1 -1
  87. maxframe/tensor/indexing/getitem.py +2 -0
  88. maxframe/tensor/merge/__init__.py +2 -0
  89. maxframe/tensor/merge/concatenate.py +101 -0
  90. maxframe/tensor/merge/tests/test_merge.py +30 -1
  91. maxframe/tensor/merge/vstack.py +74 -0
  92. maxframe/tensor/{base → misc}/__init__.py +2 -0
  93. maxframe/tensor/{base → misc}/atleast_1d.py +0 -2
  94. maxframe/tensor/misc/atleast_2d.py +70 -0
  95. maxframe/tensor/misc/atleast_3d.py +85 -0
  96. maxframe/tensor/misc/tests/__init__.py +13 -0
  97. maxframe/tensor/{base → misc}/transpose.py +22 -18
  98. maxframe/tensor/operators.py +1 -7
  99. maxframe/tensor/random/core.py +1 -1
  100. maxframe/tensor/reduction/count_nonzero.py +1 -0
  101. maxframe/tensor/reduction/mean.py +1 -0
  102. maxframe/tensor/reduction/nanmean.py +1 -0
  103. maxframe/tensor/reduction/nanvar.py +2 -0
  104. maxframe/tensor/reduction/tests/test_reduction.py +12 -1
  105. maxframe/tensor/reduction/var.py +2 -0
  106. maxframe/tensor/utils.py +2 -22
  107. maxframe/typing_.py +4 -1
  108. maxframe/udf.py +8 -9
  109. maxframe/utils.py +49 -73
  110. maxframe-1.0.0rc4.dist-info/METADATA +104 -0
  111. {maxframe-1.0.0rc2.dist-info → maxframe-1.0.0rc4.dist-info}/RECORD +129 -114
  112. {maxframe-1.0.0rc2.dist-info → maxframe-1.0.0rc4.dist-info}/WHEEL +1 -1
  113. maxframe_client/fetcher.py +33 -50
  114. maxframe_client/session/consts.py +3 -0
  115. maxframe_client/session/graph.py +8 -2
  116. maxframe_client/session/odps.py +134 -27
  117. maxframe_client/session/task.py +58 -20
  118. maxframe_client/tests/test_fetcher.py +1 -1
  119. maxframe_client/tests/test_session.py +27 -3
  120. maxframe/core/entity/chunks.py +0 -68
  121. maxframe/core/entity/fuse.py +0 -73
  122. maxframe/core/graph/builder/chunk.py +0 -430
  123. maxframe/odpsio/volumeio.py +0 -95
  124. maxframe-1.0.0rc2.dist-info/METADATA +0 -177
  125. /maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
  126. /maxframe/{tensor/base/tests → io}/__init__.py +0 -0
  127. /maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
  128. /maxframe/tensor/{base → misc}/astype.py +0 -0
  129. /maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
  130. /maxframe/tensor/{base → misc}/ravel.py +0 -0
  131. /maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
  132. /maxframe/tensor/{base → misc}/unique.py +0 -0
  133. /maxframe/tensor/{base → misc}/where.py +0 -0
  134. {maxframe-1.0.0rc2.dist-info → maxframe-1.0.0rc4.dist-info}/top_level.txt +0 -0
@@ -14,10 +14,10 @@
14
14
 
15
15
 
16
16
  from abc import ABC, abstractmethod
17
- from typing import Generator, List, Set, Union
17
+ from typing import Generator, List, Set
18
18
 
19
19
  from ....typing_ import EntityType
20
- from ..entity import ChunkGraph, EntityGraph, TileableGraph
20
+ from ..entity import EntityGraph
21
21
 
22
22
 
23
23
  def _default_inputs_selector(inputs: List[EntityType]) -> List[EntityType]:
@@ -43,7 +43,7 @@ class AbstractGraphBuilder(ABC):
43
43
 
44
44
  def _add_nodes(
45
45
  self,
46
- graph: Union[ChunkGraph, TileableGraph],
46
+ graph: EntityGraph,
47
47
  nodes: List[EntityType],
48
48
  visited: Set,
49
49
  ):
@@ -75,7 +75,7 @@ class AbstractGraphBuilder(ABC):
75
75
  nodes.append(out)
76
76
 
77
77
  @abstractmethod
78
- def build(self) -> Generator[Union[EntityGraph, ChunkGraph], None, None]:
78
+ def build(self) -> Generator[EntityGraph, None, None]:
79
79
  """
80
80
  Build a entity graph.
81
81
 
@@ -84,3 +84,4 @@ class AbstractGraphBuilder(ABC):
84
84
  graph : EntityGraph
85
85
  Entity graph.
86
86
  """
87
+ raise NotImplementedError
@@ -12,10 +12,10 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from typing import Generator, Union
15
+ from typing import Generator
16
16
 
17
17
  from ...mode import enter_mode
18
- from ..entity import ChunkGraph, TileableGraph
18
+ from ..entity import TileableGraph
19
19
  from .base import AbstractGraphBuilder
20
20
 
21
21
 
@@ -26,9 +26,9 @@ class TileableGraphBuilder(AbstractGraphBuilder):
26
26
  super().__init__(graph=graph)
27
27
 
28
28
  @enter_mode(build=True, kernel=True)
29
- def _build(self) -> Union[TileableGraph, ChunkGraph]:
29
+ def _build(self) -> TileableGraph:
30
30
  self._add_nodes(self._graph, list(self._graph.result_tileables), set())
31
31
  return self._graph
32
32
 
33
- def build(self) -> Generator[Union[TileableGraph, ChunkGraph], None, None]:
33
+ def build(self) -> Generator[TileableGraph, None, None]:
34
34
  yield self._build()
@@ -13,12 +13,11 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import itertools
16
- from typing import List, Union
16
+ from typing import List
17
17
 
18
18
  from ....typing_ import TileableType
19
19
  from ...mode import enter_mode
20
- from ..entity import ChunkGraph, TileableGraph
21
- from .chunk import ChunkGraphBuilder
20
+ from ..entity import EntityGraph, TileableGraph
22
21
  from .tileable import TileableGraphBuilder
23
22
 
24
23
 
@@ -28,14 +27,11 @@ def build_graph(
28
27
  tile: bool = False,
29
28
  fuse_enabled: bool = True,
30
29
  **chunk_graph_build_kwargs
31
- ) -> Union[TileableGraph, ChunkGraph]:
30
+ ) -> EntityGraph:
32
31
  tileables = list(itertools.chain(*(tileable.op.outputs for tileable in tileables)))
33
32
  tileable_graph = TileableGraph(tileables)
34
33
  tileable_graph_builder = TileableGraphBuilder(tileable_graph)
35
34
  tileable_graph = next(tileable_graph_builder.build())
36
35
  if not tile:
37
36
  return tileable_graph
38
- chunk_graph_builder = ChunkGraphBuilder(
39
- tileable_graph, fuse_enabled=fuse_enabled, **chunk_graph_build_kwargs
40
- )
41
- return next(chunk_graph_builder.build())
37
+ raise NotImplementedError
Binary file
@@ -13,9 +13,9 @@
13
13
  # limitations under the License.
14
14
 
15
15
  from abc import ABCMeta, abstractmethod
16
- from typing import Dict, Iterable, List, Union
16
+ from typing import Dict, Iterable, List
17
17
 
18
- from ...core import Chunk, Tileable
18
+ from ...core import Tileable
19
19
  from ...serialization.core import buffered
20
20
  from ...serialization.serializables import BoolField, DictField, ListField, Serializable
21
21
  from ...serialization.serializables.core import SerializableSerializer
@@ -97,26 +97,6 @@ class TileableGraph(EntityGraph, Iterable[Tileable]):
97
97
  return self._logic_key
98
98
 
99
99
 
100
- class ChunkGraph(EntityGraph, Iterable[Chunk]):
101
- _result_chunks: List[Chunk]
102
-
103
- def __init__(self, result_chunks: List[Chunk] = None):
104
- super().__init__()
105
- self._result_chunks = result_chunks
106
-
107
- @property
108
- def result_chunks(self):
109
- return self._result_chunks
110
-
111
- @property
112
- def results(self):
113
- return self._result_chunks
114
-
115
- @results.setter
116
- def results(self, new_results):
117
- self._result_chunks = new_results
118
-
119
-
120
100
  class SerializableGraph(Serializable):
121
101
  _is_chunk = BoolField("is_chunk")
122
102
  # TODO(qinxuye): remove this logic when we handle fetch elegantly,
@@ -132,12 +112,11 @@ class SerializableGraph(Serializable):
132
112
  _results = ListField("results")
133
113
 
134
114
  @classmethod
135
- def from_graph(cls, graph: Union[TileableGraph, ChunkGraph]) -> "SerializableGraph":
115
+ def from_graph(cls, graph: EntityGraph) -> "SerializableGraph":
136
116
  from ..operator import Fetch
137
117
 
138
- is_chunk = isinstance(graph, ChunkGraph)
139
118
  return SerializableGraph(
140
- _is_chunk=is_chunk,
119
+ _is_chunk=False,
141
120
  _fetch_nodes=[chunk for chunk in graph if isinstance(chunk.op, Fetch)],
142
121
  _nodes=graph._nodes,
143
122
  _predecessors=graph._predecessors,
@@ -145,9 +124,8 @@ class SerializableGraph(Serializable):
145
124
  _results=graph.results,
146
125
  )
147
126
 
148
- def to_graph(self) -> Union[TileableGraph, ChunkGraph]:
149
- graph_cls = ChunkGraph if self._is_chunk else TileableGraph
150
- graph = graph_cls(self._results)
127
+ def to_graph(self) -> EntityGraph:
128
+ graph = TileableGraph(self._results)
151
129
  graph._nodes.update(self._nodes)
152
130
  graph._predecessors.update(self._predecessors)
153
131
  graph._successors.update(self._successors)
@@ -156,14 +134,12 @@ class SerializableGraph(Serializable):
156
134
 
157
135
  class GraphSerializer(SerializableSerializer):
158
136
  @buffered
159
- def serial(self, obj: Union[TileableGraph, ChunkGraph], context: Dict):
137
+ def serial(self, obj: EntityGraph, context: Dict):
160
138
  serializable_graph = SerializableGraph.from_graph(obj)
161
139
  return [], [serializable_graph], False
162
140
 
163
- def deserial(
164
- self, serialized: List, context: Dict, subs: List
165
- ) -> Union[TileableGraph, ChunkGraph]:
166
- serializable_graph: SerializableGraph = subs[0]
141
+ def deserial(self, serialized: List, context: Dict, subs: List) -> TileableGraph:
142
+ serializable_graph: EntityGraph = subs[0]
167
143
  return serializable_graph.to_graph()
168
144
 
169
145
 
@@ -22,13 +22,6 @@ from .base import (
22
22
  )
23
23
  from .core import TileableOperatorMixin, estimate_size, execute
24
24
  from .fetch import Fetch, FetchMixin, FetchShuffle, ShuffleFetchType
25
- from .fuse import Fuse, FuseChunkMixin
26
- from .objects import (
27
- MergeDictOperator,
28
- ObjectFetch,
29
- ObjectFuseChunk,
30
- ObjectFuseChunkMixin,
31
- ObjectOperator,
32
- ObjectOperatorMixin,
33
- )
25
+ from .objects import MergeDictOperator, ObjectFetch, ObjectOperator, ObjectOperatorMixin
34
26
  from .shuffle import MapReduceOperator, ShuffleProxy
27
+ from .utils import add_fetch_builder, build_fetch
@@ -12,11 +12,10 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- import functools
16
15
  import weakref
17
16
  from copy import deepcopy
18
17
  from enum import Enum
19
- from functools import partial
18
+ from functools import lru_cache, partial
20
19
  from typing import Any, Dict, List, Optional, Tuple, Type, Union
21
20
 
22
21
  from ...serialization.core import Placeholder
@@ -37,7 +36,6 @@ from ...serialization.serializables.core import SerializableSerializer
37
36
  from ...typing_ import OperatorType
38
37
  from ...utils import AttributeDict, classproperty, get_user_call_point, tokenize
39
38
  from ..base import Base
40
- from ..entity.chunks import Chunk
41
39
  from ..entity.core import ENTITY_TYPE, Entity, EntityData
42
40
  from ..entity.output_types import OutputType
43
41
  from ..entity.tileables import Tileable
@@ -90,7 +88,7 @@ class SchedulingHint(Serializable):
90
88
  priority = Int32Field("priority", default=None)
91
89
 
92
90
  @classproperty
93
- @functools.lru_cache(1)
91
+ @lru_cache(1)
94
92
  def all_hint_names(cls):
95
93
  return list(cls._FIELDS)
96
94
 
@@ -341,7 +339,7 @@ class Operator(Base, OperatorLogicKeyGeneratorMixin, metaclass=OperatorMetaclass
341
339
  raise ValueError("Outputs' size exceeds limitation")
342
340
 
343
341
  @property
344
- def outputs(self) -> List[Union[Chunk, Tileable]]:
342
+ def outputs(self) -> List[Tileable]:
345
343
  outputs = self._outputs
346
344
  if outputs:
347
345
  return [ref() for ref in outputs]
@@ -17,7 +17,6 @@ from ..entity import OutputType, register_fetch_class
17
17
  from .base import Operator
18
18
  from .core import TileableOperatorMixin
19
19
  from .fetch import Fetch, FetchMixin
20
- from .fuse import Fuse, FuseChunkMixin
21
20
 
22
21
 
23
22
  class ObjectOperator(Operator):
@@ -28,14 +27,6 @@ class ObjectOperatorMixin(TileableOperatorMixin):
28
27
  _output_type_ = OutputType.object
29
28
 
30
29
 
31
- class ObjectFuseChunkMixin(FuseChunkMixin, ObjectOperatorMixin):
32
- __slots__ = ()
33
-
34
-
35
- class ObjectFuseChunk(ObjectFuseChunkMixin, Fuse):
36
- pass
37
-
38
-
39
30
  class ObjectFetch(FetchMixin, ObjectOperatorMixin, Fetch):
40
31
  _output_type_ = OutputType.object
41
32
 
@@ -0,0 +1,55 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from ...typing_ import EntityType, TileableType
16
+ from ..entity import TILEABLE_TYPE
17
+
18
+
19
+ def build_fetch_tileable(tileable: TileableType) -> TileableType:
20
+ if tileable.is_coarse():
21
+ chunks = None
22
+ else:
23
+ chunks = []
24
+ for c in tileable.chunks:
25
+ fetch_chunk = build_fetch(c, index=c.index)
26
+ chunks.append(fetch_chunk)
27
+
28
+ tileable_op = tileable.op
29
+ params = tileable.params.copy()
30
+
31
+ new_op = tileable_op.get_fetch_op_cls(tileable)(_id=tileable_op.id)
32
+ return new_op.new_tileables(
33
+ None,
34
+ chunks=chunks,
35
+ nsplits=tileable.nsplits,
36
+ _key=tileable.key,
37
+ _id=tileable.id,
38
+ **params,
39
+ )[0]
40
+
41
+
42
+ _type_to_builder = [
43
+ (TILEABLE_TYPE, build_fetch_tileable),
44
+ ]
45
+
46
+
47
+ def build_fetch(entity: EntityType, **kw) -> EntityType:
48
+ for entity_types, func in _type_to_builder:
49
+ if isinstance(entity, entity_types):
50
+ return func(entity, **kw)
51
+ raise TypeError(f"Type {type(entity)} not supported")
52
+
53
+
54
+ def add_fetch_builder(entity_type, builder_func):
55
+ _type_to_builder.append((entity_type, builder_func))
@@ -185,7 +185,6 @@ e NaN
185
185
  dtype: float64
186
186
  """
187
187
 
188
- # FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/48
189
188
  _flex_comp_doc_FRAME = """
190
189
  Get {desc} of dataframe and other, element-wise (binary operator `{op_name}`).
191
190
  Among flexible wrappers (`eq`, `ne`, `le`, `lt`, `ge`, `gt`) to comparison
@@ -291,7 +290,7 @@ C True False
291
290
 
292
291
  Compare to a DataFrame of different shape.
293
292
 
294
- >>> other = pd.DataFrame({{'revenue': [300, 250, 100, 150]}},
293
+ >>> other = md.DataFrame({{'revenue': [300, 250, 100, 150]}},
295
294
  ... index=['A', 'B', 'C', 'D'])
296
295
  >>> other.execute()
297
296
  revenue
@@ -306,6 +305,31 @@ A False False
306
305
  B False False
307
306
  C False True
308
307
  D False False
308
+
309
+ Compare to a MultiIndex by level.
310
+
311
+ >>> df_multindex = md.DataFrame({{'cost': [250, 150, 100, 150, 300, 220],
312
+ ... 'revenue': [100, 250, 300, 200, 175, 225]}},
313
+ ... index=[['Q1', 'Q1', 'Q1', 'Q2', 'Q2', 'Q2'],
314
+ ... ['A', 'B', 'C', 'A', 'B', 'C']])
315
+ >>> df_multindex.execute()
316
+ cost revenue
317
+ Q1 A 250 100
318
+ B 150 250
319
+ C 100 300
320
+ Q2 A 150 200
321
+ B 300 175
322
+ C 220 225
323
+
324
+ >>> df.le(df_multindex, level=1).execute()
325
+ cost revenue
326
+ Q1 A True True
327
+ B True True
328
+ C True True
329
+ Q2 A False True
330
+ B True False
331
+ C True False
332
+
309
333
  """
310
334
 
311
335
 
@@ -51,6 +51,8 @@ dtype: bool
51
51
 
52
52
 
53
53
  @bin_compare_doc("Equal to", equiv="==", series_example=_eq_example)
54
- def eq(df, other, axis="columns", level=None):
55
- op = DataFrameEqual(axis=axis, level=level, lhs=df, rhs=other)
54
+ def eq(df, other, axis="columns", level=None, fill_value=None):
55
+ op = DataFrameEqual(
56
+ axis=axis, level=level, lhs=df, rhs=other, fill_value=fill_value
57
+ )
56
58
  return op(df, other)
@@ -52,6 +52,8 @@ dtype: bool
52
52
 
53
53
 
54
54
  @bin_compare_doc("Greater than", equiv=">", series_example=_gt_example)
55
- def gt(df, other, axis="columns", level=None):
56
- op = DataFrameGreater(axis=axis, level=level, lhs=df, rhs=other)
55
+ def gt(df, other, axis="columns", level=None, fill_value=None):
56
+ op = DataFrameGreater(
57
+ axis=axis, level=level, lhs=df, rhs=other, fill_value=fill_value
58
+ )
57
59
  return op(df, other)
@@ -52,6 +52,8 @@ dtype: bool
52
52
 
53
53
 
54
54
  @bin_compare_doc("Greater than or equal to", equiv=">=", series_example=_ge_example)
55
- def ge(df, other, axis="columns", level=None):
56
- op = DataFrameGreaterEqual(axis=axis, level=level, lhs=df, rhs=other)
55
+ def ge(df, other, axis="columns", level=None, fill_value=None):
56
+ op = DataFrameGreaterEqual(
57
+ axis=axis, level=level, lhs=df, rhs=other, fill_value=fill_value
58
+ )
57
59
  return op(df, other)
@@ -52,6 +52,6 @@ dtype: bool
52
52
 
53
53
 
54
54
  @bin_compare_doc("Less than", equiv="<", series_example=_lt_example)
55
- def lt(df, other, axis="columns", level=None):
56
- op = DataFrameLess(axis=axis, level=level, lhs=df, rhs=other)
55
+ def lt(df, other, axis="columns", level=None, fill_value=None):
56
+ op = DataFrameLess(axis=axis, level=level, lhs=df, rhs=other, fill_value=fill_value)
57
57
  return op(df, other)
@@ -52,6 +52,8 @@ dtype: bool
52
52
 
53
53
 
54
54
  @bin_compare_doc("Less than or equal to", equiv="<=", series_example=_le_example)
55
- def le(df, other, axis="columns", level=None):
56
- op = DataFrameLessEqual(axis=axis, level=level, lhs=df, rhs=other)
55
+ def le(df, other, axis="columns", level=None, fill_value=None):
56
+ op = DataFrameLessEqual(
57
+ axis=axis, level=level, lhs=df, rhs=other, fill_value=fill_value
58
+ )
57
59
  return op(df, other)
@@ -51,6 +51,8 @@ dtype: bool
51
51
 
52
52
 
53
53
  @bin_compare_doc("Not equal to", equiv="!=", series_example=_ne_example)
54
- def ne(df, other, axis="columns", level=None):
55
- op = DataFrameNotEqual(axis=axis, level=level, lhs=df, rhs=other)
54
+ def ne(df, other, axis="columns", level=None, fill_value=None):
55
+ op = DataFrameNotEqual(
56
+ axis=axis, level=level, lhs=df, rhs=other, fill_value=fill_value
57
+ )
56
58
  return op(df, other)
@@ -1666,6 +1666,8 @@ class DataFrameData(_BatchedFetcher, BaseDataFrameData):
1666
1666
  raise NotImplementedError
1667
1667
 
1668
1668
  corner_data = fetch_corner_data(self, session=self._executed_sessions[-1])
1669
+ if corner_data is None:
1670
+ return
1669
1671
 
1670
1672
  buf = StringIO()
1671
1673
  max_rows = pd.get_option("display.max_rows")
@@ -13,6 +13,7 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import dataclasses
16
+ import logging
16
17
  import re
17
18
  from typing import Dict, List, Optional, Tuple, Union
18
19
 
@@ -22,12 +23,14 @@ from odps import ODPS
22
23
  from odps.types import Column, OdpsSchema, validate_data_type
23
24
 
24
25
  from ... import opcodes
26
+ from ...config import options
25
27
  from ...core import OutputType
26
28
  from ...core.graph import DAG
27
- from ...odpsio import odps_schema_to_pandas_dtypes
29
+ from ...io.odpsio import odps_schema_to_pandas_dtypes
28
30
  from ...serialization.serializables import (
29
31
  AnyField,
30
32
  BoolField,
33
+ DictField,
31
34
  FieldTypes,
32
35
  Int64Field,
33
36
  ListField,
@@ -37,6 +40,10 @@ from ...serialization.serializables import (
37
40
  from ..utils import parse_index
38
41
  from .core import ColumnPruneSupportedDataSourceMixin, IncrementalIndexDatasource
39
42
 
43
+ logger = logging.getLogger(__name__)
44
+
45
+ _DEFAULT_ANONYMOUS_COL_PREFIX = "_anon_col_"
46
+
40
47
  _EXPLAIN_DEPENDS_REGEX = re.compile(r"([^\s]+) depends on: ([^\n]+)")
41
48
  _EXPLAIN_JOB_REGEX = re.compile(r"(\S+) is root job")
42
49
  _EXPLAIN_TASKS_HEADER_REGEX = re.compile(r"In Job ([^:]+):")
@@ -46,8 +53,11 @@ _EXPLAIN_TASK_SCHEMA_REGEX = re.compile(
46
53
  r"In Task ([^:]+)[\S\s]+FS: output: ([^\n #]+)[\s\S]+schema:\s+([\S\s]+)$",
47
54
  re.MULTILINE,
48
55
  )
49
- _EXPLAIN_COLUMN_REGEX = re.compile(r"([^\(]+) \(([^)]+)\)(?:| AS ([^ ]+))(?:\n|$)")
50
- _ANONYMOUS_COL_REGEX = re.compile(r"^_c\d+$")
56
+ _EXPLAIN_COLUMN_REGEX = re.compile(r"([^\(]+) \(([^\n]+)\)(?:| AS ([^ ]+))(?:\n|$)")
57
+ _ANONYMOUS_COL_REGEX = re.compile(r"^_c(\d+)$")
58
+
59
+ _SIMPLE_SCHEMA_COLS_REGEX = re.compile(r"SELECT (([^:]+:[^, ]+[, ]*)+)FROM")
60
+ _SIMPLE_SCHEMA_COL_REGEX = re.compile(r"([^\.]+):([^, ]+)")
51
61
 
52
62
 
53
63
  @dataclasses.dataclass
@@ -152,7 +162,7 @@ def _resolve_task_sector(job_name: str, sector: str) -> TaskSector:
152
162
  return TaskSector(job_name, task_name, out_target, schemas)
153
163
 
154
164
 
155
- def _parse_explained_schema(explain_string: str) -> OdpsSchema:
165
+ def _parse_full_explain(explain_string: str) -> OdpsSchema:
156
166
  sectors = _split_explain_string(explain_string)
157
167
  jobs_sector = tasks_sector = None
158
168
 
@@ -191,6 +201,25 @@ def _parse_explained_schema(explain_string: str) -> OdpsSchema:
191
201
  return OdpsSchema(cols)
192
202
 
193
203
 
204
+ def _parse_simple_explain(explain_string: str) -> OdpsSchema:
205
+ fields_match = _SIMPLE_SCHEMA_COLS_REGEX.search(explain_string)
206
+ if not fields_match:
207
+ raise ValueError("Cannot detect output table schema")
208
+
209
+ fields_str = fields_match.group(1)
210
+ cols = []
211
+ for field, type_name in _SIMPLE_SCHEMA_COL_REGEX.findall(fields_str):
212
+ cols.append(Column(field, validate_data_type(type_name)))
213
+ return OdpsSchema(cols)
214
+
215
+
216
+ def _parse_explained_schema(explain_string: str) -> OdpsSchema:
217
+ if explain_string.startswith("AdhocSink"):
218
+ return _parse_simple_explain(explain_string)
219
+ else:
220
+ return _parse_full_explain(explain_string)
221
+
222
+
194
223
  class DataFrameReadODPSQuery(
195
224
  IncrementalIndexDatasource,
196
225
  ColumnPruneSupportedDataSourceMixin,
@@ -205,6 +234,7 @@ class DataFrameReadODPSQuery(
205
234
  string_as_binary = BoolField("string_as_binary", default=None)
206
235
  index_columns = ListField("index_columns", FieldTypes.string, default=None)
207
236
  index_dtypes = SeriesField("index_dtypes", default=None)
237
+ column_renames = DictField("column_renames", default=None)
208
238
 
209
239
  def get_columns(self):
210
240
  return self.columns
@@ -246,6 +276,8 @@ def read_odps_query(
246
276
  odps_entry: ODPS = None,
247
277
  index_col: Union[None, str, List[str]] = None,
248
278
  string_as_binary: bool = None,
279
+ sql_hints: Dict[str, str] = None,
280
+ anonymous_col_prefix: str = _DEFAULT_ANONYMOUS_COL_PREFIX,
249
281
  **kw,
250
282
  ):
251
283
  """
@@ -260,25 +292,51 @@ def read_odps_query(
260
292
  MaxCompute SQL statement.
261
293
  index_col: Union[None, str, List[str]]
262
294
  Columns to be specified as indexes.
295
+ string_as_binary: bool, optional
296
+ Whether to convert string columns to binary.
297
+ sql_hints: Dict[str, str], optional
298
+ User specified SQL hints.
299
+ anonymous_col_prefix: str, optional
300
+ Prefix for anonymous columns, '_anon_col_' by default.
263
301
 
264
302
  Returns
265
303
  -------
266
304
  result: DataFrame
267
305
  DataFrame read from MaxCompute (ODPS) table
268
306
  """
307
+ hints = options.sql.settings.copy() or {}
308
+ if sql_hints:
309
+ hints.update(sql_hints)
310
+
269
311
  odps_entry = odps_entry or ODPS.from_global() or ODPS.from_environments()
312
+
313
+ if options.session.enable_schema or odps_entry.is_schema_namespace_enabled():
314
+ hints["odps.namespace.schema"] = "true"
315
+ hints["odps.sql.allow.namespace.schema"] = "true"
316
+
317
+ # fixme workaround for multi-stage split process
318
+ hints["odps.sql.object.table.split.by.object.size.enabled"] = "false"
319
+
270
320
  if odps_entry is None:
271
321
  raise ValueError("Missing odps_entry parameter")
272
- inst = odps_entry.execute_sql(f"EXPLAIN {query}")
322
+ inst = odps_entry.execute_sql(f"EXPLAIN {query}", hints=hints)
323
+ logger.debug("Explain instance ID: %s", inst.id)
273
324
  explain_str = list(inst.get_task_results().values())[0]
274
325
 
275
326
  odps_schema = _parse_explained_schema(explain_str)
276
327
 
328
+ new_columns = []
329
+ col_renames = {}
277
330
  for col in odps_schema.columns:
278
- if _ANONYMOUS_COL_REGEX.match(col.name) and col.name not in query:
279
- raise ValueError("Need to specify names for all columns in SELECT clause.")
331
+ anon_match = _ANONYMOUS_COL_REGEX.match(col.name)
332
+ if anon_match and col.name not in query:
333
+ new_name = anonymous_col_prefix + anon_match.group(1)
334
+ col_renames[col.name] = new_name
335
+ new_columns.append(Column(new_name, col.type))
336
+ else:
337
+ new_columns.append(col)
280
338
 
281
- dtypes = odps_schema_to_pandas_dtypes(odps_schema)
339
+ dtypes = odps_schema_to_pandas_dtypes(OdpsSchema(new_columns))
282
340
 
283
341
  if not index_col:
284
342
  index_dtypes = None
@@ -301,5 +359,6 @@ def read_odps_query(
301
359
  string_as_binary=string_as_binary,
302
360
  index_columns=index_col,
303
361
  index_dtypes=index_dtypes,
362
+ column_renames=col_renames,
304
363
  )
305
364
  return op(chunk_bytes=chunk_bytes, chunk_size=chunk_size)
@@ -22,8 +22,9 @@ from odps.models import Table
22
22
  from odps.utils import to_timestamp
23
23
 
24
24
  from ... import opcodes
25
+ from ...config import options
25
26
  from ...core import OutputType
26
- from ...odpsio import odps_schema_to_pandas_dtypes
27
+ from ...io.odpsio import odps_schema_to_pandas_dtypes
27
28
  from ...serialization.serializables import (
28
29
  AnyField,
29
30
  BoolField,
@@ -167,12 +168,13 @@ def read_odps_table(
167
168
  DataFrame read from MaxCompute (ODPS) table
168
169
  """
169
170
  odps_entry = odps_entry or ODPS.from_global() or ODPS.from_environments()
171
+ schema = options.session.default_schema or odps_entry.schema
170
172
  if odps_entry is None:
171
173
  raise ValueError("Missing odps_entry parameter")
172
174
  if isinstance(table_name, Table):
173
175
  table = table_name
174
176
  else:
175
- table = odps_entry.get_table(table_name)
177
+ table = odps_entry.get_table(table_name, schema=schema)
176
178
 
177
179
  if not table.table_schema.partitions and (
178
180
  partitions is not None or append_partitions