maxframe 0.1.0b2__cp310-cp310-win_amd64.whl → 0.1.0b4__cp310-cp310-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cp310-win_amd64.pyd +0 -0
- maxframe/codegen.py +88 -19
- maxframe/config/config.py +9 -0
- maxframe/core/entity/executable.py +1 -0
- maxframe/core/entity/objects.py +3 -2
- maxframe/core/graph/core.cp310-win_amd64.pyd +0 -0
- maxframe/dataframe/__init__.py +7 -1
- maxframe/dataframe/core.py +4 -2
- maxframe/dataframe/datasource/read_odps_query.py +4 -2
- maxframe/dataframe/datasource/read_odps_table.py +3 -1
- maxframe/dataframe/datasource/tests/test_datasource.py +22 -0
- maxframe/dataframe/datastore/core.py +19 -0
- maxframe/dataframe/datastore/to_csv.py +2 -2
- maxframe/dataframe/datastore/to_odps.py +2 -2
- maxframe/dataframe/groupby/__init__.py +1 -0
- maxframe/dataframe/groupby/core.py +5 -0
- maxframe/dataframe/indexing/reset_index.py +1 -17
- maxframe/lib/aio/isolation.py +6 -1
- maxframe/lib/mmh3.cp310-win_amd64.pyd +0 -0
- maxframe/odpsio/arrow.py +8 -3
- maxframe/odpsio/schema.py +18 -5
- maxframe/odpsio/tests/test_schema.py +25 -0
- maxframe/opcodes.py +5 -0
- maxframe/protocol.py +7 -0
- maxframe/serialization/core.cp310-win_amd64.pyd +0 -0
- maxframe/serialization/serializables/core.py +6 -1
- maxframe/serialization/serializables/field.py +2 -0
- maxframe/session.py +4 -2
- maxframe/tensor/core.py +3 -3
- maxframe/tests/test_codegen.py +69 -0
- maxframe/tests/test_protocol.py +16 -8
- maxframe/tests/utils.py +1 -0
- maxframe/utils.py +20 -1
- {maxframe-0.1.0b2.dist-info → maxframe-0.1.0b4.dist-info}/METADATA +1 -1
- {maxframe-0.1.0b2.dist-info → maxframe-0.1.0b4.dist-info}/RECORD +42 -40
- maxframe_client/clients/framedriver.py +7 -7
- maxframe_client/session/odps.py +11 -10
- maxframe_client/session/task.py +8 -1
- maxframe_client/session/tests/test_task.py +29 -11
- maxframe_client/tests/test_session.py +23 -0
- {maxframe-0.1.0b2.dist-info → maxframe-0.1.0b4.dist-info}/WHEEL +0 -0
- {maxframe-0.1.0b2.dist-info → maxframe-0.1.0b4.dist-info}/top_level.txt +0 -0
|
Binary file
|
maxframe/codegen.py
CHANGED
|
@@ -17,7 +17,7 @@ import base64
|
|
|
17
17
|
import dataclasses
|
|
18
18
|
import logging
|
|
19
19
|
from enum import Enum
|
|
20
|
-
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
|
|
20
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union
|
|
21
21
|
|
|
22
22
|
from odps.types import OdpsSchema
|
|
23
23
|
from odps.utils import camel_to_underline
|
|
@@ -30,6 +30,7 @@ from .odpsio import build_dataframe_table_meta
|
|
|
30
30
|
from .odpsio.schema import pandas_to_odps_schema
|
|
31
31
|
from .protocol import DataFrameTableMeta, ResultInfo
|
|
32
32
|
from .serialization import PickleContainer
|
|
33
|
+
from .serialization.serializables import Serializable, StringField
|
|
33
34
|
from .typing_ import PandasObjectTypes
|
|
34
35
|
from .udf import MarkedFunction
|
|
35
36
|
|
|
@@ -48,8 +49,11 @@ class CodeGenResult:
|
|
|
48
49
|
constants: Dict[str, Any]
|
|
49
50
|
|
|
50
51
|
|
|
51
|
-
class AbstractUDF(
|
|
52
|
-
_session_id: str
|
|
52
|
+
class AbstractUDF(Serializable):
|
|
53
|
+
_session_id: str = StringField("session_id")
|
|
54
|
+
|
|
55
|
+
def __init__(self, session_id: Optional[str] = None, **kw):
|
|
56
|
+
super().__init__(_session_id=session_id, **kw)
|
|
53
57
|
|
|
54
58
|
@property
|
|
55
59
|
def name(self) -> str:
|
|
@@ -74,7 +78,66 @@ class AbstractUDF(abc.ABC):
|
|
|
74
78
|
|
|
75
79
|
class UserCodeMixin:
|
|
76
80
|
@classmethod
|
|
77
|
-
def
|
|
81
|
+
def obj_to_python_expr(cls, obj: Any = None) -> str:
|
|
82
|
+
"""
|
|
83
|
+
Parameters
|
|
84
|
+
----------
|
|
85
|
+
obj
|
|
86
|
+
The object to convert to python expr.
|
|
87
|
+
Returns
|
|
88
|
+
-------
|
|
89
|
+
str :
|
|
90
|
+
The str type content equals to the object when use in the python code directly.
|
|
91
|
+
"""
|
|
92
|
+
if obj is None:
|
|
93
|
+
return "None"
|
|
94
|
+
|
|
95
|
+
if isinstance(obj, (int, float)):
|
|
96
|
+
return repr(obj)
|
|
97
|
+
|
|
98
|
+
if isinstance(obj, bool):
|
|
99
|
+
return "True" if obj else "False"
|
|
100
|
+
|
|
101
|
+
if isinstance(obj, bytes):
|
|
102
|
+
base64_bytes = base64.b64encode(obj)
|
|
103
|
+
return f"base64.b64decode({base64_bytes})"
|
|
104
|
+
|
|
105
|
+
if isinstance(obj, str):
|
|
106
|
+
return repr(obj)
|
|
107
|
+
|
|
108
|
+
if isinstance(obj, list):
|
|
109
|
+
return (
|
|
110
|
+
f"[{', '.join([cls.obj_to_python_expr(element) for element in obj])}]"
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
if isinstance(obj, dict):
|
|
114
|
+
items = (
|
|
115
|
+
f"{repr(key)}: {cls.obj_to_python_expr(value)}"
|
|
116
|
+
for key, value in obj.items()
|
|
117
|
+
)
|
|
118
|
+
return f"{{{', '.join(items)}}}"
|
|
119
|
+
|
|
120
|
+
if isinstance(obj, tuple):
|
|
121
|
+
return f"({', '.join([cls.obj_to_python_expr(sub_obj) for sub_obj in obj])}{',' if len(obj) == 1 else ''})"
|
|
122
|
+
|
|
123
|
+
if isinstance(obj, set):
|
|
124
|
+
return (
|
|
125
|
+
f"{{{', '.join([cls.obj_to_python_expr(sub_obj) for sub_obj in obj])}}}"
|
|
126
|
+
if obj
|
|
127
|
+
else "set()"
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
if isinstance(obj, PickleContainer):
|
|
131
|
+
return UserCodeMixin.generate_pickled_codes(obj, None)
|
|
132
|
+
|
|
133
|
+
raise ValueError(f"not support arg type {type(obj)}")
|
|
134
|
+
|
|
135
|
+
@classmethod
|
|
136
|
+
def generate_pickled_codes(
|
|
137
|
+
cls,
|
|
138
|
+
code_to_pickle: Any,
|
|
139
|
+
unpicked_data_var_name: Union[str, None] = "pickled_data",
|
|
140
|
+
) -> str:
|
|
78
141
|
"""
|
|
79
142
|
Generate pickled codes. The final pickled variable is called 'pickled_data'.
|
|
80
143
|
|
|
@@ -82,20 +145,20 @@ class UserCodeMixin:
|
|
|
82
145
|
----------
|
|
83
146
|
code_to_pickle: Any
|
|
84
147
|
The code to be pickled.
|
|
148
|
+
unpicked_data_var_name: str
|
|
149
|
+
The variables in code used to hold the loads object from the cloudpickle
|
|
85
150
|
|
|
86
151
|
Returns
|
|
87
152
|
-------
|
|
88
|
-
|
|
89
|
-
The code snippets of pickling, the final variable is called 'pickled_data'.
|
|
153
|
+
str :
|
|
154
|
+
The code snippets of pickling, the final variable is called 'pickled_data' by default.
|
|
90
155
|
"""
|
|
91
156
|
pickled, buffers = cls.dump_pickled_data(code_to_pickle)
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
f"pickled_data = cloudpickle.loads(base64_data, buffers=[{buffers_str}])",
|
|
98
|
-
]
|
|
157
|
+
pickle_loads_expr = f"cloudpickle.loads({cls.obj_to_python_expr(pickled)}, buffers={cls.obj_to_python_expr(buffers)})"
|
|
158
|
+
if unpicked_data_var_name:
|
|
159
|
+
return f"{unpicked_data_var_name} = {pickle_loads_expr}"
|
|
160
|
+
|
|
161
|
+
return pickle_loads_expr
|
|
99
162
|
|
|
100
163
|
@staticmethod
|
|
101
164
|
def dump_pickled_data(
|
|
@@ -114,8 +177,9 @@ class UserCodeMixin:
|
|
|
114
177
|
|
|
115
178
|
|
|
116
179
|
class BigDagCodeContext(metaclass=abc.ABCMeta):
|
|
117
|
-
def __init__(self, session_id: str = None):
|
|
180
|
+
def __init__(self, session_id: str = None, subdag_id: str = None):
|
|
118
181
|
self._session_id = session_id
|
|
182
|
+
self._subdag_id = subdag_id
|
|
119
183
|
self._tileable_key_to_variables = dict()
|
|
120
184
|
self.constants = dict()
|
|
121
185
|
self._data_table_meta_cache = dict()
|
|
@@ -142,10 +206,14 @@ class BigDagCodeContext(metaclass=abc.ABCMeta):
|
|
|
142
206
|
except KeyError:
|
|
143
207
|
var_name = self._tileable_key_to_variables[
|
|
144
208
|
tileable.key
|
|
145
|
-
] =
|
|
146
|
-
self._next_var_id += 1
|
|
209
|
+
] = self.next_var_name()
|
|
147
210
|
return var_name
|
|
148
211
|
|
|
212
|
+
def next_var_name(self) -> str:
|
|
213
|
+
var_name = f"var_{self._next_var_id}"
|
|
214
|
+
self._next_var_id += 1
|
|
215
|
+
return var_name
|
|
216
|
+
|
|
149
217
|
def get_odps_schema(
|
|
150
218
|
self, data: PandasObjectTypes, unknown_as_string: bool = False
|
|
151
219
|
) -> OdpsSchema:
|
|
@@ -275,9 +343,10 @@ class BigDagCodeGenerator(metaclass=abc.ABCMeta):
|
|
|
275
343
|
engine_priority: int = 0
|
|
276
344
|
_extension_loaded = False
|
|
277
345
|
|
|
278
|
-
def __init__(self, session_id: str):
|
|
346
|
+
def __init__(self, session_id: str, subdag_id: str = None):
|
|
279
347
|
self._session_id = session_id
|
|
280
|
-
self.
|
|
348
|
+
self._subdag_id = subdag_id
|
|
349
|
+
self._context = self._init_context(session_id, subdag_id)
|
|
281
350
|
|
|
282
351
|
@classmethod
|
|
283
352
|
def _load_engine_extensions(cls):
|
|
@@ -307,7 +376,7 @@ class BigDagCodeGenerator(metaclass=abc.ABCMeta):
|
|
|
307
376
|
raise NotImplementedError
|
|
308
377
|
|
|
309
378
|
@abc.abstractmethod
|
|
310
|
-
def _init_context(self, session_id: str) -> BigDagCodeContext:
|
|
379
|
+
def _init_context(self, session_id: str, subdag_id: str) -> BigDagCodeContext:
|
|
311
380
|
raise NotImplementedError
|
|
312
381
|
|
|
313
382
|
def _generate_comments(
|
maxframe/config/config.py
CHANGED
|
@@ -340,6 +340,12 @@ default_options.register_option(
|
|
|
340
340
|
validator=is_integer,
|
|
341
341
|
remote=True,
|
|
342
342
|
)
|
|
343
|
+
default_options.register_option(
|
|
344
|
+
"session.subinstance_priority",
|
|
345
|
+
None,
|
|
346
|
+
validator=any_validator(is_null, is_integer),
|
|
347
|
+
remote=True,
|
|
348
|
+
)
|
|
343
349
|
|
|
344
350
|
default_options.register_option("warn_duplicated_execution", False, validator=is_bool)
|
|
345
351
|
default_options.register_option("dataframe.use_arrow_dtype", True, validator=is_bool)
|
|
@@ -352,6 +358,9 @@ default_options.register_option(
|
|
|
352
358
|
default_options.register_option(
|
|
353
359
|
"show_progress", "auto", validator=any_validator(is_bool, is_string)
|
|
354
360
|
)
|
|
361
|
+
default_options.register_option(
|
|
362
|
+
"dag.settings", value=dict(), validator=is_dict, remote=True
|
|
363
|
+
)
|
|
355
364
|
|
|
356
365
|
################
|
|
357
366
|
# SPE Settings #
|
maxframe/core/entity/objects.py
CHANGED
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
from typing import Any, Dict
|
|
16
16
|
|
|
17
17
|
from ...serialization.serializables import FieldTypes, ListField
|
|
18
|
+
from ...utils import skip_na_call
|
|
18
19
|
from .chunks import Chunk, ChunkData
|
|
19
20
|
from .core import Entity
|
|
20
21
|
from .executable import _ToObjectMixin
|
|
@@ -62,8 +63,8 @@ class ObjectData(TileableData, _ToObjectMixin):
|
|
|
62
63
|
_chunks = ListField(
|
|
63
64
|
"chunks",
|
|
64
65
|
FieldTypes.reference(ObjectChunkData),
|
|
65
|
-
on_serialize=lambda x: [it.data for it in x]
|
|
66
|
-
on_deserialize=lambda x: [ObjectChunk(it) for it in x]
|
|
66
|
+
on_serialize=skip_na_call(lambda x: [it.data for it in x]),
|
|
67
|
+
on_deserialize=skip_na_call(lambda x: [ObjectChunk(it) for it in x]),
|
|
67
68
|
)
|
|
68
69
|
|
|
69
70
|
def __init__(self, op=None, nsplits=None, **kw):
|
|
Binary file
|
maxframe/dataframe/__init__.py
CHANGED
|
@@ -39,6 +39,7 @@ from .datasource.read_odps_query import read_odps_query
|
|
|
39
39
|
from .datasource.read_odps_table import read_odps_table
|
|
40
40
|
from .datasource.read_parquet import read_parquet
|
|
41
41
|
from .datastore.to_odps import to_odps_table
|
|
42
|
+
from .groupby import NamedAgg
|
|
42
43
|
from .initializer import DataFrame, Index, Series, read_pandas
|
|
43
44
|
from .merge import concat, merge
|
|
44
45
|
from .misc.cut import cut
|
|
@@ -52,7 +53,12 @@ from .reduction import CustomReduction, unique
|
|
|
52
53
|
from .tseries.to_datetime import to_datetime
|
|
53
54
|
|
|
54
55
|
try:
|
|
55
|
-
from pandas import NA,
|
|
56
|
+
from pandas import NA, Timestamp
|
|
57
|
+
except ImportError: # pragma: no cover
|
|
58
|
+
pass
|
|
59
|
+
|
|
60
|
+
try:
|
|
61
|
+
from . import _internal
|
|
56
62
|
except ImportError: # pragma: no cover
|
|
57
63
|
pass
|
|
58
64
|
|
maxframe/dataframe/core.py
CHANGED
|
@@ -960,7 +960,9 @@ class BaseSeriesData(HasShapeTileableData, _ToPandasMixin):
|
|
|
960
960
|
buf = StringIO()
|
|
961
961
|
max_rows = pd.get_option("display.max_rows")
|
|
962
962
|
corner_max_rows = (
|
|
963
|
-
max_rows
|
|
963
|
+
max_rows
|
|
964
|
+
if self.shape[0] <= max_rows or corner_data.shape[0] == 0
|
|
965
|
+
else corner_data.shape[0] - 1
|
|
964
966
|
) # make sure max_rows < corner_data
|
|
965
967
|
|
|
966
968
|
with pd.option_context("display.max_rows", corner_max_rows):
|
|
@@ -1605,7 +1607,7 @@ class DataFrameData(_BatchedFetcher, BaseDataFrameData):
|
|
|
1605
1607
|
buf = StringIO()
|
|
1606
1608
|
max_rows = pd.get_option("display.max_rows")
|
|
1607
1609
|
|
|
1608
|
-
if self.shape[0] <= max_rows:
|
|
1610
|
+
if self.shape[0] <= max_rows or corner_data.shape[0] == 0:
|
|
1609
1611
|
buf.write(repr(corner_data) if representation else str(corner_data))
|
|
1610
1612
|
else:
|
|
1611
1613
|
# remember we cannot directly call repr(df),
|
|
@@ -46,7 +46,7 @@ _EXPLAIN_TASK_SCHEMA_REGEX = re.compile(
|
|
|
46
46
|
r"In Task ([^:]+)[\S\s]+FS: output: ([^\n #]+)[\s\S]+schema:\s+([\S\s]+)$",
|
|
47
47
|
re.MULTILINE,
|
|
48
48
|
)
|
|
49
|
-
_EXPLAIN_COLUMN_REGEX = re.compile(r"([
|
|
49
|
+
_EXPLAIN_COLUMN_REGEX = re.compile(r"([^\(]+) \(([^)]+)\)(?:| AS ([^ ]+))(?:\n|$)")
|
|
50
50
|
|
|
51
51
|
|
|
52
52
|
@dataclasses.dataclass
|
|
@@ -263,7 +263,9 @@ def read_odps_query(
|
|
|
263
263
|
result: DataFrame
|
|
264
264
|
DataFrame read from MaxCompute (ODPS) table
|
|
265
265
|
"""
|
|
266
|
-
odps_entry = odps_entry or ODPS.from_environments()
|
|
266
|
+
odps_entry = odps_entry or ODPS.from_global() or ODPS.from_environments()
|
|
267
|
+
if odps_entry is None:
|
|
268
|
+
raise ValueError("Missing odps_entry parameter")
|
|
267
269
|
inst = odps_entry.execute_sql(f"EXPLAIN {query}")
|
|
268
270
|
explain_str = list(inst.get_task_results().values())[0]
|
|
269
271
|
|
|
@@ -69,7 +69,7 @@ class DataFrameReadODPSTable(
|
|
|
69
69
|
return getattr(self, "partition_spec", None)
|
|
70
70
|
|
|
71
71
|
def get_columns(self):
|
|
72
|
-
return self.columns
|
|
72
|
+
return self.columns or list(self.dtypes.index)
|
|
73
73
|
|
|
74
74
|
def set_pruned_columns(self, columns, *, keep_order=None): # pragma: no cover
|
|
75
75
|
self.columns = columns
|
|
@@ -164,6 +164,8 @@ def read_odps_table(
|
|
|
164
164
|
DataFrame read from MaxCompute (ODPS) table
|
|
165
165
|
"""
|
|
166
166
|
odps_entry = odps_entry or ODPS.from_global() or ODPS.from_environments()
|
|
167
|
+
if odps_entry is None:
|
|
168
|
+
raise ValueError("Missing odps_entry parameter")
|
|
167
169
|
if isinstance(table_name, Table):
|
|
168
170
|
table = table_name
|
|
169
171
|
else:
|
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
+
import os
|
|
15
16
|
from collections import OrderedDict
|
|
16
17
|
|
|
17
18
|
import numpy as np
|
|
@@ -33,6 +34,7 @@ from ..from_tensor import (
|
|
|
33
34
|
)
|
|
34
35
|
from ..index import from_pandas as from_pandas_index
|
|
35
36
|
from ..index import from_tileable
|
|
37
|
+
from ..read_odps_query import ColumnSchema, _resolve_task_sector
|
|
36
38
|
from ..series import from_pandas as from_pandas_series
|
|
37
39
|
|
|
38
40
|
ray = lazy_import("ray")
|
|
@@ -228,6 +230,7 @@ def test_from_odps_table():
|
|
|
228
230
|
assert df.op.table_name == test_table.full_table_name
|
|
229
231
|
assert df.index_value.name is None
|
|
230
232
|
assert isinstance(df.index_value.value, IndexValue.RangeIndex)
|
|
233
|
+
assert df.op.get_columns() == ["col1", "col2", "col3"]
|
|
231
234
|
pd.testing.assert_series_equal(
|
|
232
235
|
df.dtypes,
|
|
233
236
|
pd.Series(
|
|
@@ -247,6 +250,7 @@ def test_from_odps_table():
|
|
|
247
250
|
assert df.op.table_name == test_table.full_table_name
|
|
248
251
|
assert df.index_value.name is None
|
|
249
252
|
assert isinstance(df.index_value.value, IndexValue.RangeIndex)
|
|
253
|
+
assert df.op.get_columns() == ["col1", "col2"]
|
|
250
254
|
pd.testing.assert_series_equal(
|
|
251
255
|
df.dtypes,
|
|
252
256
|
pd.Series([np.dtype("O"), np.dtype("int64")], index=["col1", "col2"]),
|
|
@@ -257,6 +261,7 @@ def test_from_odps_table():
|
|
|
257
261
|
assert df.index_value.name == "col1"
|
|
258
262
|
assert isinstance(df.index_value.value, IndexValue.Index)
|
|
259
263
|
assert df.index.dtype == np.dtype("O")
|
|
264
|
+
assert df.op.get_columns() == ["col2", "col3"]
|
|
260
265
|
pd.testing.assert_series_equal(
|
|
261
266
|
df.dtypes,
|
|
262
267
|
pd.Series([np.dtype("int64"), np.dtype("float64")], index=["col2", "col3"]),
|
|
@@ -267,6 +272,7 @@ def test_from_odps_table():
|
|
|
267
272
|
|
|
268
273
|
df = read_odps_table(test_parted_table, append_partitions=True)
|
|
269
274
|
assert df.op.append_partitions is True
|
|
275
|
+
assert df.op.get_columns() == ["col1", "col2", "col3", "pt"]
|
|
270
276
|
pd.testing.assert_series_equal(
|
|
271
277
|
df.dtypes,
|
|
272
278
|
pd.Series(
|
|
@@ -280,6 +286,7 @@ def test_from_odps_table():
|
|
|
280
286
|
)
|
|
281
287
|
assert df.op.append_partitions is True
|
|
282
288
|
assert df.op.partitions == ["pt=20240103"]
|
|
289
|
+
assert df.op.get_columns() == ["col1", "col2", "pt"]
|
|
283
290
|
pd.testing.assert_series_equal(
|
|
284
291
|
df.dtypes,
|
|
285
292
|
pd.Series(
|
|
@@ -377,3 +384,18 @@ def test_date_range():
|
|
|
377
384
|
assert dr.index_value.is_unique == expected.is_unique
|
|
378
385
|
assert dr.index_value.is_monotonic_increasing == expected.is_monotonic_increasing
|
|
379
386
|
assert dr.name == expected.name
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
def test_resolve_task_sector():
|
|
390
|
+
input_path = os.path.join(os.path.dirname(__file__), "test-data", "task-input.txt")
|
|
391
|
+
with open(input_path, "r") as f:
|
|
392
|
+
sector = f.read()
|
|
393
|
+
actual_sector = _resolve_task_sector("job0", sector)
|
|
394
|
+
|
|
395
|
+
assert actual_sector.job_name == "job0"
|
|
396
|
+
assert actual_sector.task_name == "M1"
|
|
397
|
+
assert actual_sector.output_target == "Screen"
|
|
398
|
+
assert len(actual_sector.schema) == 78
|
|
399
|
+
assert actual_sector.schema[0] == ColumnSchema("unnamed: 0", "bigint", "")
|
|
400
|
+
assert actual_sector.schema[1] == ColumnSchema("id", "bigint", "id_alias")
|
|
401
|
+
assert actual_sector.schema[2] == ColumnSchema("listing_url", "string", "")
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class DataFrameDataStore(DataFrameOperator, DataFrameOperatorMixin):
|
|
19
|
+
pass
|
|
@@ -23,11 +23,11 @@ from ...serialization.serializables import (
|
|
|
23
23
|
ListField,
|
|
24
24
|
StringField,
|
|
25
25
|
)
|
|
26
|
-
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
27
26
|
from ..utils import parse_index
|
|
27
|
+
from .core import DataFrameDataStore
|
|
28
28
|
|
|
29
29
|
|
|
30
|
-
class DataFrameToCSV(
|
|
30
|
+
class DataFrameToCSV(DataFrameDataStore):
|
|
31
31
|
_op_type_ = opcodes.TO_CSV
|
|
32
32
|
|
|
33
33
|
input = KeyField("input")
|
|
@@ -32,13 +32,13 @@ from ...serialization.serializables import (
|
|
|
32
32
|
)
|
|
33
33
|
from ...typing_ import TileableType
|
|
34
34
|
from ..core import DataFrame # noqa: F401
|
|
35
|
-
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
36
35
|
from ..utils import parse_index
|
|
36
|
+
from .core import DataFrameDataStore
|
|
37
37
|
|
|
38
38
|
logger = logging.getLogger(__name__)
|
|
39
39
|
|
|
40
40
|
|
|
41
|
-
class DataFrameToODPSTable(
|
|
41
|
+
class DataFrameToODPSTable(DataFrameDataStore):
|
|
42
42
|
_op_type_ = opcodes.TO_ODPS_TABLE
|
|
43
43
|
|
|
44
44
|
dtypes = SeriesField("dtypes")
|
|
@@ -12,6 +12,8 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
+
from collections import namedtuple
|
|
16
|
+
|
|
15
17
|
import pandas as pd
|
|
16
18
|
|
|
17
19
|
from ... import opcodes
|
|
@@ -30,6 +32,9 @@ _GROUP_KEYS_NO_DEFAULT = pd_release_version >= (1, 5, 0)
|
|
|
30
32
|
_default_group_keys = no_default if _GROUP_KEYS_NO_DEFAULT else True
|
|
31
33
|
|
|
32
34
|
|
|
35
|
+
NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"])
|
|
36
|
+
|
|
37
|
+
|
|
33
38
|
class DataFrameGroupByOperator(MapReduceOperator, DataFrameOperatorMixin):
|
|
34
39
|
_op_type_ = opcodes.GROUPBY
|
|
35
40
|
|
|
@@ -107,7 +107,6 @@ def df_reset_index(
|
|
|
107
107
|
inplace=False,
|
|
108
108
|
col_level=0,
|
|
109
109
|
col_fill="",
|
|
110
|
-
incremental_index=False,
|
|
111
110
|
):
|
|
112
111
|
"""
|
|
113
112
|
Reset the index, or a level of it.
|
|
@@ -133,12 +132,6 @@ def df_reset_index(
|
|
|
133
132
|
col_fill : object, default ''
|
|
134
133
|
If the columns have multiple levels, determines how the other
|
|
135
134
|
levels are named. If None then the index name is repeated.
|
|
136
|
-
incremental_index: bool, default False
|
|
137
|
-
Ensure RangeIndex incremental, when output DataFrame has multiple chunks,
|
|
138
|
-
ensuring index incremental costs more computation,
|
|
139
|
-
so by default, each chunk will have index which starts from 0,
|
|
140
|
-
setting incremental_index=True,reset_index will guarantee that
|
|
141
|
-
output DataFrame's index is from 0 to n - 1.
|
|
142
135
|
|
|
143
136
|
Returns
|
|
144
137
|
-------
|
|
@@ -264,7 +257,6 @@ def df_reset_index(
|
|
|
264
257
|
drop=drop,
|
|
265
258
|
col_level=col_level,
|
|
266
259
|
col_fill=col_fill,
|
|
267
|
-
incremental_index=incremental_index,
|
|
268
260
|
output_types=[OutputType.dataframe],
|
|
269
261
|
)
|
|
270
262
|
ret = op(df)
|
|
@@ -280,7 +272,6 @@ def series_reset_index(
|
|
|
280
272
|
drop=False,
|
|
281
273
|
name=no_default,
|
|
282
274
|
inplace=False,
|
|
283
|
-
incremental_index=False,
|
|
284
275
|
):
|
|
285
276
|
"""
|
|
286
277
|
Generate a new DataFrame or Series with the index reset.
|
|
@@ -303,12 +294,6 @@ def series_reset_index(
|
|
|
303
294
|
when `drop` is True.
|
|
304
295
|
inplace : bool, default False
|
|
305
296
|
Modify the Series in place (do not create a new object).
|
|
306
|
-
incremental_index: bool, default False
|
|
307
|
-
Ensure RangeIndex incremental, when output Series has multiple chunks,
|
|
308
|
-
ensuring index incremental costs more computation,
|
|
309
|
-
so by default, each chunk will have index which starts from 0,
|
|
310
|
-
setting incremental_index=True,reset_index will guarantee that
|
|
311
|
-
output Series's index is from 0 to n - 1.
|
|
312
297
|
|
|
313
298
|
Returns
|
|
314
299
|
-------
|
|
@@ -406,8 +391,7 @@ def series_reset_index(
|
|
|
406
391
|
level=level,
|
|
407
392
|
drop=drop,
|
|
408
393
|
name=name,
|
|
409
|
-
|
|
410
|
-
output_types=[OutputType.series],
|
|
394
|
+
output_types=[OutputType.series if drop else OutputType.dataframe],
|
|
411
395
|
)
|
|
412
396
|
ret = op(series)
|
|
413
397
|
if not inplace:
|
maxframe/lib/aio/isolation.py
CHANGED
|
@@ -14,11 +14,14 @@
|
|
|
14
14
|
|
|
15
15
|
import asyncio
|
|
16
16
|
import atexit
|
|
17
|
+
import itertools
|
|
17
18
|
import threading
|
|
18
19
|
from typing import Dict, Optional
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
class Isolation:
|
|
23
|
+
_counter = itertools.count().__next__
|
|
24
|
+
|
|
22
25
|
loop: asyncio.AbstractEventLoop
|
|
23
26
|
_stopped: Optional[asyncio.Event]
|
|
24
27
|
_thread: Optional[threading.Thread]
|
|
@@ -38,7 +41,9 @@ class Isolation:
|
|
|
38
41
|
|
|
39
42
|
def start(self):
|
|
40
43
|
if self._threaded:
|
|
41
|
-
self._thread = thread = threading.Thread(
|
|
44
|
+
self._thread = thread = threading.Thread(
|
|
45
|
+
name=f"IsolationThread-{self._counter()}", target=self._run
|
|
46
|
+
)
|
|
42
47
|
thread.daemon = True
|
|
43
48
|
thread.start()
|
|
44
49
|
self._thread_ident = thread.ident
|
|
Binary file
|
maxframe/odpsio/arrow.py
CHANGED
|
@@ -65,14 +65,19 @@ def arrow_to_pandas(
|
|
|
65
65
|
raise ValueError(f"Does not support meta type {table_meta.type!r}")
|
|
66
66
|
|
|
67
67
|
|
|
68
|
-
def pandas_to_arrow(
|
|
69
|
-
|
|
68
|
+
def pandas_to_arrow(
|
|
69
|
+
df: Any, nthreads=1, ignore_index=False
|
|
70
|
+
) -> Tuple[ArrowTableType, DataFrameTableMeta]:
|
|
71
|
+
table_meta = build_dataframe_table_meta(df, ignore_index)
|
|
70
72
|
df = df.copy() if callable(getattr(df, "copy", None)) else df
|
|
71
73
|
if table_meta.type in (OutputType.dataframe, OutputType.series):
|
|
72
74
|
if table_meta.type == OutputType.series:
|
|
73
75
|
df = df.to_frame("_data" if df.name is None else df.name)
|
|
74
76
|
df.columns = pd.Index(table_meta.table_column_names)
|
|
75
|
-
|
|
77
|
+
if not ignore_index:
|
|
78
|
+
df = df.rename_axis(table_meta.table_index_column_names).reset_index()
|
|
79
|
+
elif ignore_index:
|
|
80
|
+
df = pd.DataFrame([], columns=[])
|
|
76
81
|
elif table_meta.type == OutputType.index:
|
|
77
82
|
names = [f"_idx_{idx}" for idx in range(len(df.names))]
|
|
78
83
|
df = df.to_frame(name=names[0] if len(names) == 1 else names)
|
maxframe/odpsio/schema.py
CHANGED
|
@@ -175,7 +175,9 @@ def _scalar_as_index(df_obj: Any) -> pd.Index:
|
|
|
175
175
|
|
|
176
176
|
|
|
177
177
|
def pandas_to_odps_schema(
|
|
178
|
-
df_obj: Any,
|
|
178
|
+
df_obj: Any,
|
|
179
|
+
unknown_as_string: bool = False,
|
|
180
|
+
ignore_index=False,
|
|
179
181
|
) -> Tuple[odps_types.OdpsSchema, DataFrameTableMeta]:
|
|
180
182
|
from .. import dataframe as md
|
|
181
183
|
from .arrow import pandas_to_arrow
|
|
@@ -209,7 +211,7 @@ def pandas_to_odps_schema(
|
|
|
209
211
|
else:
|
|
210
212
|
empty_df_obj = df_obj
|
|
211
213
|
|
|
212
|
-
arrow_data, table_meta = pandas_to_arrow(empty_df_obj)
|
|
214
|
+
arrow_data, table_meta = pandas_to_arrow(empty_df_obj, ignore_index=ignore_index)
|
|
213
215
|
return (
|
|
214
216
|
arrow_schema_to_odps_schema(
|
|
215
217
|
arrow_data.schema, unknown_as_string=unknown_as_string
|
|
@@ -268,7 +270,9 @@ def build_table_column_name(
|
|
|
268
270
|
return col_name
|
|
269
271
|
|
|
270
272
|
|
|
271
|
-
def build_dataframe_table_meta(
|
|
273
|
+
def build_dataframe_table_meta(
|
|
274
|
+
df_obj: Any, ignore_index: bool = False
|
|
275
|
+
) -> DataFrameTableMeta:
|
|
272
276
|
from .. import dataframe as md
|
|
273
277
|
|
|
274
278
|
col_to_count = defaultdict(lambda: 0)
|
|
@@ -285,6 +289,8 @@ def build_dataframe_table_meta(df_obj: Any) -> DataFrameTableMeta:
|
|
|
285
289
|
else: # pragma: no cover
|
|
286
290
|
raise TypeError(f"Cannot accept type {type(df_obj)}")
|
|
287
291
|
|
|
292
|
+
assert not ignore_index or obj_type in (OutputType.dataframe, OutputType.series)
|
|
293
|
+
|
|
288
294
|
if obj_type == OutputType.scalar:
|
|
289
295
|
pd_dtypes = pd.Series([])
|
|
290
296
|
column_index_names = []
|
|
@@ -340,12 +346,19 @@ def build_dataframe_table_meta(df_obj: Any) -> DataFrameTableMeta:
|
|
|
340
346
|
else:
|
|
341
347
|
index_dtypes = pd.Series([pd_index_val.dtype], index=pd_index_val.names)
|
|
342
348
|
|
|
349
|
+
if ignore_index:
|
|
350
|
+
table_index_column_names = []
|
|
351
|
+
pd_index_dtypes = pd.Series([], index=[])
|
|
352
|
+
else:
|
|
353
|
+
table_index_column_names = [f"_idx_{i}" for i in range(len(index_obj.names))]
|
|
354
|
+
pd_index_dtypes = index_dtypes
|
|
355
|
+
|
|
343
356
|
return DataFrameTableMeta(
|
|
344
357
|
table_name=table_name,
|
|
345
358
|
type=obj_type,
|
|
346
359
|
table_column_names=final_sql_columns,
|
|
347
|
-
table_index_column_names=
|
|
360
|
+
table_index_column_names=table_index_column_names,
|
|
348
361
|
pd_column_dtypes=pd_dtypes,
|
|
349
362
|
pd_column_level_names=column_index_names,
|
|
350
|
-
pd_index_dtypes=
|
|
363
|
+
pd_index_dtypes=pd_index_dtypes,
|
|
351
364
|
)
|