maxframe 0.1.0b3__cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl → 0.1.0b4__cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/config/config.py +3 -0
- maxframe/dataframe/__init__.py +5 -0
- maxframe/dataframe/core.py +4 -2
- maxframe/dataframe/datasource/read_odps_query.py +3 -1
- maxframe/dataframe/datasource/read_odps_table.py +2 -0
- maxframe/dataframe/datastore/core.py +19 -0
- maxframe/dataframe/datastore/to_csv.py +2 -2
- maxframe/dataframe/datastore/to_odps.py +2 -2
- maxframe/dataframe/indexing/reset_index.py +1 -17
- maxframe/odpsio/arrow.py +8 -3
- maxframe/odpsio/schema.py +18 -5
- maxframe/odpsio/tests/test_schema.py +25 -0
- maxframe/opcodes.py +5 -0
- maxframe/session.py +4 -2
- maxframe/utils.py +5 -0
- {maxframe-0.1.0b3.dist-info → maxframe-0.1.0b4.dist-info}/METADATA +1 -1
- {maxframe-0.1.0b3.dist-info → maxframe-0.1.0b4.dist-info}/RECORD +590 -589
- maxframe_client/session/odps.py +11 -10
- maxframe_client/tests/test_session.py +21 -0
- {maxframe-0.1.0b3.dist-info → maxframe-0.1.0b4.dist-info}/WHEEL +0 -0
- {maxframe-0.1.0b3.dist-info → maxframe-0.1.0b4.dist-info}/top_level.txt +0 -0
maxframe/config/config.py
CHANGED
|
@@ -358,6 +358,9 @@ default_options.register_option(
|
|
|
358
358
|
default_options.register_option(
|
|
359
359
|
"show_progress", "auto", validator=any_validator(is_bool, is_string)
|
|
360
360
|
)
|
|
361
|
+
default_options.register_option(
|
|
362
|
+
"dag.settings", value=dict(), validator=is_dict, remote=True
|
|
363
|
+
)
|
|
361
364
|
|
|
362
365
|
################
|
|
363
366
|
# SPE Settings #
|
maxframe/dataframe/__init__.py
CHANGED
maxframe/dataframe/core.py
CHANGED
|
@@ -960,7 +960,9 @@ class BaseSeriesData(HasShapeTileableData, _ToPandasMixin):
|
|
|
960
960
|
buf = StringIO()
|
|
961
961
|
max_rows = pd.get_option("display.max_rows")
|
|
962
962
|
corner_max_rows = (
|
|
963
|
-
max_rows
|
|
963
|
+
max_rows
|
|
964
|
+
if self.shape[0] <= max_rows or corner_data.shape[0] == 0
|
|
965
|
+
else corner_data.shape[0] - 1
|
|
964
966
|
) # make sure max_rows < corner_data
|
|
965
967
|
|
|
966
968
|
with pd.option_context("display.max_rows", corner_max_rows):
|
|
@@ -1605,7 +1607,7 @@ class DataFrameData(_BatchedFetcher, BaseDataFrameData):
|
|
|
1605
1607
|
buf = StringIO()
|
|
1606
1608
|
max_rows = pd.get_option("display.max_rows")
|
|
1607
1609
|
|
|
1608
|
-
if self.shape[0] <= max_rows:
|
|
1610
|
+
if self.shape[0] <= max_rows or corner_data.shape[0] == 0:
|
|
1609
1611
|
buf.write(repr(corner_data) if representation else str(corner_data))
|
|
1610
1612
|
else:
|
|
1611
1613
|
# remember we cannot directly call repr(df),
|
|
@@ -263,7 +263,9 @@ def read_odps_query(
|
|
|
263
263
|
result: DataFrame
|
|
264
264
|
DataFrame read from MaxCompute (ODPS) table
|
|
265
265
|
"""
|
|
266
|
-
odps_entry = odps_entry or ODPS.from_environments()
|
|
266
|
+
odps_entry = odps_entry or ODPS.from_global() or ODPS.from_environments()
|
|
267
|
+
if odps_entry is None:
|
|
268
|
+
raise ValueError("Missing odps_entry parameter")
|
|
267
269
|
inst = odps_entry.execute_sql(f"EXPLAIN {query}")
|
|
268
270
|
explain_str = list(inst.get_task_results().values())[0]
|
|
269
271
|
|
|
@@ -164,6 +164,8 @@ def read_odps_table(
|
|
|
164
164
|
DataFrame read from MaxCompute (ODPS) table
|
|
165
165
|
"""
|
|
166
166
|
odps_entry = odps_entry or ODPS.from_global() or ODPS.from_environments()
|
|
167
|
+
if odps_entry is None:
|
|
168
|
+
raise ValueError("Missing odps_entry parameter")
|
|
167
169
|
if isinstance(table_name, Table):
|
|
168
170
|
table = table_name
|
|
169
171
|
else:
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class DataFrameDataStore(DataFrameOperator, DataFrameOperatorMixin):
|
|
19
|
+
pass
|
|
@@ -23,11 +23,11 @@ from ...serialization.serializables import (
|
|
|
23
23
|
ListField,
|
|
24
24
|
StringField,
|
|
25
25
|
)
|
|
26
|
-
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
27
26
|
from ..utils import parse_index
|
|
27
|
+
from .core import DataFrameDataStore
|
|
28
28
|
|
|
29
29
|
|
|
30
|
-
class DataFrameToCSV(
|
|
30
|
+
class DataFrameToCSV(DataFrameDataStore):
|
|
31
31
|
_op_type_ = opcodes.TO_CSV
|
|
32
32
|
|
|
33
33
|
input = KeyField("input")
|
|
@@ -32,13 +32,13 @@ from ...serialization.serializables import (
|
|
|
32
32
|
)
|
|
33
33
|
from ...typing_ import TileableType
|
|
34
34
|
from ..core import DataFrame # noqa: F401
|
|
35
|
-
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
36
35
|
from ..utils import parse_index
|
|
36
|
+
from .core import DataFrameDataStore
|
|
37
37
|
|
|
38
38
|
logger = logging.getLogger(__name__)
|
|
39
39
|
|
|
40
40
|
|
|
41
|
-
class DataFrameToODPSTable(
|
|
41
|
+
class DataFrameToODPSTable(DataFrameDataStore):
|
|
42
42
|
_op_type_ = opcodes.TO_ODPS_TABLE
|
|
43
43
|
|
|
44
44
|
dtypes = SeriesField("dtypes")
|
|
@@ -107,7 +107,6 @@ def df_reset_index(
|
|
|
107
107
|
inplace=False,
|
|
108
108
|
col_level=0,
|
|
109
109
|
col_fill="",
|
|
110
|
-
incremental_index=False,
|
|
111
110
|
):
|
|
112
111
|
"""
|
|
113
112
|
Reset the index, or a level of it.
|
|
@@ -133,12 +132,6 @@ def df_reset_index(
|
|
|
133
132
|
col_fill : object, default ''
|
|
134
133
|
If the columns have multiple levels, determines how the other
|
|
135
134
|
levels are named. If None then the index name is repeated.
|
|
136
|
-
incremental_index: bool, default False
|
|
137
|
-
Ensure RangeIndex incremental, when output DataFrame has multiple chunks,
|
|
138
|
-
ensuring index incremental costs more computation,
|
|
139
|
-
so by default, each chunk will have index which starts from 0,
|
|
140
|
-
setting incremental_index=True,reset_index will guarantee that
|
|
141
|
-
output DataFrame's index is from 0 to n - 1.
|
|
142
135
|
|
|
143
136
|
Returns
|
|
144
137
|
-------
|
|
@@ -264,7 +257,6 @@ def df_reset_index(
|
|
|
264
257
|
drop=drop,
|
|
265
258
|
col_level=col_level,
|
|
266
259
|
col_fill=col_fill,
|
|
267
|
-
incremental_index=incremental_index,
|
|
268
260
|
output_types=[OutputType.dataframe],
|
|
269
261
|
)
|
|
270
262
|
ret = op(df)
|
|
@@ -280,7 +272,6 @@ def series_reset_index(
|
|
|
280
272
|
drop=False,
|
|
281
273
|
name=no_default,
|
|
282
274
|
inplace=False,
|
|
283
|
-
incremental_index=False,
|
|
284
275
|
):
|
|
285
276
|
"""
|
|
286
277
|
Generate a new DataFrame or Series with the index reset.
|
|
@@ -303,12 +294,6 @@ def series_reset_index(
|
|
|
303
294
|
when `drop` is True.
|
|
304
295
|
inplace : bool, default False
|
|
305
296
|
Modify the Series in place (do not create a new object).
|
|
306
|
-
incremental_index: bool, default False
|
|
307
|
-
Ensure RangeIndex incremental, when output Series has multiple chunks,
|
|
308
|
-
ensuring index incremental costs more computation,
|
|
309
|
-
so by default, each chunk will have index which starts from 0,
|
|
310
|
-
setting incremental_index=True,reset_index will guarantee that
|
|
311
|
-
output Series's index is from 0 to n - 1.
|
|
312
297
|
|
|
313
298
|
Returns
|
|
314
299
|
-------
|
|
@@ -406,8 +391,7 @@ def series_reset_index(
|
|
|
406
391
|
level=level,
|
|
407
392
|
drop=drop,
|
|
408
393
|
name=name,
|
|
409
|
-
|
|
410
|
-
output_types=[OutputType.series],
|
|
394
|
+
output_types=[OutputType.series if drop else OutputType.dataframe],
|
|
411
395
|
)
|
|
412
396
|
ret = op(series)
|
|
413
397
|
if not inplace:
|
maxframe/odpsio/arrow.py
CHANGED
|
@@ -65,14 +65,19 @@ def arrow_to_pandas(
|
|
|
65
65
|
raise ValueError(f"Does not support meta type {table_meta.type!r}")
|
|
66
66
|
|
|
67
67
|
|
|
68
|
-
def pandas_to_arrow(
|
|
69
|
-
|
|
68
|
+
def pandas_to_arrow(
|
|
69
|
+
df: Any, nthreads=1, ignore_index=False
|
|
70
|
+
) -> Tuple[ArrowTableType, DataFrameTableMeta]:
|
|
71
|
+
table_meta = build_dataframe_table_meta(df, ignore_index)
|
|
70
72
|
df = df.copy() if callable(getattr(df, "copy", None)) else df
|
|
71
73
|
if table_meta.type in (OutputType.dataframe, OutputType.series):
|
|
72
74
|
if table_meta.type == OutputType.series:
|
|
73
75
|
df = df.to_frame("_data" if df.name is None else df.name)
|
|
74
76
|
df.columns = pd.Index(table_meta.table_column_names)
|
|
75
|
-
|
|
77
|
+
if not ignore_index:
|
|
78
|
+
df = df.rename_axis(table_meta.table_index_column_names).reset_index()
|
|
79
|
+
elif ignore_index:
|
|
80
|
+
df = pd.DataFrame([], columns=[])
|
|
76
81
|
elif table_meta.type == OutputType.index:
|
|
77
82
|
names = [f"_idx_{idx}" for idx in range(len(df.names))]
|
|
78
83
|
df = df.to_frame(name=names[0] if len(names) == 1 else names)
|
maxframe/odpsio/schema.py
CHANGED
|
@@ -175,7 +175,9 @@ def _scalar_as_index(df_obj: Any) -> pd.Index:
|
|
|
175
175
|
|
|
176
176
|
|
|
177
177
|
def pandas_to_odps_schema(
|
|
178
|
-
df_obj: Any,
|
|
178
|
+
df_obj: Any,
|
|
179
|
+
unknown_as_string: bool = False,
|
|
180
|
+
ignore_index=False,
|
|
179
181
|
) -> Tuple[odps_types.OdpsSchema, DataFrameTableMeta]:
|
|
180
182
|
from .. import dataframe as md
|
|
181
183
|
from .arrow import pandas_to_arrow
|
|
@@ -209,7 +211,7 @@ def pandas_to_odps_schema(
|
|
|
209
211
|
else:
|
|
210
212
|
empty_df_obj = df_obj
|
|
211
213
|
|
|
212
|
-
arrow_data, table_meta = pandas_to_arrow(empty_df_obj)
|
|
214
|
+
arrow_data, table_meta = pandas_to_arrow(empty_df_obj, ignore_index=ignore_index)
|
|
213
215
|
return (
|
|
214
216
|
arrow_schema_to_odps_schema(
|
|
215
217
|
arrow_data.schema, unknown_as_string=unknown_as_string
|
|
@@ -268,7 +270,9 @@ def build_table_column_name(
|
|
|
268
270
|
return col_name
|
|
269
271
|
|
|
270
272
|
|
|
271
|
-
def build_dataframe_table_meta(
|
|
273
|
+
def build_dataframe_table_meta(
|
|
274
|
+
df_obj: Any, ignore_index: bool = False
|
|
275
|
+
) -> DataFrameTableMeta:
|
|
272
276
|
from .. import dataframe as md
|
|
273
277
|
|
|
274
278
|
col_to_count = defaultdict(lambda: 0)
|
|
@@ -285,6 +289,8 @@ def build_dataframe_table_meta(df_obj: Any) -> DataFrameTableMeta:
|
|
|
285
289
|
else: # pragma: no cover
|
|
286
290
|
raise TypeError(f"Cannot accept type {type(df_obj)}")
|
|
287
291
|
|
|
292
|
+
assert not ignore_index or obj_type in (OutputType.dataframe, OutputType.series)
|
|
293
|
+
|
|
288
294
|
if obj_type == OutputType.scalar:
|
|
289
295
|
pd_dtypes = pd.Series([])
|
|
290
296
|
column_index_names = []
|
|
@@ -340,12 +346,19 @@ def build_dataframe_table_meta(df_obj: Any) -> DataFrameTableMeta:
|
|
|
340
346
|
else:
|
|
341
347
|
index_dtypes = pd.Series([pd_index_val.dtype], index=pd_index_val.names)
|
|
342
348
|
|
|
349
|
+
if ignore_index:
|
|
350
|
+
table_index_column_names = []
|
|
351
|
+
pd_index_dtypes = pd.Series([], index=[])
|
|
352
|
+
else:
|
|
353
|
+
table_index_column_names = [f"_idx_{i}" for i in range(len(index_obj.names))]
|
|
354
|
+
pd_index_dtypes = index_dtypes
|
|
355
|
+
|
|
343
356
|
return DataFrameTableMeta(
|
|
344
357
|
table_name=table_name,
|
|
345
358
|
type=obj_type,
|
|
346
359
|
table_column_names=final_sql_columns,
|
|
347
|
-
table_index_column_names=
|
|
360
|
+
table_index_column_names=table_index_column_names,
|
|
348
361
|
pd_column_dtypes=pd_dtypes,
|
|
349
362
|
pd_column_level_names=column_index_names,
|
|
350
|
-
pd_index_dtypes=
|
|
363
|
+
pd_index_dtypes=pd_index_dtypes,
|
|
351
364
|
)
|
|
@@ -61,6 +61,16 @@ def test_pandas_to_odps_schema_dataframe(wrap_obj):
|
|
|
61
61
|
assert meta.pd_column_level_names == [None]
|
|
62
62
|
assert meta.pd_index_level_names == [None]
|
|
63
63
|
|
|
64
|
+
test_df = _wrap_maxframe_obj(data, wrap=wrap_obj)
|
|
65
|
+
schema, meta = pandas_to_odps_schema(test_df, ignore_index=True)
|
|
66
|
+
assert [c.name for c in schema.columns] == list(test_df.dtypes.index.str.lower())
|
|
67
|
+
assert [c.type.name for c in schema.columns] == ["double"] * len(test_df.columns)
|
|
68
|
+
assert meta.type == OutputType.dataframe
|
|
69
|
+
assert meta.table_column_names == list(test_df.dtypes.index.str.lower())
|
|
70
|
+
assert meta.table_index_column_names == []
|
|
71
|
+
assert meta.pd_column_level_names == [None]
|
|
72
|
+
assert meta.pd_index_level_names == []
|
|
73
|
+
|
|
64
74
|
data.columns = pd.MultiIndex.from_tuples(
|
|
65
75
|
[("A", "A"), ("A", "B"), ("A", "C"), ("B", "A"), ("B", "B")], names=["c1", "c2"]
|
|
66
76
|
)
|
|
@@ -99,6 +109,15 @@ def test_pandas_to_odps_schema_series(wrap_obj):
|
|
|
99
109
|
assert meta.pd_column_level_names == [None]
|
|
100
110
|
assert meta.pd_index_level_names == [None]
|
|
101
111
|
|
|
112
|
+
schema, meta = pandas_to_odps_schema(test_s, ignore_index=True)
|
|
113
|
+
assert [c.name for c in schema.columns] == ["_data"]
|
|
114
|
+
assert [c.type.name for c in schema.columns] == ["double"]
|
|
115
|
+
assert meta.type == OutputType.series
|
|
116
|
+
assert meta.table_column_names == ["_data"]
|
|
117
|
+
assert meta.table_index_column_names == []
|
|
118
|
+
assert meta.pd_column_level_names == [None]
|
|
119
|
+
assert meta.pd_index_level_names == []
|
|
120
|
+
|
|
102
121
|
data.index = pd.MultiIndex.from_arrays(
|
|
103
122
|
[np.random.choice(list("ABC"), 100), np.random.randint(0, 10, 100)],
|
|
104
123
|
names=["c1", "c2"],
|
|
@@ -130,6 +149,9 @@ def test_pandas_to_odps_schema_index(wrap_obj):
|
|
|
130
149
|
assert meta.pd_column_level_names == []
|
|
131
150
|
assert meta.pd_index_level_names == [None]
|
|
132
151
|
|
|
152
|
+
with pytest.raises(AssertionError):
|
|
153
|
+
pandas_to_odps_schema(test_idx, unknown_as_string=True, ignore_index=True)
|
|
154
|
+
|
|
133
155
|
data = pd.MultiIndex.from_arrays(
|
|
134
156
|
[np.random.choice(list("ABC"), 100), np.random.randint(0, 10, 100)],
|
|
135
157
|
names=["c1", "c2"],
|
|
@@ -159,6 +181,9 @@ def test_pandas_to_odps_schema_scalar(wrap_obj):
|
|
|
159
181
|
assert meta.pd_column_level_names == []
|
|
160
182
|
assert meta.pd_index_level_names == [None]
|
|
161
183
|
|
|
184
|
+
with pytest.raises(AssertionError):
|
|
185
|
+
pandas_to_odps_schema(test_scalar, unknown_as_string=True, ignore_index=True)
|
|
186
|
+
|
|
162
187
|
|
|
163
188
|
def test_odps_arrow_schema_conversion():
|
|
164
189
|
odps_schema = odps_types.OdpsSchema(
|
maxframe/opcodes.py
CHANGED
|
@@ -564,6 +564,11 @@ CHOLESKY_FUSE = 999988
|
|
|
564
564
|
# MaxFrame-dedicated functions
|
|
565
565
|
DATAFRAME_RESHUFFLE = 10001
|
|
566
566
|
|
|
567
|
+
# MaxFrame internal operators
|
|
568
|
+
DATAFRAME_PROJECTION_SAME_INDEX_MERGE = 100001
|
|
569
|
+
GROUPBY_AGGR_SAME_INDEX_MERGE = 100002
|
|
570
|
+
DATAFRAME_ILOC_GET_AND_RENAME_ITEM = 100003
|
|
571
|
+
|
|
567
572
|
# fetches
|
|
568
573
|
FETCH_SHUFFLE = 999998
|
|
569
574
|
FETCH = 999999
|
maxframe/session.py
CHANGED
|
@@ -1211,7 +1211,7 @@ def new_session(
|
|
|
1211
1211
|
# load third party extensions.
|
|
1212
1212
|
ensure_isolation_created(kwargs)
|
|
1213
1213
|
|
|
1214
|
-
odps_entry = odps_entry or ODPS.from_environments()
|
|
1214
|
+
odps_entry = odps_entry or ODPS.from_global() or ODPS.from_environments()
|
|
1215
1215
|
if address is None:
|
|
1216
1216
|
from maxframe_client.session.consts import ODPS_SESSION_INSECURE_SCHEME
|
|
1217
1217
|
|
|
@@ -1255,7 +1255,9 @@ def get_default_or_create(**kwargs):
|
|
|
1255
1255
|
if session is None:
|
|
1256
1256
|
# no session attached, try to create one
|
|
1257
1257
|
warnings.warn(warning_msg)
|
|
1258
|
-
session = new_session(
|
|
1258
|
+
session = new_session(
|
|
1259
|
+
ODPS.from_global() or ODPS.from_environments(), **kwargs
|
|
1260
|
+
)
|
|
1259
1261
|
session.as_default()
|
|
1260
1262
|
if isinstance(session, IsolatedAsyncSession):
|
|
1261
1263
|
session = SyncSession.from_isolated_session(session)
|
maxframe/utils.py
CHANGED
|
@@ -381,6 +381,11 @@ def build_temp_table_name(session_id: str, tileable_key: str) -> str:
|
|
|
381
381
|
return f"tmp_mf_{session_id}_{tileable_key}"
|
|
382
382
|
|
|
383
383
|
|
|
384
|
+
def build_temp_intermediate_table_name(session_id: str, tileable_key: str) -> str:
|
|
385
|
+
temp_table = build_temp_table_name(session_id, tileable_key)
|
|
386
|
+
return f"{temp_table}_intermediate"
|
|
387
|
+
|
|
388
|
+
|
|
384
389
|
def build_session_volume_name(session_id: str) -> str:
|
|
385
390
|
return f"mf_vol_{session_id}"
|
|
386
391
|
|