maxframe 0.1.0b3__cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl → 0.1.0b4__cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

maxframe/config/config.py CHANGED
@@ -358,6 +358,9 @@ default_options.register_option(
358
358
  default_options.register_option(
359
359
  "show_progress", "auto", validator=any_validator(is_bool, is_string)
360
360
  )
361
+ default_options.register_option(
362
+ "dag.settings", value=dict(), validator=is_dict, remote=True
363
+ )
361
364
 
362
365
  ################
363
366
  # SPE Settings #
@@ -57,6 +57,11 @@ try:
57
57
  except ImportError: # pragma: no cover
58
58
  pass
59
59
 
60
+ try:
61
+ from . import _internal
62
+ except ImportError: # pragma: no cover
63
+ pass
64
+
60
65
  del (
61
66
  arithmetic,
62
67
  datasource,
@@ -960,7 +960,9 @@ class BaseSeriesData(HasShapeTileableData, _ToPandasMixin):
960
960
  buf = StringIO()
961
961
  max_rows = pd.get_option("display.max_rows")
962
962
  corner_max_rows = (
963
- max_rows if self.shape[0] <= max_rows else corner_data.shape[0] - 1
963
+ max_rows
964
+ if self.shape[0] <= max_rows or corner_data.shape[0] == 0
965
+ else corner_data.shape[0] - 1
964
966
  ) # make sure max_rows < corner_data
965
967
 
966
968
  with pd.option_context("display.max_rows", corner_max_rows):
@@ -1605,7 +1607,7 @@ class DataFrameData(_BatchedFetcher, BaseDataFrameData):
1605
1607
  buf = StringIO()
1606
1608
  max_rows = pd.get_option("display.max_rows")
1607
1609
 
1608
- if self.shape[0] <= max_rows:
1610
+ if self.shape[0] <= max_rows or corner_data.shape[0] == 0:
1609
1611
  buf.write(repr(corner_data) if representation else str(corner_data))
1610
1612
  else:
1611
1613
  # remember we cannot directly call repr(df),
@@ -263,7 +263,9 @@ def read_odps_query(
263
263
  result: DataFrame
264
264
  DataFrame read from MaxCompute (ODPS) table
265
265
  """
266
- odps_entry = odps_entry or ODPS.from_environments()
266
+ odps_entry = odps_entry or ODPS.from_global() or ODPS.from_environments()
267
+ if odps_entry is None:
268
+ raise ValueError("Missing odps_entry parameter")
267
269
  inst = odps_entry.execute_sql(f"EXPLAIN {query}")
268
270
  explain_str = list(inst.get_task_results().values())[0]
269
271
 
@@ -164,6 +164,8 @@ def read_odps_table(
164
164
  DataFrame read from MaxCompute (ODPS) table
165
165
  """
166
166
  odps_entry = odps_entry or ODPS.from_global() or ODPS.from_environments()
167
+ if odps_entry is None:
168
+ raise ValueError("Missing odps_entry parameter")
167
169
  if isinstance(table_name, Table):
168
170
  table = table_name
169
171
  else:
@@ -0,0 +1,19 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from ..operators import DataFrameOperator, DataFrameOperatorMixin
16
+
17
+
18
+ class DataFrameDataStore(DataFrameOperator, DataFrameOperatorMixin):
19
+ pass
@@ -23,11 +23,11 @@ from ...serialization.serializables import (
23
23
  ListField,
24
24
  StringField,
25
25
  )
26
- from ..operators import DataFrameOperator, DataFrameOperatorMixin
27
26
  from ..utils import parse_index
27
+ from .core import DataFrameDataStore
28
28
 
29
29
 
30
- class DataFrameToCSV(DataFrameOperator, DataFrameOperatorMixin):
30
+ class DataFrameToCSV(DataFrameDataStore):
31
31
  _op_type_ = opcodes.TO_CSV
32
32
 
33
33
  input = KeyField("input")
@@ -32,13 +32,13 @@ from ...serialization.serializables import (
32
32
  )
33
33
  from ...typing_ import TileableType
34
34
  from ..core import DataFrame # noqa: F401
35
- from ..operators import DataFrameOperator, DataFrameOperatorMixin
36
35
  from ..utils import parse_index
36
+ from .core import DataFrameDataStore
37
37
 
38
38
  logger = logging.getLogger(__name__)
39
39
 
40
40
 
41
- class DataFrameToODPSTable(DataFrameOperator, DataFrameOperatorMixin):
41
+ class DataFrameToODPSTable(DataFrameDataStore):
42
42
  _op_type_ = opcodes.TO_ODPS_TABLE
43
43
 
44
44
  dtypes = SeriesField("dtypes")
@@ -107,7 +107,6 @@ def df_reset_index(
107
107
  inplace=False,
108
108
  col_level=0,
109
109
  col_fill="",
110
- incremental_index=False,
111
110
  ):
112
111
  """
113
112
  Reset the index, or a level of it.
@@ -133,12 +132,6 @@ def df_reset_index(
133
132
  col_fill : object, default ''
134
133
  If the columns have multiple levels, determines how the other
135
134
  levels are named. If None then the index name is repeated.
136
- incremental_index: bool, default False
137
- Ensure RangeIndex incremental, when output DataFrame has multiple chunks,
138
- ensuring index incremental costs more computation,
139
- so by default, each chunk will have index which starts from 0,
140
- setting incremental_index=True,reset_index will guarantee that
141
- output DataFrame's index is from 0 to n - 1.
142
135
 
143
136
  Returns
144
137
  -------
@@ -264,7 +257,6 @@ def df_reset_index(
264
257
  drop=drop,
265
258
  col_level=col_level,
266
259
  col_fill=col_fill,
267
- incremental_index=incremental_index,
268
260
  output_types=[OutputType.dataframe],
269
261
  )
270
262
  ret = op(df)
@@ -280,7 +272,6 @@ def series_reset_index(
280
272
  drop=False,
281
273
  name=no_default,
282
274
  inplace=False,
283
- incremental_index=False,
284
275
  ):
285
276
  """
286
277
  Generate a new DataFrame or Series with the index reset.
@@ -303,12 +294,6 @@ def series_reset_index(
303
294
  when `drop` is True.
304
295
  inplace : bool, default False
305
296
  Modify the Series in place (do not create a new object).
306
- incremental_index: bool, default False
307
- Ensure RangeIndex incremental, when output Series has multiple chunks,
308
- ensuring index incremental costs more computation,
309
- so by default, each chunk will have index which starts from 0,
310
- setting incremental_index=True,reset_index will guarantee that
311
- output Series's index is from 0 to n - 1.
312
297
 
313
298
  Returns
314
299
  -------
@@ -406,8 +391,7 @@ def series_reset_index(
406
391
  level=level,
407
392
  drop=drop,
408
393
  name=name,
409
- incremental_index=incremental_index,
410
- output_types=[OutputType.series],
394
+ output_types=[OutputType.series if drop else OutputType.dataframe],
411
395
  )
412
396
  ret = op(series)
413
397
  if not inplace:
maxframe/odpsio/arrow.py CHANGED
@@ -65,14 +65,19 @@ def arrow_to_pandas(
65
65
  raise ValueError(f"Does not support meta type {table_meta.type!r}")
66
66
 
67
67
 
68
- def pandas_to_arrow(df: Any, nthreads=1) -> Tuple[ArrowTableType, DataFrameTableMeta]:
69
- table_meta = build_dataframe_table_meta(df)
68
+ def pandas_to_arrow(
69
+ df: Any, nthreads=1, ignore_index=False
70
+ ) -> Tuple[ArrowTableType, DataFrameTableMeta]:
71
+ table_meta = build_dataframe_table_meta(df, ignore_index)
70
72
  df = df.copy() if callable(getattr(df, "copy", None)) else df
71
73
  if table_meta.type in (OutputType.dataframe, OutputType.series):
72
74
  if table_meta.type == OutputType.series:
73
75
  df = df.to_frame("_data" if df.name is None else df.name)
74
76
  df.columns = pd.Index(table_meta.table_column_names)
75
- df = df.rename_axis(table_meta.table_index_column_names).reset_index()
77
+ if not ignore_index:
78
+ df = df.rename_axis(table_meta.table_index_column_names).reset_index()
79
+ elif ignore_index:
80
+ df = pd.DataFrame([], columns=[])
76
81
  elif table_meta.type == OutputType.index:
77
82
  names = [f"_idx_{idx}" for idx in range(len(df.names))]
78
83
  df = df.to_frame(name=names[0] if len(names) == 1 else names)
maxframe/odpsio/schema.py CHANGED
@@ -175,7 +175,9 @@ def _scalar_as_index(df_obj: Any) -> pd.Index:
175
175
 
176
176
 
177
177
  def pandas_to_odps_schema(
178
- df_obj: Any, unknown_as_string: bool = False
178
+ df_obj: Any,
179
+ unknown_as_string: bool = False,
180
+ ignore_index=False,
179
181
  ) -> Tuple[odps_types.OdpsSchema, DataFrameTableMeta]:
180
182
  from .. import dataframe as md
181
183
  from .arrow import pandas_to_arrow
@@ -209,7 +211,7 @@ def pandas_to_odps_schema(
209
211
  else:
210
212
  empty_df_obj = df_obj
211
213
 
212
- arrow_data, table_meta = pandas_to_arrow(empty_df_obj)
214
+ arrow_data, table_meta = pandas_to_arrow(empty_df_obj, ignore_index=ignore_index)
213
215
  return (
214
216
  arrow_schema_to_odps_schema(
215
217
  arrow_data.schema, unknown_as_string=unknown_as_string
@@ -268,7 +270,9 @@ def build_table_column_name(
268
270
  return col_name
269
271
 
270
272
 
271
- def build_dataframe_table_meta(df_obj: Any) -> DataFrameTableMeta:
273
+ def build_dataframe_table_meta(
274
+ df_obj: Any, ignore_index: bool = False
275
+ ) -> DataFrameTableMeta:
272
276
  from .. import dataframe as md
273
277
 
274
278
  col_to_count = defaultdict(lambda: 0)
@@ -285,6 +289,8 @@ def build_dataframe_table_meta(df_obj: Any) -> DataFrameTableMeta:
285
289
  else: # pragma: no cover
286
290
  raise TypeError(f"Cannot accept type {type(df_obj)}")
287
291
 
292
+ assert not ignore_index or obj_type in (OutputType.dataframe, OutputType.series)
293
+
288
294
  if obj_type == OutputType.scalar:
289
295
  pd_dtypes = pd.Series([])
290
296
  column_index_names = []
@@ -340,12 +346,19 @@ def build_dataframe_table_meta(df_obj: Any) -> DataFrameTableMeta:
340
346
  else:
341
347
  index_dtypes = pd.Series([pd_index_val.dtype], index=pd_index_val.names)
342
348
 
349
+ if ignore_index:
350
+ table_index_column_names = []
351
+ pd_index_dtypes = pd.Series([], index=[])
352
+ else:
353
+ table_index_column_names = [f"_idx_{i}" for i in range(len(index_obj.names))]
354
+ pd_index_dtypes = index_dtypes
355
+
343
356
  return DataFrameTableMeta(
344
357
  table_name=table_name,
345
358
  type=obj_type,
346
359
  table_column_names=final_sql_columns,
347
- table_index_column_names=[f"_idx_{i}" for i in range(len(index_obj.names))],
360
+ table_index_column_names=table_index_column_names,
348
361
  pd_column_dtypes=pd_dtypes,
349
362
  pd_column_level_names=column_index_names,
350
- pd_index_dtypes=index_dtypes,
363
+ pd_index_dtypes=pd_index_dtypes,
351
364
  )
@@ -61,6 +61,16 @@ def test_pandas_to_odps_schema_dataframe(wrap_obj):
61
61
  assert meta.pd_column_level_names == [None]
62
62
  assert meta.pd_index_level_names == [None]
63
63
 
64
+ test_df = _wrap_maxframe_obj(data, wrap=wrap_obj)
65
+ schema, meta = pandas_to_odps_schema(test_df, ignore_index=True)
66
+ assert [c.name for c in schema.columns] == list(test_df.dtypes.index.str.lower())
67
+ assert [c.type.name for c in schema.columns] == ["double"] * len(test_df.columns)
68
+ assert meta.type == OutputType.dataframe
69
+ assert meta.table_column_names == list(test_df.dtypes.index.str.lower())
70
+ assert meta.table_index_column_names == []
71
+ assert meta.pd_column_level_names == [None]
72
+ assert meta.pd_index_level_names == []
73
+
64
74
  data.columns = pd.MultiIndex.from_tuples(
65
75
  [("A", "A"), ("A", "B"), ("A", "C"), ("B", "A"), ("B", "B")], names=["c1", "c2"]
66
76
  )
@@ -99,6 +109,15 @@ def test_pandas_to_odps_schema_series(wrap_obj):
99
109
  assert meta.pd_column_level_names == [None]
100
110
  assert meta.pd_index_level_names == [None]
101
111
 
112
+ schema, meta = pandas_to_odps_schema(test_s, ignore_index=True)
113
+ assert [c.name for c in schema.columns] == ["_data"]
114
+ assert [c.type.name for c in schema.columns] == ["double"]
115
+ assert meta.type == OutputType.series
116
+ assert meta.table_column_names == ["_data"]
117
+ assert meta.table_index_column_names == []
118
+ assert meta.pd_column_level_names == [None]
119
+ assert meta.pd_index_level_names == []
120
+
102
121
  data.index = pd.MultiIndex.from_arrays(
103
122
  [np.random.choice(list("ABC"), 100), np.random.randint(0, 10, 100)],
104
123
  names=["c1", "c2"],
@@ -130,6 +149,9 @@ def test_pandas_to_odps_schema_index(wrap_obj):
130
149
  assert meta.pd_column_level_names == []
131
150
  assert meta.pd_index_level_names == [None]
132
151
 
152
+ with pytest.raises(AssertionError):
153
+ pandas_to_odps_schema(test_idx, unknown_as_string=True, ignore_index=True)
154
+
133
155
  data = pd.MultiIndex.from_arrays(
134
156
  [np.random.choice(list("ABC"), 100), np.random.randint(0, 10, 100)],
135
157
  names=["c1", "c2"],
@@ -159,6 +181,9 @@ def test_pandas_to_odps_schema_scalar(wrap_obj):
159
181
  assert meta.pd_column_level_names == []
160
182
  assert meta.pd_index_level_names == [None]
161
183
 
184
+ with pytest.raises(AssertionError):
185
+ pandas_to_odps_schema(test_scalar, unknown_as_string=True, ignore_index=True)
186
+
162
187
 
163
188
  def test_odps_arrow_schema_conversion():
164
189
  odps_schema = odps_types.OdpsSchema(
maxframe/opcodes.py CHANGED
@@ -564,6 +564,11 @@ CHOLESKY_FUSE = 999988
564
564
  # MaxFrame-dedicated functions
565
565
  DATAFRAME_RESHUFFLE = 10001
566
566
 
567
+ # MaxFrame internal operators
568
+ DATAFRAME_PROJECTION_SAME_INDEX_MERGE = 100001
569
+ GROUPBY_AGGR_SAME_INDEX_MERGE = 100002
570
+ DATAFRAME_ILOC_GET_AND_RENAME_ITEM = 100003
571
+
567
572
  # fetches
568
573
  FETCH_SHUFFLE = 999998
569
574
  FETCH = 999999
maxframe/session.py CHANGED
@@ -1211,7 +1211,7 @@ def new_session(
1211
1211
  # load third party extensions.
1212
1212
  ensure_isolation_created(kwargs)
1213
1213
 
1214
- odps_entry = odps_entry or ODPS.from_environments()
1214
+ odps_entry = odps_entry or ODPS.from_global() or ODPS.from_environments()
1215
1215
  if address is None:
1216
1216
  from maxframe_client.session.consts import ODPS_SESSION_INSECURE_SCHEME
1217
1217
 
@@ -1255,7 +1255,9 @@ def get_default_or_create(**kwargs):
1255
1255
  if session is None:
1256
1256
  # no session attached, try to create one
1257
1257
  warnings.warn(warning_msg)
1258
- session = new_session(ODPS.from_environments(), **kwargs)
1258
+ session = new_session(
1259
+ ODPS.from_global() or ODPS.from_environments(), **kwargs
1260
+ )
1259
1261
  session.as_default()
1260
1262
  if isinstance(session, IsolatedAsyncSession):
1261
1263
  session = SyncSession.from_isolated_session(session)
maxframe/utils.py CHANGED
@@ -381,6 +381,11 @@ def build_temp_table_name(session_id: str, tileable_key: str) -> str:
381
381
  return f"tmp_mf_{session_id}_{tileable_key}"
382
382
 
383
383
 
384
+ def build_temp_intermediate_table_name(session_id: str, tileable_key: str) -> str:
385
+ temp_table = build_temp_table_name(session_id, tileable_key)
386
+ return f"{temp_table}_intermediate"
387
+
388
+
384
389
  def build_session_volume_name(session_id: str) -> str:
385
390
  return f"mf_vol_{session_id}"
386
391
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: maxframe
3
- Version: 0.1.0b3
3
+ Version: 0.1.0b4
4
4
  Summary: MaxFrame operator-based data analyze framework
5
5
  Requires-Dist: numpy >=1.19.0
6
6
  Requires-Dist: pandas >=1.0.0