maxframe 0.1.0b2__cp310-cp310-win32.whl → 0.1.0b4__cp310-cp310-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (42) hide show
  1. maxframe/_utils.cp310-win32.pyd +0 -0
  2. maxframe/codegen.py +88 -19
  3. maxframe/config/config.py +9 -0
  4. maxframe/core/entity/executable.py +1 -0
  5. maxframe/core/entity/objects.py +3 -2
  6. maxframe/core/graph/core.cp310-win32.pyd +0 -0
  7. maxframe/dataframe/__init__.py +7 -1
  8. maxframe/dataframe/core.py +4 -2
  9. maxframe/dataframe/datasource/read_odps_query.py +4 -2
  10. maxframe/dataframe/datasource/read_odps_table.py +3 -1
  11. maxframe/dataframe/datasource/tests/test_datasource.py +22 -0
  12. maxframe/dataframe/datastore/core.py +19 -0
  13. maxframe/dataframe/datastore/to_csv.py +2 -2
  14. maxframe/dataframe/datastore/to_odps.py +2 -2
  15. maxframe/dataframe/groupby/__init__.py +1 -0
  16. maxframe/dataframe/groupby/core.py +5 -0
  17. maxframe/dataframe/indexing/reset_index.py +1 -17
  18. maxframe/lib/aio/isolation.py +6 -1
  19. maxframe/lib/mmh3.cp310-win32.pyd +0 -0
  20. maxframe/odpsio/arrow.py +8 -3
  21. maxframe/odpsio/schema.py +18 -5
  22. maxframe/odpsio/tests/test_schema.py +25 -0
  23. maxframe/opcodes.py +5 -0
  24. maxframe/protocol.py +7 -0
  25. maxframe/serialization/core.cp310-win32.pyd +0 -0
  26. maxframe/serialization/serializables/core.py +6 -1
  27. maxframe/serialization/serializables/field.py +2 -0
  28. maxframe/session.py +4 -2
  29. maxframe/tensor/core.py +3 -3
  30. maxframe/tests/test_codegen.py +69 -0
  31. maxframe/tests/test_protocol.py +16 -8
  32. maxframe/tests/utils.py +1 -0
  33. maxframe/utils.py +20 -1
  34. {maxframe-0.1.0b2.dist-info → maxframe-0.1.0b4.dist-info}/METADATA +1 -1
  35. {maxframe-0.1.0b2.dist-info → maxframe-0.1.0b4.dist-info}/RECORD +42 -40
  36. maxframe_client/clients/framedriver.py +7 -7
  37. maxframe_client/session/odps.py +11 -10
  38. maxframe_client/session/task.py +8 -1
  39. maxframe_client/session/tests/test_task.py +29 -11
  40. maxframe_client/tests/test_session.py +23 -0
  41. {maxframe-0.1.0b2.dist-info → maxframe-0.1.0b4.dist-info}/WHEEL +0 -0
  42. {maxframe-0.1.0b2.dist-info → maxframe-0.1.0b4.dist-info}/top_level.txt +0 -0
Binary file
maxframe/codegen.py CHANGED
@@ -17,7 +17,7 @@ import base64
17
17
  import dataclasses
18
18
  import logging
19
19
  from enum import Enum
20
- from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
20
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union
21
21
 
22
22
  from odps.types import OdpsSchema
23
23
  from odps.utils import camel_to_underline
@@ -30,6 +30,7 @@ from .odpsio import build_dataframe_table_meta
30
30
  from .odpsio.schema import pandas_to_odps_schema
31
31
  from .protocol import DataFrameTableMeta, ResultInfo
32
32
  from .serialization import PickleContainer
33
+ from .serialization.serializables import Serializable, StringField
33
34
  from .typing_ import PandasObjectTypes
34
35
  from .udf import MarkedFunction
35
36
 
@@ -48,8 +49,11 @@ class CodeGenResult:
48
49
  constants: Dict[str, Any]
49
50
 
50
51
 
51
- class AbstractUDF(abc.ABC):
52
- _session_id: str
52
+ class AbstractUDF(Serializable):
53
+ _session_id: str = StringField("session_id")
54
+
55
+ def __init__(self, session_id: Optional[str] = None, **kw):
56
+ super().__init__(_session_id=session_id, **kw)
53
57
 
54
58
  @property
55
59
  def name(self) -> str:
@@ -74,7 +78,66 @@ class AbstractUDF(abc.ABC):
74
78
 
75
79
  class UserCodeMixin:
76
80
  @classmethod
77
- def generate_pickled_codes(cls, code_to_pickle: Any) -> List[str]:
81
+ def obj_to_python_expr(cls, obj: Any = None) -> str:
82
+ """
83
+ Parameters
84
+ ----------
85
+ obj
86
+ The object to convert to python expr.
87
+ Returns
88
+ -------
89
+ str :
90
+ The str type content equals to the object when use in the python code directly.
91
+ """
92
+ if obj is None:
93
+ return "None"
94
+
95
+ if isinstance(obj, (int, float)):
96
+ return repr(obj)
97
+
98
+ if isinstance(obj, bool):
99
+ return "True" if obj else "False"
100
+
101
+ if isinstance(obj, bytes):
102
+ base64_bytes = base64.b64encode(obj)
103
+ return f"base64.b64decode({base64_bytes})"
104
+
105
+ if isinstance(obj, str):
106
+ return repr(obj)
107
+
108
+ if isinstance(obj, list):
109
+ return (
110
+ f"[{', '.join([cls.obj_to_python_expr(element) for element in obj])}]"
111
+ )
112
+
113
+ if isinstance(obj, dict):
114
+ items = (
115
+ f"{repr(key)}: {cls.obj_to_python_expr(value)}"
116
+ for key, value in obj.items()
117
+ )
118
+ return f"{{{', '.join(items)}}}"
119
+
120
+ if isinstance(obj, tuple):
121
+ return f"({', '.join([cls.obj_to_python_expr(sub_obj) for sub_obj in obj])}{',' if len(obj) == 1 else ''})"
122
+
123
+ if isinstance(obj, set):
124
+ return (
125
+ f"{{{', '.join([cls.obj_to_python_expr(sub_obj) for sub_obj in obj])}}}"
126
+ if obj
127
+ else "set()"
128
+ )
129
+
130
+ if isinstance(obj, PickleContainer):
131
+ return UserCodeMixin.generate_pickled_codes(obj, None)
132
+
133
+ raise ValueError(f"not support arg type {type(obj)}")
134
+
135
+ @classmethod
136
+ def generate_pickled_codes(
137
+ cls,
138
+ code_to_pickle: Any,
139
+ unpicked_data_var_name: Union[str, None] = "pickled_data",
140
+ ) -> str:
78
141
  """
79
142
  Generate pickled codes. The final pickled variable is called 'pickled_data'.
80
143
 
@@ -82,20 +145,20 @@ class UserCodeMixin:
82
145
  ----------
83
146
  code_to_pickle: Any
84
147
  The code to be pickled.
148
+ unpicked_data_var_name: str
149
+ The variables in code used to hold the loads object from the cloudpickle
85
150
 
86
151
  Returns
87
152
  -------
88
- List[str] :
89
- The code snippets of pickling, the final variable is called 'pickled_data'.
153
+ str :
154
+ The code snippets of pickling, the final variable is called 'pickled_data' by default.
90
155
  """
91
156
  pickled, buffers = cls.dump_pickled_data(code_to_pickle)
92
- pickled = base64.b64encode(pickled)
93
- buffers = [base64.b64encode(b) for b in buffers]
94
- buffers_str = ", ".join(f"base64.b64decode(b'{b.decode()}')" for b in buffers)
95
- return [
96
- f"base64_data = base64.b64decode(b'{pickled.decode()}')",
97
- f"pickled_data = cloudpickle.loads(base64_data, buffers=[{buffers_str}])",
98
- ]
157
+ pickle_loads_expr = f"cloudpickle.loads({cls.obj_to_python_expr(pickled)}, buffers={cls.obj_to_python_expr(buffers)})"
158
+ if unpicked_data_var_name:
159
+ return f"{unpicked_data_var_name} = {pickle_loads_expr}"
160
+
161
+ return pickle_loads_expr
99
162
 
100
163
  @staticmethod
101
164
  def dump_pickled_data(
@@ -114,8 +177,9 @@ class UserCodeMixin:
114
177
 
115
178
 
116
179
  class BigDagCodeContext(metaclass=abc.ABCMeta):
117
- def __init__(self, session_id: str = None):
180
+ def __init__(self, session_id: str = None, subdag_id: str = None):
118
181
  self._session_id = session_id
182
+ self._subdag_id = subdag_id
119
183
  self._tileable_key_to_variables = dict()
120
184
  self.constants = dict()
121
185
  self._data_table_meta_cache = dict()
@@ -142,10 +206,14 @@ class BigDagCodeContext(metaclass=abc.ABCMeta):
142
206
  except KeyError:
143
207
  var_name = self._tileable_key_to_variables[
144
208
  tileable.key
145
- ] = f"var_{self._next_var_id}"
146
- self._next_var_id += 1
209
+ ] = self.next_var_name()
147
210
  return var_name
148
211
 
212
+ def next_var_name(self) -> str:
213
+ var_name = f"var_{self._next_var_id}"
214
+ self._next_var_id += 1
215
+ return var_name
216
+
149
217
  def get_odps_schema(
150
218
  self, data: PandasObjectTypes, unknown_as_string: bool = False
151
219
  ) -> OdpsSchema:
@@ -275,9 +343,10 @@ class BigDagCodeGenerator(metaclass=abc.ABCMeta):
275
343
  engine_priority: int = 0
276
344
  _extension_loaded = False
277
345
 
278
- def __init__(self, session_id: str):
346
+ def __init__(self, session_id: str, subdag_id: str = None):
279
347
  self._session_id = session_id
280
- self._context = self._init_context(session_id)
348
+ self._subdag_id = subdag_id
349
+ self._context = self._init_context(session_id, subdag_id)
281
350
 
282
351
  @classmethod
283
352
  def _load_engine_extensions(cls):
@@ -307,7 +376,7 @@ class BigDagCodeGenerator(metaclass=abc.ABCMeta):
307
376
  raise NotImplementedError
308
377
 
309
378
  @abc.abstractmethod
310
- def _init_context(self, session_id: str) -> BigDagCodeContext:
379
+ def _init_context(self, session_id: str, subdag_id: str) -> BigDagCodeContext:
311
380
  raise NotImplementedError
312
381
 
313
382
  def _generate_comments(
maxframe/config/config.py CHANGED
@@ -340,6 +340,12 @@ default_options.register_option(
340
340
  validator=is_integer,
341
341
  remote=True,
342
342
  )
343
+ default_options.register_option(
344
+ "session.subinstance_priority",
345
+ None,
346
+ validator=any_validator(is_null, is_integer),
347
+ remote=True,
348
+ )
343
349
 
344
350
  default_options.register_option("warn_duplicated_execution", False, validator=is_bool)
345
351
  default_options.register_option("dataframe.use_arrow_dtype", True, validator=is_bool)
@@ -352,6 +358,9 @@ default_options.register_option(
352
358
  default_options.register_option(
353
359
  "show_progress", "auto", validator=any_validator(is_bool, is_string)
354
360
  )
361
+ default_options.register_option(
362
+ "dag.settings", value=dict(), validator=is_dict, remote=True
363
+ )
355
364
 
356
365
  ################
357
366
  # SPE Settings #
@@ -66,6 +66,7 @@ class DecrefRunner:
66
66
  if self._decref_thread: # pragma: no branch
67
67
  self._queue.put_nowait((None, None, None))
68
68
  self._decref_thread.join(1)
69
+ self._decref_thread = None
69
70
 
70
71
  def put(self, key: str, session_ref: ref):
71
72
  if self._decref_thread is None:
@@ -15,6 +15,7 @@
15
15
  from typing import Any, Dict
16
16
 
17
17
  from ...serialization.serializables import FieldTypes, ListField
18
+ from ...utils import skip_na_call
18
19
  from .chunks import Chunk, ChunkData
19
20
  from .core import Entity
20
21
  from .executable import _ToObjectMixin
@@ -62,8 +63,8 @@ class ObjectData(TileableData, _ToObjectMixin):
62
63
  _chunks = ListField(
63
64
  "chunks",
64
65
  FieldTypes.reference(ObjectChunkData),
65
- on_serialize=lambda x: [it.data for it in x] if x is not None else x,
66
- on_deserialize=lambda x: [ObjectChunk(it) for it in x] if x is not None else x,
66
+ on_serialize=skip_na_call(lambda x: [it.data for it in x]),
67
+ on_deserialize=skip_na_call(lambda x: [ObjectChunk(it) for it in x]),
67
68
  )
68
69
 
69
70
  def __init__(self, op=None, nsplits=None, **kw):
Binary file
@@ -39,6 +39,7 @@ from .datasource.read_odps_query import read_odps_query
39
39
  from .datasource.read_odps_table import read_odps_table
40
40
  from .datasource.read_parquet import read_parquet
41
41
  from .datastore.to_odps import to_odps_table
42
+ from .groupby import NamedAgg
42
43
  from .initializer import DataFrame, Index, Series, read_pandas
43
44
  from .merge import concat, merge
44
45
  from .misc.cut import cut
@@ -52,7 +53,12 @@ from .reduction import CustomReduction, unique
52
53
  from .tseries.to_datetime import to_datetime
53
54
 
54
55
  try:
55
- from pandas import NA, NamedAgg, Timestamp
56
+ from pandas import NA, Timestamp
57
+ except ImportError: # pragma: no cover
58
+ pass
59
+
60
+ try:
61
+ from . import _internal
56
62
  except ImportError: # pragma: no cover
57
63
  pass
58
64
 
@@ -960,7 +960,9 @@ class BaseSeriesData(HasShapeTileableData, _ToPandasMixin):
960
960
  buf = StringIO()
961
961
  max_rows = pd.get_option("display.max_rows")
962
962
  corner_max_rows = (
963
- max_rows if self.shape[0] <= max_rows else corner_data.shape[0] - 1
963
+ max_rows
964
+ if self.shape[0] <= max_rows or corner_data.shape[0] == 0
965
+ else corner_data.shape[0] - 1
964
966
  ) # make sure max_rows < corner_data
965
967
 
966
968
  with pd.option_context("display.max_rows", corner_max_rows):
@@ -1605,7 +1607,7 @@ class DataFrameData(_BatchedFetcher, BaseDataFrameData):
1605
1607
  buf = StringIO()
1606
1608
  max_rows = pd.get_option("display.max_rows")
1607
1609
 
1608
- if self.shape[0] <= max_rows:
1610
+ if self.shape[0] <= max_rows or corner_data.shape[0] == 0:
1609
1611
  buf.write(repr(corner_data) if representation else str(corner_data))
1610
1612
  else:
1611
1613
  # remember we cannot directly call repr(df),
@@ -46,7 +46,7 @@ _EXPLAIN_TASK_SCHEMA_REGEX = re.compile(
46
46
  r"In Task ([^:]+)[\S\s]+FS: output: ([^\n #]+)[\s\S]+schema:\s+([\S\s]+)$",
47
47
  re.MULTILINE,
48
48
  )
49
- _EXPLAIN_COLUMN_REGEX = re.compile(r"([^ ]+) \(([^)]+)\)(?:| AS ([^ ]+))(?:\n|$)")
49
+ _EXPLAIN_COLUMN_REGEX = re.compile(r"([^\(]+) \(([^)]+)\)(?:| AS ([^ ]+))(?:\n|$)")
50
50
 
51
51
 
52
52
  @dataclasses.dataclass
@@ -263,7 +263,9 @@ def read_odps_query(
263
263
  result: DataFrame
264
264
  DataFrame read from MaxCompute (ODPS) table
265
265
  """
266
- odps_entry = odps_entry or ODPS.from_environments()
266
+ odps_entry = odps_entry or ODPS.from_global() or ODPS.from_environments()
267
+ if odps_entry is None:
268
+ raise ValueError("Missing odps_entry parameter")
267
269
  inst = odps_entry.execute_sql(f"EXPLAIN {query}")
268
270
  explain_str = list(inst.get_task_results().values())[0]
269
271
 
@@ -69,7 +69,7 @@ class DataFrameReadODPSTable(
69
69
  return getattr(self, "partition_spec", None)
70
70
 
71
71
  def get_columns(self):
72
- return self.columns
72
+ return self.columns or list(self.dtypes.index)
73
73
 
74
74
  def set_pruned_columns(self, columns, *, keep_order=None): # pragma: no cover
75
75
  self.columns = columns
@@ -164,6 +164,8 @@ def read_odps_table(
164
164
  DataFrame read from MaxCompute (ODPS) table
165
165
  """
166
166
  odps_entry = odps_entry or ODPS.from_global() or ODPS.from_environments()
167
+ if odps_entry is None:
168
+ raise ValueError("Missing odps_entry parameter")
167
169
  if isinstance(table_name, Table):
168
170
  table = table_name
169
171
  else:
@@ -12,6 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ import os
15
16
  from collections import OrderedDict
16
17
 
17
18
  import numpy as np
@@ -33,6 +34,7 @@ from ..from_tensor import (
33
34
  )
34
35
  from ..index import from_pandas as from_pandas_index
35
36
  from ..index import from_tileable
37
+ from ..read_odps_query import ColumnSchema, _resolve_task_sector
36
38
  from ..series import from_pandas as from_pandas_series
37
39
 
38
40
  ray = lazy_import("ray")
@@ -228,6 +230,7 @@ def test_from_odps_table():
228
230
  assert df.op.table_name == test_table.full_table_name
229
231
  assert df.index_value.name is None
230
232
  assert isinstance(df.index_value.value, IndexValue.RangeIndex)
233
+ assert df.op.get_columns() == ["col1", "col2", "col3"]
231
234
  pd.testing.assert_series_equal(
232
235
  df.dtypes,
233
236
  pd.Series(
@@ -247,6 +250,7 @@ def test_from_odps_table():
247
250
  assert df.op.table_name == test_table.full_table_name
248
251
  assert df.index_value.name is None
249
252
  assert isinstance(df.index_value.value, IndexValue.RangeIndex)
253
+ assert df.op.get_columns() == ["col1", "col2"]
250
254
  pd.testing.assert_series_equal(
251
255
  df.dtypes,
252
256
  pd.Series([np.dtype("O"), np.dtype("int64")], index=["col1", "col2"]),
@@ -257,6 +261,7 @@ def test_from_odps_table():
257
261
  assert df.index_value.name == "col1"
258
262
  assert isinstance(df.index_value.value, IndexValue.Index)
259
263
  assert df.index.dtype == np.dtype("O")
264
+ assert df.op.get_columns() == ["col2", "col3"]
260
265
  pd.testing.assert_series_equal(
261
266
  df.dtypes,
262
267
  pd.Series([np.dtype("int64"), np.dtype("float64")], index=["col2", "col3"]),
@@ -267,6 +272,7 @@ def test_from_odps_table():
267
272
 
268
273
  df = read_odps_table(test_parted_table, append_partitions=True)
269
274
  assert df.op.append_partitions is True
275
+ assert df.op.get_columns() == ["col1", "col2", "col3", "pt"]
270
276
  pd.testing.assert_series_equal(
271
277
  df.dtypes,
272
278
  pd.Series(
@@ -280,6 +286,7 @@ def test_from_odps_table():
280
286
  )
281
287
  assert df.op.append_partitions is True
282
288
  assert df.op.partitions == ["pt=20240103"]
289
+ assert df.op.get_columns() == ["col1", "col2", "pt"]
283
290
  pd.testing.assert_series_equal(
284
291
  df.dtypes,
285
292
  pd.Series(
@@ -377,3 +384,18 @@ def test_date_range():
377
384
  assert dr.index_value.is_unique == expected.is_unique
378
385
  assert dr.index_value.is_monotonic_increasing == expected.is_monotonic_increasing
379
386
  assert dr.name == expected.name
387
+
388
+
389
+ def test_resolve_task_sector():
390
+ input_path = os.path.join(os.path.dirname(__file__), "test-data", "task-input.txt")
391
+ with open(input_path, "r") as f:
392
+ sector = f.read()
393
+ actual_sector = _resolve_task_sector("job0", sector)
394
+
395
+ assert actual_sector.job_name == "job0"
396
+ assert actual_sector.task_name == "M1"
397
+ assert actual_sector.output_target == "Screen"
398
+ assert len(actual_sector.schema) == 78
399
+ assert actual_sector.schema[0] == ColumnSchema("unnamed: 0", "bigint", "")
400
+ assert actual_sector.schema[1] == ColumnSchema("id", "bigint", "id_alias")
401
+ assert actual_sector.schema[2] == ColumnSchema("listing_url", "string", "")
@@ -0,0 +1,19 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from ..operators import DataFrameOperator, DataFrameOperatorMixin
16
+
17
+
18
+ class DataFrameDataStore(DataFrameOperator, DataFrameOperatorMixin):
19
+ pass
@@ -23,11 +23,11 @@ from ...serialization.serializables import (
23
23
  ListField,
24
24
  StringField,
25
25
  )
26
- from ..operators import DataFrameOperator, DataFrameOperatorMixin
27
26
  from ..utils import parse_index
27
+ from .core import DataFrameDataStore
28
28
 
29
29
 
30
- class DataFrameToCSV(DataFrameOperator, DataFrameOperatorMixin):
30
+ class DataFrameToCSV(DataFrameDataStore):
31
31
  _op_type_ = opcodes.TO_CSV
32
32
 
33
33
  input = KeyField("input")
@@ -32,13 +32,13 @@ from ...serialization.serializables import (
32
32
  )
33
33
  from ...typing_ import TileableType
34
34
  from ..core import DataFrame # noqa: F401
35
- from ..operators import DataFrameOperator, DataFrameOperatorMixin
36
35
  from ..utils import parse_index
36
+ from .core import DataFrameDataStore
37
37
 
38
38
  logger = logging.getLogger(__name__)
39
39
 
40
40
 
41
- class DataFrameToODPSTable(DataFrameOperator, DataFrameOperatorMixin):
41
+ class DataFrameToODPSTable(DataFrameDataStore):
42
42
  _op_type_ = opcodes.TO_ODPS_TABLE
43
43
 
44
44
  dtypes = SeriesField("dtypes")
@@ -14,6 +14,7 @@
14
14
 
15
15
  # noinspection PyUnresolvedReferences
16
16
  from ..core import DataFrameGroupBy, GroupBy, SeriesGroupBy
17
+ from .core import NamedAgg
17
18
 
18
19
 
19
20
  def _install():
@@ -12,6 +12,8 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ from collections import namedtuple
16
+
15
17
  import pandas as pd
16
18
 
17
19
  from ... import opcodes
@@ -30,6 +32,9 @@ _GROUP_KEYS_NO_DEFAULT = pd_release_version >= (1, 5, 0)
30
32
  _default_group_keys = no_default if _GROUP_KEYS_NO_DEFAULT else True
31
33
 
32
34
 
35
+ NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"])
36
+
37
+
33
38
  class DataFrameGroupByOperator(MapReduceOperator, DataFrameOperatorMixin):
34
39
  _op_type_ = opcodes.GROUPBY
35
40
 
@@ -107,7 +107,6 @@ def df_reset_index(
107
107
  inplace=False,
108
108
  col_level=0,
109
109
  col_fill="",
110
- incremental_index=False,
111
110
  ):
112
111
  """
113
112
  Reset the index, or a level of it.
@@ -133,12 +132,6 @@ def df_reset_index(
133
132
  col_fill : object, default ''
134
133
  If the columns have multiple levels, determines how the other
135
134
  levels are named. If None then the index name is repeated.
136
- incremental_index: bool, default False
137
- Ensure RangeIndex incremental, when output DataFrame has multiple chunks,
138
- ensuring index incremental costs more computation,
139
- so by default, each chunk will have index which starts from 0,
140
- setting incremental_index=True,reset_index will guarantee that
141
- output DataFrame's index is from 0 to n - 1.
142
135
 
143
136
  Returns
144
137
  -------
@@ -264,7 +257,6 @@ def df_reset_index(
264
257
  drop=drop,
265
258
  col_level=col_level,
266
259
  col_fill=col_fill,
267
- incremental_index=incremental_index,
268
260
  output_types=[OutputType.dataframe],
269
261
  )
270
262
  ret = op(df)
@@ -280,7 +272,6 @@ def series_reset_index(
280
272
  drop=False,
281
273
  name=no_default,
282
274
  inplace=False,
283
- incremental_index=False,
284
275
  ):
285
276
  """
286
277
  Generate a new DataFrame or Series with the index reset.
@@ -303,12 +294,6 @@ def series_reset_index(
303
294
  when `drop` is True.
304
295
  inplace : bool, default False
305
296
  Modify the Series in place (do not create a new object).
306
- incremental_index: bool, default False
307
- Ensure RangeIndex incremental, when output Series has multiple chunks,
308
- ensuring index incremental costs more computation,
309
- so by default, each chunk will have index which starts from 0,
310
- setting incremental_index=True,reset_index will guarantee that
311
- output Series's index is from 0 to n - 1.
312
297
 
313
298
  Returns
314
299
  -------
@@ -406,8 +391,7 @@ def series_reset_index(
406
391
  level=level,
407
392
  drop=drop,
408
393
  name=name,
409
- incremental_index=incremental_index,
410
- output_types=[OutputType.series],
394
+ output_types=[OutputType.series if drop else OutputType.dataframe],
411
395
  )
412
396
  ret = op(series)
413
397
  if not inplace:
@@ -14,11 +14,14 @@
14
14
 
15
15
  import asyncio
16
16
  import atexit
17
+ import itertools
17
18
  import threading
18
19
  from typing import Dict, Optional
19
20
 
20
21
 
21
22
  class Isolation:
23
+ _counter = itertools.count().__next__
24
+
22
25
  loop: asyncio.AbstractEventLoop
23
26
  _stopped: Optional[asyncio.Event]
24
27
  _thread: Optional[threading.Thread]
@@ -38,7 +41,9 @@ class Isolation:
38
41
 
39
42
  def start(self):
40
43
  if self._threaded:
41
- self._thread = thread = threading.Thread(target=self._run)
44
+ self._thread = thread = threading.Thread(
45
+ name=f"IsolationThread-{self._counter()}", target=self._run
46
+ )
42
47
  thread.daemon = True
43
48
  thread.start()
44
49
  self._thread_ident = thread.ident
Binary file
maxframe/odpsio/arrow.py CHANGED
@@ -65,14 +65,19 @@ def arrow_to_pandas(
65
65
  raise ValueError(f"Does not support meta type {table_meta.type!r}")
66
66
 
67
67
 
68
- def pandas_to_arrow(df: Any, nthreads=1) -> Tuple[ArrowTableType, DataFrameTableMeta]:
69
- table_meta = build_dataframe_table_meta(df)
68
+ def pandas_to_arrow(
69
+ df: Any, nthreads=1, ignore_index=False
70
+ ) -> Tuple[ArrowTableType, DataFrameTableMeta]:
71
+ table_meta = build_dataframe_table_meta(df, ignore_index)
70
72
  df = df.copy() if callable(getattr(df, "copy", None)) else df
71
73
  if table_meta.type in (OutputType.dataframe, OutputType.series):
72
74
  if table_meta.type == OutputType.series:
73
75
  df = df.to_frame("_data" if df.name is None else df.name)
74
76
  df.columns = pd.Index(table_meta.table_column_names)
75
- df = df.rename_axis(table_meta.table_index_column_names).reset_index()
77
+ if not ignore_index:
78
+ df = df.rename_axis(table_meta.table_index_column_names).reset_index()
79
+ elif ignore_index:
80
+ df = pd.DataFrame([], columns=[])
76
81
  elif table_meta.type == OutputType.index:
77
82
  names = [f"_idx_{idx}" for idx in range(len(df.names))]
78
83
  df = df.to_frame(name=names[0] if len(names) == 1 else names)
maxframe/odpsio/schema.py CHANGED
@@ -175,7 +175,9 @@ def _scalar_as_index(df_obj: Any) -> pd.Index:
175
175
 
176
176
 
177
177
  def pandas_to_odps_schema(
178
- df_obj: Any, unknown_as_string: bool = False
178
+ df_obj: Any,
179
+ unknown_as_string: bool = False,
180
+ ignore_index=False,
179
181
  ) -> Tuple[odps_types.OdpsSchema, DataFrameTableMeta]:
180
182
  from .. import dataframe as md
181
183
  from .arrow import pandas_to_arrow
@@ -209,7 +211,7 @@ def pandas_to_odps_schema(
209
211
  else:
210
212
  empty_df_obj = df_obj
211
213
 
212
- arrow_data, table_meta = pandas_to_arrow(empty_df_obj)
214
+ arrow_data, table_meta = pandas_to_arrow(empty_df_obj, ignore_index=ignore_index)
213
215
  return (
214
216
  arrow_schema_to_odps_schema(
215
217
  arrow_data.schema, unknown_as_string=unknown_as_string
@@ -268,7 +270,9 @@ def build_table_column_name(
268
270
  return col_name
269
271
 
270
272
 
271
- def build_dataframe_table_meta(df_obj: Any) -> DataFrameTableMeta:
273
+ def build_dataframe_table_meta(
274
+ df_obj: Any, ignore_index: bool = False
275
+ ) -> DataFrameTableMeta:
272
276
  from .. import dataframe as md
273
277
 
274
278
  col_to_count = defaultdict(lambda: 0)
@@ -285,6 +289,8 @@ def build_dataframe_table_meta(df_obj: Any) -> DataFrameTableMeta:
285
289
  else: # pragma: no cover
286
290
  raise TypeError(f"Cannot accept type {type(df_obj)}")
287
291
 
292
+ assert not ignore_index or obj_type in (OutputType.dataframe, OutputType.series)
293
+
288
294
  if obj_type == OutputType.scalar:
289
295
  pd_dtypes = pd.Series([])
290
296
  column_index_names = []
@@ -340,12 +346,19 @@ def build_dataframe_table_meta(df_obj: Any) -> DataFrameTableMeta:
340
346
  else:
341
347
  index_dtypes = pd.Series([pd_index_val.dtype], index=pd_index_val.names)
342
348
 
349
+ if ignore_index:
350
+ table_index_column_names = []
351
+ pd_index_dtypes = pd.Series([], index=[])
352
+ else:
353
+ table_index_column_names = [f"_idx_{i}" for i in range(len(index_obj.names))]
354
+ pd_index_dtypes = index_dtypes
355
+
343
356
  return DataFrameTableMeta(
344
357
  table_name=table_name,
345
358
  type=obj_type,
346
359
  table_column_names=final_sql_columns,
347
- table_index_column_names=[f"_idx_{i}" for i in range(len(index_obj.names))],
360
+ table_index_column_names=table_index_column_names,
348
361
  pd_column_dtypes=pd_dtypes,
349
362
  pd_column_level_names=column_index_names,
350
- pd_index_dtypes=index_dtypes,
363
+ pd_index_dtypes=pd_index_dtypes,
351
364
  )