maxframe 1.0.0rc3__cp39-cp39-win32.whl → 1.1.0__cp39-cp39-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (112) hide show
  1. maxframe/_utils.cp39-win32.pyd +0 -0
  2. maxframe/codegen.py +1 -0
  3. maxframe/config/config.py +16 -1
  4. maxframe/conftest.py +52 -14
  5. maxframe/core/entity/executable.py +1 -1
  6. maxframe/core/graph/core.cp39-win32.pyd +0 -0
  7. maxframe/core/operator/base.py +2 -0
  8. maxframe/dataframe/arithmetic/docstring.py +26 -2
  9. maxframe/dataframe/arithmetic/equal.py +4 -2
  10. maxframe/dataframe/arithmetic/greater.py +4 -2
  11. maxframe/dataframe/arithmetic/greater_equal.py +4 -2
  12. maxframe/dataframe/arithmetic/less.py +2 -2
  13. maxframe/dataframe/arithmetic/less_equal.py +4 -2
  14. maxframe/dataframe/arithmetic/not_equal.py +4 -2
  15. maxframe/dataframe/arithmetic/tests/test_arithmetic.py +17 -16
  16. maxframe/dataframe/core.py +26 -2
  17. maxframe/dataframe/datasource/read_odps_query.py +116 -28
  18. maxframe/dataframe/datasource/read_odps_table.py +3 -1
  19. maxframe/dataframe/datasource/tests/test_datasource.py +93 -12
  20. maxframe/dataframe/datastore/to_odps.py +7 -0
  21. maxframe/dataframe/extensions/__init__.py +8 -0
  22. maxframe/dataframe/extensions/apply_chunk.py +649 -0
  23. maxframe/dataframe/extensions/flatjson.py +131 -0
  24. maxframe/dataframe/extensions/flatmap.py +314 -0
  25. maxframe/dataframe/extensions/reshuffle.py +1 -1
  26. maxframe/dataframe/extensions/tests/test_apply_chunk.py +186 -0
  27. maxframe/dataframe/extensions/tests/test_extensions.py +108 -3
  28. maxframe/dataframe/groupby/__init__.py +1 -0
  29. maxframe/dataframe/groupby/aggregation.py +1 -0
  30. maxframe/dataframe/groupby/apply.py +9 -1
  31. maxframe/dataframe/groupby/core.py +1 -1
  32. maxframe/dataframe/groupby/fill.py +4 -1
  33. maxframe/dataframe/groupby/getitem.py +6 -0
  34. maxframe/dataframe/groupby/tests/test_groupby.py +1 -1
  35. maxframe/dataframe/groupby/transform.py +8 -2
  36. maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
  37. maxframe/dataframe/indexing/loc.py +6 -4
  38. maxframe/dataframe/indexing/rename.py +11 -0
  39. maxframe/dataframe/initializer.py +11 -1
  40. maxframe/dataframe/merge/__init__.py +9 -1
  41. maxframe/dataframe/merge/concat.py +41 -31
  42. maxframe/dataframe/merge/merge.py +1 -1
  43. maxframe/dataframe/merge/tests/test_merge.py +3 -1
  44. maxframe/dataframe/misc/apply.py +3 -0
  45. maxframe/dataframe/misc/drop_duplicates.py +23 -2
  46. maxframe/dataframe/misc/map.py +3 -1
  47. maxframe/dataframe/misc/tests/test_misc.py +24 -2
  48. maxframe/dataframe/misc/transform.py +22 -13
  49. maxframe/dataframe/reduction/__init__.py +3 -0
  50. maxframe/dataframe/reduction/aggregation.py +1 -0
  51. maxframe/dataframe/reduction/median.py +56 -0
  52. maxframe/dataframe/reduction/tests/test_reduction.py +17 -7
  53. maxframe/dataframe/statistics/quantile.py +8 -2
  54. maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
  55. maxframe/dataframe/tests/test_initializer.py +33 -2
  56. maxframe/dataframe/tests/test_utils.py +60 -0
  57. maxframe/dataframe/utils.py +110 -7
  58. maxframe/dataframe/window/expanding.py +5 -3
  59. maxframe/dataframe/window/tests/test_expanding.py +2 -2
  60. maxframe/io/objects/tests/test_object_io.py +39 -12
  61. maxframe/io/odpsio/arrow.py +30 -2
  62. maxframe/io/odpsio/schema.py +28 -8
  63. maxframe/io/odpsio/tableio.py +55 -133
  64. maxframe/io/odpsio/tests/test_schema.py +40 -4
  65. maxframe/io/odpsio/tests/test_tableio.py +5 -5
  66. maxframe/io/odpsio/tests/test_volumeio.py +35 -11
  67. maxframe/io/odpsio/volumeio.py +36 -6
  68. maxframe/learn/contrib/__init__.py +3 -1
  69. maxframe/learn/contrib/graph/__init__.py +15 -0
  70. maxframe/learn/contrib/graph/connected_components.py +215 -0
  71. maxframe/learn/contrib/graph/tests/__init__.py +13 -0
  72. maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
  73. maxframe/learn/contrib/llm/__init__.py +16 -0
  74. maxframe/learn/contrib/llm/core.py +54 -0
  75. maxframe/learn/contrib/llm/models/__init__.py +14 -0
  76. maxframe/learn/contrib/llm/models/dashscope.py +73 -0
  77. maxframe/learn/contrib/llm/multi_modal.py +42 -0
  78. maxframe/learn/contrib/llm/text.py +42 -0
  79. maxframe/learn/contrib/xgboost/classifier.py +3 -3
  80. maxframe/learn/contrib/xgboost/predict.py +8 -39
  81. maxframe/learn/contrib/xgboost/train.py +4 -3
  82. maxframe/lib/mmh3.cp39-win32.pyd +0 -0
  83. maxframe/lib/sparse/tests/test_sparse.py +15 -15
  84. maxframe/opcodes.py +10 -1
  85. maxframe/protocol.py +6 -1
  86. maxframe/serialization/core.cp39-win32.pyd +0 -0
  87. maxframe/serialization/core.pyx +13 -1
  88. maxframe/serialization/pandas.py +50 -20
  89. maxframe/serialization/serializables/core.py +24 -5
  90. maxframe/serialization/serializables/field_type.py +4 -1
  91. maxframe/serialization/serializables/tests/test_serializable.py +8 -1
  92. maxframe/serialization/tests/test_serial.py +2 -1
  93. maxframe/session.py +9 -2
  94. maxframe/tensor/__init__.py +19 -7
  95. maxframe/tensor/indexing/getitem.py +2 -0
  96. maxframe/tensor/merge/concatenate.py +23 -20
  97. maxframe/tensor/merge/vstack.py +5 -1
  98. maxframe/tensor/misc/transpose.py +1 -1
  99. maxframe/tests/utils.py +16 -0
  100. maxframe/udf.py +27 -0
  101. maxframe/utils.py +64 -14
  102. {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/METADATA +2 -2
  103. {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/RECORD +112 -96
  104. {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/WHEEL +1 -1
  105. maxframe_client/clients/framedriver.py +4 -1
  106. maxframe_client/fetcher.py +28 -10
  107. maxframe_client/session/consts.py +3 -0
  108. maxframe_client/session/odps.py +104 -20
  109. maxframe_client/session/task.py +42 -26
  110. maxframe_client/session/tests/test_task.py +0 -4
  111. maxframe_client/tests/test_session.py +44 -12
  112. {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/top_level.txt +0 -0
@@ -28,6 +28,7 @@ from .aggregation import BaseDataFrameExpandingAgg
28
28
  from .core import Window
29
29
 
30
30
  _window_has_method = pd_release_version >= (1, 3, 0)
31
+ _window_has_center = pd_release_version < (2, 0, 0)
31
32
 
32
33
 
33
34
  class DataFrameExpandingAgg(BaseDataFrameExpandingAgg):
@@ -49,10 +50,11 @@ class Expanding(Window):
49
50
  def params(self):
50
51
  p = OrderedDict()
51
52
 
53
+ args = ["min_periods", "center", "axis", "method"]
52
54
  if not _window_has_method: # pragma: no cover
53
- args = ["min_periods", "center", "axis"]
54
- else:
55
- args = ["min_periods", "center", "axis", "method"]
55
+ args = [a for a in args if a != "method"]
56
+ if not _window_has_center:
57
+ args = [a for a in args if a != "center"]
56
58
 
57
59
  for k in args:
58
60
  p[k] = getattr(self, k)
@@ -29,8 +29,8 @@ def test_expanding():
29
29
  with pytest.raises(NotImplementedError):
30
30
  _ = df2.expanding(3, axis=1)
31
31
 
32
- r = df2.expanding(3, center=False)
33
- expected = df.expanding(3, center=False)
32
+ r = df2.expanding(3)
33
+ expected = df.expanding(3)
34
34
  assert repr(r) == repr(expected)
35
35
 
36
36
  assert "b" in dir(r)
@@ -11,6 +11,7 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+
14
15
  import numpy as np
15
16
  import pytest
16
17
  from odps import ODPS
@@ -48,15 +49,33 @@ def create_volume(request, oss_config):
48
49
  oss_bucket_name,
49
50
  oss_endpoint,
50
51
  ) = oss_config.oss_config
51
- test_location = "oss://%s:%s@%s/%s/%s" % (
52
- oss_access_id,
53
- oss_secret_access_key,
54
- oss_endpoint,
55
- oss_bucket_name,
56
- oss_test_dir_name,
57
- )
52
+
53
+ if "test" in oss_endpoint:
54
+ # offline config
55
+ test_location = "oss://%s:%s@%s/%s/%s" % (
56
+ oss_access_id,
57
+ oss_secret_access_key,
58
+ oss_endpoint,
59
+ oss_bucket_name,
60
+ oss_test_dir_name,
61
+ )
62
+ rolearn = None
63
+ else:
64
+ # online config
65
+ endpoint_parts = oss_endpoint.split(".", 1)
66
+ if "-internal" not in endpoint_parts[0]:
67
+ endpoint_parts[0] += "-internal"
68
+ test_location = "oss://%s/%s/%s" % (
69
+ ".".join(endpoint_parts),
70
+ oss_bucket_name,
71
+ oss_test_dir_name,
72
+ )
73
+ rolearn = oss_config.oss_rolearn
74
+
58
75
  oss_config.oss_bucket.put_object(oss_test_dir_name + "/", b"")
59
- odps_entry.create_external_volume(test_vol_name, location=test_location)
76
+ odps_entry.create_external_volume(
77
+ test_vol_name, location=test_location, rolearn=rolearn
78
+ )
60
79
 
61
80
  try:
62
81
  yield test_vol_name
@@ -75,8 +94,12 @@ def test_simple_object_io(create_volume):
75
94
 
76
95
  odps_entry = ODPS.from_environments()
77
96
 
78
- reader = ODPSVolumeReader(odps_entry, create_volume, obj.key)
79
- writer = ODPSVolumeWriter(odps_entry, create_volume, obj.key)
97
+ reader = ODPSVolumeReader(
98
+ odps_entry, create_volume, obj.key, replace_internal_host=True
99
+ )
100
+ writer = ODPSVolumeWriter(
101
+ odps_entry, create_volume, obj.key, replace_internal_host=True
102
+ )
80
103
 
81
104
  handler = get_object_io_handler(obj)()
82
105
  handler.write_object(writer, obj, data)
@@ -89,8 +112,12 @@ def test_tensor_object_io(create_volume):
89
112
 
90
113
  odps_entry = ODPS.from_environments()
91
114
 
92
- reader = ODPSVolumeReader(odps_entry, create_volume, obj.key)
93
- writer = ODPSVolumeWriter(odps_entry, create_volume, obj.key)
115
+ reader = ODPSVolumeReader(
116
+ odps_entry, create_volume, obj.key, replace_internal_host=True
117
+ )
118
+ writer = ODPSVolumeWriter(
119
+ odps_entry, create_volume, obj.key, replace_internal_host=True
120
+ )
94
121
 
95
122
  handler = get_object_io_handler(obj)()
96
123
  handler.write_object(writer, obj, data)
@@ -69,13 +69,24 @@ def arrow_to_pandas(
69
69
 
70
70
 
71
71
  def pandas_to_arrow(
72
- df: Any, nthreads=1, ignore_index=False
72
+ df: Any, nthreads=1, ignore_index=False, ms_cols=None
73
73
  ) -> Tuple[ArrowTableType, DataFrameTableMeta]:
74
74
  table_meta = build_dataframe_table_meta(df, ignore_index)
75
75
  df = df.copy() if callable(getattr(df, "copy", None)) else df
76
+ table_datetime_cols = None
76
77
  if table_meta.type in (OutputType.dataframe, OutputType.series):
77
78
  if table_meta.type == OutputType.series:
78
79
  df = df.to_frame("_data" if df.name is None else df.name)
80
+ if ms_cols:
81
+ table_datetime_cols = {"_data"}
82
+ elif ms_cols:
83
+ ms_col_set = set(ms_cols)
84
+ table_datetime_cols = set()
85
+ for pd_col, table_col in zip(
86
+ table_meta.pd_column_dtypes.keys(), table_meta.table_column_names
87
+ ):
88
+ if pd_col in ms_col_set:
89
+ table_datetime_cols.add(table_col)
79
90
  df.columns = pd.Index(table_meta.table_column_names)
80
91
  if not ignore_index:
81
92
  df = df.rename_axis(table_meta.table_index_column_names).reset_index()
@@ -83,6 +94,12 @@ def pandas_to_arrow(
83
94
  df = pd.DataFrame([], columns=[])
84
95
  elif table_meta.type == OutputType.index:
85
96
  names = [f"_idx_{idx}" for idx in range(len(df.names))]
97
+ table_datetime_cols = set()
98
+ if ms_cols:
99
+ if isinstance(df, pd.MultiIndex):
100
+ table_datetime_cols = {f"_idx_{idx}" for idx in ms_cols}
101
+ else:
102
+ table_datetime_cols = {"_idx_0"}
86
103
  df = df.to_frame(name=names[0] if len(names) == 1 else names)
87
104
  elif table_meta.type == OutputType.scalar:
88
105
  names = ["_idx_0"]
@@ -92,4 +109,15 @@ def pandas_to_arrow(
92
109
  df = pd.DataFrame([[df]], columns=names)
93
110
  else: # this could never happen # pragma: no cover
94
111
  raise ValueError(f"Does not support meta type {table_meta.type!r}")
95
- return pa.Table.from_pandas(df, nthreads=nthreads, preserve_index=False), table_meta
112
+ pa_table = pa.Table.from_pandas(df, nthreads=nthreads, preserve_index=False)
113
+ if table_datetime_cols:
114
+ col_names = pa_table.schema.names
115
+ col_datas = []
116
+ for idx, col_name in enumerate(pa_table.schema.names):
117
+ if col_name not in table_datetime_cols:
118
+ col_datas.append(pa_table.column(idx))
119
+ continue
120
+ col_data = pa_table.column(idx).cast(pa.timestamp("ms"))
121
+ col_datas.append(col_data)
122
+ pa_table = pa.Table.from_arrays(col_datas, names=col_names)
123
+ return pa_table, table_meta
@@ -16,6 +16,7 @@ import string
16
16
  from collections import defaultdict
17
17
  from typing import Any, Dict, Tuple
18
18
 
19
+ import numpy as np
19
20
  import pandas as pd
20
21
  import pyarrow as pa
21
22
  from odps import types as odps_types
@@ -39,6 +40,7 @@ _arrow_to_odps_types = {
39
40
  pa.float64(): odps_types.double,
40
41
  pa.date32(): odps_types.date,
41
42
  pa.timestamp("ms"): odps_types.datetime,
43
+ pa.timestamp("us"): odps_types.timestamp,
42
44
  pa.timestamp("ns"): odps_types.timestamp,
43
45
  }
44
46
 
@@ -54,7 +56,9 @@ _odps_type_to_arrow = {
54
56
  odps_types.double: pa.float64(),
55
57
  odps_types.date: pa.date32(),
56
58
  odps_types.datetime: pa.timestamp("ms"),
59
+ odps_types.json: pa.string(),
57
60
  odps_types.timestamp: pa.timestamp("ns"),
61
+ odps_types.timestamp_ntz: pa.timestamp("ns"),
58
62
  }
59
63
 
60
64
 
@@ -166,7 +170,7 @@ def odps_schema_to_pandas_dtypes(
166
170
  return arrow_schema.empty_table().to_pandas().dtypes
167
171
 
168
172
 
169
- def _is_scalar_object(df_obj: Any) -> bool:
173
+ def is_scalar_object(df_obj: Any) -> bool:
170
174
  return (
171
175
  isinstance(df_obj, TENSOR_TYPE) and df_obj.shape == ()
172
176
  ) or pd_types.is_scalar(df_obj)
@@ -187,7 +191,7 @@ def pandas_to_odps_schema(
187
191
  from ... import dataframe as md
188
192
  from .arrow import pandas_to_arrow
189
193
 
190
- if _is_scalar_object(df_obj):
194
+ if is_scalar_object(df_obj):
191
195
  empty_index = None
192
196
  elif hasattr(df_obj, "index_value"):
193
197
  empty_index = df_obj.index_value.to_pandas()[:0]
@@ -203,20 +207,35 @@ def pandas_to_odps_schema(
203
207
  else:
204
208
  empty_columns = None
205
209
 
210
+ ms_cols = None
206
211
  if isinstance(df_obj, (md.DataFrame, pd.DataFrame)):
207
212
  empty_df_obj = pd.DataFrame(
208
213
  [], columns=empty_columns, index=empty_index
209
214
  ).astype(df_obj.dtypes)
215
+ ms_cols = [
216
+ col for col, dt in df_obj.dtypes.items() if dt == np.dtype("datetime64[ms]")
217
+ ]
210
218
  elif isinstance(df_obj, (md.Series, pd.Series)):
211
219
  empty_df_obj = pd.Series([], name=df_obj.name, index=empty_index).astype(
212
220
  df_obj.dtype
213
221
  )
222
+ ms_cols = df_obj.dtype == np.dtype("datetime64[ms]")
214
223
  elif isinstance(df_obj, (md.Index, pd.Index)):
215
224
  empty_df_obj = empty_index
225
+ if isinstance(empty_index, pd.MultiIndex):
226
+ ms_cols = [
227
+ idx
228
+ for idx, dt in enumerate(empty_index.dtypes.values)
229
+ if dt == np.dtype("datetime64[ms]")
230
+ ]
231
+ else:
232
+ ms_cols = df_obj.dtype == np.dtype("datetime64[ms]")
216
233
  else:
217
234
  empty_df_obj = df_obj
218
235
 
219
- arrow_data, table_meta = pandas_to_arrow(empty_df_obj, ignore_index=ignore_index)
236
+ arrow_data, table_meta = pandas_to_arrow(
237
+ empty_df_obj, ignore_index=ignore_index, ms_cols=ms_cols
238
+ )
220
239
  return (
221
240
  arrow_schema_to_odps_schema(
222
241
  arrow_data.schema, unknown_as_string=unknown_as_string
@@ -289,7 +308,7 @@ def build_dataframe_table_meta(
289
308
  obj_type = OutputType.series
290
309
  elif isinstance(df_obj, (md.Index, pd.Index)):
291
310
  obj_type = OutputType.index
292
- elif _is_scalar_object(df_obj):
311
+ elif is_scalar_object(df_obj):
293
312
  obj_type = OutputType.scalar
294
313
  else: # pragma: no cover
295
314
  raise TypeError(f"Cannot accept type {type(df_obj)}")
@@ -344,10 +363,11 @@ def build_dataframe_table_meta(
344
363
  else:
345
364
  pd_index_val = index_obj
346
365
 
347
- if hasattr(pd_index_val, "dtypes"):
348
- index_dtypes = pd.Series(pd_index_val.dtypes.values, index=pd_index_val.names)
349
- else:
350
- index_dtypes = pd.Series([pd_index_val.dtype], index=pd_index_val.names)
366
+ level_dtypes = [
367
+ pd_index_val.get_level_values(level).dtype
368
+ for level in range(pd_index_val.nlevels)
369
+ ]
370
+ index_dtypes = pd.Series(level_dtypes, index=pd_index_val.names)
351
371
 
352
372
  if ignore_index and obj_type != OutputType.index:
353
373
  table_index_column_names = []
@@ -20,15 +20,14 @@ from typing import Dict, List, Optional, Union
20
20
 
21
21
  import pyarrow as pa
22
22
  from odps import ODPS
23
- from odps import __version__ as pyodps_version
24
23
  from odps.apis.storage_api import (
25
24
  StorageApiArrowClient,
26
25
  TableBatchScanResponse,
27
26
  TableBatchWriteResponse,
28
27
  )
29
- from odps.config import option_context as pyodps_option_context
30
28
  from odps.tunnel import TableTunnel
31
29
  from odps.types import OdpsSchema, PartitionSpec, timestamp_ntz
30
+ from odps.utils import call_with_retry
32
31
 
33
32
  try:
34
33
  import pyarrow.compute as pac
@@ -37,26 +36,18 @@ except ImportError:
37
36
 
38
37
  from ...config import options
39
38
  from ...env import ODPS_STORAGE_API_ENDPOINT
40
- from ...lib.version import Version
39
+ from ...utils import sync_pyodps_options
41
40
  from .schema import odps_schema_to_arrow_schema
42
41
 
43
42
  PartitionsType = Union[List[str], str, None]
44
43
 
45
44
  _DEFAULT_ROW_BATCH_SIZE = 4096
46
- _need_convert_timezone = Version(pyodps_version) < Version("0.11.7")
47
-
48
-
49
- @contextmanager
50
- def _sync_pyodps_timezone():
51
- with pyodps_option_context() as cfg:
52
- cfg.local_timezone = options.local_timezone
53
- yield
54
45
 
55
46
 
56
47
  class ODPSTableIO(ABC):
57
48
  def __new__(cls, odps: ODPS):
58
49
  if cls is ODPSTableIO:
59
- if options.use_common_table:
50
+ if options.use_common_table or ODPS_STORAGE_API_ENDPOINT in os.environ:
60
51
  return HaloTableIO(odps)
61
52
  else:
62
53
  return TunnelTableIO(odps)
@@ -138,7 +129,12 @@ class TunnelMultiPartitionReader:
138
129
  self._cur_partition_id = -1
139
130
  self._reader_start_pos = 0
140
131
 
141
- if partitions is None or isinstance(partitions, str):
132
+ if partitions is None:
133
+ if not self._table.table_schema.partitions:
134
+ self._partitions = [None]
135
+ else:
136
+ self._partitions = [str(pt) for pt in self._table.partitions]
137
+ elif isinstance(partitions, str):
142
138
  self._partitions = [partitions]
143
139
  else:
144
140
  self._partitions = partitions
@@ -166,12 +162,14 @@ class TunnelMultiPartitionReader:
166
162
  self._cur_partition_id += 1
167
163
 
168
164
  part_str = self._partitions[self._cur_partition_id]
169
- with _sync_pyodps_timezone():
165
+ req_columns = self._schema.names
166
+ with sync_pyodps_options():
170
167
  self._cur_reader = self._table.open_reader(
171
168
  part_str,
172
- columns=self._columns,
169
+ columns=req_columns,
173
170
  arrow=True,
174
171
  download_id=self._partition_to_download_ids.get(part_str),
172
+ append_partitions=True,
175
173
  )
176
174
  if self._cur_reader.count + self._reader_start_pos > self._start:
177
175
  start = self._start - self._reader_start_pos
@@ -180,43 +178,15 @@ class TunnelMultiPartitionReader:
180
178
  else:
181
179
  count = min(self._count, self._cur_reader.count - start)
182
180
 
183
- with _sync_pyodps_timezone():
181
+ with sync_pyodps_options():
184
182
  self._reader_iter = self._cur_reader.read(start, count)
185
183
  break
186
184
  self._reader_start_pos += self._cur_reader.count
187
185
  else:
188
186
  self._cur_reader = None
189
187
 
190
- def _fill_batch_partition(self, batch: pa.RecordBatch) -> pa.RecordBatch:
191
- pt_spec = PartitionSpec(self._partitions[self._cur_partition_id])
192
-
193
- names = list(batch.schema.names)
194
- arrays = []
195
- for idx in range(batch.num_columns):
196
- col = batch.column(idx)
197
- if _need_convert_timezone and isinstance(col.type, pa.TimestampType):
198
- if col.type.tz is not None:
199
- target_type = pa.timestamp(
200
- self._schema.types[idx].unit, col.type.tz
201
- )
202
- arrays.append(col.cast(target_type))
203
- else:
204
- target_type = pa.timestamp(
205
- self._schema.types[idx].unit, options.local_timezone
206
- )
207
- pd_col = col.to_pandas().dt.tz_localize(options.local_timezone)
208
- arrays.append(pa.Array.from_pandas(pd_col).cast(target_type))
209
- else:
210
- arrays.append(batch.column(idx))
211
-
212
- for part_col in self._partition_cols or []:
213
- names.append(part_col)
214
- col_type = self._schema.field_by_name(part_col).type
215
- arrays.append(pa.array([pt_spec[part_col]] * batch.num_rows).cast(col_type))
216
- return pa.RecordBatch.from_arrays(arrays, names)
217
-
218
188
  def read(self):
219
- with _sync_pyodps_timezone():
189
+ with sync_pyodps_options():
220
190
  if self._cur_reader is None:
221
191
  self._open_next_reader()
222
192
  if self._cur_reader is None:
@@ -227,7 +197,7 @@ class TunnelMultiPartitionReader:
227
197
  if batch is not None:
228
198
  if self._row_left is not None:
229
199
  self._row_left -= batch.num_rows
230
- return self._fill_batch_partition(batch)
200
+ return batch
231
201
  except StopIteration:
232
202
  self._open_next_reader()
233
203
  return None
@@ -244,34 +214,6 @@ class TunnelMultiPartitionReader:
244
214
  return pa.Table.from_batches(batches)
245
215
 
246
216
 
247
- class TunnelWrappedWriter:
248
- def __init__(self, nested_writer):
249
- self._writer = nested_writer
250
-
251
- def write(self, data: Union[pa.RecordBatch, pa.Table]):
252
- if not any(isinstance(tp, pa.TimestampType) for tp in data.schema.types):
253
- self._writer.write(data)
254
- return
255
- pa_type = type(data)
256
- arrays = []
257
- for idx in range(data.num_columns):
258
- name = data.schema.names[idx]
259
- col = data.column(idx)
260
- if not isinstance(col.type, pa.TimestampType):
261
- arrays.append(col)
262
- continue
263
- if self._writer.schema[name].type == timestamp_ntz:
264
- col = HaloTableArrowWriter._localize_timezone(col, "UTC")
265
- else:
266
- col = HaloTableArrowWriter._localize_timezone(col)
267
- arrays.append(col)
268
- data = pa_type.from_arrays(arrays, names=data.schema.names)
269
- self._writer.write(data)
270
-
271
- def __getattr__(self, item):
272
- return getattr(self._writer, item)
273
-
274
-
275
217
  class TunnelTableIO(ODPSTableIO):
276
218
  @contextmanager
277
219
  def open_reader(
@@ -285,7 +227,9 @@ class TunnelTableIO(ODPSTableIO):
285
227
  reverse_range: bool = False,
286
228
  row_batch_size: int = _DEFAULT_ROW_BATCH_SIZE,
287
229
  ):
288
- table = self._odps.get_table(full_table_name)
230
+ with sync_pyodps_options():
231
+ table = self._odps.get_table(full_table_name)
232
+
289
233
  if partition_columns is True:
290
234
  partition_columns = [c.name for c in table.table_schema.partitions]
291
235
 
@@ -296,21 +240,22 @@ class TunnelTableIO(ODPSTableIO):
296
240
  or (stop is not None and stop < 0)
297
241
  or (reverse_range and start is None)
298
242
  ):
299
- table = self._odps.get_table(full_table_name)
300
- tunnel = TableTunnel(self._odps)
301
- parts = (
302
- [partitions]
303
- if partitions is None or isinstance(partitions, str)
304
- else partitions
305
- )
306
- part_to_down_id = dict()
307
- total_records = 0
308
- for part in parts:
309
- down_session = tunnel.create_download_session(
310
- table, async_mode=True, partition_spec=part
243
+ with sync_pyodps_options():
244
+ table = self._odps.get_table(full_table_name)
245
+ tunnel = TableTunnel(self._odps)
246
+ parts = (
247
+ [partitions]
248
+ if partitions is None or isinstance(partitions, str)
249
+ else partitions
311
250
  )
312
- part_to_down_id[part] = down_session.id
313
- total_records += down_session.count
251
+ part_to_down_id = dict()
252
+ total_records = 0
253
+ for part in parts:
254
+ down_session = tunnel.create_download_session(
255
+ table, async_mode=True, partition_spec=part
256
+ )
257
+ part_to_down_id[part] = down_session.id
258
+ total_records += down_session.count
314
259
 
315
260
  count = None
316
261
  if start is not None or stop is not None:
@@ -347,20 +292,14 @@ class TunnelTableIO(ODPSTableIO):
347
292
  overwrite: bool = True,
348
293
  ):
349
294
  table = self._odps.get_table(full_table_name)
350
- with _sync_pyodps_timezone():
295
+ with sync_pyodps_options():
351
296
  with table.open_writer(
352
297
  partition=partition,
353
298
  arrow=True,
354
299
  create_partition=partition is not None,
355
300
  overwrite=overwrite,
356
301
  ) as writer:
357
- # fixme should yield writer directly once pyodps fixes
358
- # related arrow timestamp bug when provided schema and
359
- # table schema is identical.
360
- if _need_convert_timezone:
361
- yield TunnelWrappedWriter(writer)
362
- else:
363
- yield writer
302
+ yield writer
364
303
 
365
304
 
366
305
  class HaloTableArrowReader:
@@ -416,7 +355,7 @@ class HaloTableArrowReader:
416
355
  split_index=self._cur_split_id + 1,
417
356
  **read_rows_kw,
418
357
  )
419
- self._cur_reader = self._client.read_rows_arrow(req)
358
+ self._cur_reader = call_with_retry(self._client.read_rows_arrow, req)
420
359
  self._cur_split_id += 1
421
360
 
422
361
  def _convert_timezone(self, batch: pa.RecordBatch) -> pa.RecordBatch:
@@ -488,8 +427,9 @@ class HaloTableArrowWriter:
488
427
  def open(self):
489
428
  from odps.apis.storage_api import WriteRowsRequest
490
429
 
491
- self._writer = self._client.write_rows_arrow(
492
- WriteRowsRequest(self._write_info.session_id)
430
+ self._writer = call_with_retry(
431
+ self._client.write_rows_arrow,
432
+ WriteRowsRequest(self._write_info.session_id),
493
433
  )
494
434
 
495
435
  @classmethod
@@ -560,28 +500,6 @@ class HaloTableIO(ODPSTableIO):
560
500
  for pt in partitions
561
501
  ]
562
502
 
563
- def get_table_record_count(
564
- self, full_table_name: str, partitions: PartitionsType = None
565
- ):
566
- from odps.apis.storage_api import SplitOptions, TableBatchScanRequest
567
-
568
- table = self._odps.get_table(full_table_name)
569
- client = StorageApiArrowClient(
570
- self._odps, table, rest_endpoint=self._storage_api_endpoint
571
- )
572
-
573
- split_option = SplitOptions.SplitMode.SIZE
574
-
575
- scan_kw = {
576
- "required_partitions": self._convert_partitions(partitions),
577
- "split_options": SplitOptions.get_default_options(split_option),
578
- }
579
-
580
- # todo add more options for partition column handling
581
- req = TableBatchScanRequest(**scan_kw)
582
- resp = client.create_read_session(req)
583
- return resp.record_count
584
-
585
503
  @contextmanager
586
504
  def open_reader(
587
505
  self,
@@ -596,8 +514,8 @@ class HaloTableIO(ODPSTableIO):
596
514
  ):
597
515
  from odps.apis.storage_api import (
598
516
  SessionRequest,
517
+ SessionStatus,
599
518
  SplitOptions,
600
- Status,
601
519
  TableBatchScanRequest,
602
520
  )
603
521
 
@@ -625,16 +543,16 @@ class HaloTableIO(ODPSTableIO):
625
543
 
626
544
  # todo add more options for partition column handling
627
545
  req = TableBatchScanRequest(**scan_kw)
628
- resp = client.create_read_session(req)
546
+ resp = call_with_retry(client.create_read_session, req)
629
547
 
630
548
  session_id = resp.session_id
631
- status = resp.status
632
- while status == Status.WAIT:
633
- resp = client.get_read_session(SessionRequest(session_id))
634
- status = resp.status
549
+ status = resp.session_status
550
+ while status == SessionStatus.INIT:
551
+ resp = call_with_retry(client.get_read_session, SessionRequest(session_id))
552
+ status = resp.session_status
635
553
  time.sleep(1.0)
636
554
 
637
- assert status == Status.OK
555
+ assert status == SessionStatus.NORMAL
638
556
 
639
557
  count = None
640
558
  if start is not None or stop is not None:
@@ -685,7 +603,7 @@ class HaloTableIO(ODPSTableIO):
685
603
  part_strs = self._convert_partitions(partition)
686
604
  part_str = part_strs[0] if part_strs else None
687
605
  req = TableBatchWriteRequest(partition_spec=part_str, overwrite=overwrite)
688
- resp = client.create_write_session(req)
606
+ resp = call_with_retry(client.create_write_session, req)
689
607
 
690
608
  session_id = resp.session_id
691
609
  writer = HaloTableArrowWriter(client, resp, table.table_schema)
@@ -694,9 +612,13 @@ class HaloTableIO(ODPSTableIO):
694
612
  yield writer
695
613
 
696
614
  commit_msg = writer.close()
697
- resp = client.commit_write_session(
698
- SessionRequest(session_id=session_id), [commit_msg]
615
+ resp = call_with_retry(
616
+ client.commit_write_session,
617
+ SessionRequest(session_id=session_id),
618
+ [commit_msg],
699
619
  )
700
620
  while resp.session_status == SessionStatus.COMMITTING:
701
- resp = client.get_write_session(SessionRequest(session_id=session_id))
621
+ resp = call_with_retry(
622
+ client.get_write_session, SessionRequest(session_id=session_id)
623
+ )
702
624
  assert resp.session_status == SessionStatus.COMMITTED
@@ -21,6 +21,7 @@ from odps import types as odps_types
21
21
  from .... import dataframe as md
22
22
  from .... import tensor as mt
23
23
  from ....core import OutputType
24
+ from ....utils import pd_release_version
24
25
  from ..schema import (
25
26
  arrow_schema_to_odps_schema,
26
27
  build_dataframe_table_meta,
@@ -270,10 +271,6 @@ def test_odps_arrow_schema_conversion():
270
271
 
271
272
  with pytest.raises(TypeError):
272
273
  arrow_schema_to_odps_schema(pa.schema([("col1", pa.float16())]))
273
- with pytest.raises(TypeError):
274
- odps_schema_to_arrow_schema(
275
- odps_types.OdpsSchema([odps_types.Column("col1", "json")])
276
- )
277
274
 
278
275
 
279
276
  def test_build_column_name():
@@ -296,3 +293,42 @@ def test_build_table_meta(wrap_obj):
296
293
  table_meta = build_dataframe_table_meta(test_df)
297
294
  expected_cols = ["a_2", "a_3", "a_0", "a_1_0", "a_1_1", "b", "c"]
298
295
  assert table_meta.table_column_names == expected_cols
296
+
297
+
298
+ @pytest.mark.skipif(
299
+ pd_release_version[0] < 2, reason="only run under pandas 2.0 or greater"
300
+ )
301
+ def test_table_meta_with_datetime():
302
+ raw_df = pd.DataFrame(
303
+ [
304
+ [1, "abc", "2024-10-01 11:23:12"],
305
+ [3, "uvw", "2024-10-02 22:55:13"],
306
+ ],
307
+ columns=["col1", "col2", "col3"],
308
+ )
309
+ df = md.DataFrame(raw_df).astype({"col3": "datetime64[ms]"})
310
+ schema, _ = pandas_to_odps_schema(df, unknown_as_string=True)
311
+ assert schema.columns[3].type == odps_types.datetime
312
+
313
+ raw_series = pd.Series(
314
+ ["2024-10-01 11:23:12", "2024-10-02 22:55:13"], dtype="datetime64[ms]"
315
+ )
316
+ s = md.Series(raw_series)
317
+ schema, _ = pandas_to_odps_schema(s, unknown_as_string=True)
318
+ assert schema.columns[1].type == odps_types.datetime
319
+
320
+ raw_index = pd.Index(
321
+ ["2024-10-01 11:23:12", "2024-10-02 22:55:13"], dtype="datetime64[ms]"
322
+ )
323
+ idx = md.Index(raw_index)
324
+ schema, _ = pandas_to_odps_schema(idx, unknown_as_string=True)
325
+ assert schema.columns[0].type == odps_types.datetime
326
+
327
+ src_df = pd.DataFrame(
328
+ [[1, "2024-10-01 11:23:12"], [3, "2024-10-02 22:55:13"]],
329
+ columns=["A", "B"],
330
+ ).astype({"B": "datetime64[ms]"})
331
+ raw_multiindex = pd.MultiIndex.from_frame(src_df)
332
+ multiidx = md.Index(raw_multiindex)
333
+ schema, _ = pandas_to_odps_schema(multiidx, unknown_as_string=True)
334
+ assert schema.columns[1].type == odps_types.datetime