maxframe 0.1.0b5__cp310-cp310-macosx_10_9_universal2.whl → 1.0.0rc2__cp310-cp310-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (92) hide show
  1. maxframe/_utils.cpython-310-darwin.so +0 -0
  2. maxframe/codegen.py +6 -2
  3. maxframe/config/config.py +38 -2
  4. maxframe/config/validators.py +1 -0
  5. maxframe/conftest.py +2 -0
  6. maxframe/core/__init__.py +0 -3
  7. maxframe/core/entity/__init__.py +1 -8
  8. maxframe/core/entity/objects.py +3 -45
  9. maxframe/core/graph/core.cpython-310-darwin.so +0 -0
  10. maxframe/core/graph/core.pyx +4 -4
  11. maxframe/dataframe/__init__.py +1 -1
  12. maxframe/dataframe/arithmetic/around.py +5 -17
  13. maxframe/dataframe/arithmetic/core.py +15 -7
  14. maxframe/dataframe/arithmetic/docstring.py +5 -55
  15. maxframe/dataframe/arithmetic/tests/test_arithmetic.py +22 -0
  16. maxframe/dataframe/core.py +5 -5
  17. maxframe/dataframe/datasource/date_range.py +2 -2
  18. maxframe/dataframe/datasource/read_odps_query.py +6 -0
  19. maxframe/dataframe/datasource/read_odps_table.py +2 -1
  20. maxframe/dataframe/datasource/tests/test_datasource.py +14 -0
  21. maxframe/dataframe/datastore/tests/__init__.py +13 -0
  22. maxframe/dataframe/datastore/tests/test_to_odps.py +48 -0
  23. maxframe/dataframe/datastore/to_odps.py +21 -0
  24. maxframe/dataframe/groupby/cum.py +0 -1
  25. maxframe/dataframe/groupby/tests/test_groupby.py +4 -0
  26. maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
  27. maxframe/dataframe/indexing/align.py +1 -1
  28. maxframe/dataframe/indexing/rename.py +3 -37
  29. maxframe/dataframe/indexing/sample.py +0 -1
  30. maxframe/dataframe/indexing/set_index.py +68 -1
  31. maxframe/dataframe/merge/merge.py +236 -2
  32. maxframe/dataframe/merge/tests/test_merge.py +123 -0
  33. maxframe/dataframe/misc/apply.py +5 -10
  34. maxframe/dataframe/misc/case_when.py +1 -1
  35. maxframe/dataframe/misc/describe.py +2 -2
  36. maxframe/dataframe/misc/drop_duplicates.py +4 -25
  37. maxframe/dataframe/misc/eval.py +4 -0
  38. maxframe/dataframe/misc/memory_usage.py +2 -2
  39. maxframe/dataframe/misc/pct_change.py +1 -83
  40. maxframe/dataframe/misc/tests/test_misc.py +23 -0
  41. maxframe/dataframe/misc/transform.py +1 -30
  42. maxframe/dataframe/misc/value_counts.py +4 -17
  43. maxframe/dataframe/missing/dropna.py +1 -1
  44. maxframe/dataframe/missing/fillna.py +5 -5
  45. maxframe/dataframe/sort/sort_values.py +1 -11
  46. maxframe/dataframe/statistics/corr.py +3 -3
  47. maxframe/dataframe/statistics/quantile.py +5 -17
  48. maxframe/dataframe/utils.py +4 -7
  49. maxframe/errors.py +13 -0
  50. maxframe/extension.py +12 -0
  51. maxframe/learn/contrib/xgboost/dmatrix.py +2 -2
  52. maxframe/learn/contrib/xgboost/predict.py +2 -2
  53. maxframe/learn/contrib/xgboost/train.py +2 -2
  54. maxframe/lib/mmh3.cpython-310-darwin.so +0 -0
  55. maxframe/lib/mmh3.pyi +43 -0
  56. maxframe/lib/wrapped_pickle.py +2 -1
  57. maxframe/odpsio/__init__.py +1 -1
  58. maxframe/odpsio/arrow.py +8 -4
  59. maxframe/odpsio/schema.py +10 -7
  60. maxframe/odpsio/tableio.py +388 -14
  61. maxframe/odpsio/tests/test_schema.py +16 -15
  62. maxframe/odpsio/tests/test_tableio.py +48 -21
  63. maxframe/protocol.py +148 -12
  64. maxframe/serialization/core.cpython-310-darwin.so +0 -0
  65. maxframe/serialization/core.pxd +3 -0
  66. maxframe/serialization/core.pyi +3 -0
  67. maxframe/serialization/core.pyx +54 -25
  68. maxframe/serialization/exception.py +1 -1
  69. maxframe/serialization/pandas.py +7 -2
  70. maxframe/serialization/serializables/core.py +158 -12
  71. maxframe/serialization/serializables/tests/test_serializable.py +46 -4
  72. maxframe/tensor/__init__.py +59 -0
  73. maxframe/tensor/arithmetic/tests/test_arithmetic.py +1 -1
  74. maxframe/tensor/base/atleast_1d.py +1 -1
  75. maxframe/tensor/base/unique.py +3 -3
  76. maxframe/tensor/reduction/count_nonzero.py +1 -1
  77. maxframe/tensor/statistics/quantile.py +2 -2
  78. maxframe/tests/test_protocol.py +34 -0
  79. maxframe/tests/test_utils.py +0 -12
  80. maxframe/tests/utils.py +11 -2
  81. maxframe/utils.py +24 -13
  82. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0rc2.dist-info}/METADATA +75 -2
  83. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0rc2.dist-info}/RECORD +91 -89
  84. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0rc2.dist-info}/WHEEL +1 -1
  85. maxframe_client/__init__.py +0 -1
  86. maxframe_client/fetcher.py +38 -27
  87. maxframe_client/session/odps.py +50 -10
  88. maxframe_client/session/task.py +41 -20
  89. maxframe_client/tests/test_fetcher.py +21 -3
  90. maxframe_client/tests/test_session.py +49 -2
  91. maxframe_client/clients/spe.py +0 -104
  92. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0rc2.dist-info}/top_level.txt +0 -0
@@ -16,7 +16,7 @@ import os
16
16
  import time
17
17
  from abc import ABC, abstractmethod
18
18
  from contextlib import contextmanager
19
- from typing import List, Optional, Union
19
+ from typing import Dict, List, Optional, Union
20
20
 
21
21
  import pyarrow as pa
22
22
  from odps import ODPS
@@ -25,8 +25,16 @@ from odps.apis.storage_api import (
25
25
  TableBatchScanResponse,
26
26
  TableBatchWriteResponse,
27
27
  )
28
- from odps.types import PartitionSpec
28
+ from odps.config import option_context as pyodps_option_context
29
+ from odps.tunnel import TableTunnel
30
+ from odps.types import OdpsSchema, PartitionSpec, timestamp_ntz
29
31
 
32
+ try:
33
+ import pyarrow.compute as pac
34
+ except ImportError:
35
+ pac = None
36
+
37
+ from ..config import options
30
38
  from ..env import ODPS_STORAGE_API_ENDPOINT
31
39
  from .schema import odps_schema_to_arrow_schema
32
40
 
@@ -35,18 +43,55 @@ PartitionsType = Union[List[str], str, None]
35
43
  _DEFAULT_ROW_BATCH_SIZE = 4096
36
44
 
37
45
 
38
- class MCTableIO(ABC):
46
+ @contextmanager
47
+ def _sync_pyodps_timezone():
48
+ with pyodps_option_context() as cfg:
49
+ cfg.local_timezone = options.local_timezone
50
+ yield
51
+
52
+
53
+ class ODPSTableIO(ABC):
54
+ def __new__(cls, odps: ODPS):
55
+ if cls is ODPSTableIO:
56
+ if options.use_common_table:
57
+ return HaloTableIO(odps)
58
+ else:
59
+ return TunnelTableIO(odps)
60
+ return super().__new__(cls)
61
+
39
62
  def __init__(self, odps: ODPS):
40
63
  self._odps = odps
41
64
 
65
+ @classmethod
66
+ def _get_reader_schema(
67
+ cls,
68
+ table_schema: OdpsSchema,
69
+ columns: Optional[List[str]] = None,
70
+ partition_columns: Union[None, bool, List[str]] = None,
71
+ ) -> OdpsSchema:
72
+ final_cols = []
73
+
74
+ columns = columns or [col.name for col in table_schema.simple_columns]
75
+ if partition_columns is True:
76
+ partition_columns = [c.name for c in table_schema.partitions]
77
+ else:
78
+ partition_columns = partition_columns or []
79
+
80
+ for col_name in columns + partition_columns:
81
+ final_cols.append(table_schema[col_name])
82
+ return OdpsSchema(final_cols)
83
+
42
84
  @abstractmethod
43
85
  def open_reader(
44
86
  self,
45
87
  full_table_name: str,
46
88
  partitions: PartitionsType = None,
47
89
  columns: Optional[List[str]] = None,
90
+ partition_columns: Union[None, bool, List[str]] = None,
48
91
  start: Optional[int] = None,
49
- count: Optional[int] = None,
92
+ stop: Optional[int] = None,
93
+ reverse_range: bool = False,
94
+ row_batch_size: int = _DEFAULT_ROW_BATCH_SIZE,
50
95
  ):
51
96
  raise NotImplementedError
52
97
 
@@ -60,12 +105,264 @@ class MCTableIO(ABC):
60
105
  raise NotImplementedError
61
106
 
62
107
 
108
+ class TunnelMultiPartitionReader:
109
+ def __init__(
110
+ self,
111
+ odps_entry: ODPS,
112
+ table_name: str,
113
+ partitions: PartitionsType,
114
+ columns: Optional[List[str]] = None,
115
+ partition_columns: Optional[List[str]] = None,
116
+ start: Optional[int] = None,
117
+ count: Optional[int] = None,
118
+ partition_to_download_ids: Dict[str, str] = None,
119
+ ):
120
+ self._odps_entry = odps_entry
121
+ self._table = odps_entry.get_table(table_name)
122
+ self._columns = columns
123
+
124
+ odps_schema = ODPSTableIO._get_reader_schema(
125
+ self._table.table_schema, columns, partition_columns
126
+ )
127
+ self._schema = odps_schema_to_arrow_schema(odps_schema)
128
+
129
+ self._start = start or 0
130
+ self._count = count
131
+ self._row_left = count
132
+
133
+ self._cur_reader = None
134
+ self._reader_iter = None
135
+ self._cur_partition_id = -1
136
+ self._reader_start_pos = 0
137
+
138
+ if partitions is None or isinstance(partitions, str):
139
+ self._partitions = [partitions]
140
+ else:
141
+ self._partitions = partitions
142
+
143
+ self._partition_cols = partition_columns
144
+ self._partition_to_download_ids = partition_to_download_ids or dict()
145
+
146
+ @property
147
+ def count(self) -> Optional[int]:
148
+ if len(self._partitions) > 1:
149
+ return None
150
+ return self._count
151
+
152
+ def _open_next_reader(self):
153
+ if self._cur_reader is not None:
154
+ self._reader_start_pos += self._cur_reader.count
155
+
156
+ if (
157
+ self._row_left is not None and self._row_left <= 0
158
+ ) or 1 + self._cur_partition_id >= len(self._partitions):
159
+ self._cur_reader = None
160
+ return
161
+
162
+ while 1 + self._cur_partition_id < len(self._partitions):
163
+ self._cur_partition_id += 1
164
+
165
+ part_str = self._partitions[self._cur_partition_id]
166
+ with _sync_pyodps_timezone():
167
+ self._cur_reader = self._table.open_reader(
168
+ part_str,
169
+ columns=self._columns,
170
+ arrow=True,
171
+ download_id=self._partition_to_download_ids.get(part_str),
172
+ )
173
+ if self._cur_reader.count + self._reader_start_pos > self._start:
174
+ start = self._start - self._reader_start_pos
175
+ if self._count is None:
176
+ count = None
177
+ else:
178
+ count = min(self._count, self._cur_reader.count - start)
179
+
180
+ with _sync_pyodps_timezone():
181
+ self._reader_iter = self._cur_reader.read(start, count)
182
+ break
183
+ self._reader_start_pos += self._cur_reader.count
184
+ else:
185
+ self._cur_reader = None
186
+
187
+ def _fill_batch_partition(self, batch: pa.RecordBatch) -> pa.RecordBatch:
188
+ pt_spec = PartitionSpec(self._partitions[self._cur_partition_id])
189
+
190
+ names = list(batch.schema.names)
191
+ arrays = []
192
+ for idx in range(batch.num_columns):
193
+ col = batch.column(idx)
194
+ if isinstance(col.type, pa.TimestampType):
195
+ if col.type.tz is not None:
196
+ target_type = pa.timestamp(
197
+ self._schema.types[idx].unit, col.type.tz
198
+ )
199
+ arrays.append(col.cast(target_type))
200
+ else:
201
+ target_type = pa.timestamp(
202
+ self._schema.types[idx].unit, options.local_timezone
203
+ )
204
+ pd_col = col.to_pandas().dt.tz_localize(options.local_timezone)
205
+ arrays.append(pa.Array.from_pandas(pd_col).cast(target_type))
206
+ else:
207
+ arrays.append(batch.column(idx))
208
+
209
+ for part_col in self._partition_cols or []:
210
+ names.append(part_col)
211
+ col_type = self._schema.field_by_name(part_col).type
212
+ arrays.append(pa.array([pt_spec[part_col]] * batch.num_rows).cast(col_type))
213
+ return pa.RecordBatch.from_arrays(arrays, names)
214
+
215
+ def read(self):
216
+ with _sync_pyodps_timezone():
217
+ if self._cur_reader is None:
218
+ self._open_next_reader()
219
+ if self._cur_reader is None:
220
+ return None
221
+ while self._cur_reader is not None:
222
+ try:
223
+ batch = next(self._reader_iter)
224
+ if batch is not None:
225
+ if self._row_left is not None:
226
+ self._row_left -= batch.num_rows
227
+ return self._fill_batch_partition(batch)
228
+ except StopIteration:
229
+ self._open_next_reader()
230
+ return None
231
+
232
+ def read_all(self) -> pa.Table:
233
+ batches = []
234
+ while True:
235
+ batch = self.read()
236
+ if batch is None:
237
+ break
238
+ batches.append(batch)
239
+ if not batches:
240
+ return self._schema.empty_table()
241
+ return pa.Table.from_batches(batches)
242
+
243
+
244
+ class TunnelWrappedWriter:
245
+ def __init__(self, nested_writer):
246
+ self._writer = nested_writer
247
+
248
+ def write(self, data: Union[pa.RecordBatch, pa.Table]):
249
+ if not any(isinstance(tp, pa.TimestampType) for tp in data.schema.types):
250
+ self._writer.write(data)
251
+ return
252
+ pa_type = type(data)
253
+ arrays = []
254
+ for idx in range(data.num_columns):
255
+ name = data.schema.names[idx]
256
+ col = data.column(idx)
257
+ if not isinstance(col.type, pa.TimestampType):
258
+ arrays.append(col)
259
+ continue
260
+ if self._writer.schema[name].type == timestamp_ntz:
261
+ col = HaloTableArrowWriter._localize_timezone(col, "UTC")
262
+ else:
263
+ col = HaloTableArrowWriter._localize_timezone(col)
264
+ arrays.append(col)
265
+ data = pa_type.from_arrays(arrays, names=data.schema.names)
266
+ self._writer.write(data)
267
+
268
+ def __getattr__(self, item):
269
+ return getattr(self._writer, item)
270
+
271
+
272
+ class TunnelTableIO(ODPSTableIO):
273
+ @contextmanager
274
+ def open_reader(
275
+ self,
276
+ full_table_name: str,
277
+ partitions: PartitionsType = None,
278
+ columns: Optional[List[str]] = None,
279
+ partition_columns: Union[None, bool, List[str]] = None,
280
+ start: Optional[int] = None,
281
+ stop: Optional[int] = None,
282
+ reverse_range: bool = False,
283
+ row_batch_size: int = _DEFAULT_ROW_BATCH_SIZE,
284
+ ):
285
+ table = self._odps.get_table(full_table_name)
286
+ if partition_columns is True:
287
+ partition_columns = [c.name for c in table.table_schema.partitions]
288
+
289
+ total_records = None
290
+ part_to_down_id = None
291
+ if (
292
+ (start is not None and start < 0)
293
+ or (stop is not None and stop < 0)
294
+ or (reverse_range and start is None)
295
+ ):
296
+ table = self._odps.get_table(full_table_name)
297
+ tunnel = TableTunnel(self._odps)
298
+ parts = (
299
+ [partitions]
300
+ if partitions is None or isinstance(partitions, str)
301
+ else partitions
302
+ )
303
+ part_to_down_id = dict()
304
+ total_records = 0
305
+ for part in parts:
306
+ down_session = tunnel.create_download_session(
307
+ table, async_mode=True, partition_spec=part
308
+ )
309
+ part_to_down_id[part] = down_session.id
310
+ total_records += down_session.count
311
+
312
+ count = None
313
+ if start is not None or stop is not None:
314
+ if reverse_range:
315
+ start = start if start is not None else total_records - 1
316
+ stop = stop if stop is not None else -1
317
+ else:
318
+ start = start if start is not None else 0
319
+ stop = stop if stop is not None else None
320
+ start = start if start >= 0 else total_records + start
321
+ stop = stop if stop is None or stop >= 0 else total_records + stop
322
+ if reverse_range:
323
+ count = start - stop
324
+ start = stop + 1
325
+ else:
326
+ count = stop - start if stop is not None and start is not None else None
327
+
328
+ yield TunnelMultiPartitionReader(
329
+ self._odps,
330
+ full_table_name,
331
+ partitions=partitions,
332
+ columns=columns,
333
+ partition_columns=partition_columns,
334
+ start=start,
335
+ count=count,
336
+ partition_to_download_ids=part_to_down_id,
337
+ )
338
+
339
+ @contextmanager
340
+ def open_writer(
341
+ self,
342
+ full_table_name: str,
343
+ partition: Optional[str] = None,
344
+ overwrite: bool = True,
345
+ ):
346
+ table = self._odps.get_table(full_table_name)
347
+ with _sync_pyodps_timezone():
348
+ with table.open_writer(
349
+ partition=partition,
350
+ arrow=True,
351
+ create_partition=partition is not None,
352
+ overwrite=overwrite,
353
+ ) as writer:
354
+ # fixme should yield writer directly once pyodps fixes
355
+ # related arrow timestamp bug when provided schema and
356
+ # table schema is identical.
357
+ yield TunnelWrappedWriter(writer)
358
+
359
+
63
360
  class HaloTableArrowReader:
64
361
  def __init__(
65
362
  self,
66
363
  client: StorageApiArrowClient,
67
364
  scan_info: TableBatchScanResponse,
68
- schema: pa.Schema,
365
+ odps_schema: OdpsSchema,
69
366
  start: Optional[int] = None,
70
367
  count: Optional[int] = None,
71
368
  row_batch_size: Optional[int] = None,
@@ -76,7 +373,8 @@ class HaloTableArrowReader:
76
373
  self._cur_split_id = -1
77
374
  self._cur_reader = None
78
375
 
79
- self._schema = schema
376
+ self._odps_schema = odps_schema
377
+ self._arrow_schema = odps_schema_to_arrow_schema(odps_schema)
80
378
 
81
379
  self._start = start
82
380
  self._count = count
@@ -115,6 +413,34 @@ class HaloTableArrowReader:
115
413
  self._cur_reader = self._client.read_rows_arrow(req)
116
414
  self._cur_split_id += 1
117
415
 
416
+ def _convert_timezone(self, batch: pa.RecordBatch) -> pa.RecordBatch:
417
+ timezone = options.local_timezone
418
+ if not any(isinstance(tp, pa.TimestampType) for tp in batch.schema.types):
419
+ return batch
420
+
421
+ cols = []
422
+ for idx in range(batch.num_columns):
423
+ col = batch.column(idx)
424
+ name = batch.schema.names[idx]
425
+ if not isinstance(col.type, pa.TimestampType):
426
+ cols.append(col)
427
+ continue
428
+ if self._odps_schema[name].type == timestamp_ntz:
429
+ col = col.cast(pa.timestamp(col.type.unit))
430
+ cols.append(col)
431
+ continue
432
+
433
+ if hasattr(pac, "local_timestamp"):
434
+ col = col.cast(pa.timestamp(col.type.unit, timezone))
435
+ else:
436
+ pd_col = col.to_pandas().dt.tz_convert(timezone)
437
+ col = pa.Array.from_pandas(pd_col).cast(
438
+ pa.timestamp(col.type.unit, timezone)
439
+ )
440
+ cols.append(col)
441
+
442
+ return pa.RecordBatch.from_arrays(cols, names=batch.schema.names)
443
+
118
444
  def read(self):
119
445
  if self._cur_reader is None:
120
446
  self._open_next_reader()
@@ -123,7 +449,7 @@ class HaloTableArrowReader:
123
449
  while self._cur_reader is not None:
124
450
  batch = self._cur_reader.read()
125
451
  if batch is not None:
126
- return batch
452
+ return self._convert_timezone(batch)
127
453
  self._open_next_reader()
128
454
  return None
129
455
 
@@ -135,16 +461,21 @@ class HaloTableArrowReader:
135
461
  break
136
462
  batches.append(batch)
137
463
  if not batches:
138
- return self._schema.empty_table()
464
+ return self._arrow_schema.empty_table()
139
465
  return pa.Table.from_batches(batches)
140
466
 
141
467
 
142
468
  class HaloTableArrowWriter:
143
469
  def __init__(
144
- self, client: StorageApiArrowClient, write_info: TableBatchWriteResponse
470
+ self,
471
+ client: StorageApiArrowClient,
472
+ write_info: TableBatchWriteResponse,
473
+ odps_schema: OdpsSchema,
145
474
  ):
146
475
  self._client = client
147
476
  self._write_info = write_info
477
+ self._odps_schema = odps_schema
478
+ self._arrow_schema = odps_schema_to_arrow_schema(odps_schema)
148
479
 
149
480
  self._writer = None
150
481
 
@@ -155,12 +486,52 @@ class HaloTableArrowWriter:
155
486
  WriteRowsRequest(self._write_info.session_id)
156
487
  )
157
488
 
489
+ @classmethod
490
+ def _localize_timezone(cls, col, tz=None):
491
+ from odps.lib import tzlocal
492
+
493
+ if tz is None:
494
+ if options.local_timezone is None:
495
+ tz = str(tzlocal.get_localzone())
496
+ else:
497
+ tz = str(options.local_timezone)
498
+
499
+ if col.type.tz is not None:
500
+ return col
501
+ if hasattr(pac, "assume_timezone"):
502
+ col = pac.assume_timezone(col, tz)
503
+ return col
504
+ else:
505
+ col = col.to_pandas()
506
+ return pa.Array.from_pandas(col.dt.tz_localize(tz))
507
+
508
+ def _convert_schema(self, batch: pa.RecordBatch):
509
+ if batch.schema == self._arrow_schema and not any(
510
+ isinstance(tp, pa.TimestampType) for tp in self._arrow_schema.types
511
+ ):
512
+ return batch
513
+ cols = []
514
+ for idx in range(batch.num_columns):
515
+ col = batch.column(idx)
516
+ name = batch.schema.names[idx]
517
+
518
+ if isinstance(col.type, pa.TimestampType):
519
+ if self._odps_schema[name].type == timestamp_ntz:
520
+ col = self._localize_timezone(col, "UTC")
521
+ else:
522
+ col = self._localize_timezone(col)
523
+
524
+ if col.type != self._arrow_schema.types[idx]:
525
+ col = col.cast(self._arrow_schema.types[idx])
526
+ cols.append(col)
527
+ return pa.RecordBatch.from_arrays(cols, names=batch.schema.names)
528
+
158
529
  def write(self, batch):
159
530
  if isinstance(batch, pa.Table):
160
531
  for b in batch.to_batches():
161
- self._writer.write(b)
532
+ self._writer.write(self._convert_schema(b))
162
533
  else:
163
- self._writer.write(batch)
534
+ self._writer.write(self._convert_schema(batch))
164
535
 
165
536
  def close(self):
166
537
  commit_msg, is_success = self._writer.finish()
@@ -169,7 +540,7 @@ class HaloTableArrowWriter:
169
540
  return commit_msg
170
541
 
171
542
 
172
- class HaloTableIO(MCTableIO):
543
+ class HaloTableIO(ODPSTableIO):
173
544
  _storage_api_endpoint = os.getenv(ODPS_STORAGE_API_ENDPOINT)
174
545
 
175
546
  @staticmethod
@@ -275,10 +646,13 @@ class HaloTableIO(MCTableIO):
275
646
  else:
276
647
  count = stop - start
277
648
 
649
+ reader_schema = self._get_reader_schema(
650
+ table.table_schema, columns, partition_columns
651
+ )
278
652
  yield HaloTableArrowReader(
279
653
  client,
280
654
  resp,
281
- schema=odps_schema_to_arrow_schema(table.table_schema),
655
+ odps_schema=reader_schema,
282
656
  start=start,
283
657
  count=count,
284
658
  row_batch_size=row_batch_size,
@@ -308,7 +682,7 @@ class HaloTableIO(MCTableIO):
308
682
  resp = client.create_write_session(req)
309
683
 
310
684
  session_id = resp.session_id
311
- writer = HaloTableArrowWriter(client, resp)
685
+ writer = HaloTableArrowWriter(client, resp, table.table_schema)
312
686
  writer.open()
313
687
 
314
688
  yield writer
@@ -143,17 +143,17 @@ def test_pandas_to_odps_schema_index(wrap_obj):
143
143
  data = pd.Index(np.random.randint(0, 100, 100))
144
144
 
145
145
  test_idx = _wrap_maxframe_obj(data, wrap=wrap_obj)
146
- schema, meta = pandas_to_odps_schema(test_idx, unknown_as_string=True)
147
- assert [c.name for c in schema.columns] == ["_idx_0"]
148
- assert [c.type.name for c in schema.columns] == ["bigint"]
149
- assert meta.type == OutputType.index
150
- assert meta.table_column_names == []
151
- assert meta.table_index_column_names == ["_idx_0"]
152
- assert meta.pd_column_level_names == []
153
- assert meta.pd_index_level_names == [None]
154
-
155
- with pytest.raises(AssertionError):
156
- pandas_to_odps_schema(test_idx, unknown_as_string=True, ignore_index=True)
146
+ for ignore_idx in (False, True):
147
+ schema, meta = pandas_to_odps_schema(
148
+ test_idx, unknown_as_string=True, ignore_index=ignore_idx
149
+ )
150
+ assert [c.name for c in schema.columns] == ["_idx_0"]
151
+ assert [c.type.name for c in schema.columns] == ["bigint"]
152
+ assert meta.type == OutputType.index
153
+ assert meta.table_column_names == []
154
+ assert meta.table_index_column_names == ["_idx_0"]
155
+ assert meta.pd_column_level_names == []
156
+ assert meta.pd_index_level_names == [None]
157
157
 
158
158
  data = pd.MultiIndex.from_arrays(
159
159
  [np.random.choice(list("ABC"), 100), np.random.randint(0, 10, 100)],
@@ -177,6 +177,7 @@ def test_pandas_to_odps_schema_scalar(wrap_obj):
177
177
  test_scalar = _wrap_maxframe_obj(data, wrap=wrap_obj)
178
178
  if wrap_obj != "no":
179
179
  test_scalar.op.data = None
180
+
180
181
  schema, meta = pandas_to_odps_schema(test_scalar, unknown_as_string=True)
181
182
  assert schema.columns[0].name == "_idx_0"
182
183
  assert schema.columns[0].type.name == "double"
@@ -186,9 +187,6 @@ def test_pandas_to_odps_schema_scalar(wrap_obj):
186
187
  assert meta.pd_column_level_names == []
187
188
  assert meta.pd_index_level_names == [None]
188
189
 
189
- with pytest.raises(AssertionError):
190
- pandas_to_odps_schema(test_scalar, unknown_as_string=True, ignore_index=True)
191
-
192
190
 
193
191
  def test_odps_arrow_schema_conversion():
194
192
  odps_schema = odps_types.OdpsSchema(
@@ -211,10 +209,11 @@ def test_odps_arrow_schema_conversion():
211
209
  odps_types.Column("col16", "struct<a1: string, a2: map<string, bigint>>"),
212
210
  odps_types.Column("col17", "CHAR(15)"),
213
211
  odps_types.Column("col18", "VARCHAR(15)"),
212
+ odps_types.Column("col19", "decimal"),
214
213
  ]
215
214
  )
216
215
  arrow_schema = odps_schema_to_arrow_schema(odps_schema)
217
- assert arrow_schema.names == [f"col{i}" for i in range(1, 19)]
216
+ assert arrow_schema.names == [f"col{i}" for i in range(1, 20)]
218
217
  assert arrow_schema.types == [
219
218
  pa.string(),
220
219
  pa.binary(),
@@ -234,6 +233,7 @@ def test_odps_arrow_schema_conversion():
234
233
  pa.struct([("a1", pa.string()), ("a2", pa.map_(pa.string(), pa.int64()))]),
235
234
  pa.string(),
236
235
  pa.string(),
236
+ pa.decimal128(38, 18),
237
237
  ]
238
238
 
239
239
  expected_odps_schema = odps_types.OdpsSchema(
@@ -256,6 +256,7 @@ def test_odps_arrow_schema_conversion():
256
256
  odps_types.Column("col16", "struct<a1: string, a2: map<string, bigint>>"),
257
257
  odps_types.Column("col17", "string"),
258
258
  odps_types.Column("col18", "string"),
259
+ odps_types.Column("col19", "decimal(38, 18)"),
259
260
  ]
260
261
  )
261
262