maxframe 0.1.0b5__cp39-cp39-macosx_10_9_universal2.whl → 1.0.0rc2__cp39-cp39-macosx_10_9_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cpython-39-darwin.so +0 -0
- maxframe/codegen.py +6 -2
- maxframe/config/config.py +38 -2
- maxframe/config/validators.py +1 -0
- maxframe/conftest.py +2 -0
- maxframe/core/__init__.py +0 -3
- maxframe/core/entity/__init__.py +1 -8
- maxframe/core/entity/objects.py +3 -45
- maxframe/core/graph/core.cpython-39-darwin.so +0 -0
- maxframe/core/graph/core.pyx +4 -4
- maxframe/dataframe/__init__.py +1 -1
- maxframe/dataframe/arithmetic/around.py +5 -17
- maxframe/dataframe/arithmetic/core.py +15 -7
- maxframe/dataframe/arithmetic/docstring.py +5 -55
- maxframe/dataframe/arithmetic/tests/test_arithmetic.py +22 -0
- maxframe/dataframe/core.py +5 -5
- maxframe/dataframe/datasource/date_range.py +2 -2
- maxframe/dataframe/datasource/read_odps_query.py +6 -0
- maxframe/dataframe/datasource/read_odps_table.py +2 -1
- maxframe/dataframe/datasource/tests/test_datasource.py +14 -0
- maxframe/dataframe/datastore/tests/__init__.py +13 -0
- maxframe/dataframe/datastore/tests/test_to_odps.py +48 -0
- maxframe/dataframe/datastore/to_odps.py +21 -0
- maxframe/dataframe/groupby/cum.py +0 -1
- maxframe/dataframe/groupby/tests/test_groupby.py +4 -0
- maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
- maxframe/dataframe/indexing/align.py +1 -1
- maxframe/dataframe/indexing/rename.py +3 -37
- maxframe/dataframe/indexing/sample.py +0 -1
- maxframe/dataframe/indexing/set_index.py +68 -1
- maxframe/dataframe/merge/merge.py +236 -2
- maxframe/dataframe/merge/tests/test_merge.py +123 -0
- maxframe/dataframe/misc/apply.py +5 -10
- maxframe/dataframe/misc/case_when.py +1 -1
- maxframe/dataframe/misc/describe.py +2 -2
- maxframe/dataframe/misc/drop_duplicates.py +4 -25
- maxframe/dataframe/misc/eval.py +4 -0
- maxframe/dataframe/misc/memory_usage.py +2 -2
- maxframe/dataframe/misc/pct_change.py +1 -83
- maxframe/dataframe/misc/tests/test_misc.py +23 -0
- maxframe/dataframe/misc/transform.py +1 -30
- maxframe/dataframe/misc/value_counts.py +4 -17
- maxframe/dataframe/missing/dropna.py +1 -1
- maxframe/dataframe/missing/fillna.py +5 -5
- maxframe/dataframe/sort/sort_values.py +1 -11
- maxframe/dataframe/statistics/corr.py +3 -3
- maxframe/dataframe/statistics/quantile.py +5 -17
- maxframe/dataframe/utils.py +4 -7
- maxframe/errors.py +13 -0
- maxframe/extension.py +12 -0
- maxframe/learn/contrib/xgboost/dmatrix.py +2 -2
- maxframe/learn/contrib/xgboost/predict.py +2 -2
- maxframe/learn/contrib/xgboost/train.py +2 -2
- maxframe/lib/mmh3.cpython-39-darwin.so +0 -0
- maxframe/lib/mmh3.pyi +43 -0
- maxframe/lib/wrapped_pickle.py +2 -1
- maxframe/odpsio/__init__.py +1 -1
- maxframe/odpsio/arrow.py +8 -4
- maxframe/odpsio/schema.py +10 -7
- maxframe/odpsio/tableio.py +388 -14
- maxframe/odpsio/tests/test_schema.py +16 -15
- maxframe/odpsio/tests/test_tableio.py +48 -21
- maxframe/protocol.py +148 -12
- maxframe/serialization/core.cpython-39-darwin.so +0 -0
- maxframe/serialization/core.pxd +3 -0
- maxframe/serialization/core.pyi +3 -0
- maxframe/serialization/core.pyx +54 -25
- maxframe/serialization/exception.py +1 -1
- maxframe/serialization/pandas.py +7 -2
- maxframe/serialization/serializables/core.py +158 -12
- maxframe/serialization/serializables/tests/test_serializable.py +46 -4
- maxframe/tensor/__init__.py +59 -0
- maxframe/tensor/arithmetic/tests/test_arithmetic.py +1 -1
- maxframe/tensor/base/atleast_1d.py +1 -1
- maxframe/tensor/base/unique.py +3 -3
- maxframe/tensor/reduction/count_nonzero.py +1 -1
- maxframe/tensor/statistics/quantile.py +2 -2
- maxframe/tests/test_protocol.py +34 -0
- maxframe/tests/test_utils.py +0 -12
- maxframe/tests/utils.py +11 -2
- maxframe/utils.py +24 -13
- {maxframe-0.1.0b5.dist-info → maxframe-1.0.0rc2.dist-info}/METADATA +75 -2
- {maxframe-0.1.0b5.dist-info → maxframe-1.0.0rc2.dist-info}/RECORD +91 -89
- {maxframe-0.1.0b5.dist-info → maxframe-1.0.0rc2.dist-info}/WHEEL +1 -1
- maxframe_client/__init__.py +0 -1
- maxframe_client/fetcher.py +38 -27
- maxframe_client/session/odps.py +50 -10
- maxframe_client/session/task.py +41 -20
- maxframe_client/tests/test_fetcher.py +21 -3
- maxframe_client/tests/test_session.py +49 -2
- maxframe_client/clients/spe.py +0 -104
- {maxframe-0.1.0b5.dist-info → maxframe-1.0.0rc2.dist-info}/top_level.txt +0 -0
maxframe/odpsio/tableio.py
CHANGED
|
@@ -16,7 +16,7 @@ import os
|
|
|
16
16
|
import time
|
|
17
17
|
from abc import ABC, abstractmethod
|
|
18
18
|
from contextlib import contextmanager
|
|
19
|
-
from typing import List, Optional, Union
|
|
19
|
+
from typing import Dict, List, Optional, Union
|
|
20
20
|
|
|
21
21
|
import pyarrow as pa
|
|
22
22
|
from odps import ODPS
|
|
@@ -25,8 +25,16 @@ from odps.apis.storage_api import (
|
|
|
25
25
|
TableBatchScanResponse,
|
|
26
26
|
TableBatchWriteResponse,
|
|
27
27
|
)
|
|
28
|
-
from odps.
|
|
28
|
+
from odps.config import option_context as pyodps_option_context
|
|
29
|
+
from odps.tunnel import TableTunnel
|
|
30
|
+
from odps.types import OdpsSchema, PartitionSpec, timestamp_ntz
|
|
29
31
|
|
|
32
|
+
try:
|
|
33
|
+
import pyarrow.compute as pac
|
|
34
|
+
except ImportError:
|
|
35
|
+
pac = None
|
|
36
|
+
|
|
37
|
+
from ..config import options
|
|
30
38
|
from ..env import ODPS_STORAGE_API_ENDPOINT
|
|
31
39
|
from .schema import odps_schema_to_arrow_schema
|
|
32
40
|
|
|
@@ -35,18 +43,55 @@ PartitionsType = Union[List[str], str, None]
|
|
|
35
43
|
_DEFAULT_ROW_BATCH_SIZE = 4096
|
|
36
44
|
|
|
37
45
|
|
|
38
|
-
|
|
46
|
+
@contextmanager
|
|
47
|
+
def _sync_pyodps_timezone():
|
|
48
|
+
with pyodps_option_context() as cfg:
|
|
49
|
+
cfg.local_timezone = options.local_timezone
|
|
50
|
+
yield
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class ODPSTableIO(ABC):
|
|
54
|
+
def __new__(cls, odps: ODPS):
|
|
55
|
+
if cls is ODPSTableIO:
|
|
56
|
+
if options.use_common_table:
|
|
57
|
+
return HaloTableIO(odps)
|
|
58
|
+
else:
|
|
59
|
+
return TunnelTableIO(odps)
|
|
60
|
+
return super().__new__(cls)
|
|
61
|
+
|
|
39
62
|
def __init__(self, odps: ODPS):
|
|
40
63
|
self._odps = odps
|
|
41
64
|
|
|
65
|
+
@classmethod
|
|
66
|
+
def _get_reader_schema(
|
|
67
|
+
cls,
|
|
68
|
+
table_schema: OdpsSchema,
|
|
69
|
+
columns: Optional[List[str]] = None,
|
|
70
|
+
partition_columns: Union[None, bool, List[str]] = None,
|
|
71
|
+
) -> OdpsSchema:
|
|
72
|
+
final_cols = []
|
|
73
|
+
|
|
74
|
+
columns = columns or [col.name for col in table_schema.simple_columns]
|
|
75
|
+
if partition_columns is True:
|
|
76
|
+
partition_columns = [c.name for c in table_schema.partitions]
|
|
77
|
+
else:
|
|
78
|
+
partition_columns = partition_columns or []
|
|
79
|
+
|
|
80
|
+
for col_name in columns + partition_columns:
|
|
81
|
+
final_cols.append(table_schema[col_name])
|
|
82
|
+
return OdpsSchema(final_cols)
|
|
83
|
+
|
|
42
84
|
@abstractmethod
|
|
43
85
|
def open_reader(
|
|
44
86
|
self,
|
|
45
87
|
full_table_name: str,
|
|
46
88
|
partitions: PartitionsType = None,
|
|
47
89
|
columns: Optional[List[str]] = None,
|
|
90
|
+
partition_columns: Union[None, bool, List[str]] = None,
|
|
48
91
|
start: Optional[int] = None,
|
|
49
|
-
|
|
92
|
+
stop: Optional[int] = None,
|
|
93
|
+
reverse_range: bool = False,
|
|
94
|
+
row_batch_size: int = _DEFAULT_ROW_BATCH_SIZE,
|
|
50
95
|
):
|
|
51
96
|
raise NotImplementedError
|
|
52
97
|
|
|
@@ -60,12 +105,264 @@ class MCTableIO(ABC):
|
|
|
60
105
|
raise NotImplementedError
|
|
61
106
|
|
|
62
107
|
|
|
108
|
+
class TunnelMultiPartitionReader:
|
|
109
|
+
def __init__(
|
|
110
|
+
self,
|
|
111
|
+
odps_entry: ODPS,
|
|
112
|
+
table_name: str,
|
|
113
|
+
partitions: PartitionsType,
|
|
114
|
+
columns: Optional[List[str]] = None,
|
|
115
|
+
partition_columns: Optional[List[str]] = None,
|
|
116
|
+
start: Optional[int] = None,
|
|
117
|
+
count: Optional[int] = None,
|
|
118
|
+
partition_to_download_ids: Dict[str, str] = None,
|
|
119
|
+
):
|
|
120
|
+
self._odps_entry = odps_entry
|
|
121
|
+
self._table = odps_entry.get_table(table_name)
|
|
122
|
+
self._columns = columns
|
|
123
|
+
|
|
124
|
+
odps_schema = ODPSTableIO._get_reader_schema(
|
|
125
|
+
self._table.table_schema, columns, partition_columns
|
|
126
|
+
)
|
|
127
|
+
self._schema = odps_schema_to_arrow_schema(odps_schema)
|
|
128
|
+
|
|
129
|
+
self._start = start or 0
|
|
130
|
+
self._count = count
|
|
131
|
+
self._row_left = count
|
|
132
|
+
|
|
133
|
+
self._cur_reader = None
|
|
134
|
+
self._reader_iter = None
|
|
135
|
+
self._cur_partition_id = -1
|
|
136
|
+
self._reader_start_pos = 0
|
|
137
|
+
|
|
138
|
+
if partitions is None or isinstance(partitions, str):
|
|
139
|
+
self._partitions = [partitions]
|
|
140
|
+
else:
|
|
141
|
+
self._partitions = partitions
|
|
142
|
+
|
|
143
|
+
self._partition_cols = partition_columns
|
|
144
|
+
self._partition_to_download_ids = partition_to_download_ids or dict()
|
|
145
|
+
|
|
146
|
+
@property
|
|
147
|
+
def count(self) -> Optional[int]:
|
|
148
|
+
if len(self._partitions) > 1:
|
|
149
|
+
return None
|
|
150
|
+
return self._count
|
|
151
|
+
|
|
152
|
+
def _open_next_reader(self):
|
|
153
|
+
if self._cur_reader is not None:
|
|
154
|
+
self._reader_start_pos += self._cur_reader.count
|
|
155
|
+
|
|
156
|
+
if (
|
|
157
|
+
self._row_left is not None and self._row_left <= 0
|
|
158
|
+
) or 1 + self._cur_partition_id >= len(self._partitions):
|
|
159
|
+
self._cur_reader = None
|
|
160
|
+
return
|
|
161
|
+
|
|
162
|
+
while 1 + self._cur_partition_id < len(self._partitions):
|
|
163
|
+
self._cur_partition_id += 1
|
|
164
|
+
|
|
165
|
+
part_str = self._partitions[self._cur_partition_id]
|
|
166
|
+
with _sync_pyodps_timezone():
|
|
167
|
+
self._cur_reader = self._table.open_reader(
|
|
168
|
+
part_str,
|
|
169
|
+
columns=self._columns,
|
|
170
|
+
arrow=True,
|
|
171
|
+
download_id=self._partition_to_download_ids.get(part_str),
|
|
172
|
+
)
|
|
173
|
+
if self._cur_reader.count + self._reader_start_pos > self._start:
|
|
174
|
+
start = self._start - self._reader_start_pos
|
|
175
|
+
if self._count is None:
|
|
176
|
+
count = None
|
|
177
|
+
else:
|
|
178
|
+
count = min(self._count, self._cur_reader.count - start)
|
|
179
|
+
|
|
180
|
+
with _sync_pyodps_timezone():
|
|
181
|
+
self._reader_iter = self._cur_reader.read(start, count)
|
|
182
|
+
break
|
|
183
|
+
self._reader_start_pos += self._cur_reader.count
|
|
184
|
+
else:
|
|
185
|
+
self._cur_reader = None
|
|
186
|
+
|
|
187
|
+
def _fill_batch_partition(self, batch: pa.RecordBatch) -> pa.RecordBatch:
|
|
188
|
+
pt_spec = PartitionSpec(self._partitions[self._cur_partition_id])
|
|
189
|
+
|
|
190
|
+
names = list(batch.schema.names)
|
|
191
|
+
arrays = []
|
|
192
|
+
for idx in range(batch.num_columns):
|
|
193
|
+
col = batch.column(idx)
|
|
194
|
+
if isinstance(col.type, pa.TimestampType):
|
|
195
|
+
if col.type.tz is not None:
|
|
196
|
+
target_type = pa.timestamp(
|
|
197
|
+
self._schema.types[idx].unit, col.type.tz
|
|
198
|
+
)
|
|
199
|
+
arrays.append(col.cast(target_type))
|
|
200
|
+
else:
|
|
201
|
+
target_type = pa.timestamp(
|
|
202
|
+
self._schema.types[idx].unit, options.local_timezone
|
|
203
|
+
)
|
|
204
|
+
pd_col = col.to_pandas().dt.tz_localize(options.local_timezone)
|
|
205
|
+
arrays.append(pa.Array.from_pandas(pd_col).cast(target_type))
|
|
206
|
+
else:
|
|
207
|
+
arrays.append(batch.column(idx))
|
|
208
|
+
|
|
209
|
+
for part_col in self._partition_cols or []:
|
|
210
|
+
names.append(part_col)
|
|
211
|
+
col_type = self._schema.field_by_name(part_col).type
|
|
212
|
+
arrays.append(pa.array([pt_spec[part_col]] * batch.num_rows).cast(col_type))
|
|
213
|
+
return pa.RecordBatch.from_arrays(arrays, names)
|
|
214
|
+
|
|
215
|
+
def read(self):
|
|
216
|
+
with _sync_pyodps_timezone():
|
|
217
|
+
if self._cur_reader is None:
|
|
218
|
+
self._open_next_reader()
|
|
219
|
+
if self._cur_reader is None:
|
|
220
|
+
return None
|
|
221
|
+
while self._cur_reader is not None:
|
|
222
|
+
try:
|
|
223
|
+
batch = next(self._reader_iter)
|
|
224
|
+
if batch is not None:
|
|
225
|
+
if self._row_left is not None:
|
|
226
|
+
self._row_left -= batch.num_rows
|
|
227
|
+
return self._fill_batch_partition(batch)
|
|
228
|
+
except StopIteration:
|
|
229
|
+
self._open_next_reader()
|
|
230
|
+
return None
|
|
231
|
+
|
|
232
|
+
def read_all(self) -> pa.Table:
|
|
233
|
+
batches = []
|
|
234
|
+
while True:
|
|
235
|
+
batch = self.read()
|
|
236
|
+
if batch is None:
|
|
237
|
+
break
|
|
238
|
+
batches.append(batch)
|
|
239
|
+
if not batches:
|
|
240
|
+
return self._schema.empty_table()
|
|
241
|
+
return pa.Table.from_batches(batches)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
class TunnelWrappedWriter:
|
|
245
|
+
def __init__(self, nested_writer):
|
|
246
|
+
self._writer = nested_writer
|
|
247
|
+
|
|
248
|
+
def write(self, data: Union[pa.RecordBatch, pa.Table]):
|
|
249
|
+
if not any(isinstance(tp, pa.TimestampType) for tp in data.schema.types):
|
|
250
|
+
self._writer.write(data)
|
|
251
|
+
return
|
|
252
|
+
pa_type = type(data)
|
|
253
|
+
arrays = []
|
|
254
|
+
for idx in range(data.num_columns):
|
|
255
|
+
name = data.schema.names[idx]
|
|
256
|
+
col = data.column(idx)
|
|
257
|
+
if not isinstance(col.type, pa.TimestampType):
|
|
258
|
+
arrays.append(col)
|
|
259
|
+
continue
|
|
260
|
+
if self._writer.schema[name].type == timestamp_ntz:
|
|
261
|
+
col = HaloTableArrowWriter._localize_timezone(col, "UTC")
|
|
262
|
+
else:
|
|
263
|
+
col = HaloTableArrowWriter._localize_timezone(col)
|
|
264
|
+
arrays.append(col)
|
|
265
|
+
data = pa_type.from_arrays(arrays, names=data.schema.names)
|
|
266
|
+
self._writer.write(data)
|
|
267
|
+
|
|
268
|
+
def __getattr__(self, item):
|
|
269
|
+
return getattr(self._writer, item)
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
class TunnelTableIO(ODPSTableIO):
|
|
273
|
+
@contextmanager
|
|
274
|
+
def open_reader(
|
|
275
|
+
self,
|
|
276
|
+
full_table_name: str,
|
|
277
|
+
partitions: PartitionsType = None,
|
|
278
|
+
columns: Optional[List[str]] = None,
|
|
279
|
+
partition_columns: Union[None, bool, List[str]] = None,
|
|
280
|
+
start: Optional[int] = None,
|
|
281
|
+
stop: Optional[int] = None,
|
|
282
|
+
reverse_range: bool = False,
|
|
283
|
+
row_batch_size: int = _DEFAULT_ROW_BATCH_SIZE,
|
|
284
|
+
):
|
|
285
|
+
table = self._odps.get_table(full_table_name)
|
|
286
|
+
if partition_columns is True:
|
|
287
|
+
partition_columns = [c.name for c in table.table_schema.partitions]
|
|
288
|
+
|
|
289
|
+
total_records = None
|
|
290
|
+
part_to_down_id = None
|
|
291
|
+
if (
|
|
292
|
+
(start is not None and start < 0)
|
|
293
|
+
or (stop is not None and stop < 0)
|
|
294
|
+
or (reverse_range and start is None)
|
|
295
|
+
):
|
|
296
|
+
table = self._odps.get_table(full_table_name)
|
|
297
|
+
tunnel = TableTunnel(self._odps)
|
|
298
|
+
parts = (
|
|
299
|
+
[partitions]
|
|
300
|
+
if partitions is None or isinstance(partitions, str)
|
|
301
|
+
else partitions
|
|
302
|
+
)
|
|
303
|
+
part_to_down_id = dict()
|
|
304
|
+
total_records = 0
|
|
305
|
+
for part in parts:
|
|
306
|
+
down_session = tunnel.create_download_session(
|
|
307
|
+
table, async_mode=True, partition_spec=part
|
|
308
|
+
)
|
|
309
|
+
part_to_down_id[part] = down_session.id
|
|
310
|
+
total_records += down_session.count
|
|
311
|
+
|
|
312
|
+
count = None
|
|
313
|
+
if start is not None or stop is not None:
|
|
314
|
+
if reverse_range:
|
|
315
|
+
start = start if start is not None else total_records - 1
|
|
316
|
+
stop = stop if stop is not None else -1
|
|
317
|
+
else:
|
|
318
|
+
start = start if start is not None else 0
|
|
319
|
+
stop = stop if stop is not None else None
|
|
320
|
+
start = start if start >= 0 else total_records + start
|
|
321
|
+
stop = stop if stop is None or stop >= 0 else total_records + stop
|
|
322
|
+
if reverse_range:
|
|
323
|
+
count = start - stop
|
|
324
|
+
start = stop + 1
|
|
325
|
+
else:
|
|
326
|
+
count = stop - start if stop is not None and start is not None else None
|
|
327
|
+
|
|
328
|
+
yield TunnelMultiPartitionReader(
|
|
329
|
+
self._odps,
|
|
330
|
+
full_table_name,
|
|
331
|
+
partitions=partitions,
|
|
332
|
+
columns=columns,
|
|
333
|
+
partition_columns=partition_columns,
|
|
334
|
+
start=start,
|
|
335
|
+
count=count,
|
|
336
|
+
partition_to_download_ids=part_to_down_id,
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
@contextmanager
|
|
340
|
+
def open_writer(
|
|
341
|
+
self,
|
|
342
|
+
full_table_name: str,
|
|
343
|
+
partition: Optional[str] = None,
|
|
344
|
+
overwrite: bool = True,
|
|
345
|
+
):
|
|
346
|
+
table = self._odps.get_table(full_table_name)
|
|
347
|
+
with _sync_pyodps_timezone():
|
|
348
|
+
with table.open_writer(
|
|
349
|
+
partition=partition,
|
|
350
|
+
arrow=True,
|
|
351
|
+
create_partition=partition is not None,
|
|
352
|
+
overwrite=overwrite,
|
|
353
|
+
) as writer:
|
|
354
|
+
# fixme should yield writer directly once pyodps fixes
|
|
355
|
+
# related arrow timestamp bug when provided schema and
|
|
356
|
+
# table schema is identical.
|
|
357
|
+
yield TunnelWrappedWriter(writer)
|
|
358
|
+
|
|
359
|
+
|
|
63
360
|
class HaloTableArrowReader:
|
|
64
361
|
def __init__(
|
|
65
362
|
self,
|
|
66
363
|
client: StorageApiArrowClient,
|
|
67
364
|
scan_info: TableBatchScanResponse,
|
|
68
|
-
|
|
365
|
+
odps_schema: OdpsSchema,
|
|
69
366
|
start: Optional[int] = None,
|
|
70
367
|
count: Optional[int] = None,
|
|
71
368
|
row_batch_size: Optional[int] = None,
|
|
@@ -76,7 +373,8 @@ class HaloTableArrowReader:
|
|
|
76
373
|
self._cur_split_id = -1
|
|
77
374
|
self._cur_reader = None
|
|
78
375
|
|
|
79
|
-
self.
|
|
376
|
+
self._odps_schema = odps_schema
|
|
377
|
+
self._arrow_schema = odps_schema_to_arrow_schema(odps_schema)
|
|
80
378
|
|
|
81
379
|
self._start = start
|
|
82
380
|
self._count = count
|
|
@@ -115,6 +413,34 @@ class HaloTableArrowReader:
|
|
|
115
413
|
self._cur_reader = self._client.read_rows_arrow(req)
|
|
116
414
|
self._cur_split_id += 1
|
|
117
415
|
|
|
416
|
+
def _convert_timezone(self, batch: pa.RecordBatch) -> pa.RecordBatch:
|
|
417
|
+
timezone = options.local_timezone
|
|
418
|
+
if not any(isinstance(tp, pa.TimestampType) for tp in batch.schema.types):
|
|
419
|
+
return batch
|
|
420
|
+
|
|
421
|
+
cols = []
|
|
422
|
+
for idx in range(batch.num_columns):
|
|
423
|
+
col = batch.column(idx)
|
|
424
|
+
name = batch.schema.names[idx]
|
|
425
|
+
if not isinstance(col.type, pa.TimestampType):
|
|
426
|
+
cols.append(col)
|
|
427
|
+
continue
|
|
428
|
+
if self._odps_schema[name].type == timestamp_ntz:
|
|
429
|
+
col = col.cast(pa.timestamp(col.type.unit))
|
|
430
|
+
cols.append(col)
|
|
431
|
+
continue
|
|
432
|
+
|
|
433
|
+
if hasattr(pac, "local_timestamp"):
|
|
434
|
+
col = col.cast(pa.timestamp(col.type.unit, timezone))
|
|
435
|
+
else:
|
|
436
|
+
pd_col = col.to_pandas().dt.tz_convert(timezone)
|
|
437
|
+
col = pa.Array.from_pandas(pd_col).cast(
|
|
438
|
+
pa.timestamp(col.type.unit, timezone)
|
|
439
|
+
)
|
|
440
|
+
cols.append(col)
|
|
441
|
+
|
|
442
|
+
return pa.RecordBatch.from_arrays(cols, names=batch.schema.names)
|
|
443
|
+
|
|
118
444
|
def read(self):
|
|
119
445
|
if self._cur_reader is None:
|
|
120
446
|
self._open_next_reader()
|
|
@@ -123,7 +449,7 @@ class HaloTableArrowReader:
|
|
|
123
449
|
while self._cur_reader is not None:
|
|
124
450
|
batch = self._cur_reader.read()
|
|
125
451
|
if batch is not None:
|
|
126
|
-
return batch
|
|
452
|
+
return self._convert_timezone(batch)
|
|
127
453
|
self._open_next_reader()
|
|
128
454
|
return None
|
|
129
455
|
|
|
@@ -135,16 +461,21 @@ class HaloTableArrowReader:
|
|
|
135
461
|
break
|
|
136
462
|
batches.append(batch)
|
|
137
463
|
if not batches:
|
|
138
|
-
return self.
|
|
464
|
+
return self._arrow_schema.empty_table()
|
|
139
465
|
return pa.Table.from_batches(batches)
|
|
140
466
|
|
|
141
467
|
|
|
142
468
|
class HaloTableArrowWriter:
|
|
143
469
|
def __init__(
|
|
144
|
-
self,
|
|
470
|
+
self,
|
|
471
|
+
client: StorageApiArrowClient,
|
|
472
|
+
write_info: TableBatchWriteResponse,
|
|
473
|
+
odps_schema: OdpsSchema,
|
|
145
474
|
):
|
|
146
475
|
self._client = client
|
|
147
476
|
self._write_info = write_info
|
|
477
|
+
self._odps_schema = odps_schema
|
|
478
|
+
self._arrow_schema = odps_schema_to_arrow_schema(odps_schema)
|
|
148
479
|
|
|
149
480
|
self._writer = None
|
|
150
481
|
|
|
@@ -155,12 +486,52 @@ class HaloTableArrowWriter:
|
|
|
155
486
|
WriteRowsRequest(self._write_info.session_id)
|
|
156
487
|
)
|
|
157
488
|
|
|
489
|
+
@classmethod
|
|
490
|
+
def _localize_timezone(cls, col, tz=None):
|
|
491
|
+
from odps.lib import tzlocal
|
|
492
|
+
|
|
493
|
+
if tz is None:
|
|
494
|
+
if options.local_timezone is None:
|
|
495
|
+
tz = str(tzlocal.get_localzone())
|
|
496
|
+
else:
|
|
497
|
+
tz = str(options.local_timezone)
|
|
498
|
+
|
|
499
|
+
if col.type.tz is not None:
|
|
500
|
+
return col
|
|
501
|
+
if hasattr(pac, "assume_timezone"):
|
|
502
|
+
col = pac.assume_timezone(col, tz)
|
|
503
|
+
return col
|
|
504
|
+
else:
|
|
505
|
+
col = col.to_pandas()
|
|
506
|
+
return pa.Array.from_pandas(col.dt.tz_localize(tz))
|
|
507
|
+
|
|
508
|
+
def _convert_schema(self, batch: pa.RecordBatch):
|
|
509
|
+
if batch.schema == self._arrow_schema and not any(
|
|
510
|
+
isinstance(tp, pa.TimestampType) for tp in self._arrow_schema.types
|
|
511
|
+
):
|
|
512
|
+
return batch
|
|
513
|
+
cols = []
|
|
514
|
+
for idx in range(batch.num_columns):
|
|
515
|
+
col = batch.column(idx)
|
|
516
|
+
name = batch.schema.names[idx]
|
|
517
|
+
|
|
518
|
+
if isinstance(col.type, pa.TimestampType):
|
|
519
|
+
if self._odps_schema[name].type == timestamp_ntz:
|
|
520
|
+
col = self._localize_timezone(col, "UTC")
|
|
521
|
+
else:
|
|
522
|
+
col = self._localize_timezone(col)
|
|
523
|
+
|
|
524
|
+
if col.type != self._arrow_schema.types[idx]:
|
|
525
|
+
col = col.cast(self._arrow_schema.types[idx])
|
|
526
|
+
cols.append(col)
|
|
527
|
+
return pa.RecordBatch.from_arrays(cols, names=batch.schema.names)
|
|
528
|
+
|
|
158
529
|
def write(self, batch):
|
|
159
530
|
if isinstance(batch, pa.Table):
|
|
160
531
|
for b in batch.to_batches():
|
|
161
|
-
self._writer.write(b)
|
|
532
|
+
self._writer.write(self._convert_schema(b))
|
|
162
533
|
else:
|
|
163
|
-
self._writer.write(batch)
|
|
534
|
+
self._writer.write(self._convert_schema(batch))
|
|
164
535
|
|
|
165
536
|
def close(self):
|
|
166
537
|
commit_msg, is_success = self._writer.finish()
|
|
@@ -169,7 +540,7 @@ class HaloTableArrowWriter:
|
|
|
169
540
|
return commit_msg
|
|
170
541
|
|
|
171
542
|
|
|
172
|
-
class HaloTableIO(
|
|
543
|
+
class HaloTableIO(ODPSTableIO):
|
|
173
544
|
_storage_api_endpoint = os.getenv(ODPS_STORAGE_API_ENDPOINT)
|
|
174
545
|
|
|
175
546
|
@staticmethod
|
|
@@ -275,10 +646,13 @@ class HaloTableIO(MCTableIO):
|
|
|
275
646
|
else:
|
|
276
647
|
count = stop - start
|
|
277
648
|
|
|
649
|
+
reader_schema = self._get_reader_schema(
|
|
650
|
+
table.table_schema, columns, partition_columns
|
|
651
|
+
)
|
|
278
652
|
yield HaloTableArrowReader(
|
|
279
653
|
client,
|
|
280
654
|
resp,
|
|
281
|
-
|
|
655
|
+
odps_schema=reader_schema,
|
|
282
656
|
start=start,
|
|
283
657
|
count=count,
|
|
284
658
|
row_batch_size=row_batch_size,
|
|
@@ -308,7 +682,7 @@ class HaloTableIO(MCTableIO):
|
|
|
308
682
|
resp = client.create_write_session(req)
|
|
309
683
|
|
|
310
684
|
session_id = resp.session_id
|
|
311
|
-
writer = HaloTableArrowWriter(client, resp)
|
|
685
|
+
writer = HaloTableArrowWriter(client, resp, table.table_schema)
|
|
312
686
|
writer.open()
|
|
313
687
|
|
|
314
688
|
yield writer
|
|
@@ -143,17 +143,17 @@ def test_pandas_to_odps_schema_index(wrap_obj):
|
|
|
143
143
|
data = pd.Index(np.random.randint(0, 100, 100))
|
|
144
144
|
|
|
145
145
|
test_idx = _wrap_maxframe_obj(data, wrap=wrap_obj)
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
146
|
+
for ignore_idx in (False, True):
|
|
147
|
+
schema, meta = pandas_to_odps_schema(
|
|
148
|
+
test_idx, unknown_as_string=True, ignore_index=ignore_idx
|
|
149
|
+
)
|
|
150
|
+
assert [c.name for c in schema.columns] == ["_idx_0"]
|
|
151
|
+
assert [c.type.name for c in schema.columns] == ["bigint"]
|
|
152
|
+
assert meta.type == OutputType.index
|
|
153
|
+
assert meta.table_column_names == []
|
|
154
|
+
assert meta.table_index_column_names == ["_idx_0"]
|
|
155
|
+
assert meta.pd_column_level_names == []
|
|
156
|
+
assert meta.pd_index_level_names == [None]
|
|
157
157
|
|
|
158
158
|
data = pd.MultiIndex.from_arrays(
|
|
159
159
|
[np.random.choice(list("ABC"), 100), np.random.randint(0, 10, 100)],
|
|
@@ -177,6 +177,7 @@ def test_pandas_to_odps_schema_scalar(wrap_obj):
|
|
|
177
177
|
test_scalar = _wrap_maxframe_obj(data, wrap=wrap_obj)
|
|
178
178
|
if wrap_obj != "no":
|
|
179
179
|
test_scalar.op.data = None
|
|
180
|
+
|
|
180
181
|
schema, meta = pandas_to_odps_schema(test_scalar, unknown_as_string=True)
|
|
181
182
|
assert schema.columns[0].name == "_idx_0"
|
|
182
183
|
assert schema.columns[0].type.name == "double"
|
|
@@ -186,9 +187,6 @@ def test_pandas_to_odps_schema_scalar(wrap_obj):
|
|
|
186
187
|
assert meta.pd_column_level_names == []
|
|
187
188
|
assert meta.pd_index_level_names == [None]
|
|
188
189
|
|
|
189
|
-
with pytest.raises(AssertionError):
|
|
190
|
-
pandas_to_odps_schema(test_scalar, unknown_as_string=True, ignore_index=True)
|
|
191
|
-
|
|
192
190
|
|
|
193
191
|
def test_odps_arrow_schema_conversion():
|
|
194
192
|
odps_schema = odps_types.OdpsSchema(
|
|
@@ -211,10 +209,11 @@ def test_odps_arrow_schema_conversion():
|
|
|
211
209
|
odps_types.Column("col16", "struct<a1: string, a2: map<string, bigint>>"),
|
|
212
210
|
odps_types.Column("col17", "CHAR(15)"),
|
|
213
211
|
odps_types.Column("col18", "VARCHAR(15)"),
|
|
212
|
+
odps_types.Column("col19", "decimal"),
|
|
214
213
|
]
|
|
215
214
|
)
|
|
216
215
|
arrow_schema = odps_schema_to_arrow_schema(odps_schema)
|
|
217
|
-
assert arrow_schema.names == [f"col{i}" for i in range(1,
|
|
216
|
+
assert arrow_schema.names == [f"col{i}" for i in range(1, 20)]
|
|
218
217
|
assert arrow_schema.types == [
|
|
219
218
|
pa.string(),
|
|
220
219
|
pa.binary(),
|
|
@@ -234,6 +233,7 @@ def test_odps_arrow_schema_conversion():
|
|
|
234
233
|
pa.struct([("a1", pa.string()), ("a2", pa.map_(pa.string(), pa.int64()))]),
|
|
235
234
|
pa.string(),
|
|
236
235
|
pa.string(),
|
|
236
|
+
pa.decimal128(38, 18),
|
|
237
237
|
]
|
|
238
238
|
|
|
239
239
|
expected_odps_schema = odps_types.OdpsSchema(
|
|
@@ -256,6 +256,7 @@ def test_odps_arrow_schema_conversion():
|
|
|
256
256
|
odps_types.Column("col16", "struct<a1: string, a2: map<string, bigint>>"),
|
|
257
257
|
odps_types.Column("col17", "string"),
|
|
258
258
|
odps_types.Column("col18", "string"),
|
|
259
|
+
odps_types.Column("col19", "decimal(38, 18)"),
|
|
259
260
|
]
|
|
260
261
|
)
|
|
261
262
|
|