maxframe 1.0.0rc4__cp39-cp39-win32.whl → 1.1.1__cp39-cp39-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cp39-win32.pyd +0 -0
- maxframe/config/__init__.py +1 -1
- maxframe/config/config.py +26 -0
- maxframe/config/tests/test_config.py +20 -1
- maxframe/conftest.py +17 -4
- maxframe/core/graph/core.cp39-win32.pyd +0 -0
- maxframe/core/operator/base.py +2 -0
- maxframe/dataframe/arithmetic/tests/test_arithmetic.py +17 -16
- maxframe/dataframe/core.py +24 -2
- maxframe/dataframe/datasource/read_odps_query.py +65 -35
- maxframe/dataframe/datasource/read_odps_table.py +4 -2
- maxframe/dataframe/datasource/tests/test_datasource.py +59 -7
- maxframe/dataframe/extensions/__init__.py +5 -0
- maxframe/dataframe/extensions/apply_chunk.py +649 -0
- maxframe/dataframe/extensions/flatjson.py +131 -0
- maxframe/dataframe/extensions/flatmap.py +28 -40
- maxframe/dataframe/extensions/reshuffle.py +1 -1
- maxframe/dataframe/extensions/tests/test_apply_chunk.py +186 -0
- maxframe/dataframe/extensions/tests/test_extensions.py +46 -2
- maxframe/dataframe/groupby/__init__.py +1 -0
- maxframe/dataframe/groupby/aggregation.py +1 -0
- maxframe/dataframe/groupby/apply.py +9 -1
- maxframe/dataframe/groupby/core.py +1 -1
- maxframe/dataframe/groupby/fill.py +4 -1
- maxframe/dataframe/groupby/getitem.py +6 -0
- maxframe/dataframe/groupby/tests/test_groupby.py +1 -1
- maxframe/dataframe/groupby/transform.py +8 -2
- maxframe/dataframe/indexing/loc.py +6 -4
- maxframe/dataframe/merge/__init__.py +9 -1
- maxframe/dataframe/merge/concat.py +41 -31
- maxframe/dataframe/merge/merge.py +1 -1
- maxframe/dataframe/merge/tests/test_merge.py +3 -1
- maxframe/dataframe/misc/apply.py +3 -0
- maxframe/dataframe/misc/drop_duplicates.py +5 -1
- maxframe/dataframe/misc/map.py +3 -1
- maxframe/dataframe/misc/tests/test_misc.py +24 -2
- maxframe/dataframe/misc/transform.py +22 -13
- maxframe/dataframe/reduction/__init__.py +3 -0
- maxframe/dataframe/reduction/aggregation.py +1 -0
- maxframe/dataframe/reduction/median.py +56 -0
- maxframe/dataframe/reduction/tests/test_reduction.py +17 -7
- maxframe/dataframe/statistics/quantile.py +8 -2
- maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
- maxframe/dataframe/tests/test_utils.py +60 -0
- maxframe/dataframe/utils.py +110 -7
- maxframe/dataframe/window/expanding.py +5 -3
- maxframe/dataframe/window/tests/test_expanding.py +2 -2
- maxframe/io/objects/tests/test_object_io.py +39 -12
- maxframe/io/odpsio/__init__.py +1 -1
- maxframe/io/odpsio/arrow.py +51 -2
- maxframe/io/odpsio/schema.py +23 -5
- maxframe/io/odpsio/tableio.py +80 -124
- maxframe/io/odpsio/tests/test_schema.py +40 -0
- maxframe/io/odpsio/tests/test_tableio.py +5 -5
- maxframe/io/odpsio/tests/test_volumeio.py +35 -11
- maxframe/io/odpsio/volumeio.py +27 -3
- maxframe/learn/contrib/__init__.py +3 -2
- maxframe/learn/contrib/llm/__init__.py +16 -0
- maxframe/learn/contrib/llm/core.py +54 -0
- maxframe/learn/contrib/llm/models/__init__.py +14 -0
- maxframe/learn/contrib/llm/models/dashscope.py +73 -0
- maxframe/learn/contrib/llm/multi_modal.py +42 -0
- maxframe/learn/contrib/llm/text.py +42 -0
- maxframe/lib/mmh3.cp39-win32.pyd +0 -0
- maxframe/lib/sparse/tests/test_sparse.py +15 -15
- maxframe/opcodes.py +7 -1
- maxframe/serialization/core.cp39-win32.pyd +0 -0
- maxframe/serialization/core.pyx +13 -1
- maxframe/serialization/pandas.py +50 -20
- maxframe/serialization/serializables/core.py +70 -15
- maxframe/serialization/serializables/field_type.py +4 -1
- maxframe/serialization/serializables/tests/test_serializable.py +12 -2
- maxframe/serialization/tests/test_serial.py +2 -1
- maxframe/tensor/__init__.py +19 -7
- maxframe/tensor/merge/vstack.py +1 -1
- maxframe/tests/utils.py +16 -0
- maxframe/udf.py +27 -0
- maxframe/utils.py +42 -8
- {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/METADATA +4 -4
- {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/RECORD +88 -77
- {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/WHEEL +1 -1
- maxframe_client/clients/framedriver.py +4 -1
- maxframe_client/fetcher.py +23 -8
- maxframe_client/session/odps.py +40 -11
- maxframe_client/session/task.py +6 -25
- maxframe_client/session/tests/test_task.py +35 -6
- maxframe_client/tests/test_session.py +30 -10
- {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/top_level.txt +0 -0
maxframe/io/odpsio/tableio.py
CHANGED
|
@@ -15,20 +15,20 @@
|
|
|
15
15
|
import os
|
|
16
16
|
import time
|
|
17
17
|
from abc import ABC, abstractmethod
|
|
18
|
+
from collections import OrderedDict
|
|
18
19
|
from contextlib import contextmanager
|
|
19
20
|
from typing import Dict, List, Optional, Union
|
|
20
21
|
|
|
21
|
-
import numpy as np
|
|
22
22
|
import pyarrow as pa
|
|
23
23
|
from odps import ODPS
|
|
24
|
-
from odps import __version__ as pyodps_version
|
|
25
24
|
from odps.apis.storage_api import (
|
|
26
25
|
StorageApiArrowClient,
|
|
27
26
|
TableBatchScanResponse,
|
|
28
27
|
TableBatchWriteResponse,
|
|
29
28
|
)
|
|
30
|
-
from odps.tunnel import TableTunnel
|
|
29
|
+
from odps.tunnel import TableDownloadSession, TableDownloadStatus, TableTunnel
|
|
31
30
|
from odps.types import OdpsSchema, PartitionSpec, timestamp_ntz
|
|
31
|
+
from odps.utils import call_with_retry
|
|
32
32
|
|
|
33
33
|
try:
|
|
34
34
|
import pyarrow.compute as pac
|
|
@@ -37,20 +37,19 @@ except ImportError:
|
|
|
37
37
|
|
|
38
38
|
from ...config import options
|
|
39
39
|
from ...env import ODPS_STORAGE_API_ENDPOINT
|
|
40
|
-
from ...
|
|
41
|
-
from ...utils import sync_pyodps_options
|
|
40
|
+
from ...utils import is_empty, sync_pyodps_options
|
|
42
41
|
from .schema import odps_schema_to_arrow_schema
|
|
43
42
|
|
|
44
43
|
PartitionsType = Union[List[str], str, None]
|
|
45
44
|
|
|
46
45
|
_DEFAULT_ROW_BATCH_SIZE = 4096
|
|
47
|
-
|
|
46
|
+
_DOWNLOAD_ID_CACHE_SIZE = 100
|
|
48
47
|
|
|
49
48
|
|
|
50
49
|
class ODPSTableIO(ABC):
|
|
51
50
|
def __new__(cls, odps: ODPS):
|
|
52
51
|
if cls is ODPSTableIO:
|
|
53
|
-
if options.use_common_table:
|
|
52
|
+
if options.use_common_table or ODPS_STORAGE_API_ENDPOINT in os.environ:
|
|
54
53
|
return HaloTableIO(odps)
|
|
55
54
|
else:
|
|
56
55
|
return TunnelTableIO(odps)
|
|
@@ -68,7 +67,11 @@ class ODPSTableIO(ABC):
|
|
|
68
67
|
) -> OdpsSchema:
|
|
69
68
|
final_cols = []
|
|
70
69
|
|
|
71
|
-
columns =
|
|
70
|
+
columns = (
|
|
71
|
+
columns
|
|
72
|
+
if not is_empty(columns)
|
|
73
|
+
else [col.name for col in table_schema.simple_columns]
|
|
74
|
+
)
|
|
72
75
|
if partition_columns is True:
|
|
73
76
|
partition_columns = [c.name for c in table_schema.partitions]
|
|
74
77
|
else:
|
|
@@ -132,7 +135,12 @@ class TunnelMultiPartitionReader:
|
|
|
132
135
|
self._cur_partition_id = -1
|
|
133
136
|
self._reader_start_pos = 0
|
|
134
137
|
|
|
135
|
-
if partitions is None
|
|
138
|
+
if partitions is None:
|
|
139
|
+
if not self._table.table_schema.partitions:
|
|
140
|
+
self._partitions = [None]
|
|
141
|
+
else:
|
|
142
|
+
self._partitions = [str(pt) for pt in self._table.partitions]
|
|
143
|
+
elif isinstance(partitions, str):
|
|
136
144
|
self._partitions = [partitions]
|
|
137
145
|
else:
|
|
138
146
|
self._partitions = partitions
|
|
@@ -160,17 +168,14 @@ class TunnelMultiPartitionReader:
|
|
|
160
168
|
self._cur_partition_id += 1
|
|
161
169
|
|
|
162
170
|
part_str = self._partitions[self._cur_partition_id]
|
|
163
|
-
|
|
164
|
-
# todo make this more formal when PyODPS 0.12.0 is released
|
|
165
|
-
req_columns = self._columns
|
|
166
|
-
if not _need_patch_batch:
|
|
167
|
-
req_columns = self._schema.names
|
|
171
|
+
req_columns = self._schema.names
|
|
168
172
|
with sync_pyodps_options():
|
|
169
173
|
self._cur_reader = self._table.open_reader(
|
|
170
174
|
part_str,
|
|
171
175
|
columns=req_columns,
|
|
172
176
|
arrow=True,
|
|
173
177
|
download_id=self._partition_to_download_ids.get(part_str),
|
|
178
|
+
append_partitions=True,
|
|
174
179
|
)
|
|
175
180
|
if self._cur_reader.count + self._reader_start_pos > self._start:
|
|
176
181
|
start = self._start - self._reader_start_pos
|
|
@@ -186,35 +191,6 @@ class TunnelMultiPartitionReader:
|
|
|
186
191
|
else:
|
|
187
192
|
self._cur_reader = None
|
|
188
193
|
|
|
189
|
-
def _fill_batch_partition(self, batch: pa.RecordBatch) -> pa.RecordBatch:
|
|
190
|
-
pt_spec = PartitionSpec(self._partitions[self._cur_partition_id])
|
|
191
|
-
|
|
192
|
-
names = list(batch.schema.names)
|
|
193
|
-
arrays = []
|
|
194
|
-
for idx in range(batch.num_columns):
|
|
195
|
-
col = batch.column(idx)
|
|
196
|
-
if isinstance(col.type, pa.TimestampType):
|
|
197
|
-
if col.type.tz is not None:
|
|
198
|
-
target_type = pa.timestamp(
|
|
199
|
-
self._schema.types[idx].unit, col.type.tz
|
|
200
|
-
)
|
|
201
|
-
arrays.append(col.cast(target_type))
|
|
202
|
-
else:
|
|
203
|
-
target_type = pa.timestamp(
|
|
204
|
-
self._schema.types[idx].unit, options.local_timezone
|
|
205
|
-
)
|
|
206
|
-
pd_col = col.to_pandas().dt.tz_localize(options.local_timezone)
|
|
207
|
-
arrays.append(pa.Array.from_pandas(pd_col).cast(target_type))
|
|
208
|
-
else:
|
|
209
|
-
arrays.append(batch.column(idx))
|
|
210
|
-
|
|
211
|
-
for part_col in self._partition_cols or []:
|
|
212
|
-
names.append(part_col)
|
|
213
|
-
col_type = self._schema.field_by_name(part_col).type
|
|
214
|
-
pt_col = np.repeat([pt_spec[part_col]], batch.num_rows)
|
|
215
|
-
arrays.append(pa.array(pt_col).cast(col_type))
|
|
216
|
-
return pa.RecordBatch.from_arrays(arrays, names)
|
|
217
|
-
|
|
218
194
|
def read(self):
|
|
219
195
|
with sync_pyodps_options():
|
|
220
196
|
if self._cur_reader is None:
|
|
@@ -227,10 +203,7 @@ class TunnelMultiPartitionReader:
|
|
|
227
203
|
if batch is not None:
|
|
228
204
|
if self._row_left is not None:
|
|
229
205
|
self._row_left -= batch.num_rows
|
|
230
|
-
|
|
231
|
-
return self._fill_batch_partition(batch)
|
|
232
|
-
else:
|
|
233
|
-
return batch
|
|
206
|
+
return batch
|
|
234
207
|
except StopIteration:
|
|
235
208
|
self._open_next_reader()
|
|
236
209
|
return None
|
|
@@ -247,35 +220,47 @@ class TunnelMultiPartitionReader:
|
|
|
247
220
|
return pa.Table.from_batches(batches)
|
|
248
221
|
|
|
249
222
|
|
|
250
|
-
class
|
|
251
|
-
|
|
252
|
-
self._writer = nested_writer
|
|
223
|
+
class TunnelTableIO(ODPSTableIO):
|
|
224
|
+
_down_session_ids = OrderedDict()
|
|
253
225
|
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
226
|
+
@classmethod
|
|
227
|
+
def create_download_sessions(
|
|
228
|
+
cls,
|
|
229
|
+
odps_entry: ODPS,
|
|
230
|
+
full_table_name: str,
|
|
231
|
+
partitions: List[Optional[str]] = None,
|
|
232
|
+
) -> Dict[Optional[str], TableDownloadSession]:
|
|
233
|
+
table = odps_entry.get_table(full_table_name)
|
|
234
|
+
tunnel = TableTunnel(odps_entry)
|
|
235
|
+
parts = (
|
|
236
|
+
[partitions]
|
|
237
|
+
if partitions is None or isinstance(partitions, str)
|
|
238
|
+
else partitions
|
|
239
|
+
)
|
|
240
|
+
part_to_session = dict()
|
|
241
|
+
for part in parts:
|
|
242
|
+
part_key = (full_table_name, part)
|
|
243
|
+
down_session = None
|
|
244
|
+
|
|
245
|
+
if part_key in cls._down_session_ids:
|
|
246
|
+
down_id = cls._down_session_ids[part_key]
|
|
247
|
+
down_session = tunnel.create_download_session(
|
|
248
|
+
table, async_mode=True, partition_spec=part, download_id=down_id
|
|
249
|
+
)
|
|
250
|
+
if down_session.status != TableDownloadStatus.Normal:
|
|
251
|
+
down_session = None
|
|
273
252
|
|
|
274
|
-
|
|
275
|
-
|
|
253
|
+
if down_session is None:
|
|
254
|
+
down_session = tunnel.create_download_session(
|
|
255
|
+
table, async_mode=True, partition_spec=part
|
|
256
|
+
)
|
|
276
257
|
|
|
258
|
+
while len(cls._down_session_ids) >= _DOWNLOAD_ID_CACHE_SIZE:
|
|
259
|
+
cls._down_session_ids.popitem(False)
|
|
260
|
+
cls._down_session_ids[part_key] = down_session.id
|
|
261
|
+
part_to_session[part] = down_session
|
|
262
|
+
return part_to_session
|
|
277
263
|
|
|
278
|
-
class TunnelTableIO(ODPSTableIO):
|
|
279
264
|
@contextmanager
|
|
280
265
|
def open_reader(
|
|
281
266
|
self,
|
|
@@ -302,21 +287,15 @@ class TunnelTableIO(ODPSTableIO):
|
|
|
302
287
|
or (reverse_range and start is None)
|
|
303
288
|
):
|
|
304
289
|
with sync_pyodps_options():
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
290
|
+
tunnel_sessions = self.create_download_sessions(
|
|
291
|
+
self._odps, full_table_name, partitions
|
|
292
|
+
)
|
|
293
|
+
part_to_down_id = {
|
|
294
|
+
pt: session.id for (pt, session) in tunnel_sessions.items()
|
|
295
|
+
}
|
|
296
|
+
total_records = sum(
|
|
297
|
+
session.count for session in tunnel_sessions.values()
|
|
311
298
|
)
|
|
312
|
-
part_to_down_id = dict()
|
|
313
|
-
total_records = 0
|
|
314
|
-
for part in parts:
|
|
315
|
-
down_session = tunnel.create_download_session(
|
|
316
|
-
table, async_mode=True, partition_spec=part
|
|
317
|
-
)
|
|
318
|
-
part_to_down_id[part] = down_session.id
|
|
319
|
-
total_records += down_session.count
|
|
320
299
|
|
|
321
300
|
count = None
|
|
322
301
|
if start is not None or stop is not None:
|
|
@@ -360,13 +339,7 @@ class TunnelTableIO(ODPSTableIO):
|
|
|
360
339
|
create_partition=partition is not None,
|
|
361
340
|
overwrite=overwrite,
|
|
362
341
|
) as writer:
|
|
363
|
-
|
|
364
|
-
# related arrow timestamp bug when provided schema and
|
|
365
|
-
# table schema is identical.
|
|
366
|
-
if _need_patch_batch:
|
|
367
|
-
yield TunnelWrappedWriter(writer)
|
|
368
|
-
else:
|
|
369
|
-
yield writer
|
|
342
|
+
yield writer
|
|
370
343
|
|
|
371
344
|
|
|
372
345
|
class HaloTableArrowReader:
|
|
@@ -422,7 +395,7 @@ class HaloTableArrowReader:
|
|
|
422
395
|
split_index=self._cur_split_id + 1,
|
|
423
396
|
**read_rows_kw,
|
|
424
397
|
)
|
|
425
|
-
self._cur_reader = self._client.read_rows_arrow
|
|
398
|
+
self._cur_reader = call_with_retry(self._client.read_rows_arrow, req)
|
|
426
399
|
self._cur_split_id += 1
|
|
427
400
|
|
|
428
401
|
def _convert_timezone(self, batch: pa.RecordBatch) -> pa.RecordBatch:
|
|
@@ -494,8 +467,9 @@ class HaloTableArrowWriter:
|
|
|
494
467
|
def open(self):
|
|
495
468
|
from odps.apis.storage_api import WriteRowsRequest
|
|
496
469
|
|
|
497
|
-
self._writer =
|
|
498
|
-
|
|
470
|
+
self._writer = call_with_retry(
|
|
471
|
+
self._client.write_rows_arrow,
|
|
472
|
+
WriteRowsRequest(self._write_info.session_id),
|
|
499
473
|
)
|
|
500
474
|
|
|
501
475
|
@classmethod
|
|
@@ -566,28 +540,6 @@ class HaloTableIO(ODPSTableIO):
|
|
|
566
540
|
for pt in partitions
|
|
567
541
|
]
|
|
568
542
|
|
|
569
|
-
def get_table_record_count(
|
|
570
|
-
self, full_table_name: str, partitions: PartitionsType = None
|
|
571
|
-
):
|
|
572
|
-
from odps.apis.storage_api import SplitOptions, TableBatchScanRequest
|
|
573
|
-
|
|
574
|
-
table = self._odps.get_table(full_table_name)
|
|
575
|
-
client = StorageApiArrowClient(
|
|
576
|
-
self._odps, table, rest_endpoint=self._storage_api_endpoint
|
|
577
|
-
)
|
|
578
|
-
|
|
579
|
-
split_option = SplitOptions.SplitMode.SIZE
|
|
580
|
-
|
|
581
|
-
scan_kw = {
|
|
582
|
-
"required_partitions": self._convert_partitions(partitions),
|
|
583
|
-
"split_options": SplitOptions.get_default_options(split_option),
|
|
584
|
-
}
|
|
585
|
-
|
|
586
|
-
# todo add more options for partition column handling
|
|
587
|
-
req = TableBatchScanRequest(**scan_kw)
|
|
588
|
-
resp = client.create_read_session(req)
|
|
589
|
-
return resp.record_count
|
|
590
|
-
|
|
591
543
|
@contextmanager
|
|
592
544
|
def open_reader(
|
|
593
545
|
self,
|
|
@@ -631,12 +583,12 @@ class HaloTableIO(ODPSTableIO):
|
|
|
631
583
|
|
|
632
584
|
# todo add more options for partition column handling
|
|
633
585
|
req = TableBatchScanRequest(**scan_kw)
|
|
634
|
-
resp = client.create_read_session
|
|
586
|
+
resp = call_with_retry(client.create_read_session, req)
|
|
635
587
|
|
|
636
588
|
session_id = resp.session_id
|
|
637
589
|
status = resp.session_status
|
|
638
590
|
while status == SessionStatus.INIT:
|
|
639
|
-
resp = client.get_read_session
|
|
591
|
+
resp = call_with_retry(client.get_read_session, SessionRequest(session_id))
|
|
640
592
|
status = resp.session_status
|
|
641
593
|
time.sleep(1.0)
|
|
642
594
|
|
|
@@ -691,7 +643,7 @@ class HaloTableIO(ODPSTableIO):
|
|
|
691
643
|
part_strs = self._convert_partitions(partition)
|
|
692
644
|
part_str = part_strs[0] if part_strs else None
|
|
693
645
|
req = TableBatchWriteRequest(partition_spec=part_str, overwrite=overwrite)
|
|
694
|
-
resp = client.create_write_session
|
|
646
|
+
resp = call_with_retry(client.create_write_session, req)
|
|
695
647
|
|
|
696
648
|
session_id = resp.session_id
|
|
697
649
|
writer = HaloTableArrowWriter(client, resp, table.table_schema)
|
|
@@ -700,9 +652,13 @@ class HaloTableIO(ODPSTableIO):
|
|
|
700
652
|
yield writer
|
|
701
653
|
|
|
702
654
|
commit_msg = writer.close()
|
|
703
|
-
resp =
|
|
704
|
-
|
|
655
|
+
resp = call_with_retry(
|
|
656
|
+
client.commit_write_session,
|
|
657
|
+
SessionRequest(session_id=session_id),
|
|
658
|
+
[commit_msg],
|
|
705
659
|
)
|
|
706
660
|
while resp.session_status == SessionStatus.COMMITTING:
|
|
707
|
-
resp =
|
|
661
|
+
resp = call_with_retry(
|
|
662
|
+
client.get_write_session, SessionRequest(session_id=session_id)
|
|
663
|
+
)
|
|
708
664
|
assert resp.session_status == SessionStatus.COMMITTED
|
|
@@ -21,6 +21,7 @@ from odps import types as odps_types
|
|
|
21
21
|
from .... import dataframe as md
|
|
22
22
|
from .... import tensor as mt
|
|
23
23
|
from ....core import OutputType
|
|
24
|
+
from ....utils import pd_release_version
|
|
24
25
|
from ..schema import (
|
|
25
26
|
arrow_schema_to_odps_schema,
|
|
26
27
|
build_dataframe_table_meta,
|
|
@@ -292,3 +293,42 @@ def test_build_table_meta(wrap_obj):
|
|
|
292
293
|
table_meta = build_dataframe_table_meta(test_df)
|
|
293
294
|
expected_cols = ["a_2", "a_3", "a_0", "a_1_0", "a_1_1", "b", "c"]
|
|
294
295
|
assert table_meta.table_column_names == expected_cols
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
@pytest.mark.skipif(
|
|
299
|
+
pd_release_version[0] < 2, reason="only run under pandas 2.0 or greater"
|
|
300
|
+
)
|
|
301
|
+
def test_table_meta_with_datetime():
|
|
302
|
+
raw_df = pd.DataFrame(
|
|
303
|
+
[
|
|
304
|
+
[1, "abc", "2024-10-01 11:23:12"],
|
|
305
|
+
[3, "uvw", "2024-10-02 22:55:13"],
|
|
306
|
+
],
|
|
307
|
+
columns=["col1", "col2", "col3"],
|
|
308
|
+
)
|
|
309
|
+
df = md.DataFrame(raw_df).astype({"col3": "datetime64[ms]"})
|
|
310
|
+
schema, _ = pandas_to_odps_schema(df, unknown_as_string=True)
|
|
311
|
+
assert schema.columns[3].type == odps_types.datetime
|
|
312
|
+
|
|
313
|
+
raw_series = pd.Series(
|
|
314
|
+
["2024-10-01 11:23:12", "2024-10-02 22:55:13"], dtype="datetime64[ms]"
|
|
315
|
+
)
|
|
316
|
+
s = md.Series(raw_series)
|
|
317
|
+
schema, _ = pandas_to_odps_schema(s, unknown_as_string=True)
|
|
318
|
+
assert schema.columns[1].type == odps_types.datetime
|
|
319
|
+
|
|
320
|
+
raw_index = pd.Index(
|
|
321
|
+
["2024-10-01 11:23:12", "2024-10-02 22:55:13"], dtype="datetime64[ms]"
|
|
322
|
+
)
|
|
323
|
+
idx = md.Index(raw_index)
|
|
324
|
+
schema, _ = pandas_to_odps_schema(idx, unknown_as_string=True)
|
|
325
|
+
assert schema.columns[0].type == odps_types.datetime
|
|
326
|
+
|
|
327
|
+
src_df = pd.DataFrame(
|
|
328
|
+
[[1, "2024-10-01 11:23:12"], [3, "2024-10-02 22:55:13"]],
|
|
329
|
+
columns=["A", "B"],
|
|
330
|
+
).astype({"B": "datetime64[ms]"})
|
|
331
|
+
raw_multiindex = pd.MultiIndex.from_frame(src_df)
|
|
332
|
+
multiidx = md.Index(raw_multiindex)
|
|
333
|
+
schema, _ = pandas_to_odps_schema(multiidx, unknown_as_string=True)
|
|
334
|
+
assert schema.columns[1].type == odps_types.datetime
|
|
@@ -31,7 +31,7 @@ def switch_table_io(request):
|
|
|
31
31
|
old_use_common_table = options.use_common_table
|
|
32
32
|
try:
|
|
33
33
|
options.use_common_table = request.param
|
|
34
|
-
yield
|
|
34
|
+
yield request.param
|
|
35
35
|
finally:
|
|
36
36
|
options.use_common_table = old_use_common_table
|
|
37
37
|
|
|
@@ -45,7 +45,7 @@ def test_empty_table_io(switch_table_io):
|
|
|
45
45
|
table_io = ODPSTableIO(o)
|
|
46
46
|
|
|
47
47
|
# test read from empty table
|
|
48
|
-
empty_table_name = tn("
|
|
48
|
+
empty_table_name = tn("test_empty_table_halo_read_" + str(switch_table_io).lower())
|
|
49
49
|
o.delete_table(empty_table_name, if_exists=True)
|
|
50
50
|
tb = o.create_table(empty_table_name, "col1 string", lifecycle=1)
|
|
51
51
|
|
|
@@ -65,7 +65,7 @@ def test_table_io_without_parts(switch_table_io):
|
|
|
65
65
|
table_io = ODPSTableIO(o)
|
|
66
66
|
|
|
67
67
|
# test read and write tables without partition
|
|
68
|
-
no_part_table_name = tn("
|
|
68
|
+
no_part_table_name = tn("test_no_part_halo_write_" + str(switch_table_io).lower())
|
|
69
69
|
o.delete_table(no_part_table_name, if_exists=True)
|
|
70
70
|
col_desc = ",".join(f"{c} double" for c in "abcde") + ", f datetime"
|
|
71
71
|
tb = o.create_table(no_part_table_name, col_desc, lifecycle=1)
|
|
@@ -99,7 +99,7 @@ def test_table_io_with_range_reader(switch_table_io):
|
|
|
99
99
|
table_io = ODPSTableIO(o)
|
|
100
100
|
|
|
101
101
|
# test read and write tables without partition
|
|
102
|
-
no_part_table_name = tn("
|
|
102
|
+
no_part_table_name = tn("test_halo_write_range_" + str(switch_table_io).lower())
|
|
103
103
|
o.delete_table(no_part_table_name, if_exists=True)
|
|
104
104
|
tb = o.create_table(
|
|
105
105
|
no_part_table_name, ",".join(f"{c} double" for c in "abcde"), lifecycle=1
|
|
@@ -139,7 +139,7 @@ def test_table_io_with_parts(switch_table_io):
|
|
|
139
139
|
table_io = ODPSTableIO(o)
|
|
140
140
|
|
|
141
141
|
# test read and write tables with partition
|
|
142
|
-
parted_table_name = tn("
|
|
142
|
+
parted_table_name = tn("test_parted_halo_write_" + str(switch_table_io).lower())
|
|
143
143
|
o.delete_table(parted_table_name, if_exists=True)
|
|
144
144
|
tb = o.create_table(
|
|
145
145
|
parted_table_name,
|
|
@@ -42,15 +42,33 @@ def create_volume(request, oss_config):
|
|
|
42
42
|
oss_bucket_name,
|
|
43
43
|
oss_endpoint,
|
|
44
44
|
) = oss_config.oss_config
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
45
|
+
|
|
46
|
+
if "test" in oss_endpoint:
|
|
47
|
+
# offline config
|
|
48
|
+
test_location = "oss://%s:%s@%s/%s/%s" % (
|
|
49
|
+
oss_access_id,
|
|
50
|
+
oss_secret_access_key,
|
|
51
|
+
oss_endpoint,
|
|
52
|
+
oss_bucket_name,
|
|
53
|
+
oss_test_dir_name,
|
|
54
|
+
)
|
|
55
|
+
rolearn = None
|
|
56
|
+
else:
|
|
57
|
+
# online config
|
|
58
|
+
endpoint_parts = oss_endpoint.split(".", 1)
|
|
59
|
+
if "-internal" not in endpoint_parts[0]:
|
|
60
|
+
endpoint_parts[0] += "-internal"
|
|
61
|
+
test_location = "oss://%s/%s/%s" % (
|
|
62
|
+
".".join(endpoint_parts),
|
|
63
|
+
oss_bucket_name,
|
|
64
|
+
oss_test_dir_name,
|
|
65
|
+
)
|
|
66
|
+
rolearn = oss_config.oss_rolearn
|
|
67
|
+
|
|
52
68
|
oss_config.oss_bucket.put_object(oss_test_dir_name + "/", b"")
|
|
53
|
-
odps_entry.create_external_volume(
|
|
69
|
+
odps_entry.create_external_volume(
|
|
70
|
+
test_vol_name, location=test_location, rolearn=rolearn
|
|
71
|
+
)
|
|
54
72
|
try:
|
|
55
73
|
yield test_vol_name
|
|
56
74
|
finally:
|
|
@@ -75,13 +93,19 @@ def test_read_write_volume(create_volume):
|
|
|
75
93
|
|
|
76
94
|
odps_entry = ODPS.from_environments()
|
|
77
95
|
|
|
78
|
-
writer = ODPSVolumeWriter(
|
|
96
|
+
writer = ODPSVolumeWriter(
|
|
97
|
+
odps_entry, create_volume, test_vol_dir, replace_internal_host=True
|
|
98
|
+
)
|
|
79
99
|
|
|
80
|
-
writer = ODPSVolumeWriter(
|
|
100
|
+
writer = ODPSVolumeWriter(
|
|
101
|
+
odps_entry, create_volume, test_vol_dir, replace_internal_host=True
|
|
102
|
+
)
|
|
81
103
|
writer.write_file("file1", b"content1")
|
|
82
104
|
writer.write_file("file2", b"content2")
|
|
83
105
|
|
|
84
|
-
reader = ODPSVolumeReader(
|
|
106
|
+
reader = ODPSVolumeReader(
|
|
107
|
+
odps_entry, create_volume, test_vol_dir, replace_internal_host=True
|
|
108
|
+
)
|
|
85
109
|
assert reader.read_file("file1") == b"content1"
|
|
86
110
|
assert reader.read_file("file2") == b"content2"
|
|
87
111
|
|
maxframe/io/odpsio/volumeio.py
CHANGED
|
@@ -16,13 +16,25 @@ import inspect
|
|
|
16
16
|
from typing import Iterator, List, Optional, Union
|
|
17
17
|
|
|
18
18
|
from odps import ODPS
|
|
19
|
+
from odps import __version__ as pyodps_version
|
|
20
|
+
|
|
21
|
+
from ...lib.version import Version
|
|
22
|
+
|
|
23
|
+
_has_replace_internal_host = Version(pyodps_version) >= Version("0.12.0")
|
|
19
24
|
|
|
20
25
|
|
|
21
26
|
class ODPSVolumeReader:
|
|
22
|
-
def __init__(
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
odps_entry: ODPS,
|
|
30
|
+
volume_name: str,
|
|
31
|
+
volume_dir: str,
|
|
32
|
+
replace_internal_host: bool = False,
|
|
33
|
+
):
|
|
23
34
|
self._odps_entry = odps_entry
|
|
24
35
|
self._volume = odps_entry.get_volume(volume_name)
|
|
25
36
|
self._volume_dir = volume_dir
|
|
37
|
+
self._replace_internal_host = replace_internal_host
|
|
26
38
|
|
|
27
39
|
def list_files(self) -> List[str]:
|
|
28
40
|
def _get_file_name(vol_file):
|
|
@@ -38,7 +50,12 @@ class ODPSVolumeReader:
|
|
|
38
50
|
]
|
|
39
51
|
|
|
40
52
|
def read_file(self, file_name: str) -> bytes:
|
|
41
|
-
|
|
53
|
+
kw = {}
|
|
54
|
+
if _has_replace_internal_host and self._replace_internal_host:
|
|
55
|
+
kw = {"replace_internal_host": self._replace_internal_host}
|
|
56
|
+
with self._volume.open_reader(
|
|
57
|
+
self._volume_dir + "/" + file_name, **kw
|
|
58
|
+
) as reader:
|
|
42
59
|
return reader.read()
|
|
43
60
|
|
|
44
61
|
|
|
@@ -49,13 +66,20 @@ class ODPSVolumeWriter:
|
|
|
49
66
|
volume_name: str,
|
|
50
67
|
volume_dir: str,
|
|
51
68
|
schema_name: Optional[str] = None,
|
|
69
|
+
replace_internal_host: bool = False,
|
|
52
70
|
):
|
|
53
71
|
self._odps_entry = odps_entry
|
|
54
72
|
self._volume = odps_entry.get_volume(volume_name, schema=schema_name)
|
|
55
73
|
self._volume_dir = volume_dir
|
|
74
|
+
self._replace_internal_host = replace_internal_host
|
|
56
75
|
|
|
57
76
|
def write_file(self, file_name: str, data: Union[bytes, Iterator[bytes]]):
|
|
58
|
-
|
|
77
|
+
kw = {}
|
|
78
|
+
if _has_replace_internal_host and self._replace_internal_host:
|
|
79
|
+
kw = {"replace_internal_host": self._replace_internal_host}
|
|
80
|
+
with self._volume.open_writer(
|
|
81
|
+
self._volume_dir + "/" + file_name, **kw
|
|
82
|
+
) as writer:
|
|
59
83
|
if not inspect.isgenerator(data):
|
|
60
84
|
writer.write(data)
|
|
61
85
|
else:
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
from . import models, multi_modal, text
|
|
15
|
+
|
|
16
|
+
del models
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
from typing import Any, Dict
|
|
15
|
+
|
|
16
|
+
import numpy as np
|
|
17
|
+
import pandas as pd
|
|
18
|
+
|
|
19
|
+
from ....core.entity.output_types import OutputType
|
|
20
|
+
from ....core.operator.base import Operator
|
|
21
|
+
from ....core.operator.core import TileableOperatorMixin
|
|
22
|
+
from ....dataframe.utils import parse_index
|
|
23
|
+
from ....serialization.serializables.core import Serializable
|
|
24
|
+
from ....serialization.serializables.field import AnyField, DictField, StringField
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class LLM(Serializable):
|
|
28
|
+
name = StringField("name", default=None)
|
|
29
|
+
|
|
30
|
+
def validate_params(self, params: Dict[str, Any]):
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class LLMOperator(Operator, TileableOperatorMixin):
|
|
35
|
+
model = AnyField("model", default=None)
|
|
36
|
+
prompt_template = AnyField("prompt_template", default=None)
|
|
37
|
+
params = DictField("params", default=None)
|
|
38
|
+
|
|
39
|
+
def __init__(self, output_types=None, **kw):
|
|
40
|
+
if output_types is None:
|
|
41
|
+
output_types = [OutputType.dataframe]
|
|
42
|
+
super().__init__(_output_types=output_types, **kw)
|
|
43
|
+
|
|
44
|
+
def __call__(self, data):
|
|
45
|
+
col_names = ["response", "success"]
|
|
46
|
+
columns = parse_index(pd.Index(col_names), store_data=True)
|
|
47
|
+
out_dtypes = pd.Series([np.dtype("O"), np.dtype("bool")], index=col_names)
|
|
48
|
+
return self.new_tileable(
|
|
49
|
+
inputs=[data],
|
|
50
|
+
dtypes=out_dtypes,
|
|
51
|
+
shape=(data.shape[0], len(col_names)),
|
|
52
|
+
index_value=data.index_value,
|
|
53
|
+
columns_value=columns,
|
|
54
|
+
)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
from .dashscope import DashScopeMultiModalLLM, DashScopeTextLLM
|