maxframe 1.0.0rc3__cp311-cp311-macosx_10_9_universal2.whl → 1.1.0__cp311-cp311-macosx_10_9_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cpython-311-darwin.so +0 -0
- maxframe/codegen.py +1 -0
- maxframe/config/config.py +16 -1
- maxframe/conftest.py +52 -14
- maxframe/core/entity/executable.py +1 -1
- maxframe/core/graph/core.cpython-311-darwin.so +0 -0
- maxframe/core/operator/base.py +2 -0
- maxframe/dataframe/arithmetic/docstring.py +26 -2
- maxframe/dataframe/arithmetic/equal.py +4 -2
- maxframe/dataframe/arithmetic/greater.py +4 -2
- maxframe/dataframe/arithmetic/greater_equal.py +4 -2
- maxframe/dataframe/arithmetic/less.py +2 -2
- maxframe/dataframe/arithmetic/less_equal.py +4 -2
- maxframe/dataframe/arithmetic/not_equal.py +4 -2
- maxframe/dataframe/arithmetic/tests/test_arithmetic.py +17 -16
- maxframe/dataframe/core.py +26 -2
- maxframe/dataframe/datasource/read_odps_query.py +116 -28
- maxframe/dataframe/datasource/read_odps_table.py +3 -1
- maxframe/dataframe/datasource/tests/test_datasource.py +93 -12
- maxframe/dataframe/datastore/to_odps.py +7 -0
- maxframe/dataframe/extensions/__init__.py +8 -0
- maxframe/dataframe/extensions/apply_chunk.py +649 -0
- maxframe/dataframe/extensions/flatjson.py +131 -0
- maxframe/dataframe/extensions/flatmap.py +314 -0
- maxframe/dataframe/extensions/reshuffle.py +1 -1
- maxframe/dataframe/extensions/tests/test_apply_chunk.py +186 -0
- maxframe/dataframe/extensions/tests/test_extensions.py +108 -3
- maxframe/dataframe/groupby/__init__.py +1 -0
- maxframe/dataframe/groupby/aggregation.py +1 -0
- maxframe/dataframe/groupby/apply.py +9 -1
- maxframe/dataframe/groupby/core.py +1 -1
- maxframe/dataframe/groupby/fill.py +4 -1
- maxframe/dataframe/groupby/getitem.py +6 -0
- maxframe/dataframe/groupby/tests/test_groupby.py +1 -1
- maxframe/dataframe/groupby/transform.py +8 -2
- maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
- maxframe/dataframe/indexing/loc.py +6 -4
- maxframe/dataframe/indexing/rename.py +11 -0
- maxframe/dataframe/initializer.py +11 -1
- maxframe/dataframe/merge/__init__.py +9 -1
- maxframe/dataframe/merge/concat.py +41 -31
- maxframe/dataframe/merge/merge.py +1 -1
- maxframe/dataframe/merge/tests/test_merge.py +3 -1
- maxframe/dataframe/misc/apply.py +3 -0
- maxframe/dataframe/misc/drop_duplicates.py +23 -2
- maxframe/dataframe/misc/map.py +3 -1
- maxframe/dataframe/misc/tests/test_misc.py +24 -2
- maxframe/dataframe/misc/transform.py +22 -13
- maxframe/dataframe/reduction/__init__.py +3 -0
- maxframe/dataframe/reduction/aggregation.py +1 -0
- maxframe/dataframe/reduction/median.py +56 -0
- maxframe/dataframe/reduction/tests/test_reduction.py +17 -7
- maxframe/dataframe/statistics/quantile.py +8 -2
- maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
- maxframe/dataframe/tests/test_initializer.py +33 -2
- maxframe/dataframe/tests/test_utils.py +60 -0
- maxframe/dataframe/utils.py +110 -7
- maxframe/dataframe/window/expanding.py +5 -3
- maxframe/dataframe/window/tests/test_expanding.py +2 -2
- maxframe/io/objects/tests/test_object_io.py +39 -12
- maxframe/io/odpsio/arrow.py +30 -2
- maxframe/io/odpsio/schema.py +28 -8
- maxframe/io/odpsio/tableio.py +55 -133
- maxframe/io/odpsio/tests/test_schema.py +40 -4
- maxframe/io/odpsio/tests/test_tableio.py +5 -5
- maxframe/io/odpsio/tests/test_volumeio.py +35 -11
- maxframe/io/odpsio/volumeio.py +36 -6
- maxframe/learn/contrib/__init__.py +3 -1
- maxframe/learn/contrib/graph/__init__.py +15 -0
- maxframe/learn/contrib/graph/connected_components.py +215 -0
- maxframe/learn/contrib/graph/tests/__init__.py +13 -0
- maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
- maxframe/learn/contrib/llm/__init__.py +16 -0
- maxframe/learn/contrib/llm/core.py +54 -0
- maxframe/learn/contrib/llm/models/__init__.py +14 -0
- maxframe/learn/contrib/llm/models/dashscope.py +73 -0
- maxframe/learn/contrib/llm/multi_modal.py +42 -0
- maxframe/learn/contrib/llm/text.py +42 -0
- maxframe/learn/contrib/xgboost/classifier.py +3 -3
- maxframe/learn/contrib/xgboost/predict.py +8 -39
- maxframe/learn/contrib/xgboost/train.py +4 -3
- maxframe/lib/mmh3.cpython-311-darwin.so +0 -0
- maxframe/lib/sparse/tests/test_sparse.py +15 -15
- maxframe/opcodes.py +10 -1
- maxframe/protocol.py +6 -1
- maxframe/serialization/core.cpython-311-darwin.so +0 -0
- maxframe/serialization/core.pyx +13 -1
- maxframe/serialization/pandas.py +50 -20
- maxframe/serialization/serializables/core.py +24 -5
- maxframe/serialization/serializables/field_type.py +4 -1
- maxframe/serialization/serializables/tests/test_serializable.py +8 -1
- maxframe/serialization/tests/test_serial.py +2 -1
- maxframe/session.py +9 -2
- maxframe/tensor/__init__.py +19 -7
- maxframe/tensor/indexing/getitem.py +2 -0
- maxframe/tensor/merge/concatenate.py +23 -20
- maxframe/tensor/merge/vstack.py +5 -1
- maxframe/tensor/misc/transpose.py +1 -1
- maxframe/tests/utils.py +16 -0
- maxframe/udf.py +27 -0
- maxframe/utils.py +64 -14
- {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/METADATA +2 -2
- {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/RECORD +112 -96
- {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/WHEEL +1 -1
- maxframe_client/clients/framedriver.py +4 -1
- maxframe_client/fetcher.py +28 -10
- maxframe_client/session/consts.py +3 -0
- maxframe_client/session/odps.py +104 -20
- maxframe_client/session/task.py +42 -26
- maxframe_client/session/tests/test_task.py +0 -4
- maxframe_client/tests/test_session.py +44 -12
- {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/top_level.txt +0 -0
|
@@ -28,6 +28,7 @@ from .aggregation import BaseDataFrameExpandingAgg
|
|
|
28
28
|
from .core import Window
|
|
29
29
|
|
|
30
30
|
_window_has_method = pd_release_version >= (1, 3, 0)
|
|
31
|
+
_window_has_center = pd_release_version < (2, 0, 0)
|
|
31
32
|
|
|
32
33
|
|
|
33
34
|
class DataFrameExpandingAgg(BaseDataFrameExpandingAgg):
|
|
@@ -49,10 +50,11 @@ class Expanding(Window):
|
|
|
49
50
|
def params(self):
|
|
50
51
|
p = OrderedDict()
|
|
51
52
|
|
|
53
|
+
args = ["min_periods", "center", "axis", "method"]
|
|
52
54
|
if not _window_has_method: # pragma: no cover
|
|
53
|
-
args = [
|
|
54
|
-
|
|
55
|
-
args = [
|
|
55
|
+
args = [a for a in args if a != "method"]
|
|
56
|
+
if not _window_has_center:
|
|
57
|
+
args = [a for a in args if a != "center"]
|
|
56
58
|
|
|
57
59
|
for k in args:
|
|
58
60
|
p[k] = getattr(self, k)
|
|
@@ -29,8 +29,8 @@ def test_expanding():
|
|
|
29
29
|
with pytest.raises(NotImplementedError):
|
|
30
30
|
_ = df2.expanding(3, axis=1)
|
|
31
31
|
|
|
32
|
-
r = df2.expanding(3
|
|
33
|
-
expected = df.expanding(3
|
|
32
|
+
r = df2.expanding(3)
|
|
33
|
+
expected = df.expanding(3)
|
|
34
34
|
assert repr(r) == repr(expected)
|
|
35
35
|
|
|
36
36
|
assert "b" in dir(r)
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
|
|
14
15
|
import numpy as np
|
|
15
16
|
import pytest
|
|
16
17
|
from odps import ODPS
|
|
@@ -48,15 +49,33 @@ def create_volume(request, oss_config):
|
|
|
48
49
|
oss_bucket_name,
|
|
49
50
|
oss_endpoint,
|
|
50
51
|
) = oss_config.oss_config
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
52
|
+
|
|
53
|
+
if "test" in oss_endpoint:
|
|
54
|
+
# offline config
|
|
55
|
+
test_location = "oss://%s:%s@%s/%s/%s" % (
|
|
56
|
+
oss_access_id,
|
|
57
|
+
oss_secret_access_key,
|
|
58
|
+
oss_endpoint,
|
|
59
|
+
oss_bucket_name,
|
|
60
|
+
oss_test_dir_name,
|
|
61
|
+
)
|
|
62
|
+
rolearn = None
|
|
63
|
+
else:
|
|
64
|
+
# online config
|
|
65
|
+
endpoint_parts = oss_endpoint.split(".", 1)
|
|
66
|
+
if "-internal" not in endpoint_parts[0]:
|
|
67
|
+
endpoint_parts[0] += "-internal"
|
|
68
|
+
test_location = "oss://%s/%s/%s" % (
|
|
69
|
+
".".join(endpoint_parts),
|
|
70
|
+
oss_bucket_name,
|
|
71
|
+
oss_test_dir_name,
|
|
72
|
+
)
|
|
73
|
+
rolearn = oss_config.oss_rolearn
|
|
74
|
+
|
|
58
75
|
oss_config.oss_bucket.put_object(oss_test_dir_name + "/", b"")
|
|
59
|
-
odps_entry.create_external_volume(
|
|
76
|
+
odps_entry.create_external_volume(
|
|
77
|
+
test_vol_name, location=test_location, rolearn=rolearn
|
|
78
|
+
)
|
|
60
79
|
|
|
61
80
|
try:
|
|
62
81
|
yield test_vol_name
|
|
@@ -75,8 +94,12 @@ def test_simple_object_io(create_volume):
|
|
|
75
94
|
|
|
76
95
|
odps_entry = ODPS.from_environments()
|
|
77
96
|
|
|
78
|
-
reader = ODPSVolumeReader(
|
|
79
|
-
|
|
97
|
+
reader = ODPSVolumeReader(
|
|
98
|
+
odps_entry, create_volume, obj.key, replace_internal_host=True
|
|
99
|
+
)
|
|
100
|
+
writer = ODPSVolumeWriter(
|
|
101
|
+
odps_entry, create_volume, obj.key, replace_internal_host=True
|
|
102
|
+
)
|
|
80
103
|
|
|
81
104
|
handler = get_object_io_handler(obj)()
|
|
82
105
|
handler.write_object(writer, obj, data)
|
|
@@ -89,8 +112,12 @@ def test_tensor_object_io(create_volume):
|
|
|
89
112
|
|
|
90
113
|
odps_entry = ODPS.from_environments()
|
|
91
114
|
|
|
92
|
-
reader = ODPSVolumeReader(
|
|
93
|
-
|
|
115
|
+
reader = ODPSVolumeReader(
|
|
116
|
+
odps_entry, create_volume, obj.key, replace_internal_host=True
|
|
117
|
+
)
|
|
118
|
+
writer = ODPSVolumeWriter(
|
|
119
|
+
odps_entry, create_volume, obj.key, replace_internal_host=True
|
|
120
|
+
)
|
|
94
121
|
|
|
95
122
|
handler = get_object_io_handler(obj)()
|
|
96
123
|
handler.write_object(writer, obj, data)
|
maxframe/io/odpsio/arrow.py
CHANGED
|
@@ -69,13 +69,24 @@ def arrow_to_pandas(
|
|
|
69
69
|
|
|
70
70
|
|
|
71
71
|
def pandas_to_arrow(
|
|
72
|
-
df: Any, nthreads=1, ignore_index=False
|
|
72
|
+
df: Any, nthreads=1, ignore_index=False, ms_cols=None
|
|
73
73
|
) -> Tuple[ArrowTableType, DataFrameTableMeta]:
|
|
74
74
|
table_meta = build_dataframe_table_meta(df, ignore_index)
|
|
75
75
|
df = df.copy() if callable(getattr(df, "copy", None)) else df
|
|
76
|
+
table_datetime_cols = None
|
|
76
77
|
if table_meta.type in (OutputType.dataframe, OutputType.series):
|
|
77
78
|
if table_meta.type == OutputType.series:
|
|
78
79
|
df = df.to_frame("_data" if df.name is None else df.name)
|
|
80
|
+
if ms_cols:
|
|
81
|
+
table_datetime_cols = {"_data"}
|
|
82
|
+
elif ms_cols:
|
|
83
|
+
ms_col_set = set(ms_cols)
|
|
84
|
+
table_datetime_cols = set()
|
|
85
|
+
for pd_col, table_col in zip(
|
|
86
|
+
table_meta.pd_column_dtypes.keys(), table_meta.table_column_names
|
|
87
|
+
):
|
|
88
|
+
if pd_col in ms_col_set:
|
|
89
|
+
table_datetime_cols.add(table_col)
|
|
79
90
|
df.columns = pd.Index(table_meta.table_column_names)
|
|
80
91
|
if not ignore_index:
|
|
81
92
|
df = df.rename_axis(table_meta.table_index_column_names).reset_index()
|
|
@@ -83,6 +94,12 @@ def pandas_to_arrow(
|
|
|
83
94
|
df = pd.DataFrame([], columns=[])
|
|
84
95
|
elif table_meta.type == OutputType.index:
|
|
85
96
|
names = [f"_idx_{idx}" for idx in range(len(df.names))]
|
|
97
|
+
table_datetime_cols = set()
|
|
98
|
+
if ms_cols:
|
|
99
|
+
if isinstance(df, pd.MultiIndex):
|
|
100
|
+
table_datetime_cols = {f"_idx_{idx}" for idx in ms_cols}
|
|
101
|
+
else:
|
|
102
|
+
table_datetime_cols = {"_idx_0"}
|
|
86
103
|
df = df.to_frame(name=names[0] if len(names) == 1 else names)
|
|
87
104
|
elif table_meta.type == OutputType.scalar:
|
|
88
105
|
names = ["_idx_0"]
|
|
@@ -92,4 +109,15 @@ def pandas_to_arrow(
|
|
|
92
109
|
df = pd.DataFrame([[df]], columns=names)
|
|
93
110
|
else: # this could never happen # pragma: no cover
|
|
94
111
|
raise ValueError(f"Does not support meta type {table_meta.type!r}")
|
|
95
|
-
|
|
112
|
+
pa_table = pa.Table.from_pandas(df, nthreads=nthreads, preserve_index=False)
|
|
113
|
+
if table_datetime_cols:
|
|
114
|
+
col_names = pa_table.schema.names
|
|
115
|
+
col_datas = []
|
|
116
|
+
for idx, col_name in enumerate(pa_table.schema.names):
|
|
117
|
+
if col_name not in table_datetime_cols:
|
|
118
|
+
col_datas.append(pa_table.column(idx))
|
|
119
|
+
continue
|
|
120
|
+
col_data = pa_table.column(idx).cast(pa.timestamp("ms"))
|
|
121
|
+
col_datas.append(col_data)
|
|
122
|
+
pa_table = pa.Table.from_arrays(col_datas, names=col_names)
|
|
123
|
+
return pa_table, table_meta
|
maxframe/io/odpsio/schema.py
CHANGED
|
@@ -16,6 +16,7 @@ import string
|
|
|
16
16
|
from collections import defaultdict
|
|
17
17
|
from typing import Any, Dict, Tuple
|
|
18
18
|
|
|
19
|
+
import numpy as np
|
|
19
20
|
import pandas as pd
|
|
20
21
|
import pyarrow as pa
|
|
21
22
|
from odps import types as odps_types
|
|
@@ -39,6 +40,7 @@ _arrow_to_odps_types = {
|
|
|
39
40
|
pa.float64(): odps_types.double,
|
|
40
41
|
pa.date32(): odps_types.date,
|
|
41
42
|
pa.timestamp("ms"): odps_types.datetime,
|
|
43
|
+
pa.timestamp("us"): odps_types.timestamp,
|
|
42
44
|
pa.timestamp("ns"): odps_types.timestamp,
|
|
43
45
|
}
|
|
44
46
|
|
|
@@ -54,7 +56,9 @@ _odps_type_to_arrow = {
|
|
|
54
56
|
odps_types.double: pa.float64(),
|
|
55
57
|
odps_types.date: pa.date32(),
|
|
56
58
|
odps_types.datetime: pa.timestamp("ms"),
|
|
59
|
+
odps_types.json: pa.string(),
|
|
57
60
|
odps_types.timestamp: pa.timestamp("ns"),
|
|
61
|
+
odps_types.timestamp_ntz: pa.timestamp("ns"),
|
|
58
62
|
}
|
|
59
63
|
|
|
60
64
|
|
|
@@ -166,7 +170,7 @@ def odps_schema_to_pandas_dtypes(
|
|
|
166
170
|
return arrow_schema.empty_table().to_pandas().dtypes
|
|
167
171
|
|
|
168
172
|
|
|
169
|
-
def
|
|
173
|
+
def is_scalar_object(df_obj: Any) -> bool:
|
|
170
174
|
return (
|
|
171
175
|
isinstance(df_obj, TENSOR_TYPE) and df_obj.shape == ()
|
|
172
176
|
) or pd_types.is_scalar(df_obj)
|
|
@@ -187,7 +191,7 @@ def pandas_to_odps_schema(
|
|
|
187
191
|
from ... import dataframe as md
|
|
188
192
|
from .arrow import pandas_to_arrow
|
|
189
193
|
|
|
190
|
-
if
|
|
194
|
+
if is_scalar_object(df_obj):
|
|
191
195
|
empty_index = None
|
|
192
196
|
elif hasattr(df_obj, "index_value"):
|
|
193
197
|
empty_index = df_obj.index_value.to_pandas()[:0]
|
|
@@ -203,20 +207,35 @@ def pandas_to_odps_schema(
|
|
|
203
207
|
else:
|
|
204
208
|
empty_columns = None
|
|
205
209
|
|
|
210
|
+
ms_cols = None
|
|
206
211
|
if isinstance(df_obj, (md.DataFrame, pd.DataFrame)):
|
|
207
212
|
empty_df_obj = pd.DataFrame(
|
|
208
213
|
[], columns=empty_columns, index=empty_index
|
|
209
214
|
).astype(df_obj.dtypes)
|
|
215
|
+
ms_cols = [
|
|
216
|
+
col for col, dt in df_obj.dtypes.items() if dt == np.dtype("datetime64[ms]")
|
|
217
|
+
]
|
|
210
218
|
elif isinstance(df_obj, (md.Series, pd.Series)):
|
|
211
219
|
empty_df_obj = pd.Series([], name=df_obj.name, index=empty_index).astype(
|
|
212
220
|
df_obj.dtype
|
|
213
221
|
)
|
|
222
|
+
ms_cols = df_obj.dtype == np.dtype("datetime64[ms]")
|
|
214
223
|
elif isinstance(df_obj, (md.Index, pd.Index)):
|
|
215
224
|
empty_df_obj = empty_index
|
|
225
|
+
if isinstance(empty_index, pd.MultiIndex):
|
|
226
|
+
ms_cols = [
|
|
227
|
+
idx
|
|
228
|
+
for idx, dt in enumerate(empty_index.dtypes.values)
|
|
229
|
+
if dt == np.dtype("datetime64[ms]")
|
|
230
|
+
]
|
|
231
|
+
else:
|
|
232
|
+
ms_cols = df_obj.dtype == np.dtype("datetime64[ms]")
|
|
216
233
|
else:
|
|
217
234
|
empty_df_obj = df_obj
|
|
218
235
|
|
|
219
|
-
arrow_data, table_meta = pandas_to_arrow(
|
|
236
|
+
arrow_data, table_meta = pandas_to_arrow(
|
|
237
|
+
empty_df_obj, ignore_index=ignore_index, ms_cols=ms_cols
|
|
238
|
+
)
|
|
220
239
|
return (
|
|
221
240
|
arrow_schema_to_odps_schema(
|
|
222
241
|
arrow_data.schema, unknown_as_string=unknown_as_string
|
|
@@ -289,7 +308,7 @@ def build_dataframe_table_meta(
|
|
|
289
308
|
obj_type = OutputType.series
|
|
290
309
|
elif isinstance(df_obj, (md.Index, pd.Index)):
|
|
291
310
|
obj_type = OutputType.index
|
|
292
|
-
elif
|
|
311
|
+
elif is_scalar_object(df_obj):
|
|
293
312
|
obj_type = OutputType.scalar
|
|
294
313
|
else: # pragma: no cover
|
|
295
314
|
raise TypeError(f"Cannot accept type {type(df_obj)}")
|
|
@@ -344,10 +363,11 @@ def build_dataframe_table_meta(
|
|
|
344
363
|
else:
|
|
345
364
|
pd_index_val = index_obj
|
|
346
365
|
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
366
|
+
level_dtypes = [
|
|
367
|
+
pd_index_val.get_level_values(level).dtype
|
|
368
|
+
for level in range(pd_index_val.nlevels)
|
|
369
|
+
]
|
|
370
|
+
index_dtypes = pd.Series(level_dtypes, index=pd_index_val.names)
|
|
351
371
|
|
|
352
372
|
if ignore_index and obj_type != OutputType.index:
|
|
353
373
|
table_index_column_names = []
|
maxframe/io/odpsio/tableio.py
CHANGED
|
@@ -20,15 +20,14 @@ from typing import Dict, List, Optional, Union
|
|
|
20
20
|
|
|
21
21
|
import pyarrow as pa
|
|
22
22
|
from odps import ODPS
|
|
23
|
-
from odps import __version__ as pyodps_version
|
|
24
23
|
from odps.apis.storage_api import (
|
|
25
24
|
StorageApiArrowClient,
|
|
26
25
|
TableBatchScanResponse,
|
|
27
26
|
TableBatchWriteResponse,
|
|
28
27
|
)
|
|
29
|
-
from odps.config import option_context as pyodps_option_context
|
|
30
28
|
from odps.tunnel import TableTunnel
|
|
31
29
|
from odps.types import OdpsSchema, PartitionSpec, timestamp_ntz
|
|
30
|
+
from odps.utils import call_with_retry
|
|
32
31
|
|
|
33
32
|
try:
|
|
34
33
|
import pyarrow.compute as pac
|
|
@@ -37,26 +36,18 @@ except ImportError:
|
|
|
37
36
|
|
|
38
37
|
from ...config import options
|
|
39
38
|
from ...env import ODPS_STORAGE_API_ENDPOINT
|
|
40
|
-
from ...
|
|
39
|
+
from ...utils import sync_pyodps_options
|
|
41
40
|
from .schema import odps_schema_to_arrow_schema
|
|
42
41
|
|
|
43
42
|
PartitionsType = Union[List[str], str, None]
|
|
44
43
|
|
|
45
44
|
_DEFAULT_ROW_BATCH_SIZE = 4096
|
|
46
|
-
_need_convert_timezone = Version(pyodps_version) < Version("0.11.7")
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
@contextmanager
|
|
50
|
-
def _sync_pyodps_timezone():
|
|
51
|
-
with pyodps_option_context() as cfg:
|
|
52
|
-
cfg.local_timezone = options.local_timezone
|
|
53
|
-
yield
|
|
54
45
|
|
|
55
46
|
|
|
56
47
|
class ODPSTableIO(ABC):
|
|
57
48
|
def __new__(cls, odps: ODPS):
|
|
58
49
|
if cls is ODPSTableIO:
|
|
59
|
-
if options.use_common_table:
|
|
50
|
+
if options.use_common_table or ODPS_STORAGE_API_ENDPOINT in os.environ:
|
|
60
51
|
return HaloTableIO(odps)
|
|
61
52
|
else:
|
|
62
53
|
return TunnelTableIO(odps)
|
|
@@ -138,7 +129,12 @@ class TunnelMultiPartitionReader:
|
|
|
138
129
|
self._cur_partition_id = -1
|
|
139
130
|
self._reader_start_pos = 0
|
|
140
131
|
|
|
141
|
-
if partitions is None
|
|
132
|
+
if partitions is None:
|
|
133
|
+
if not self._table.table_schema.partitions:
|
|
134
|
+
self._partitions = [None]
|
|
135
|
+
else:
|
|
136
|
+
self._partitions = [str(pt) for pt in self._table.partitions]
|
|
137
|
+
elif isinstance(partitions, str):
|
|
142
138
|
self._partitions = [partitions]
|
|
143
139
|
else:
|
|
144
140
|
self._partitions = partitions
|
|
@@ -166,12 +162,14 @@ class TunnelMultiPartitionReader:
|
|
|
166
162
|
self._cur_partition_id += 1
|
|
167
163
|
|
|
168
164
|
part_str = self._partitions[self._cur_partition_id]
|
|
169
|
-
|
|
165
|
+
req_columns = self._schema.names
|
|
166
|
+
with sync_pyodps_options():
|
|
170
167
|
self._cur_reader = self._table.open_reader(
|
|
171
168
|
part_str,
|
|
172
|
-
columns=
|
|
169
|
+
columns=req_columns,
|
|
173
170
|
arrow=True,
|
|
174
171
|
download_id=self._partition_to_download_ids.get(part_str),
|
|
172
|
+
append_partitions=True,
|
|
175
173
|
)
|
|
176
174
|
if self._cur_reader.count + self._reader_start_pos > self._start:
|
|
177
175
|
start = self._start - self._reader_start_pos
|
|
@@ -180,43 +178,15 @@ class TunnelMultiPartitionReader:
|
|
|
180
178
|
else:
|
|
181
179
|
count = min(self._count, self._cur_reader.count - start)
|
|
182
180
|
|
|
183
|
-
with
|
|
181
|
+
with sync_pyodps_options():
|
|
184
182
|
self._reader_iter = self._cur_reader.read(start, count)
|
|
185
183
|
break
|
|
186
184
|
self._reader_start_pos += self._cur_reader.count
|
|
187
185
|
else:
|
|
188
186
|
self._cur_reader = None
|
|
189
187
|
|
|
190
|
-
def _fill_batch_partition(self, batch: pa.RecordBatch) -> pa.RecordBatch:
|
|
191
|
-
pt_spec = PartitionSpec(self._partitions[self._cur_partition_id])
|
|
192
|
-
|
|
193
|
-
names = list(batch.schema.names)
|
|
194
|
-
arrays = []
|
|
195
|
-
for idx in range(batch.num_columns):
|
|
196
|
-
col = batch.column(idx)
|
|
197
|
-
if _need_convert_timezone and isinstance(col.type, pa.TimestampType):
|
|
198
|
-
if col.type.tz is not None:
|
|
199
|
-
target_type = pa.timestamp(
|
|
200
|
-
self._schema.types[idx].unit, col.type.tz
|
|
201
|
-
)
|
|
202
|
-
arrays.append(col.cast(target_type))
|
|
203
|
-
else:
|
|
204
|
-
target_type = pa.timestamp(
|
|
205
|
-
self._schema.types[idx].unit, options.local_timezone
|
|
206
|
-
)
|
|
207
|
-
pd_col = col.to_pandas().dt.tz_localize(options.local_timezone)
|
|
208
|
-
arrays.append(pa.Array.from_pandas(pd_col).cast(target_type))
|
|
209
|
-
else:
|
|
210
|
-
arrays.append(batch.column(idx))
|
|
211
|
-
|
|
212
|
-
for part_col in self._partition_cols or []:
|
|
213
|
-
names.append(part_col)
|
|
214
|
-
col_type = self._schema.field_by_name(part_col).type
|
|
215
|
-
arrays.append(pa.array([pt_spec[part_col]] * batch.num_rows).cast(col_type))
|
|
216
|
-
return pa.RecordBatch.from_arrays(arrays, names)
|
|
217
|
-
|
|
218
188
|
def read(self):
|
|
219
|
-
with
|
|
189
|
+
with sync_pyodps_options():
|
|
220
190
|
if self._cur_reader is None:
|
|
221
191
|
self._open_next_reader()
|
|
222
192
|
if self._cur_reader is None:
|
|
@@ -227,7 +197,7 @@ class TunnelMultiPartitionReader:
|
|
|
227
197
|
if batch is not None:
|
|
228
198
|
if self._row_left is not None:
|
|
229
199
|
self._row_left -= batch.num_rows
|
|
230
|
-
return
|
|
200
|
+
return batch
|
|
231
201
|
except StopIteration:
|
|
232
202
|
self._open_next_reader()
|
|
233
203
|
return None
|
|
@@ -244,34 +214,6 @@ class TunnelMultiPartitionReader:
|
|
|
244
214
|
return pa.Table.from_batches(batches)
|
|
245
215
|
|
|
246
216
|
|
|
247
|
-
class TunnelWrappedWriter:
|
|
248
|
-
def __init__(self, nested_writer):
|
|
249
|
-
self._writer = nested_writer
|
|
250
|
-
|
|
251
|
-
def write(self, data: Union[pa.RecordBatch, pa.Table]):
|
|
252
|
-
if not any(isinstance(tp, pa.TimestampType) for tp in data.schema.types):
|
|
253
|
-
self._writer.write(data)
|
|
254
|
-
return
|
|
255
|
-
pa_type = type(data)
|
|
256
|
-
arrays = []
|
|
257
|
-
for idx in range(data.num_columns):
|
|
258
|
-
name = data.schema.names[idx]
|
|
259
|
-
col = data.column(idx)
|
|
260
|
-
if not isinstance(col.type, pa.TimestampType):
|
|
261
|
-
arrays.append(col)
|
|
262
|
-
continue
|
|
263
|
-
if self._writer.schema[name].type == timestamp_ntz:
|
|
264
|
-
col = HaloTableArrowWriter._localize_timezone(col, "UTC")
|
|
265
|
-
else:
|
|
266
|
-
col = HaloTableArrowWriter._localize_timezone(col)
|
|
267
|
-
arrays.append(col)
|
|
268
|
-
data = pa_type.from_arrays(arrays, names=data.schema.names)
|
|
269
|
-
self._writer.write(data)
|
|
270
|
-
|
|
271
|
-
def __getattr__(self, item):
|
|
272
|
-
return getattr(self._writer, item)
|
|
273
|
-
|
|
274
|
-
|
|
275
217
|
class TunnelTableIO(ODPSTableIO):
|
|
276
218
|
@contextmanager
|
|
277
219
|
def open_reader(
|
|
@@ -285,7 +227,9 @@ class TunnelTableIO(ODPSTableIO):
|
|
|
285
227
|
reverse_range: bool = False,
|
|
286
228
|
row_batch_size: int = _DEFAULT_ROW_BATCH_SIZE,
|
|
287
229
|
):
|
|
288
|
-
|
|
230
|
+
with sync_pyodps_options():
|
|
231
|
+
table = self._odps.get_table(full_table_name)
|
|
232
|
+
|
|
289
233
|
if partition_columns is True:
|
|
290
234
|
partition_columns = [c.name for c in table.table_schema.partitions]
|
|
291
235
|
|
|
@@ -296,21 +240,22 @@ class TunnelTableIO(ODPSTableIO):
|
|
|
296
240
|
or (stop is not None and stop < 0)
|
|
297
241
|
or (reverse_range and start is None)
|
|
298
242
|
):
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
part_to_down_id = dict()
|
|
307
|
-
total_records = 0
|
|
308
|
-
for part in parts:
|
|
309
|
-
down_session = tunnel.create_download_session(
|
|
310
|
-
table, async_mode=True, partition_spec=part
|
|
243
|
+
with sync_pyodps_options():
|
|
244
|
+
table = self._odps.get_table(full_table_name)
|
|
245
|
+
tunnel = TableTunnel(self._odps)
|
|
246
|
+
parts = (
|
|
247
|
+
[partitions]
|
|
248
|
+
if partitions is None or isinstance(partitions, str)
|
|
249
|
+
else partitions
|
|
311
250
|
)
|
|
312
|
-
part_to_down_id
|
|
313
|
-
total_records
|
|
251
|
+
part_to_down_id = dict()
|
|
252
|
+
total_records = 0
|
|
253
|
+
for part in parts:
|
|
254
|
+
down_session = tunnel.create_download_session(
|
|
255
|
+
table, async_mode=True, partition_spec=part
|
|
256
|
+
)
|
|
257
|
+
part_to_down_id[part] = down_session.id
|
|
258
|
+
total_records += down_session.count
|
|
314
259
|
|
|
315
260
|
count = None
|
|
316
261
|
if start is not None or stop is not None:
|
|
@@ -347,20 +292,14 @@ class TunnelTableIO(ODPSTableIO):
|
|
|
347
292
|
overwrite: bool = True,
|
|
348
293
|
):
|
|
349
294
|
table = self._odps.get_table(full_table_name)
|
|
350
|
-
with
|
|
295
|
+
with sync_pyodps_options():
|
|
351
296
|
with table.open_writer(
|
|
352
297
|
partition=partition,
|
|
353
298
|
arrow=True,
|
|
354
299
|
create_partition=partition is not None,
|
|
355
300
|
overwrite=overwrite,
|
|
356
301
|
) as writer:
|
|
357
|
-
|
|
358
|
-
# related arrow timestamp bug when provided schema and
|
|
359
|
-
# table schema is identical.
|
|
360
|
-
if _need_convert_timezone:
|
|
361
|
-
yield TunnelWrappedWriter(writer)
|
|
362
|
-
else:
|
|
363
|
-
yield writer
|
|
302
|
+
yield writer
|
|
364
303
|
|
|
365
304
|
|
|
366
305
|
class HaloTableArrowReader:
|
|
@@ -416,7 +355,7 @@ class HaloTableArrowReader:
|
|
|
416
355
|
split_index=self._cur_split_id + 1,
|
|
417
356
|
**read_rows_kw,
|
|
418
357
|
)
|
|
419
|
-
self._cur_reader = self._client.read_rows_arrow
|
|
358
|
+
self._cur_reader = call_with_retry(self._client.read_rows_arrow, req)
|
|
420
359
|
self._cur_split_id += 1
|
|
421
360
|
|
|
422
361
|
def _convert_timezone(self, batch: pa.RecordBatch) -> pa.RecordBatch:
|
|
@@ -488,8 +427,9 @@ class HaloTableArrowWriter:
|
|
|
488
427
|
def open(self):
|
|
489
428
|
from odps.apis.storage_api import WriteRowsRequest
|
|
490
429
|
|
|
491
|
-
self._writer =
|
|
492
|
-
|
|
430
|
+
self._writer = call_with_retry(
|
|
431
|
+
self._client.write_rows_arrow,
|
|
432
|
+
WriteRowsRequest(self._write_info.session_id),
|
|
493
433
|
)
|
|
494
434
|
|
|
495
435
|
@classmethod
|
|
@@ -560,28 +500,6 @@ class HaloTableIO(ODPSTableIO):
|
|
|
560
500
|
for pt in partitions
|
|
561
501
|
]
|
|
562
502
|
|
|
563
|
-
def get_table_record_count(
|
|
564
|
-
self, full_table_name: str, partitions: PartitionsType = None
|
|
565
|
-
):
|
|
566
|
-
from odps.apis.storage_api import SplitOptions, TableBatchScanRequest
|
|
567
|
-
|
|
568
|
-
table = self._odps.get_table(full_table_name)
|
|
569
|
-
client = StorageApiArrowClient(
|
|
570
|
-
self._odps, table, rest_endpoint=self._storage_api_endpoint
|
|
571
|
-
)
|
|
572
|
-
|
|
573
|
-
split_option = SplitOptions.SplitMode.SIZE
|
|
574
|
-
|
|
575
|
-
scan_kw = {
|
|
576
|
-
"required_partitions": self._convert_partitions(partitions),
|
|
577
|
-
"split_options": SplitOptions.get_default_options(split_option),
|
|
578
|
-
}
|
|
579
|
-
|
|
580
|
-
# todo add more options for partition column handling
|
|
581
|
-
req = TableBatchScanRequest(**scan_kw)
|
|
582
|
-
resp = client.create_read_session(req)
|
|
583
|
-
return resp.record_count
|
|
584
|
-
|
|
585
503
|
@contextmanager
|
|
586
504
|
def open_reader(
|
|
587
505
|
self,
|
|
@@ -596,8 +514,8 @@ class HaloTableIO(ODPSTableIO):
|
|
|
596
514
|
):
|
|
597
515
|
from odps.apis.storage_api import (
|
|
598
516
|
SessionRequest,
|
|
517
|
+
SessionStatus,
|
|
599
518
|
SplitOptions,
|
|
600
|
-
Status,
|
|
601
519
|
TableBatchScanRequest,
|
|
602
520
|
)
|
|
603
521
|
|
|
@@ -625,16 +543,16 @@ class HaloTableIO(ODPSTableIO):
|
|
|
625
543
|
|
|
626
544
|
# todo add more options for partition column handling
|
|
627
545
|
req = TableBatchScanRequest(**scan_kw)
|
|
628
|
-
resp = client.create_read_session
|
|
546
|
+
resp = call_with_retry(client.create_read_session, req)
|
|
629
547
|
|
|
630
548
|
session_id = resp.session_id
|
|
631
|
-
status = resp.
|
|
632
|
-
while status ==
|
|
633
|
-
resp = client.get_read_session
|
|
634
|
-
status = resp.
|
|
549
|
+
status = resp.session_status
|
|
550
|
+
while status == SessionStatus.INIT:
|
|
551
|
+
resp = call_with_retry(client.get_read_session, SessionRequest(session_id))
|
|
552
|
+
status = resp.session_status
|
|
635
553
|
time.sleep(1.0)
|
|
636
554
|
|
|
637
|
-
assert status ==
|
|
555
|
+
assert status == SessionStatus.NORMAL
|
|
638
556
|
|
|
639
557
|
count = None
|
|
640
558
|
if start is not None or stop is not None:
|
|
@@ -685,7 +603,7 @@ class HaloTableIO(ODPSTableIO):
|
|
|
685
603
|
part_strs = self._convert_partitions(partition)
|
|
686
604
|
part_str = part_strs[0] if part_strs else None
|
|
687
605
|
req = TableBatchWriteRequest(partition_spec=part_str, overwrite=overwrite)
|
|
688
|
-
resp = client.create_write_session
|
|
606
|
+
resp = call_with_retry(client.create_write_session, req)
|
|
689
607
|
|
|
690
608
|
session_id = resp.session_id
|
|
691
609
|
writer = HaloTableArrowWriter(client, resp, table.table_schema)
|
|
@@ -694,9 +612,13 @@ class HaloTableIO(ODPSTableIO):
|
|
|
694
612
|
yield writer
|
|
695
613
|
|
|
696
614
|
commit_msg = writer.close()
|
|
697
|
-
resp =
|
|
698
|
-
|
|
615
|
+
resp = call_with_retry(
|
|
616
|
+
client.commit_write_session,
|
|
617
|
+
SessionRequest(session_id=session_id),
|
|
618
|
+
[commit_msg],
|
|
699
619
|
)
|
|
700
620
|
while resp.session_status == SessionStatus.COMMITTING:
|
|
701
|
-
resp =
|
|
621
|
+
resp = call_with_retry(
|
|
622
|
+
client.get_write_session, SessionRequest(session_id=session_id)
|
|
623
|
+
)
|
|
702
624
|
assert resp.session_status == SessionStatus.COMMITTED
|
|
@@ -21,6 +21,7 @@ from odps import types as odps_types
|
|
|
21
21
|
from .... import dataframe as md
|
|
22
22
|
from .... import tensor as mt
|
|
23
23
|
from ....core import OutputType
|
|
24
|
+
from ....utils import pd_release_version
|
|
24
25
|
from ..schema import (
|
|
25
26
|
arrow_schema_to_odps_schema,
|
|
26
27
|
build_dataframe_table_meta,
|
|
@@ -270,10 +271,6 @@ def test_odps_arrow_schema_conversion():
|
|
|
270
271
|
|
|
271
272
|
with pytest.raises(TypeError):
|
|
272
273
|
arrow_schema_to_odps_schema(pa.schema([("col1", pa.float16())]))
|
|
273
|
-
with pytest.raises(TypeError):
|
|
274
|
-
odps_schema_to_arrow_schema(
|
|
275
|
-
odps_types.OdpsSchema([odps_types.Column("col1", "json")])
|
|
276
|
-
)
|
|
277
274
|
|
|
278
275
|
|
|
279
276
|
def test_build_column_name():
|
|
@@ -296,3 +293,42 @@ def test_build_table_meta(wrap_obj):
|
|
|
296
293
|
table_meta = build_dataframe_table_meta(test_df)
|
|
297
294
|
expected_cols = ["a_2", "a_3", "a_0", "a_1_0", "a_1_1", "b", "c"]
|
|
298
295
|
assert table_meta.table_column_names == expected_cols
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
@pytest.mark.skipif(
|
|
299
|
+
pd_release_version[0] < 2, reason="only run under pandas 2.0 or greater"
|
|
300
|
+
)
|
|
301
|
+
def test_table_meta_with_datetime():
|
|
302
|
+
raw_df = pd.DataFrame(
|
|
303
|
+
[
|
|
304
|
+
[1, "abc", "2024-10-01 11:23:12"],
|
|
305
|
+
[3, "uvw", "2024-10-02 22:55:13"],
|
|
306
|
+
],
|
|
307
|
+
columns=["col1", "col2", "col3"],
|
|
308
|
+
)
|
|
309
|
+
df = md.DataFrame(raw_df).astype({"col3": "datetime64[ms]"})
|
|
310
|
+
schema, _ = pandas_to_odps_schema(df, unknown_as_string=True)
|
|
311
|
+
assert schema.columns[3].type == odps_types.datetime
|
|
312
|
+
|
|
313
|
+
raw_series = pd.Series(
|
|
314
|
+
["2024-10-01 11:23:12", "2024-10-02 22:55:13"], dtype="datetime64[ms]"
|
|
315
|
+
)
|
|
316
|
+
s = md.Series(raw_series)
|
|
317
|
+
schema, _ = pandas_to_odps_schema(s, unknown_as_string=True)
|
|
318
|
+
assert schema.columns[1].type == odps_types.datetime
|
|
319
|
+
|
|
320
|
+
raw_index = pd.Index(
|
|
321
|
+
["2024-10-01 11:23:12", "2024-10-02 22:55:13"], dtype="datetime64[ms]"
|
|
322
|
+
)
|
|
323
|
+
idx = md.Index(raw_index)
|
|
324
|
+
schema, _ = pandas_to_odps_schema(idx, unknown_as_string=True)
|
|
325
|
+
assert schema.columns[0].type == odps_types.datetime
|
|
326
|
+
|
|
327
|
+
src_df = pd.DataFrame(
|
|
328
|
+
[[1, "2024-10-01 11:23:12"], [3, "2024-10-02 22:55:13"]],
|
|
329
|
+
columns=["A", "B"],
|
|
330
|
+
).astype({"B": "datetime64[ms]"})
|
|
331
|
+
raw_multiindex = pd.MultiIndex.from_frame(src_df)
|
|
332
|
+
multiidx = md.Index(raw_multiindex)
|
|
333
|
+
schema, _ = pandas_to_odps_schema(multiidx, unknown_as_string=True)
|
|
334
|
+
assert schema.columns[1].type == odps_types.datetime
|