maxframe 1.0.0rc4__cp38-cp38-win_amd64.whl → 1.1.1__cp38-cp38-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cp38-win_amd64.pyd +0 -0
- maxframe/config/__init__.py +1 -1
- maxframe/config/config.py +26 -0
- maxframe/config/tests/test_config.py +20 -1
- maxframe/conftest.py +17 -4
- maxframe/core/graph/core.cp38-win_amd64.pyd +0 -0
- maxframe/core/operator/base.py +2 -0
- maxframe/dataframe/arithmetic/tests/test_arithmetic.py +17 -16
- maxframe/dataframe/core.py +24 -2
- maxframe/dataframe/datasource/read_odps_query.py +65 -35
- maxframe/dataframe/datasource/read_odps_table.py +4 -2
- maxframe/dataframe/datasource/tests/test_datasource.py +59 -7
- maxframe/dataframe/extensions/__init__.py +5 -0
- maxframe/dataframe/extensions/apply_chunk.py +649 -0
- maxframe/dataframe/extensions/flatjson.py +131 -0
- maxframe/dataframe/extensions/flatmap.py +28 -40
- maxframe/dataframe/extensions/reshuffle.py +1 -1
- maxframe/dataframe/extensions/tests/test_apply_chunk.py +186 -0
- maxframe/dataframe/extensions/tests/test_extensions.py +46 -2
- maxframe/dataframe/groupby/__init__.py +1 -0
- maxframe/dataframe/groupby/aggregation.py +1 -0
- maxframe/dataframe/groupby/apply.py +9 -1
- maxframe/dataframe/groupby/core.py +1 -1
- maxframe/dataframe/groupby/fill.py +4 -1
- maxframe/dataframe/groupby/getitem.py +6 -0
- maxframe/dataframe/groupby/tests/test_groupby.py +1 -1
- maxframe/dataframe/groupby/transform.py +8 -2
- maxframe/dataframe/indexing/loc.py +6 -4
- maxframe/dataframe/merge/__init__.py +9 -1
- maxframe/dataframe/merge/concat.py +41 -31
- maxframe/dataframe/merge/merge.py +1 -1
- maxframe/dataframe/merge/tests/test_merge.py +3 -1
- maxframe/dataframe/misc/apply.py +3 -0
- maxframe/dataframe/misc/drop_duplicates.py +5 -1
- maxframe/dataframe/misc/map.py +3 -1
- maxframe/dataframe/misc/tests/test_misc.py +24 -2
- maxframe/dataframe/misc/transform.py +22 -13
- maxframe/dataframe/reduction/__init__.py +3 -0
- maxframe/dataframe/reduction/aggregation.py +1 -0
- maxframe/dataframe/reduction/median.py +56 -0
- maxframe/dataframe/reduction/tests/test_reduction.py +17 -7
- maxframe/dataframe/statistics/quantile.py +8 -2
- maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
- maxframe/dataframe/tests/test_utils.py +60 -0
- maxframe/dataframe/utils.py +110 -7
- maxframe/dataframe/window/expanding.py +5 -3
- maxframe/dataframe/window/tests/test_expanding.py +2 -2
- maxframe/io/objects/tests/test_object_io.py +39 -12
- maxframe/io/odpsio/__init__.py +1 -1
- maxframe/io/odpsio/arrow.py +51 -2
- maxframe/io/odpsio/schema.py +23 -5
- maxframe/io/odpsio/tableio.py +80 -124
- maxframe/io/odpsio/tests/test_schema.py +40 -0
- maxframe/io/odpsio/tests/test_tableio.py +5 -5
- maxframe/io/odpsio/tests/test_volumeio.py +35 -11
- maxframe/io/odpsio/volumeio.py +27 -3
- maxframe/learn/contrib/__init__.py +3 -2
- maxframe/learn/contrib/llm/__init__.py +16 -0
- maxframe/learn/contrib/llm/core.py +54 -0
- maxframe/learn/contrib/llm/models/__init__.py +14 -0
- maxframe/learn/contrib/llm/models/dashscope.py +73 -0
- maxframe/learn/contrib/llm/multi_modal.py +42 -0
- maxframe/learn/contrib/llm/text.py +42 -0
- maxframe/lib/mmh3.cp38-win_amd64.pyd +0 -0
- maxframe/lib/sparse/tests/test_sparse.py +15 -15
- maxframe/opcodes.py +7 -1
- maxframe/serialization/core.cp38-win_amd64.pyd +0 -0
- maxframe/serialization/core.pyx +13 -1
- maxframe/serialization/pandas.py +50 -20
- maxframe/serialization/serializables/core.py +70 -15
- maxframe/serialization/serializables/field_type.py +4 -1
- maxframe/serialization/serializables/tests/test_serializable.py +12 -2
- maxframe/serialization/tests/test_serial.py +2 -1
- maxframe/tensor/__init__.py +19 -7
- maxframe/tensor/merge/vstack.py +1 -1
- maxframe/tests/utils.py +16 -0
- maxframe/udf.py +27 -0
- maxframe/utils.py +42 -8
- {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/METADATA +2 -2
- {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/RECORD +88 -77
- {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/WHEEL +1 -1
- maxframe_client/clients/framedriver.py +4 -1
- maxframe_client/fetcher.py +23 -8
- maxframe_client/session/odps.py +40 -11
- maxframe_client/session/task.py +6 -25
- maxframe_client/session/tests/test_task.py +35 -6
- maxframe_client/tests/test_session.py +30 -10
- {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import numpy as np
|
|
15
|
+
import pandas as pd
|
|
16
|
+
import pytest
|
|
17
|
+
|
|
18
|
+
from ...udf import MarkedFunction, with_python_requirements, with_resources
|
|
19
|
+
from ..utils import pack_func_args
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@pytest.fixture
|
|
23
|
+
def df1():
|
|
24
|
+
return pd.DataFrame([[1, 2], [3, 4]], columns=["a", "b"])
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def test_pack_function(df1):
|
|
28
|
+
# pack normal function
|
|
29
|
+
@with_resources("a.zip")
|
|
30
|
+
def keep(df):
|
|
31
|
+
return df
|
|
32
|
+
|
|
33
|
+
f = pack_func_args(df1, keep)
|
|
34
|
+
assert f(df1).equals(df1)
|
|
35
|
+
assert isinstance(f, MarkedFunction)
|
|
36
|
+
assert f.resources == ["a.zip"]
|
|
37
|
+
|
|
38
|
+
# pack with args
|
|
39
|
+
@with_python_requirements("numpy")
|
|
40
|
+
def add(a, b):
|
|
41
|
+
return a + b
|
|
42
|
+
|
|
43
|
+
f = pack_func_args(df1, add, 1)
|
|
44
|
+
assert f(df1).equals(df1 + 1)
|
|
45
|
+
assert isinstance(f, MarkedFunction)
|
|
46
|
+
assert f.pythonpacks[0].requirements == ("numpy",)
|
|
47
|
+
|
|
48
|
+
f = pack_func_args(df1, np.sum)
|
|
49
|
+
assert f(df1).equals(np.sum(df1))
|
|
50
|
+
|
|
51
|
+
@with_resources("a.txt")
|
|
52
|
+
@with_python_requirements("pandas")
|
|
53
|
+
def times_add(df, param, times):
|
|
54
|
+
return df * times + param
|
|
55
|
+
|
|
56
|
+
f = pack_func_args(df1, times_add, 5, 6)
|
|
57
|
+
assert f(df1).equals(df1 * 6 + 5)
|
|
58
|
+
assert isinstance(f, MarkedFunction)
|
|
59
|
+
assert f.resources == ["a.txt"]
|
|
60
|
+
assert f.pythonpacks[0].requirements == ("pandas",)
|
maxframe/dataframe/utils.py
CHANGED
|
@@ -20,7 +20,7 @@ import operator
|
|
|
20
20
|
import sys
|
|
21
21
|
from contextlib import contextmanager
|
|
22
22
|
from numbers import Integral
|
|
23
|
-
from typing import Any, Callable, List
|
|
23
|
+
from typing import TYPE_CHECKING, Any, Callable, List
|
|
24
24
|
|
|
25
25
|
import numpy as np
|
|
26
26
|
import pandas as pd
|
|
@@ -30,6 +30,7 @@ from pandas.core.dtypes.inference import is_dict_like, is_list_like
|
|
|
30
30
|
|
|
31
31
|
from ..core import Entity, ExecutableTuple
|
|
32
32
|
from ..lib.mmh3 import hash as mmh_hash
|
|
33
|
+
from ..udf import MarkedFunction
|
|
33
34
|
from ..utils import (
|
|
34
35
|
ModulePlaceholder,
|
|
35
36
|
is_full_slice,
|
|
@@ -44,6 +45,9 @@ try:
|
|
|
44
45
|
except ImportError: # pragma: no cover
|
|
45
46
|
pa = ModulePlaceholder("pyarrow")
|
|
46
47
|
|
|
48
|
+
if TYPE_CHECKING:
|
|
49
|
+
from .operators import DataFrameOperator
|
|
50
|
+
|
|
47
51
|
cudf = lazy_import("cudf", rename="cudf")
|
|
48
52
|
vineyard = lazy_import("vineyard")
|
|
49
53
|
try:
|
|
@@ -263,12 +267,30 @@ def parse_index(index_value, *args, store_data=False, key=None):
|
|
|
263
267
|
return IndexValue(_index_value=_serialize_index(index_value))
|
|
264
268
|
|
|
265
269
|
|
|
266
|
-
def gen_unknown_index_value(index_value, *args):
|
|
270
|
+
def gen_unknown_index_value(index_value, *args, normalize_range_index=False):
|
|
271
|
+
"""
|
|
272
|
+
Generate new index value with the same likes of given index_value and args, but without any value.
|
|
273
|
+
|
|
274
|
+
Parameters
|
|
275
|
+
----------
|
|
276
|
+
index_value
|
|
277
|
+
Given index value.
|
|
278
|
+
args
|
|
279
|
+
Arguments for parse_index.
|
|
280
|
+
normalize_range_index
|
|
281
|
+
If normalize range index to normal index.
|
|
282
|
+
|
|
283
|
+
Returns
|
|
284
|
+
-------
|
|
285
|
+
New created range index value.
|
|
286
|
+
"""
|
|
267
287
|
pd_index = index_value.to_pandas()
|
|
268
|
-
if isinstance(pd_index, pd.RangeIndex):
|
|
269
|
-
return parse_index(pd.RangeIndex(-1), *args)
|
|
288
|
+
if not normalize_range_index and isinstance(pd_index, pd.RangeIndex):
|
|
289
|
+
return parse_index(pd.RangeIndex(-1, name=pd_index.name), *args)
|
|
270
290
|
elif not isinstance(pd_index, pd.MultiIndex):
|
|
271
|
-
return parse_index(
|
|
291
|
+
return parse_index(
|
|
292
|
+
pd.Index([], dtype=pd_index.dtype, name=pd_index.name), *args
|
|
293
|
+
)
|
|
272
294
|
else:
|
|
273
295
|
i = pd.MultiIndex.from_arrays(
|
|
274
296
|
[c[:0] for c in pd_index.levels], names=pd_index.names
|
|
@@ -1160,7 +1182,65 @@ def patch_sa_engine_execute():
|
|
|
1160
1182
|
Engine.execute = execute
|
|
1161
1183
|
|
|
1162
1184
|
|
|
1163
|
-
def
|
|
1185
|
+
def bind_func_args_from_pos(func, args_bind_position, *bound_args, **bound_kwargs):
|
|
1186
|
+
"""
|
|
1187
|
+
Create a new function with arguments bound from specified position.
|
|
1188
|
+
|
|
1189
|
+
Parameters
|
|
1190
|
+
----------
|
|
1191
|
+
func : callable
|
|
1192
|
+
Target function to be wrapped.
|
|
1193
|
+
args_bind_position : int
|
|
1194
|
+
Position to start binding arguments (0-based).
|
|
1195
|
+
e.g., n=0 binds from first arg, n=1 binds from second arg.
|
|
1196
|
+
*bound_args : tuple
|
|
1197
|
+
Arguments to be bound from position n.
|
|
1198
|
+
**bound_kwargs : dict
|
|
1199
|
+
Keyword arguments to be bound.
|
|
1200
|
+
|
|
1201
|
+
Returns
|
|
1202
|
+
-------
|
|
1203
|
+
callable
|
|
1204
|
+
Wrapped function with bound arguments.
|
|
1205
|
+
|
|
1206
|
+
Examples
|
|
1207
|
+
--------
|
|
1208
|
+
>>> def func(x, y, z=0):
|
|
1209
|
+
... return x * y + z
|
|
1210
|
+
>>> f = bind_func_args_from_pos(func, 0, 10) # bind from second position
|
|
1211
|
+
>>> f(5) # equals func(5, 10)
|
|
1212
|
+
10
|
|
1213
|
+
|
|
1214
|
+
Raises
|
|
1215
|
+
------
|
|
1216
|
+
TypeError
|
|
1217
|
+
If func is not callable or n is not an integer.
|
|
1218
|
+
ValueError
|
|
1219
|
+
If n is negative or exceeds the number of parameters.
|
|
1220
|
+
"""
|
|
1221
|
+
|
|
1222
|
+
@functools.wraps(func)
|
|
1223
|
+
def wrapper(*runtime_args, **runtime_kwargs):
|
|
1224
|
+
try:
|
|
1225
|
+
# Combine arguments
|
|
1226
|
+
all_args = (
|
|
1227
|
+
runtime_args[:args_bind_position]
|
|
1228
|
+
+ bound_args
|
|
1229
|
+
+ runtime_args[args_bind_position:]
|
|
1230
|
+
)
|
|
1231
|
+
all_kwargs = {**bound_kwargs, **runtime_kwargs}
|
|
1232
|
+
|
|
1233
|
+
return func(*all_args, **all_kwargs)
|
|
1234
|
+
except Exception as e:
|
|
1235
|
+
# Enhance error message with context
|
|
1236
|
+
raise type(e)(
|
|
1237
|
+
f"Error calling {func.__name__} with bound arguments: {str(e)}"
|
|
1238
|
+
) from e
|
|
1239
|
+
|
|
1240
|
+
return wrapper
|
|
1241
|
+
|
|
1242
|
+
|
|
1243
|
+
def pack_func_args(df, funcs, *args, args_bind_position=1, **kwargs) -> Any:
|
|
1164
1244
|
"""
|
|
1165
1245
|
Pack the funcs with args and kwargs to avoid the ambiguity between other
|
|
1166
1246
|
positional and keyword arguments. It will process the funcs by the following rule:
|
|
@@ -1189,6 +1269,9 @@ def pack_func_args(df, funcs, *args, **kwargs) -> Any:
|
|
|
1189
1269
|
The DataFrame or Series object to test the function.
|
|
1190
1270
|
funcs : function, str, list-like or dict-like
|
|
1191
1271
|
Function to pack. It should have the same type with Dataframe.transform().
|
|
1272
|
+
args_bind_position: int
|
|
1273
|
+
Position to start binding arguments (0-based).
|
|
1274
|
+
e.g., n=0 binds from first arg, n=1 binds from second arg.
|
|
1192
1275
|
*args :
|
|
1193
1276
|
The positional arguments to func. If funcs contains many functions, each one
|
|
1194
1277
|
should be able to accept *args.
|
|
@@ -1219,8 +1302,19 @@ def pack_func_args(df, funcs, *args, **kwargs) -> Any:
|
|
|
1219
1302
|
|
|
1220
1303
|
f = get_callable_by_name(df, funcs) if isinstance(funcs, str) else funcs
|
|
1221
1304
|
|
|
1305
|
+
from ..udf import MarkedFunction
|
|
1306
|
+
|
|
1307
|
+
if isinstance(f, MarkedFunction):
|
|
1308
|
+
# for marked function, pack the inner function, and reset as mark function
|
|
1309
|
+
packed_func = f.copy()
|
|
1310
|
+
packed_func.func = bind_func_args_from_pos(
|
|
1311
|
+
f.func, args_bind_position, *args, **kwargs
|
|
1312
|
+
)
|
|
1313
|
+
else:
|
|
1314
|
+
packed_func = bind_func_args_from_pos(f, args_bind_position, *args, **kwargs)
|
|
1315
|
+
|
|
1222
1316
|
# Callable
|
|
1223
|
-
return
|
|
1317
|
+
return packed_func
|
|
1224
1318
|
|
|
1225
1319
|
|
|
1226
1320
|
def get_callable_by_name(df: Any, func_name: str) -> Callable:
|
|
@@ -1262,3 +1356,12 @@ def get_callable_by_name(df: Any, func_name: str) -> Callable:
|
|
|
1262
1356
|
raise AttributeError(
|
|
1263
1357
|
f"'{func_name}' is not a valid function for '{type(df).__name__}' object"
|
|
1264
1358
|
)
|
|
1359
|
+
|
|
1360
|
+
|
|
1361
|
+
def copy_func_scheduling_hints(func, op: "DataFrameOperator") -> None:
|
|
1362
|
+
if not isinstance(func, MarkedFunction):
|
|
1363
|
+
return
|
|
1364
|
+
if func.expect_engine:
|
|
1365
|
+
op.expect_engine = func.expect_engine
|
|
1366
|
+
if func.expect_resources:
|
|
1367
|
+
op.expect_resources = func.expect_resources
|
|
@@ -28,6 +28,7 @@ from .aggregation import BaseDataFrameExpandingAgg
|
|
|
28
28
|
from .core import Window
|
|
29
29
|
|
|
30
30
|
_window_has_method = pd_release_version >= (1, 3, 0)
|
|
31
|
+
_window_has_center = pd_release_version < (2, 0, 0)
|
|
31
32
|
|
|
32
33
|
|
|
33
34
|
class DataFrameExpandingAgg(BaseDataFrameExpandingAgg):
|
|
@@ -49,10 +50,11 @@ class Expanding(Window):
|
|
|
49
50
|
def params(self):
|
|
50
51
|
p = OrderedDict()
|
|
51
52
|
|
|
53
|
+
args = ["min_periods", "center", "axis", "method"]
|
|
52
54
|
if not _window_has_method: # pragma: no cover
|
|
53
|
-
args = [
|
|
54
|
-
|
|
55
|
-
args = [
|
|
55
|
+
args = [a for a in args if a != "method"]
|
|
56
|
+
if not _window_has_center:
|
|
57
|
+
args = [a for a in args if a != "center"]
|
|
56
58
|
|
|
57
59
|
for k in args:
|
|
58
60
|
p[k] = getattr(self, k)
|
|
@@ -29,8 +29,8 @@ def test_expanding():
|
|
|
29
29
|
with pytest.raises(NotImplementedError):
|
|
30
30
|
_ = df2.expanding(3, axis=1)
|
|
31
31
|
|
|
32
|
-
r = df2.expanding(3
|
|
33
|
-
expected = df.expanding(3
|
|
32
|
+
r = df2.expanding(3)
|
|
33
|
+
expected = df.expanding(3)
|
|
34
34
|
assert repr(r) == repr(expected)
|
|
35
35
|
|
|
36
36
|
assert "b" in dir(r)
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
|
|
14
15
|
import numpy as np
|
|
15
16
|
import pytest
|
|
16
17
|
from odps import ODPS
|
|
@@ -48,15 +49,33 @@ def create_volume(request, oss_config):
|
|
|
48
49
|
oss_bucket_name,
|
|
49
50
|
oss_endpoint,
|
|
50
51
|
) = oss_config.oss_config
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
52
|
+
|
|
53
|
+
if "test" in oss_endpoint:
|
|
54
|
+
# offline config
|
|
55
|
+
test_location = "oss://%s:%s@%s/%s/%s" % (
|
|
56
|
+
oss_access_id,
|
|
57
|
+
oss_secret_access_key,
|
|
58
|
+
oss_endpoint,
|
|
59
|
+
oss_bucket_name,
|
|
60
|
+
oss_test_dir_name,
|
|
61
|
+
)
|
|
62
|
+
rolearn = None
|
|
63
|
+
else:
|
|
64
|
+
# online config
|
|
65
|
+
endpoint_parts = oss_endpoint.split(".", 1)
|
|
66
|
+
if "-internal" not in endpoint_parts[0]:
|
|
67
|
+
endpoint_parts[0] += "-internal"
|
|
68
|
+
test_location = "oss://%s/%s/%s" % (
|
|
69
|
+
".".join(endpoint_parts),
|
|
70
|
+
oss_bucket_name,
|
|
71
|
+
oss_test_dir_name,
|
|
72
|
+
)
|
|
73
|
+
rolearn = oss_config.oss_rolearn
|
|
74
|
+
|
|
58
75
|
oss_config.oss_bucket.put_object(oss_test_dir_name + "/", b"")
|
|
59
|
-
odps_entry.create_external_volume(
|
|
76
|
+
odps_entry.create_external_volume(
|
|
77
|
+
test_vol_name, location=test_location, rolearn=rolearn
|
|
78
|
+
)
|
|
60
79
|
|
|
61
80
|
try:
|
|
62
81
|
yield test_vol_name
|
|
@@ -75,8 +94,12 @@ def test_simple_object_io(create_volume):
|
|
|
75
94
|
|
|
76
95
|
odps_entry = ODPS.from_environments()
|
|
77
96
|
|
|
78
|
-
reader = ODPSVolumeReader(
|
|
79
|
-
|
|
97
|
+
reader = ODPSVolumeReader(
|
|
98
|
+
odps_entry, create_volume, obj.key, replace_internal_host=True
|
|
99
|
+
)
|
|
100
|
+
writer = ODPSVolumeWriter(
|
|
101
|
+
odps_entry, create_volume, obj.key, replace_internal_host=True
|
|
102
|
+
)
|
|
80
103
|
|
|
81
104
|
handler = get_object_io_handler(obj)()
|
|
82
105
|
handler.write_object(writer, obj, data)
|
|
@@ -89,8 +112,12 @@ def test_tensor_object_io(create_volume):
|
|
|
89
112
|
|
|
90
113
|
odps_entry = ODPS.from_environments()
|
|
91
114
|
|
|
92
|
-
reader = ODPSVolumeReader(
|
|
93
|
-
|
|
115
|
+
reader = ODPSVolumeReader(
|
|
116
|
+
odps_entry, create_volume, obj.key, replace_internal_host=True
|
|
117
|
+
)
|
|
118
|
+
writer = ODPSVolumeWriter(
|
|
119
|
+
odps_entry, create_volume, obj.key, replace_internal_host=True
|
|
120
|
+
)
|
|
94
121
|
|
|
95
122
|
handler = get_object_io_handler(obj)()
|
|
96
123
|
handler.write_object(writer, obj, data)
|
maxframe/io/odpsio/__init__.py
CHANGED
maxframe/io/odpsio/arrow.py
CHANGED
|
@@ -14,10 +14,12 @@
|
|
|
14
14
|
|
|
15
15
|
from typing import Any, Tuple, Union
|
|
16
16
|
|
|
17
|
+
import numpy as np
|
|
17
18
|
import pandas as pd
|
|
18
19
|
import pyarrow as pa
|
|
19
20
|
|
|
20
21
|
from ...core import OutputType
|
|
22
|
+
from ...lib.version import parse as parse_version
|
|
21
23
|
from ...protocol import DataFrameTableMeta
|
|
22
24
|
from ...tensor.core import TENSOR_TYPE
|
|
23
25
|
from ...typing_ import ArrowTableType, PandasObjectTypes
|
|
@@ -69,13 +71,24 @@ def arrow_to_pandas(
|
|
|
69
71
|
|
|
70
72
|
|
|
71
73
|
def pandas_to_arrow(
|
|
72
|
-
df: Any, nthreads=1, ignore_index=False
|
|
74
|
+
df: Any, nthreads=1, ignore_index=False, ms_cols=None
|
|
73
75
|
) -> Tuple[ArrowTableType, DataFrameTableMeta]:
|
|
74
76
|
table_meta = build_dataframe_table_meta(df, ignore_index)
|
|
75
77
|
df = df.copy() if callable(getattr(df, "copy", None)) else df
|
|
78
|
+
table_datetime_cols = None
|
|
76
79
|
if table_meta.type in (OutputType.dataframe, OutputType.series):
|
|
77
80
|
if table_meta.type == OutputType.series:
|
|
78
81
|
df = df.to_frame("_data" if df.name is None else df.name)
|
|
82
|
+
if ms_cols:
|
|
83
|
+
table_datetime_cols = {"_data"}
|
|
84
|
+
elif ms_cols:
|
|
85
|
+
ms_col_set = set(ms_cols)
|
|
86
|
+
table_datetime_cols = set()
|
|
87
|
+
for pd_col, table_col in zip(
|
|
88
|
+
table_meta.pd_column_dtypes.keys(), table_meta.table_column_names
|
|
89
|
+
):
|
|
90
|
+
if pd_col in ms_col_set:
|
|
91
|
+
table_datetime_cols.add(table_col)
|
|
79
92
|
df.columns = pd.Index(table_meta.table_column_names)
|
|
80
93
|
if not ignore_index:
|
|
81
94
|
df = df.rename_axis(table_meta.table_index_column_names).reset_index()
|
|
@@ -83,6 +96,12 @@ def pandas_to_arrow(
|
|
|
83
96
|
df = pd.DataFrame([], columns=[])
|
|
84
97
|
elif table_meta.type == OutputType.index:
|
|
85
98
|
names = [f"_idx_{idx}" for idx in range(len(df.names))]
|
|
99
|
+
table_datetime_cols = set()
|
|
100
|
+
if ms_cols:
|
|
101
|
+
if isinstance(df, pd.MultiIndex):
|
|
102
|
+
table_datetime_cols = {f"_idx_{idx}" for idx in ms_cols}
|
|
103
|
+
else:
|
|
104
|
+
table_datetime_cols = {"_idx_0"}
|
|
86
105
|
df = df.to_frame(name=names[0] if len(names) == 1 else names)
|
|
87
106
|
elif table_meta.type == OutputType.scalar:
|
|
88
107
|
names = ["_idx_0"]
|
|
@@ -92,4 +111,34 @@ def pandas_to_arrow(
|
|
|
92
111
|
df = pd.DataFrame([[df]], columns=names)
|
|
93
112
|
else: # this could never happen # pragma: no cover
|
|
94
113
|
raise ValueError(f"Does not support meta type {table_meta.type!r}")
|
|
95
|
-
|
|
114
|
+
|
|
115
|
+
try:
|
|
116
|
+
pa_table = pa.Table.from_pandas(df, nthreads=nthreads, preserve_index=False)
|
|
117
|
+
except pa.ArrowTypeError as ex: # pragma: no cover
|
|
118
|
+
late_np_version = parse_version(np.__version__) >= parse_version("1.20")
|
|
119
|
+
early_pa_version = parse_version(pa.__version__) <= parse_version("4.0")
|
|
120
|
+
if (
|
|
121
|
+
late_np_version
|
|
122
|
+
and early_pa_version
|
|
123
|
+
and "Did not pass numpy.dtype object" in str(ex)
|
|
124
|
+
):
|
|
125
|
+
raise TypeError(
|
|
126
|
+
"Potential dependency conflict. Try update to pyarrow>4.0 "
|
|
127
|
+
"or downgrade to numpy<1.20. Details can be seen at "
|
|
128
|
+
"https://github.com/numpy/numpy/issues/17913. "
|
|
129
|
+
f"Raw error message: {ex!r}"
|
|
130
|
+
).with_traceback(ex.__traceback__) from None
|
|
131
|
+
else:
|
|
132
|
+
raise
|
|
133
|
+
|
|
134
|
+
if table_datetime_cols:
|
|
135
|
+
col_names = pa_table.schema.names
|
|
136
|
+
col_datas = []
|
|
137
|
+
for idx, col_name in enumerate(pa_table.schema.names):
|
|
138
|
+
if col_name not in table_datetime_cols:
|
|
139
|
+
col_datas.append(pa_table.column(idx))
|
|
140
|
+
continue
|
|
141
|
+
col_data = pa_table.column(idx).cast(pa.timestamp("ms"))
|
|
142
|
+
col_datas.append(col_data)
|
|
143
|
+
pa_table = pa.Table.from_arrays(col_datas, names=col_names)
|
|
144
|
+
return pa_table, table_meta
|
maxframe/io/odpsio/schema.py
CHANGED
|
@@ -16,6 +16,7 @@ import string
|
|
|
16
16
|
from collections import defaultdict
|
|
17
17
|
from typing import Any, Dict, Tuple
|
|
18
18
|
|
|
19
|
+
import numpy as np
|
|
19
20
|
import pandas as pd
|
|
20
21
|
import pyarrow as pa
|
|
21
22
|
from odps import types as odps_types
|
|
@@ -39,6 +40,7 @@ _arrow_to_odps_types = {
|
|
|
39
40
|
pa.float64(): odps_types.double,
|
|
40
41
|
pa.date32(): odps_types.date,
|
|
41
42
|
pa.timestamp("ms"): odps_types.datetime,
|
|
43
|
+
pa.timestamp("us"): odps_types.timestamp,
|
|
42
44
|
pa.timestamp("ns"): odps_types.timestamp,
|
|
43
45
|
}
|
|
44
46
|
|
|
@@ -205,20 +207,35 @@ def pandas_to_odps_schema(
|
|
|
205
207
|
else:
|
|
206
208
|
empty_columns = None
|
|
207
209
|
|
|
210
|
+
ms_cols = None
|
|
208
211
|
if isinstance(df_obj, (md.DataFrame, pd.DataFrame)):
|
|
209
212
|
empty_df_obj = pd.DataFrame(
|
|
210
213
|
[], columns=empty_columns, index=empty_index
|
|
211
214
|
).astype(df_obj.dtypes)
|
|
215
|
+
ms_cols = [
|
|
216
|
+
col for col, dt in df_obj.dtypes.items() if dt == np.dtype("datetime64[ms]")
|
|
217
|
+
]
|
|
212
218
|
elif isinstance(df_obj, (md.Series, pd.Series)):
|
|
213
219
|
empty_df_obj = pd.Series([], name=df_obj.name, index=empty_index).astype(
|
|
214
220
|
df_obj.dtype
|
|
215
221
|
)
|
|
222
|
+
ms_cols = df_obj.dtype == np.dtype("datetime64[ms]")
|
|
216
223
|
elif isinstance(df_obj, (md.Index, pd.Index)):
|
|
217
224
|
empty_df_obj = empty_index
|
|
225
|
+
if isinstance(empty_index, pd.MultiIndex):
|
|
226
|
+
ms_cols = [
|
|
227
|
+
idx
|
|
228
|
+
for idx, dt in enumerate(empty_index.dtypes.values)
|
|
229
|
+
if dt == np.dtype("datetime64[ms]")
|
|
230
|
+
]
|
|
231
|
+
else:
|
|
232
|
+
ms_cols = df_obj.dtype == np.dtype("datetime64[ms]")
|
|
218
233
|
else:
|
|
219
234
|
empty_df_obj = df_obj
|
|
220
235
|
|
|
221
|
-
arrow_data, table_meta = pandas_to_arrow(
|
|
236
|
+
arrow_data, table_meta = pandas_to_arrow(
|
|
237
|
+
empty_df_obj, ignore_index=ignore_index, ms_cols=ms_cols
|
|
238
|
+
)
|
|
222
239
|
return (
|
|
223
240
|
arrow_schema_to_odps_schema(
|
|
224
241
|
arrow_data.schema, unknown_as_string=unknown_as_string
|
|
@@ -346,10 +363,11 @@ def build_dataframe_table_meta(
|
|
|
346
363
|
else:
|
|
347
364
|
pd_index_val = index_obj
|
|
348
365
|
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
366
|
+
level_dtypes = [
|
|
367
|
+
pd_index_val.get_level_values(level).dtype
|
|
368
|
+
for level in range(pd_index_val.nlevels)
|
|
369
|
+
]
|
|
370
|
+
index_dtypes = pd.Series(level_dtypes, index=pd_index_val.names)
|
|
353
371
|
|
|
354
372
|
if ignore_index and obj_type != OutputType.index:
|
|
355
373
|
table_index_column_names = []
|