maxframe 1.0.0rc4__cp310-cp310-win_amd64.whl → 1.1.1__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (88) hide show
  1. maxframe/_utils.cp310-win_amd64.pyd +0 -0
  2. maxframe/config/__init__.py +1 -1
  3. maxframe/config/config.py +26 -0
  4. maxframe/config/tests/test_config.py +20 -1
  5. maxframe/conftest.py +17 -4
  6. maxframe/core/graph/core.cp310-win_amd64.pyd +0 -0
  7. maxframe/core/operator/base.py +2 -0
  8. maxframe/dataframe/arithmetic/tests/test_arithmetic.py +17 -16
  9. maxframe/dataframe/core.py +24 -2
  10. maxframe/dataframe/datasource/read_odps_query.py +65 -35
  11. maxframe/dataframe/datasource/read_odps_table.py +4 -2
  12. maxframe/dataframe/datasource/tests/test_datasource.py +59 -7
  13. maxframe/dataframe/extensions/__init__.py +5 -0
  14. maxframe/dataframe/extensions/apply_chunk.py +649 -0
  15. maxframe/dataframe/extensions/flatjson.py +131 -0
  16. maxframe/dataframe/extensions/flatmap.py +28 -40
  17. maxframe/dataframe/extensions/reshuffle.py +1 -1
  18. maxframe/dataframe/extensions/tests/test_apply_chunk.py +186 -0
  19. maxframe/dataframe/extensions/tests/test_extensions.py +46 -2
  20. maxframe/dataframe/groupby/__init__.py +1 -0
  21. maxframe/dataframe/groupby/aggregation.py +1 -0
  22. maxframe/dataframe/groupby/apply.py +9 -1
  23. maxframe/dataframe/groupby/core.py +1 -1
  24. maxframe/dataframe/groupby/fill.py +4 -1
  25. maxframe/dataframe/groupby/getitem.py +6 -0
  26. maxframe/dataframe/groupby/tests/test_groupby.py +1 -1
  27. maxframe/dataframe/groupby/transform.py +8 -2
  28. maxframe/dataframe/indexing/loc.py +6 -4
  29. maxframe/dataframe/merge/__init__.py +9 -1
  30. maxframe/dataframe/merge/concat.py +41 -31
  31. maxframe/dataframe/merge/merge.py +1 -1
  32. maxframe/dataframe/merge/tests/test_merge.py +3 -1
  33. maxframe/dataframe/misc/apply.py +3 -0
  34. maxframe/dataframe/misc/drop_duplicates.py +5 -1
  35. maxframe/dataframe/misc/map.py +3 -1
  36. maxframe/dataframe/misc/tests/test_misc.py +24 -2
  37. maxframe/dataframe/misc/transform.py +22 -13
  38. maxframe/dataframe/reduction/__init__.py +3 -0
  39. maxframe/dataframe/reduction/aggregation.py +1 -0
  40. maxframe/dataframe/reduction/median.py +56 -0
  41. maxframe/dataframe/reduction/tests/test_reduction.py +17 -7
  42. maxframe/dataframe/statistics/quantile.py +8 -2
  43. maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
  44. maxframe/dataframe/tests/test_utils.py +60 -0
  45. maxframe/dataframe/utils.py +110 -7
  46. maxframe/dataframe/window/expanding.py +5 -3
  47. maxframe/dataframe/window/tests/test_expanding.py +2 -2
  48. maxframe/io/objects/tests/test_object_io.py +39 -12
  49. maxframe/io/odpsio/__init__.py +1 -1
  50. maxframe/io/odpsio/arrow.py +51 -2
  51. maxframe/io/odpsio/schema.py +23 -5
  52. maxframe/io/odpsio/tableio.py +80 -124
  53. maxframe/io/odpsio/tests/test_schema.py +40 -0
  54. maxframe/io/odpsio/tests/test_tableio.py +5 -5
  55. maxframe/io/odpsio/tests/test_volumeio.py +35 -11
  56. maxframe/io/odpsio/volumeio.py +27 -3
  57. maxframe/learn/contrib/__init__.py +3 -2
  58. maxframe/learn/contrib/llm/__init__.py +16 -0
  59. maxframe/learn/contrib/llm/core.py +54 -0
  60. maxframe/learn/contrib/llm/models/__init__.py +14 -0
  61. maxframe/learn/contrib/llm/models/dashscope.py +73 -0
  62. maxframe/learn/contrib/llm/multi_modal.py +42 -0
  63. maxframe/learn/contrib/llm/text.py +42 -0
  64. maxframe/lib/mmh3.cp310-win_amd64.pyd +0 -0
  65. maxframe/lib/sparse/tests/test_sparse.py +15 -15
  66. maxframe/opcodes.py +7 -1
  67. maxframe/serialization/core.cp310-win_amd64.pyd +0 -0
  68. maxframe/serialization/core.pyx +13 -1
  69. maxframe/serialization/pandas.py +50 -20
  70. maxframe/serialization/serializables/core.py +70 -15
  71. maxframe/serialization/serializables/field_type.py +4 -1
  72. maxframe/serialization/serializables/tests/test_serializable.py +12 -2
  73. maxframe/serialization/tests/test_serial.py +2 -1
  74. maxframe/tensor/__init__.py +19 -7
  75. maxframe/tensor/merge/vstack.py +1 -1
  76. maxframe/tests/utils.py +16 -0
  77. maxframe/udf.py +27 -0
  78. maxframe/utils.py +42 -8
  79. {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/METADATA +4 -4
  80. {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/RECORD +88 -77
  81. {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/WHEEL +1 -1
  82. maxframe_client/clients/framedriver.py +4 -1
  83. maxframe_client/fetcher.py +23 -8
  84. maxframe_client/session/odps.py +40 -11
  85. maxframe_client/session/task.py +6 -25
  86. maxframe_client/session/tests/test_task.py +35 -6
  87. maxframe_client/tests/test_session.py +30 -10
  88. {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,60 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import numpy as np
15
+ import pandas as pd
16
+ import pytest
17
+
18
+ from ...udf import MarkedFunction, with_python_requirements, with_resources
19
+ from ..utils import pack_func_args
20
+
21
+
22
+ @pytest.fixture
23
+ def df1():
24
+ return pd.DataFrame([[1, 2], [3, 4]], columns=["a", "b"])
25
+
26
+
27
+ def test_pack_function(df1):
28
+ # pack normal function
29
+ @with_resources("a.zip")
30
+ def keep(df):
31
+ return df
32
+
33
+ f = pack_func_args(df1, keep)
34
+ assert f(df1).equals(df1)
35
+ assert isinstance(f, MarkedFunction)
36
+ assert f.resources == ["a.zip"]
37
+
38
+ # pack with args
39
+ @with_python_requirements("numpy")
40
+ def add(a, b):
41
+ return a + b
42
+
43
+ f = pack_func_args(df1, add, 1)
44
+ assert f(df1).equals(df1 + 1)
45
+ assert isinstance(f, MarkedFunction)
46
+ assert f.pythonpacks[0].requirements == ("numpy",)
47
+
48
+ f = pack_func_args(df1, np.sum)
49
+ assert f(df1).equals(np.sum(df1))
50
+
51
+ @with_resources("a.txt")
52
+ @with_python_requirements("pandas")
53
+ def times_add(df, param, times):
54
+ return df * times + param
55
+
56
+ f = pack_func_args(df1, times_add, 5, 6)
57
+ assert f(df1).equals(df1 * 6 + 5)
58
+ assert isinstance(f, MarkedFunction)
59
+ assert f.resources == ["a.txt"]
60
+ assert f.pythonpacks[0].requirements == ("pandas",)
@@ -20,7 +20,7 @@ import operator
20
20
  import sys
21
21
  from contextlib import contextmanager
22
22
  from numbers import Integral
23
- from typing import Any, Callable, List
23
+ from typing import TYPE_CHECKING, Any, Callable, List
24
24
 
25
25
  import numpy as np
26
26
  import pandas as pd
@@ -30,6 +30,7 @@ from pandas.core.dtypes.inference import is_dict_like, is_list_like
30
30
 
31
31
  from ..core import Entity, ExecutableTuple
32
32
  from ..lib.mmh3 import hash as mmh_hash
33
+ from ..udf import MarkedFunction
33
34
  from ..utils import (
34
35
  ModulePlaceholder,
35
36
  is_full_slice,
@@ -44,6 +45,9 @@ try:
44
45
  except ImportError: # pragma: no cover
45
46
  pa = ModulePlaceholder("pyarrow")
46
47
 
48
+ if TYPE_CHECKING:
49
+ from .operators import DataFrameOperator
50
+
47
51
  cudf = lazy_import("cudf", rename="cudf")
48
52
  vineyard = lazy_import("vineyard")
49
53
  try:
@@ -263,12 +267,30 @@ def parse_index(index_value, *args, store_data=False, key=None):
263
267
  return IndexValue(_index_value=_serialize_index(index_value))
264
268
 
265
269
 
266
- def gen_unknown_index_value(index_value, *args):
270
+ def gen_unknown_index_value(index_value, *args, normalize_range_index=False):
271
+ """
272
+ Generate new index value with the same likes of given index_value and args, but without any value.
273
+
274
+ Parameters
275
+ ----------
276
+ index_value
277
+ Given index value.
278
+ args
279
+ Arguments for parse_index.
280
+ normalize_range_index
281
+ If normalize range index to normal index.
282
+
283
+ Returns
284
+ -------
285
+ New created range index value.
286
+ """
267
287
  pd_index = index_value.to_pandas()
268
- if isinstance(pd_index, pd.RangeIndex):
269
- return parse_index(pd.RangeIndex(-1), *args)
288
+ if not normalize_range_index and isinstance(pd_index, pd.RangeIndex):
289
+ return parse_index(pd.RangeIndex(-1, name=pd_index.name), *args)
270
290
  elif not isinstance(pd_index, pd.MultiIndex):
271
- return parse_index(pd.Index([], dtype=pd_index.dtype), *args)
291
+ return parse_index(
292
+ pd.Index([], dtype=pd_index.dtype, name=pd_index.name), *args
293
+ )
272
294
  else:
273
295
  i = pd.MultiIndex.from_arrays(
274
296
  [c[:0] for c in pd_index.levels], names=pd_index.names
@@ -1160,7 +1182,65 @@ def patch_sa_engine_execute():
1160
1182
  Engine.execute = execute
1161
1183
 
1162
1184
 
1163
- def pack_func_args(df, funcs, *args, **kwargs) -> Any:
1185
+ def bind_func_args_from_pos(func, args_bind_position, *bound_args, **bound_kwargs):
1186
+ """
1187
+ Create a new function with arguments bound from specified position.
1188
+
1189
+ Parameters
1190
+ ----------
1191
+ func : callable
1192
+ Target function to be wrapped.
1193
+ args_bind_position : int
1194
+ Position to start binding arguments (0-based).
1195
+ e.g., n=0 binds from first arg, n=1 binds from second arg.
1196
+ *bound_args : tuple
1197
+ Arguments to be bound from position n.
1198
+ **bound_kwargs : dict
1199
+ Keyword arguments to be bound.
1200
+
1201
+ Returns
1202
+ -------
1203
+ callable
1204
+ Wrapped function with bound arguments.
1205
+
1206
+ Examples
1207
+ --------
1208
+ >>> def func(x, y, z=0):
1209
+ ... return x * y + z
1210
+ >>> f = bind_func_args_from_pos(func, 0, 10) # bind from second position
1211
+ >>> f(5) # equals func(5, 10)
1212
+ 10
1213
+
1214
+ Raises
1215
+ ------
1216
+ TypeError
1217
+ If func is not callable or n is not an integer.
1218
+ ValueError
1219
+ If n is negative or exceeds the number of parameters.
1220
+ """
1221
+
1222
+ @functools.wraps(func)
1223
+ def wrapper(*runtime_args, **runtime_kwargs):
1224
+ try:
1225
+ # Combine arguments
1226
+ all_args = (
1227
+ runtime_args[:args_bind_position]
1228
+ + bound_args
1229
+ + runtime_args[args_bind_position:]
1230
+ )
1231
+ all_kwargs = {**bound_kwargs, **runtime_kwargs}
1232
+
1233
+ return func(*all_args, **all_kwargs)
1234
+ except Exception as e:
1235
+ # Enhance error message with context
1236
+ raise type(e)(
1237
+ f"Error calling {func.__name__} with bound arguments: {str(e)}"
1238
+ ) from e
1239
+
1240
+ return wrapper
1241
+
1242
+
1243
+ def pack_func_args(df, funcs, *args, args_bind_position=1, **kwargs) -> Any:
1164
1244
  """
1165
1245
  Pack the funcs with args and kwargs to avoid the ambiguity between other
1166
1246
  positional and keyword arguments. It will process the funcs by the following rule:
@@ -1189,6 +1269,9 @@ def pack_func_args(df, funcs, *args, **kwargs) -> Any:
1189
1269
  The DataFrame or Series object to test the function.
1190
1270
  funcs : function, str, list-like or dict-like
1191
1271
  Function to pack. It should have the same type with Dataframe.transform().
1272
+ args_bind_position: int
1273
+ Position to start binding arguments (0-based).
1274
+ e.g., n=0 binds from first arg, n=1 binds from second arg.
1192
1275
  *args :
1193
1276
  The positional arguments to func. If funcs contains many functions, each one
1194
1277
  should be able to accept *args.
@@ -1219,8 +1302,19 @@ def pack_func_args(df, funcs, *args, **kwargs) -> Any:
1219
1302
 
1220
1303
  f = get_callable_by_name(df, funcs) if isinstance(funcs, str) else funcs
1221
1304
 
1305
+ from ..udf import MarkedFunction
1306
+
1307
+ if isinstance(f, MarkedFunction):
1308
+ # for marked function, pack the inner function, and reset as mark function
1309
+ packed_func = f.copy()
1310
+ packed_func.func = bind_func_args_from_pos(
1311
+ f.func, args_bind_position, *args, **kwargs
1312
+ )
1313
+ else:
1314
+ packed_func = bind_func_args_from_pos(f, args_bind_position, *args, **kwargs)
1315
+
1222
1316
  # Callable
1223
- return functools.partial(f, *args, **kwargs)
1317
+ return packed_func
1224
1318
 
1225
1319
 
1226
1320
  def get_callable_by_name(df: Any, func_name: str) -> Callable:
@@ -1262,3 +1356,12 @@ def get_callable_by_name(df: Any, func_name: str) -> Callable:
1262
1356
  raise AttributeError(
1263
1357
  f"'{func_name}' is not a valid function for '{type(df).__name__}' object"
1264
1358
  )
1359
+
1360
+
1361
+ def copy_func_scheduling_hints(func, op: "DataFrameOperator") -> None:
1362
+ if not isinstance(func, MarkedFunction):
1363
+ return
1364
+ if func.expect_engine:
1365
+ op.expect_engine = func.expect_engine
1366
+ if func.expect_resources:
1367
+ op.expect_resources = func.expect_resources
@@ -28,6 +28,7 @@ from .aggregation import BaseDataFrameExpandingAgg
28
28
  from .core import Window
29
29
 
30
30
  _window_has_method = pd_release_version >= (1, 3, 0)
31
+ _window_has_center = pd_release_version < (2, 0, 0)
31
32
 
32
33
 
33
34
  class DataFrameExpandingAgg(BaseDataFrameExpandingAgg):
@@ -49,10 +50,11 @@ class Expanding(Window):
49
50
  def params(self):
50
51
  p = OrderedDict()
51
52
 
53
+ args = ["min_periods", "center", "axis", "method"]
52
54
  if not _window_has_method: # pragma: no cover
53
- args = ["min_periods", "center", "axis"]
54
- else:
55
- args = ["min_periods", "center", "axis", "method"]
55
+ args = [a for a in args if a != "method"]
56
+ if not _window_has_center:
57
+ args = [a for a in args if a != "center"]
56
58
 
57
59
  for k in args:
58
60
  p[k] = getattr(self, k)
@@ -29,8 +29,8 @@ def test_expanding():
29
29
  with pytest.raises(NotImplementedError):
30
30
  _ = df2.expanding(3, axis=1)
31
31
 
32
- r = df2.expanding(3, center=False)
33
- expected = df.expanding(3, center=False)
32
+ r = df2.expanding(3)
33
+ expected = df.expanding(3)
34
34
  assert repr(r) == repr(expected)
35
35
 
36
36
  assert "b" in dir(r)
@@ -11,6 +11,7 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+
14
15
  import numpy as np
15
16
  import pytest
16
17
  from odps import ODPS
@@ -48,15 +49,33 @@ def create_volume(request, oss_config):
48
49
  oss_bucket_name,
49
50
  oss_endpoint,
50
51
  ) = oss_config.oss_config
51
- test_location = "oss://%s:%s@%s/%s/%s" % (
52
- oss_access_id,
53
- oss_secret_access_key,
54
- oss_endpoint,
55
- oss_bucket_name,
56
- oss_test_dir_name,
57
- )
52
+
53
+ if "test" in oss_endpoint:
54
+ # offline config
55
+ test_location = "oss://%s:%s@%s/%s/%s" % (
56
+ oss_access_id,
57
+ oss_secret_access_key,
58
+ oss_endpoint,
59
+ oss_bucket_name,
60
+ oss_test_dir_name,
61
+ )
62
+ rolearn = None
63
+ else:
64
+ # online config
65
+ endpoint_parts = oss_endpoint.split(".", 1)
66
+ if "-internal" not in endpoint_parts[0]:
67
+ endpoint_parts[0] += "-internal"
68
+ test_location = "oss://%s/%s/%s" % (
69
+ ".".join(endpoint_parts),
70
+ oss_bucket_name,
71
+ oss_test_dir_name,
72
+ )
73
+ rolearn = oss_config.oss_rolearn
74
+
58
75
  oss_config.oss_bucket.put_object(oss_test_dir_name + "/", b"")
59
- odps_entry.create_external_volume(test_vol_name, location=test_location)
76
+ odps_entry.create_external_volume(
77
+ test_vol_name, location=test_location, rolearn=rolearn
78
+ )
60
79
 
61
80
  try:
62
81
  yield test_vol_name
@@ -75,8 +94,12 @@ def test_simple_object_io(create_volume):
75
94
 
76
95
  odps_entry = ODPS.from_environments()
77
96
 
78
- reader = ODPSVolumeReader(odps_entry, create_volume, obj.key)
79
- writer = ODPSVolumeWriter(odps_entry, create_volume, obj.key)
97
+ reader = ODPSVolumeReader(
98
+ odps_entry, create_volume, obj.key, replace_internal_host=True
99
+ )
100
+ writer = ODPSVolumeWriter(
101
+ odps_entry, create_volume, obj.key, replace_internal_host=True
102
+ )
80
103
 
81
104
  handler = get_object_io_handler(obj)()
82
105
  handler.write_object(writer, obj, data)
@@ -89,8 +112,12 @@ def test_tensor_object_io(create_volume):
89
112
 
90
113
  odps_entry = ODPS.from_environments()
91
114
 
92
- reader = ODPSVolumeReader(odps_entry, create_volume, obj.key)
93
- writer = ODPSVolumeWriter(odps_entry, create_volume, obj.key)
115
+ reader = ODPSVolumeReader(
116
+ odps_entry, create_volume, obj.key, replace_internal_host=True
117
+ )
118
+ writer = ODPSVolumeWriter(
119
+ odps_entry, create_volume, obj.key, replace_internal_host=True
120
+ )
94
121
 
95
122
  handler = get_object_io_handler(obj)()
96
123
  handler.write_object(writer, obj, data)
@@ -19,5 +19,5 @@ from .schema import (
19
19
  odps_schema_to_pandas_dtypes,
20
20
  pandas_to_odps_schema,
21
21
  )
22
- from .tableio import HaloTableIO, ODPSTableIO
22
+ from .tableio import HaloTableIO, ODPSTableIO, TunnelTableIO
23
23
  from .volumeio import ODPSVolumeReader, ODPSVolumeWriter
@@ -14,10 +14,12 @@
14
14
 
15
15
  from typing import Any, Tuple, Union
16
16
 
17
+ import numpy as np
17
18
  import pandas as pd
18
19
  import pyarrow as pa
19
20
 
20
21
  from ...core import OutputType
22
+ from ...lib.version import parse as parse_version
21
23
  from ...protocol import DataFrameTableMeta
22
24
  from ...tensor.core import TENSOR_TYPE
23
25
  from ...typing_ import ArrowTableType, PandasObjectTypes
@@ -69,13 +71,24 @@ def arrow_to_pandas(
69
71
 
70
72
 
71
73
  def pandas_to_arrow(
72
- df: Any, nthreads=1, ignore_index=False
74
+ df: Any, nthreads=1, ignore_index=False, ms_cols=None
73
75
  ) -> Tuple[ArrowTableType, DataFrameTableMeta]:
74
76
  table_meta = build_dataframe_table_meta(df, ignore_index)
75
77
  df = df.copy() if callable(getattr(df, "copy", None)) else df
78
+ table_datetime_cols = None
76
79
  if table_meta.type in (OutputType.dataframe, OutputType.series):
77
80
  if table_meta.type == OutputType.series:
78
81
  df = df.to_frame("_data" if df.name is None else df.name)
82
+ if ms_cols:
83
+ table_datetime_cols = {"_data"}
84
+ elif ms_cols:
85
+ ms_col_set = set(ms_cols)
86
+ table_datetime_cols = set()
87
+ for pd_col, table_col in zip(
88
+ table_meta.pd_column_dtypes.keys(), table_meta.table_column_names
89
+ ):
90
+ if pd_col in ms_col_set:
91
+ table_datetime_cols.add(table_col)
79
92
  df.columns = pd.Index(table_meta.table_column_names)
80
93
  if not ignore_index:
81
94
  df = df.rename_axis(table_meta.table_index_column_names).reset_index()
@@ -83,6 +96,12 @@ def pandas_to_arrow(
83
96
  df = pd.DataFrame([], columns=[])
84
97
  elif table_meta.type == OutputType.index:
85
98
  names = [f"_idx_{idx}" for idx in range(len(df.names))]
99
+ table_datetime_cols = set()
100
+ if ms_cols:
101
+ if isinstance(df, pd.MultiIndex):
102
+ table_datetime_cols = {f"_idx_{idx}" for idx in ms_cols}
103
+ else:
104
+ table_datetime_cols = {"_idx_0"}
86
105
  df = df.to_frame(name=names[0] if len(names) == 1 else names)
87
106
  elif table_meta.type == OutputType.scalar:
88
107
  names = ["_idx_0"]
@@ -92,4 +111,34 @@ def pandas_to_arrow(
92
111
  df = pd.DataFrame([[df]], columns=names)
93
112
  else: # this could never happen # pragma: no cover
94
113
  raise ValueError(f"Does not support meta type {table_meta.type!r}")
95
- return pa.Table.from_pandas(df, nthreads=nthreads, preserve_index=False), table_meta
114
+
115
+ try:
116
+ pa_table = pa.Table.from_pandas(df, nthreads=nthreads, preserve_index=False)
117
+ except pa.ArrowTypeError as ex: # pragma: no cover
118
+ late_np_version = parse_version(np.__version__) >= parse_version("1.20")
119
+ early_pa_version = parse_version(pa.__version__) <= parse_version("4.0")
120
+ if (
121
+ late_np_version
122
+ and early_pa_version
123
+ and "Did not pass numpy.dtype object" in str(ex)
124
+ ):
125
+ raise TypeError(
126
+ "Potential dependency conflict. Try update to pyarrow>4.0 "
127
+ "or downgrade to numpy<1.20. Details can be seen at "
128
+ "https://github.com/numpy/numpy/issues/17913. "
129
+ f"Raw error message: {ex!r}"
130
+ ).with_traceback(ex.__traceback__) from None
131
+ else:
132
+ raise
133
+
134
+ if table_datetime_cols:
135
+ col_names = pa_table.schema.names
136
+ col_datas = []
137
+ for idx, col_name in enumerate(pa_table.schema.names):
138
+ if col_name not in table_datetime_cols:
139
+ col_datas.append(pa_table.column(idx))
140
+ continue
141
+ col_data = pa_table.column(idx).cast(pa.timestamp("ms"))
142
+ col_datas.append(col_data)
143
+ pa_table = pa.Table.from_arrays(col_datas, names=col_names)
144
+ return pa_table, table_meta
@@ -16,6 +16,7 @@ import string
16
16
  from collections import defaultdict
17
17
  from typing import Any, Dict, Tuple
18
18
 
19
+ import numpy as np
19
20
  import pandas as pd
20
21
  import pyarrow as pa
21
22
  from odps import types as odps_types
@@ -39,6 +40,7 @@ _arrow_to_odps_types = {
39
40
  pa.float64(): odps_types.double,
40
41
  pa.date32(): odps_types.date,
41
42
  pa.timestamp("ms"): odps_types.datetime,
43
+ pa.timestamp("us"): odps_types.timestamp,
42
44
  pa.timestamp("ns"): odps_types.timestamp,
43
45
  }
44
46
 
@@ -205,20 +207,35 @@ def pandas_to_odps_schema(
205
207
  else:
206
208
  empty_columns = None
207
209
 
210
+ ms_cols = None
208
211
  if isinstance(df_obj, (md.DataFrame, pd.DataFrame)):
209
212
  empty_df_obj = pd.DataFrame(
210
213
  [], columns=empty_columns, index=empty_index
211
214
  ).astype(df_obj.dtypes)
215
+ ms_cols = [
216
+ col for col, dt in df_obj.dtypes.items() if dt == np.dtype("datetime64[ms]")
217
+ ]
212
218
  elif isinstance(df_obj, (md.Series, pd.Series)):
213
219
  empty_df_obj = pd.Series([], name=df_obj.name, index=empty_index).astype(
214
220
  df_obj.dtype
215
221
  )
222
+ ms_cols = df_obj.dtype == np.dtype("datetime64[ms]")
216
223
  elif isinstance(df_obj, (md.Index, pd.Index)):
217
224
  empty_df_obj = empty_index
225
+ if isinstance(empty_index, pd.MultiIndex):
226
+ ms_cols = [
227
+ idx
228
+ for idx, dt in enumerate(empty_index.dtypes.values)
229
+ if dt == np.dtype("datetime64[ms]")
230
+ ]
231
+ else:
232
+ ms_cols = df_obj.dtype == np.dtype("datetime64[ms]")
218
233
  else:
219
234
  empty_df_obj = df_obj
220
235
 
221
- arrow_data, table_meta = pandas_to_arrow(empty_df_obj, ignore_index=ignore_index)
236
+ arrow_data, table_meta = pandas_to_arrow(
237
+ empty_df_obj, ignore_index=ignore_index, ms_cols=ms_cols
238
+ )
222
239
  return (
223
240
  arrow_schema_to_odps_schema(
224
241
  arrow_data.schema, unknown_as_string=unknown_as_string
@@ -346,10 +363,11 @@ def build_dataframe_table_meta(
346
363
  else:
347
364
  pd_index_val = index_obj
348
365
 
349
- if hasattr(pd_index_val, "dtypes"):
350
- index_dtypes = pd.Series(pd_index_val.dtypes.values, index=pd_index_val.names)
351
- else:
352
- index_dtypes = pd.Series([pd_index_val.dtype], index=pd_index_val.names)
366
+ level_dtypes = [
367
+ pd_index_val.get_level_values(level).dtype
368
+ for level in range(pd_index_val.nlevels)
369
+ ]
370
+ index_dtypes = pd.Series(level_dtypes, index=pd_index_val.names)
353
371
 
354
372
  if ignore_index and obj_type != OutputType.index:
355
373
  table_index_column_names = []