maxframe 1.2.0__cp311-cp311-macosx_10_9_universal2.whl → 1.3.0__cp311-cp311-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (72) hide show
  1. maxframe/_utils.cpython-311-darwin.so +0 -0
  2. maxframe/codegen.py +70 -21
  3. maxframe/config/config.py +6 -0
  4. maxframe/core/accessor.py +1 -0
  5. maxframe/core/graph/core.cpython-311-darwin.so +0 -0
  6. maxframe/dataframe/accessors/__init__.py +1 -1
  7. maxframe/dataframe/accessors/dict_/accessor.py +1 -0
  8. maxframe/dataframe/accessors/dict_/length.py +1 -0
  9. maxframe/dataframe/accessors/dict_/setitem.py +1 -0
  10. maxframe/dataframe/accessors/dict_/tests/test_dict_accessor.py +5 -7
  11. maxframe/dataframe/accessors/list_/__init__.py +37 -0
  12. maxframe/dataframe/accessors/list_/accessor.py +39 -0
  13. maxframe/dataframe/accessors/list_/getitem.py +135 -0
  14. maxframe/dataframe/accessors/list_/length.py +73 -0
  15. maxframe/dataframe/accessors/list_/tests/__init__.py +13 -0
  16. maxframe/dataframe/accessors/list_/tests/test_list_accessor.py +79 -0
  17. maxframe/dataframe/accessors/plotting/__init__.py +2 -0
  18. maxframe/dataframe/accessors/string_/__init__.py +1 -0
  19. maxframe/dataframe/datasource/read_odps_query.py +1 -1
  20. maxframe/dataframe/datasource/tests/test_datasource.py +4 -0
  21. maxframe/dataframe/datastore/to_odps.py +6 -0
  22. maxframe/dataframe/extensions/accessor.py +1 -0
  23. maxframe/dataframe/extensions/apply_chunk.py +34 -21
  24. maxframe/dataframe/extensions/flatmap.py +8 -1
  25. maxframe/dataframe/extensions/tests/test_apply_chunk.py +2 -1
  26. maxframe/dataframe/extensions/tests/test_extensions.py +1 -0
  27. maxframe/dataframe/merge/concat.py +7 -4
  28. maxframe/dataframe/merge/merge.py +1 -0
  29. maxframe/dataframe/merge/tests/test_merge.py +97 -47
  30. maxframe/dataframe/missing/tests/test_missing.py +1 -0
  31. maxframe/dataframe/tests/test_utils.py +7 -0
  32. maxframe/dataframe/ufunc/ufunc.py +1 -0
  33. maxframe/dataframe/utils.py +3 -0
  34. maxframe/io/odpsio/schema.py +1 -0
  35. maxframe/learn/contrib/__init__.py +2 -4
  36. maxframe/learn/contrib/llm/__init__.py +1 -0
  37. maxframe/learn/contrib/llm/core.py +31 -10
  38. maxframe/learn/contrib/llm/models/__init__.py +1 -0
  39. maxframe/learn/contrib/llm/models/dashscope.py +4 -3
  40. maxframe/learn/contrib/llm/models/managed.py +39 -0
  41. maxframe/learn/contrib/llm/multi_modal.py +1 -0
  42. maxframe/learn/contrib/llm/text.py +252 -8
  43. maxframe/learn/contrib/models.py +77 -0
  44. maxframe/learn/contrib/utils.py +1 -0
  45. maxframe/learn/contrib/xgboost/__init__.py +8 -1
  46. maxframe/learn/contrib/xgboost/classifier.py +15 -4
  47. maxframe/learn/contrib/xgboost/core.py +108 -1
  48. maxframe/learn/contrib/xgboost/dmatrix.py +1 -1
  49. maxframe/learn/contrib/xgboost/predict.py +8 -3
  50. maxframe/learn/contrib/xgboost/regressor.py +15 -1
  51. maxframe/learn/contrib/xgboost/train.py +5 -4
  52. maxframe/lib/dtypes_extension/__init__.py +2 -1
  53. maxframe/lib/dtypes_extension/dtypes.py +17 -42
  54. maxframe/lib/dtypes_extension/tests/test_dtypes.py +11 -31
  55. maxframe/lib/mmh3.cpython-311-darwin.so +0 -0
  56. maxframe/opcodes.py +19 -0
  57. maxframe/serialization/__init__.py +1 -0
  58. maxframe/serialization/core.cpython-311-darwin.so +0 -0
  59. maxframe/serialization/core.pyx +12 -1
  60. maxframe/serialization/numpy.py +12 -4
  61. maxframe/serialization/serializables/tests/test_serializable.py +13 -2
  62. maxframe/serialization/tests/test_serial.py +2 -0
  63. maxframe/tensor/merge/concatenate.py +1 -0
  64. maxframe/tensor/misc/unique.py +11 -10
  65. maxframe/tensor/reshape/reshape.py +4 -1
  66. maxframe/utils.py +4 -0
  67. {maxframe-1.2.0.dist-info → maxframe-1.3.0.dist-info}/METADATA +2 -2
  68. {maxframe-1.2.0.dist-info → maxframe-1.3.0.dist-info}/RECORD +72 -64
  69. {maxframe-1.2.0.dist-info → maxframe-1.3.0.dist-info}/WHEEL +1 -1
  70. maxframe_client/session/odps.py +3 -0
  71. maxframe_client/session/tests/test_task.py +1 -0
  72. {maxframe-1.2.0.dist-info → maxframe-1.3.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,79 @@
1
+ # Copyright 1999-2025 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import numpy as np
16
+ import pandas as pd
17
+ import pyarrow as pa
18
+ import pytest
19
+
20
+ from ..... import dataframe as md
21
+ from .....lib.dtypes_extension import list_
22
+ from .....utils import ARROW_DTYPE_NOT_SUPPORTED
23
+ from ..getitem import SeriesListGetItemOperator
24
+ from ..length import SeriesListLengthOperator
25
+
26
+ pytestmark = pytest.mark.skipif(
27
+ ARROW_DTYPE_NOT_SUPPORTED, reason="Arrow Dtype is not supported"
28
+ )
29
+
30
+
31
+ @pytest.fixture
32
+ def df():
33
+ return md.DataFrame(
34
+ {
35
+ "A": pd.Series([[5, 3, 2]], dtype=list_(pa.int32())),
36
+ "B": pd.Series([["ab", "cd"]], dtype=list_(pa.string())),
37
+ "C": pd.Series([1], dtype=np.dtype("int64")),
38
+ },
39
+ index=[1],
40
+ )
41
+
42
+
43
+ def test_invalid_dtype(df):
44
+ with pytest.raises(AttributeError):
45
+ df["C"].list.len()
46
+
47
+
48
+ def test_getitem(df):
49
+ s1 = df["A"].list[1]
50
+ assert isinstance(s1, md.Series)
51
+ assert s1.dtype == pd.ArrowDtype(pa.int32())
52
+ assert s1.shape == (1,)
53
+ assert s1.index_value == df.index_value
54
+ op = s1.op
55
+ assert isinstance(op, SeriesListGetItemOperator)
56
+ assert op.query_index == 1
57
+ assert op.ignore_index_error is False
58
+
59
+
60
+ def test_getitem_ignore_index_err(df):
61
+ s1 = df["B"].list.get(1)
62
+ assert isinstance(s1, md.Series)
63
+ assert s1.dtype == pd.ArrowDtype(pa.string())
64
+ assert s1.shape == (1,)
65
+ assert s1.index_value == df.index_value
66
+ op = s1.op
67
+ assert isinstance(op, SeriesListGetItemOperator)
68
+ assert op.query_index == 1
69
+ assert op.ignore_index_error is True
70
+
71
+
72
+ def test_length(df):
73
+ s1 = df["A"].list.len()
74
+ assert isinstance(s1, md.Series)
75
+ assert s1.dtype == pd.ArrowDtype(pa.int64())
76
+ assert s1.shape == (1,)
77
+ assert s1.index_value == df.index_value
78
+ op = s1.op
79
+ assert isinstance(op, SeriesListLengthOperator)
@@ -11,6 +11,8 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+
15
+
14
16
  def _install():
15
17
  import pandas as pd
16
18
 
@@ -11,6 +11,7 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+
14
15
  from .core import SeriesStringMethod
15
16
 
16
17
 
@@ -262,7 +262,7 @@ class DataFrameReadODPSQuery(
262
262
  column_renames = DictField("column_renames", default=None)
263
263
 
264
264
  def get_columns(self):
265
- return self.columns
265
+ return self.columns or list(self.dtypes.index)
266
266
 
267
267
  def set_pruned_columns(self, columns, *, keep_order=None): # pragma: no cover
268
268
  self.columns = columns
@@ -364,6 +364,7 @@ def test_from_odps_query():
364
364
  index=["col1", "col2", "col3"],
365
365
  ),
366
366
  )
367
+ assert df.op.get_columns() == ["col1", "col2", "col3"]
367
368
 
368
369
  df = read_odps_query(query1, skip_schema=True)
369
370
  assert df.dtypes is None
@@ -373,6 +374,7 @@ def test_from_odps_query():
373
374
  df = read_odps_query(query1, index_col="col1")
374
375
  assert df.op.query == query1
375
376
  assert df.index_value.name == "col1"
377
+ assert df.op.get_columns() == ["col2", "col3"]
376
378
  assert isinstance(df.index_value.value, IndexValue.Index)
377
379
  pd.testing.assert_series_equal(
378
380
  df.dtypes,
@@ -389,6 +391,7 @@ def test_from_odps_query():
389
391
  df = read_odps_query(query2, index_col=["col1", "col2"])
390
392
  assert df.op.query == query2
391
393
  assert df.index_value.names == ["col1", "col2"]
394
+ assert df.op.get_columns() == ["c31", "c32"]
392
395
  assert isinstance(df.index_value.value, IndexValue.MultiIndex)
393
396
  pd.testing.assert_series_equal(
394
397
  df.dtypes,
@@ -405,6 +408,7 @@ def test_from_odps_query():
405
408
  assert df.op.query == query3
406
409
  assert df.op.extra_params.no_split_sql is False
407
410
  assert df.index_value.names == ["c1"]
411
+ assert df.op.get_columns() == ["c32"]
408
412
  pd.testing.assert_series_equal(
409
413
  df.dtypes,
410
414
  pd.Series([np.dtype("float64")], index=["c32"]),
@@ -27,6 +27,7 @@ from ...core import OutputType
27
27
  from ...io.odpsio import build_dataframe_table_meta
28
28
  from ...serialization.serializables import (
29
29
  BoolField,
30
+ DictField,
30
31
  FieldTypes,
31
32
  Int64Field,
32
33
  ListField,
@@ -55,6 +56,7 @@ class DataFrameToODPSTable(DataFrameDataStore):
55
56
  index = BoolField("index", default=True)
56
57
  index_label = ListField("index_label", FieldTypes.string, default=None)
57
58
  lifecycle = Int64Field("lifecycle", default=None)
59
+ table_properties = DictField("table_properties", default=None)
58
60
 
59
61
  def __init__(self, **kw):
60
62
  super().__init__(_output_types=[OutputType.dataframe], **kw)
@@ -84,6 +86,7 @@ def to_odps_table(
84
86
  index: bool = True,
85
87
  index_label: Union[None, str, List[str]] = None,
86
88
  lifecycle: Optional[int] = None,
89
+ table_properties: Optional[dict] = None,
87
90
  ):
88
91
  """
89
92
  Write DataFrame object into a MaxCompute (ODPS) table.
@@ -122,6 +125,8 @@ def to_odps_table(
122
125
  names will be used.
123
126
  lifecycle: Optional[int]
124
127
  Specify lifecycle of the output table.
128
+ table_properties: Optional[dict]
129
+ Specify properties of the output table.
125
130
 
126
131
  Returns
127
132
  -------
@@ -186,5 +191,6 @@ def to_odps_table(
186
191
  index=index,
187
192
  index_label=index_label,
188
193
  lifecycle=lifecycle or options.session.table_lifecycle,
194
+ table_properties=table_properties,
189
195
  )
190
196
  return op(df)
@@ -11,6 +11,7 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+
14
15
  from typing import TYPE_CHECKING
15
16
 
16
17
  from ...core import BaseMaxFrameAccessor
@@ -11,6 +11,7 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+
14
15
  import functools
15
16
  from typing import Any, Callable, Dict, List, Tuple, Union
16
17
 
@@ -19,7 +20,12 @@ import pandas as pd
19
20
 
20
21
  from ... import opcodes
21
22
  from ...core import OutputType
22
- from ...serialization.serializables import FunctionField, Int32Field
23
+ from ...serialization.serializables import (
24
+ DictField,
25
+ FunctionField,
26
+ Int32Field,
27
+ TupleField,
28
+ )
23
29
  from ...utils import quiet_stdio
24
30
  from ..core import DATAFRAME_TYPE, DataFrame, IndexValue, Series
25
31
  from ..operators import DataFrameOperator, DataFrameOperatorMixin
@@ -38,7 +44,9 @@ class DataFrameApplyChunkOperator(DataFrameOperator, DataFrameOperatorMixin):
38
44
  _op_type_ = opcodes.APPLY_CHUNK
39
45
 
40
46
  func = FunctionField("func")
41
- batch_rows = Int32Field("batch_rows")
47
+ batch_rows = Int32Field("batch_rows", default=None)
48
+ args = TupleField("args", default=None)
49
+ kwargs = DictField("kwargs", default=None)
42
50
 
43
51
  def __init__(self, output_type=None, **kw):
44
52
  if output_type:
@@ -104,12 +112,11 @@ class DataFrameApplyChunkOperator(DataFrameOperator, DataFrameOperatorMixin):
104
112
  dtypes: Union[Tuple[str, Any], Dict[str, Any]] = None,
105
113
  output_type=None,
106
114
  index=None,
107
- args=(),
108
- **kwargs,
109
115
  ):
116
+ args = self.args or ()
117
+ kwargs = self.kwargs or {}
110
118
  # if not dtypes and not skip_infer:
111
- origin_func = self.func
112
- self.func = get_packed_func(df_or_series, origin_func, *args, **kwargs)
119
+ packed_func = get_packed_func(df_or_series, self.func, *args, **kwargs)
113
120
 
114
121
  # if skip_infer, directly build a frame
115
122
  if self.output_types and self.output_types[0] == OutputType.df_or_series:
@@ -118,8 +125,8 @@ class DataFrameApplyChunkOperator(DataFrameOperator, DataFrameOperatorMixin):
118
125
  # infer return index and dtypes
119
126
  dtypes, index_value, elementwise = self._infer_batch_func_returns(
120
127
  df_or_series,
121
- origin_func=origin_func,
122
- packed_func=self.func,
128
+ origin_func=self.func,
129
+ packed_func=packed_func,
123
130
  given_output_type=output_type,
124
131
  given_dtypes=dtypes,
125
132
  given_index=index,
@@ -166,6 +173,8 @@ class DataFrameApplyChunkOperator(DataFrameOperator, DataFrameOperatorMixin):
166
173
  given_dtypes: Union[Tuple[str, Any], pd.Series, List[Any], Dict[str, Any]],
167
174
  given_index: Union[pd.Index, IndexValue],
168
175
  given_elementwise: bool = False,
176
+ *args,
177
+ **kwargs,
169
178
  ):
170
179
  inferred_output_type = inferred_dtypes = inferred_index_value = None
171
180
  inferred_is_elementwise = False
@@ -190,7 +199,7 @@ class DataFrameApplyChunkOperator(DataFrameOperator, DataFrameOperatorMixin):
190
199
  try:
191
200
  # execute
192
201
  with np.errstate(all="ignore"), quiet_stdio():
193
- infer_result = packed_func(empty_data)
202
+ infer_result = packed_func(empty_data, *args, **kwargs)
194
203
 
195
204
  # if executed successfully, get index and dtypes from returned object
196
205
  if inferred_index_value is None:
@@ -258,7 +267,7 @@ def get_packed_func(df, func, *args, **kwargs) -> Any:
258
267
  def df_apply_chunk(
259
268
  dataframe,
260
269
  func: Union[str, Callable],
261
- batch_rows,
270
+ batch_rows=None,
262
271
  dtypes=None,
263
272
  dtype=None,
264
273
  name=None,
@@ -462,11 +471,11 @@ def df_apply_chunk(
462
471
  if not isinstance(func, Callable):
463
472
  raise TypeError("function must be a callable object")
464
473
 
465
- if not isinstance(batch_rows, int):
466
- raise TypeError("batch_rows must be an integer")
467
-
468
- if batch_rows <= 0:
469
- raise ValueError("batch_rows must be greater than 0")
474
+ if batch_rows is not None:
475
+ if not isinstance(batch_rows, int):
476
+ raise TypeError("batch_rows must be an integer")
477
+ elif batch_rows <= 0:
478
+ raise ValueError("batch_rows must be greater than 0")
470
479
 
471
480
  dtypes = (name, dtype) if dtype is not None else dtypes
472
481
 
@@ -481,15 +490,17 @@ def df_apply_chunk(
481
490
 
482
491
  # bind args and kwargs
483
492
  op = DataFrameApplyChunkOperator(
484
- func=func, batch_rows=batch_rows, output_type=output_type
493
+ func=func,
494
+ batch_rows=batch_rows,
495
+ output_type=output_type,
496
+ args=args,
497
+ kwargs=kwargs,
485
498
  )
486
499
 
487
500
  return op(
488
501
  dataframe,
489
502
  dtypes=dtypes,
490
503
  index=index,
491
- args=args,
492
- **kwargs,
493
504
  )
494
505
 
495
506
 
@@ -720,7 +731,11 @@ def series_apply_chunk(
720
731
  output_type = OutputType.df_or_series
721
732
 
722
733
  op = DataFrameApplyChunkOperator(
723
- func=func, batch_rows=batch_rows, output_type=output_type
734
+ func=func,
735
+ batch_rows=batch_rows,
736
+ output_type=output_type,
737
+ args=args,
738
+ kwargs=kwargs,
724
739
  )
725
740
 
726
741
  dtypes = (name, dtype) if dtype is not None else dtypes
@@ -729,6 +744,4 @@ def series_apply_chunk(
729
744
  dtypes=dtypes,
730
745
  output_type=output_type,
731
746
  index=index,
732
- args=args,
733
- **kwargs,
734
747
  )
@@ -27,7 +27,12 @@ from ...serialization.serializables import (
27
27
  )
28
28
  from ..core import DataFrame
29
29
  from ..operators import DataFrameOperator, DataFrameOperatorMixin
30
- from ..utils import gen_unknown_index_value, make_dtypes, parse_index
30
+ from ..utils import (
31
+ copy_func_scheduling_hints,
32
+ gen_unknown_index_value,
33
+ make_dtypes,
34
+ parse_index,
35
+ )
31
36
 
32
37
 
33
38
  class DataFrameFlatMapOperator(DataFrameOperator, DataFrameOperatorMixin):
@@ -40,6 +45,8 @@ class DataFrameFlatMapOperator(DataFrameOperator, DataFrameOperatorMixin):
40
45
 
41
46
  def __init__(self, output_types=None, **kw):
42
47
  super().__init__(_output_types=output_types, **kw)
48
+ if hasattr(self, "func"):
49
+ copy_func_scheduling_hints(self.func, self)
43
50
 
44
51
  def _call_dataframe(self, df: DataFrame, dtypes: pd.Series):
45
52
  dtypes = make_dtypes(dtypes)
@@ -11,6 +11,7 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+
14
15
  import numpy as np
15
16
  import pandas as pd
16
17
  import pytest
@@ -102,7 +103,7 @@ def test_apply_chunk_infer_dtypes_and_index(df1, df2, df3):
102
103
  assert result.index_value is df1.index_value
103
104
  assert result.dtypes.equals(df1.dtypes)
104
105
  assert isinstance(result.op.func, MarkedFunction)
105
- assert result.op.func is not process
106
+ assert result.op.func is process
106
107
  assert result.op.func.resources is process.resources
107
108
  assert result.op.func.pythonpacks is process.pythonpacks
108
109
 
@@ -11,6 +11,7 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+
14
15
  import numpy as np
15
16
  import pandas as pd
16
17
  import pytest
@@ -11,6 +11,7 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+
14
15
  from typing import List, Union
15
16
 
16
17
  import pandas as pd
@@ -100,8 +101,9 @@ class DataFrameConcat(DataFrameOperator, DataFrameOperatorMixin):
100
101
  row_length = 0
101
102
  for series in objs:
102
103
  row_length += series.shape[0]
103
- if self.ignore_index: # pragma: no cover
104
- index_value = parse_index(pd.RangeIndex(row_length))
104
+ if self.ignore_index:
105
+ idx_length = 0 if pd.isna(row_length) else row_length
106
+ index_value = parse_index(pd.RangeIndex(idx_length))
105
107
  else:
106
108
  index = self._concat_index(objs)
107
109
  index_value = parse_index(index, objs)
@@ -159,8 +161,9 @@ class DataFrameConcat(DataFrameOperator, DataFrameOperatorMixin):
159
161
  if self.join == "inner":
160
162
  objs = [o[list(emtpy_result.columns)] for o in objs]
161
163
 
162
- if self.ignore_index: # pragma: no cover
163
- index_value = parse_index(pd.RangeIndex(row_length))
164
+ if self.ignore_index:
165
+ idx_length = 0 if pd.isna(row_length) else row_length
166
+ index_value = parse_index(pd.RangeIndex(idx_length))
164
167
  else:
165
168
  index = self._concat_index(objs)
166
169
  index_value = parse_index(index, objs)
@@ -11,6 +11,7 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+
14
15
  import logging
15
16
  from abc import abstractmethod
16
17
  from collections import namedtuple
@@ -16,10 +16,10 @@ import numpy as np
16
16
  import pandas as pd
17
17
  import pytest
18
18
 
19
+ from .... import dataframe as md
19
20
  from ....tests.utils import assert_mf_index_dtype
20
21
  from ...core import IndexValue
21
- from ...datasource.dataframe import from_pandas
22
- from .. import DataFrameMerge, concat
22
+ from .. import DataFrameMerge
23
23
  from ..merge import DistributedMapJoinHint, MapJoinHint, SkewJoinHint
24
24
 
25
25
 
@@ -29,8 +29,8 @@ def test_merge():
29
29
  )
30
30
  df2 = pd.DataFrame(np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"])
31
31
 
32
- mdf1 = from_pandas(df1, chunk_size=2)
33
- mdf2 = from_pandas(df2, chunk_size=3)
32
+ mdf1 = md.DataFrame(df1, chunk_size=2)
33
+ mdf2 = md.DataFrame(df2, chunk_size=3)
34
34
 
35
35
  mapjoin = MapJoinHint()
36
36
  dist_mapjoin1 = DistributedMapJoinHint(shard_count=5)
@@ -83,8 +83,8 @@ def test_merge_invalid_parameters():
83
83
  )
84
84
  pdf2 = pd.DataFrame(np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"])
85
85
 
86
- df1 = from_pandas(pdf1, chunk_size=2)
87
- df2 = from_pandas(pdf2, chunk_size=3)
86
+ df1 = md.DataFrame(pdf1, chunk_size=2)
87
+ df2 = md.DataFrame(pdf2, chunk_size=3)
88
88
 
89
89
  with pytest.raises(ValueError):
90
90
  df1.merge(df2, bloom_filter="wrong")
@@ -104,8 +104,8 @@ def test_join():
104
104
  df2 = pd.DataFrame([[1, 2, 3], [1, 5, 6], [7, 8, 9]], index=["a1", "b2", "b3"]) + 1
105
105
  df2 = pd.concat([df2, df2 + 1])
106
106
 
107
- mdf1 = from_pandas(df1, chunk_size=2)
108
- mdf2 = from_pandas(df2, chunk_size=2)
107
+ mdf1 = md.DataFrame(df1, chunk_size=2)
108
+ mdf2 = md.DataFrame(df2, chunk_size=2)
109
109
 
110
110
  parameters = [
111
111
  {"lsuffix": "l_", "rsuffix": "r_"},
@@ -132,8 +132,8 @@ def test_join_on():
132
132
  )
133
133
  df2 = pd.concat([df2, df2 + 1])
134
134
 
135
- mdf1 = from_pandas(df1, chunk_size=2)
136
- mdf2 = from_pandas(df2, chunk_size=2)
135
+ mdf1 = md.DataFrame(df1, chunk_size=2)
136
+ mdf2 = md.DataFrame(df2, chunk_size=2)
137
137
 
138
138
  parameters = [
139
139
  {"lsuffix": "l_", "rsuffix": "r_"},
@@ -157,15 +157,15 @@ def test_append():
157
157
  df1 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD"))
158
158
  df2 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD"))
159
159
 
160
- mdf1 = from_pandas(df1, chunk_size=3)
161
- mdf2 = from_pandas(df2, chunk_size=3)
160
+ mdf1 = md.DataFrame(df1, chunk_size=3)
161
+ mdf2 = md.DataFrame(df2, chunk_size=3)
162
162
  adf = mdf1.append(mdf2)
163
163
 
164
164
  assert adf.shape == (20, 4)
165
165
  assert_mf_index_dtype(adf.index_value.value, np.int64)
166
166
 
167
- mdf1 = from_pandas(df1, chunk_size=3)
168
- mdf2 = from_pandas(df2, chunk_size=3)
167
+ mdf1 = md.DataFrame(df1, chunk_size=3)
168
+ mdf2 = md.DataFrame(df2, chunk_size=3)
169
169
  adf = mdf1.append(mdf2, ignore_index=True)
170
170
 
171
171
  assert adf.shape == (20, 4)
@@ -173,84 +173,135 @@ def test_append():
173
173
  pd.testing.assert_index_equal(adf.index_value.to_pandas(), pd.RangeIndex(20))
174
174
 
175
175
 
176
- def test_concat():
176
+ def test_concat_dataframe():
177
+ # test index concatenate
177
178
  df1 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD"))
178
179
  df2 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD"))
179
180
 
180
- mdf1 = from_pandas(df1, chunk_size=4)
181
- mdf2 = from_pandas(df2, chunk_size=4)
182
- r = concat([mdf1, mdf2], axis="index")
181
+ mdf1 = md.DataFrame(df1, chunk_size=4)
182
+ mdf2 = md.DataFrame(df2, chunk_size=4)
183
+ r = md.concat([mdf1, mdf2], axis="index")
183
184
 
184
185
  assert r.shape == (20, 4)
185
186
  assert not isinstance(r.index_value.to_pandas(), pd.RangeIndex)
186
- pd.testing.assert_series_equal(r.dtypes, df1.dtypes)
187
+ pd.testing.assert_series_equal(r.dtypes, mdf1.dtypes)
187
188
 
188
- df3 = pd.DataFrame(
189
- np.random.rand(10, 4), columns=list("ABCD"), index=pd.RangeIndex(10, 20)
189
+ # test index concatenate with range index
190
+ mdf3 = md.DataFrame(
191
+ np.random.rand(10, 4),
192
+ columns=list("ABCD"),
193
+ index=pd.RangeIndex(10, 20),
194
+ chunk_size=4,
190
195
  )
191
-
192
- mdf3 = from_pandas(df3, chunk_size=4)
193
- r = concat([mdf1, mdf3], axis="index")
196
+ r = md.concat([mdf1, mdf3], axis="index")
194
197
 
195
198
  assert r.shape == (20, 4)
196
- pd.testing.assert_series_equal(r.dtypes, df1.dtypes)
199
+ pd.testing.assert_series_equal(r.dtypes, mdf1.dtypes)
197
200
  pd.testing.assert_index_equal(r.index_value.to_pandas(), pd.RangeIndex(20))
198
201
 
202
+ # test index concatenate with perm index
199
203
  df4 = pd.DataFrame(
200
204
  np.random.rand(10, 4),
201
205
  columns=list("ABCD"),
202
206
  index=np.random.permutation(np.arange(10)),
203
207
  )
204
208
 
205
- mdf4 = from_pandas(df4, chunk_size=4)
206
- r = concat([mdf1, mdf4], axis="index")
209
+ # test concat with same index with different sources
210
+ mdf4 = md.DataFrame(df4, chunk_size=4)
211
+ r = md.concat([mdf1, mdf4], axis="index")
207
212
 
208
213
  assert r.shape == (20, 4)
209
- pd.testing.assert_series_equal(r.dtypes, df1.dtypes)
214
+ pd.testing.assert_series_equal(r.dtypes, mdf1.dtypes)
210
215
  pd.testing.assert_index_equal(
211
216
  r.index_value.to_pandas(), pd.Index([], dtype=np.int64)
212
217
  )
213
218
 
214
- r = concat([mdf4, mdf1], axis="index")
219
+ r = md.concat([mdf4, mdf1], axis="index")
215
220
 
216
221
  assert r.shape == (20, 4)
217
- pd.testing.assert_series_equal(r.dtypes, df1.dtypes)
222
+ pd.testing.assert_series_equal(r.dtypes, mdf1.dtypes)
218
223
  pd.testing.assert_index_equal(
219
224
  r.index_value.to_pandas(), pd.Index([], dtype=np.int64)
220
225
  )
221
226
 
222
- r = concat([mdf4, mdf4], axis="index")
227
+ # test concat with same index with same source
228
+ r = md.concat([mdf4, mdf4], axis="index")
223
229
 
224
230
  assert r.shape == (20, 4)
225
- pd.testing.assert_series_equal(r.dtypes, df1.dtypes)
231
+ pd.testing.assert_series_equal(r.dtypes, mdf1.dtypes)
226
232
  pd.testing.assert_index_equal(
227
233
  r.index_value.to_pandas(), pd.Index([], dtype=np.int64)
228
234
  )
229
235
 
230
- mdf1 = from_pandas(df1, chunk_size=3)
231
- mdf2 = from_pandas(df2, chunk_size=4)
232
- r = concat([mdf1, mdf2], axis="columns")
236
+ # test concat with column outer join
237
+ mdf1 = md.DataFrame(df1, chunk_size=3)
238
+ mdf2 = md.DataFrame(df2, chunk_size=4)
239
+ r = md.concat([mdf1, mdf2], axis="columns")
233
240
 
234
241
  assert r.shape == (10, 8)
235
242
  expected_dtypes = pd.concat([df1, df2], axis="columns").dtypes
236
243
  pd.testing.assert_series_equal(r.dtypes, expected_dtypes)
237
244
 
238
- df1 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD"))
239
- df2 = pd.DataFrame(np.random.rand(10, 3), columns=list("ABC"))
240
- mdf1 = from_pandas(df1, chunk_size=3)
241
- mdf2 = from_pandas(df2, chunk_size=3)
242
- r = concat([mdf1, mdf2], join="inner")
245
+ # test concat with column inner join
246
+ mdf1 = md.DataFrame(np.random.rand(10, 4), columns=list("ABCD"), chunk_size=3)
247
+ mdf2 = md.DataFrame(np.random.rand(10, 3), columns=list("ABC"), chunk_size=3)
248
+ r = md.concat([mdf1, mdf2], join="inner")
243
249
  assert r.shape == (20, 3)
244
250
 
251
+ # test concat with ignore index
252
+ r = md.concat([mdf1, mdf2], join="inner", ignore_index=True)
253
+ assert r.shape == (20, 3)
254
+ pd.testing.assert_index_equal(r.index_value.to_pandas(), pd.RangeIndex(20))
255
+
256
+ # test concat with unknown shapes
257
+ mdf1._shape = (np.nan, 4)
258
+ r = md.concat([mdf1, mdf2], join="inner", ignore_index=True)
259
+ np.testing.assert_array_equal(np.array(r.shape), np.array((np.nan, 3)))
260
+ r = md.concat([mdf1, mdf2], join="inner", ignore_index=True)
261
+ np.testing.assert_array_equal(np.array(r.shape), np.array((np.nan, 3)))
262
+
263
+ # test concat with empty frames
264
+ r = md.concat([md.DataFrame([]), mdf2], ignore_index=True)
265
+ assert r.shape == (10, 3)
266
+
267
+
268
+ def test_concat_series():
269
+ # test row concat
270
+ ms1 = md.Series(np.random.rand(10))
271
+ ms2 = md.Series(np.random.rand(10))
272
+ r = md.concat([ms1, ms2])
273
+ assert r.shape == (20,)
274
+
275
+ # test row concat with unknown shape
276
+ ms1._shape = (np.nan,)
277
+ r = md.concat([ms1, ms2])
278
+ assert np.isnan(r.shape[0])
279
+ r = md.concat([ms1, ms2], ignore_index=True)
280
+ assert np.isnan(r.shape[0])
281
+
282
+ # test col concat
283
+ ms1 = md.Series(np.random.rand(10))
284
+ ms2 = md.Series(np.random.rand(10))
285
+ r = md.concat([ms1, ms2], axis=1)
286
+ assert r.shape == (10, 2)
287
+
288
+ # test col concat with names
289
+ ms1.name = "col1"
290
+ ms2.name = "col2"
291
+ r = md.concat([ms1, ms2], axis=1)
292
+ assert r.shape == (10, 2)
293
+ assert r.dtypes.index.tolist() == ["col1", "col2"]
294
+
245
295
 
246
296
  def test_invalid_join_hint():
247
- df1 = pd.DataFrame(
248
- np.arange(20).reshape((4, 5)) + 1, columns=["a", "b", "c", "d", "e"]
297
+ mdf1 = md.DataFrame(
298
+ np.arange(20).reshape((4, 5)) + 1,
299
+ columns=["a", "b", "c", "d", "e"],
300
+ chunk_size=2,
301
+ )
302
+ mdf2 = md.DataFrame(
303
+ np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"], chunk_size=3
249
304
  )
250
- df2 = pd.DataFrame(np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"])
251
-
252
- mdf1 = from_pandas(df1, chunk_size=2)
253
- mdf2 = from_pandas(df2, chunk_size=3)
254
305
 
255
306
  # type error
256
307
  parameters = [
@@ -282,7 +333,6 @@ def test_invalid_join_hint():
282
333
  ]
283
334
 
284
335
  for kw in parameters:
285
- print(kw)
286
336
  with pytest.raises(TypeError):
287
337
  mdf1.merge(mdf2, **kw)
288
338