PyPI - maxframe - Versions diffs - 1.2.1__cp311-cp311-macosx_10_9_universal2.whl → 1.3.0__cp311-cp311-macosx_10_9_universal2.whl - Mend

maxframe 1.2.1__cp311-cp311-macosx_10_9_universal2.whl → 1.3.0__cp311-cp311-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of maxframe might be problematic. Click here for more details.

Files changed (70) hide show

maxframe/_utils.cpython-311-darwin.so +0 -0
maxframe/codegen.py +70 -21
maxframe/config/config.py +6 -0
maxframe/core/accessor.py +1 -0
maxframe/core/graph/core.cpython-311-darwin.so +0 -0
maxframe/dataframe/accessors/__init__.py +1 -1
maxframe/dataframe/accessors/dict_/accessor.py +1 -0
maxframe/dataframe/accessors/dict_/length.py +1 -0
maxframe/dataframe/accessors/dict_/setitem.py +1 -0
maxframe/dataframe/accessors/dict_/tests/test_dict_accessor.py +5 -7
maxframe/dataframe/accessors/list_/__init__.py +37 -0
maxframe/dataframe/accessors/list_/accessor.py +39 -0
maxframe/dataframe/accessors/list_/getitem.py +135 -0
maxframe/dataframe/accessors/list_/length.py +73 -0
maxframe/dataframe/accessors/list_/tests/__init__.py +13 -0
maxframe/dataframe/accessors/list_/tests/test_list_accessor.py +79 -0
maxframe/dataframe/accessors/plotting/__init__.py +2 -0
maxframe/dataframe/accessors/string_/__init__.py +1 -0
maxframe/dataframe/datastore/to_odps.py +6 -0
maxframe/dataframe/extensions/accessor.py +1 -0
maxframe/dataframe/extensions/apply_chunk.py +34 -21
maxframe/dataframe/extensions/flatmap.py +8 -1
maxframe/dataframe/extensions/tests/test_apply_chunk.py +2 -1
maxframe/dataframe/extensions/tests/test_extensions.py +1 -0
maxframe/dataframe/merge/concat.py +7 -4
maxframe/dataframe/merge/merge.py +1 -0
maxframe/dataframe/merge/tests/test_merge.py +97 -47
maxframe/dataframe/missing/tests/test_missing.py +1 -0
maxframe/dataframe/tests/test_utils.py +7 -0
maxframe/dataframe/ufunc/ufunc.py +1 -0
maxframe/dataframe/utils.py +3 -0
maxframe/io/odpsio/schema.py +1 -0
maxframe/learn/contrib/__init__.py +2 -4
maxframe/learn/contrib/llm/__init__.py +1 -0
maxframe/learn/contrib/llm/core.py +31 -10
maxframe/learn/contrib/llm/models/__init__.py +1 -0
maxframe/learn/contrib/llm/models/dashscope.py +4 -3
maxframe/learn/contrib/llm/models/managed.py +39 -0
maxframe/learn/contrib/llm/multi_modal.py +1 -0
maxframe/learn/contrib/llm/text.py +252 -8
maxframe/learn/contrib/models.py +77 -0
maxframe/learn/contrib/utils.py +1 -0
maxframe/learn/contrib/xgboost/__init__.py +8 -1
maxframe/learn/contrib/xgboost/classifier.py +15 -4
maxframe/learn/contrib/xgboost/core.py +108 -1
maxframe/learn/contrib/xgboost/dmatrix.py +1 -1
maxframe/learn/contrib/xgboost/predict.py +8 -3
maxframe/learn/contrib/xgboost/regressor.py +15 -1
maxframe/learn/contrib/xgboost/train.py +5 -4
maxframe/lib/dtypes_extension/__init__.py +2 -1
maxframe/lib/dtypes_extension/dtypes.py +21 -0
maxframe/lib/dtypes_extension/tests/test_dtypes.py +13 -3
maxframe/lib/mmh3.cpython-311-darwin.so +0 -0
maxframe/opcodes.py +19 -0
maxframe/serialization/__init__.py +1 -0
maxframe/serialization/core.cpython-311-darwin.so +0 -0
maxframe/serialization/core.pyx +12 -1
maxframe/serialization/numpy.py +12 -4
maxframe/serialization/serializables/tests/test_serializable.py +13 -2
maxframe/serialization/tests/test_serial.py +2 -0
maxframe/tensor/merge/concatenate.py +1 -0
maxframe/tensor/misc/unique.py +11 -10
maxframe/tensor/reshape/reshape.py +4 -1
maxframe/utils.py +4 -0
{maxframe-1.2.1.dist-info → maxframe-1.3.0.dist-info}/METADATA +2 -2
{maxframe-1.2.1.dist-info → maxframe-1.3.0.dist-info}/RECORD +70 -62
{maxframe-1.2.1.dist-info → maxframe-1.3.0.dist-info}/WHEEL +1 -1
maxframe_client/session/odps.py +3 -0
maxframe_client/session/tests/test_task.py +1 -0
{maxframe-1.2.1.dist-info → maxframe-1.3.0.dist-info}/top_level.txt +0 -0

maxframe/dataframe/accessors/list_/tests/test_list_accessor.py ADDED Viewed

@@ -0,0 +1,79 @@
+# Copyright 1999-2025 Alibaba Group Holding Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import pandas as pd
+import pyarrow as pa
+import pytest
+from ..... import dataframe as md
+from .....lib.dtypes_extension import list_
+from .....utils import ARROW_DTYPE_NOT_SUPPORTED
+from ..getitem import SeriesListGetItemOperator
+from ..length import SeriesListLengthOperator
+pytestmark = pytest.mark.skipif(
+    ARROW_DTYPE_NOT_SUPPORTED, reason="Arrow Dtype is not supported"
+)
+@pytest.fixture
+def df():
+    return md.DataFrame(
+        {
+            "A": pd.Series([[5, 3, 2]], dtype=list_(pa.int32())),
+            "B": pd.Series([["ab", "cd"]], dtype=list_(pa.string())),
+            "C": pd.Series([1], dtype=np.dtype("int64")),
+        },
+        index=[1],
+    )
+def test_invalid_dtype(df):
+    with pytest.raises(AttributeError):
+        df["C"].list.len()
+def test_getitem(df):
+    s1 = df["A"].list[1]
+    assert isinstance(s1, md.Series)
+    assert s1.dtype == pd.ArrowDtype(pa.int32())
+    assert s1.shape == (1,)
+    assert s1.index_value == df.index_value
+    op = s1.op
+    assert isinstance(op, SeriesListGetItemOperator)
+    assert op.query_index == 1
+    assert op.ignore_index_error is False
+def test_getitem_ignore_index_err(df):
+    s1 = df["B"].list.get(1)
+    assert isinstance(s1, md.Series)
+    assert s1.dtype == pd.ArrowDtype(pa.string())
+    assert s1.shape == (1,)
+    assert s1.index_value == df.index_value
+    op = s1.op
+    assert isinstance(op, SeriesListGetItemOperator)
+    assert op.query_index == 1
+    assert op.ignore_index_error is True
+def test_length(df):
+    s1 = df["A"].list.len()
+    assert isinstance(s1, md.Series)
+    assert s1.dtype == pd.ArrowDtype(pa.int64())
+    assert s1.shape == (1,)
+    assert s1.index_value == df.index_value
+    op = s1.op
+    assert isinstance(op, SeriesListLengthOperator)

maxframe/dataframe/accessors/plotting/__init__.py CHANGED Viewed

@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 def _install():
     import pandas as pd

maxframe/dataframe/accessors/string_/__init__.py CHANGED Viewed

@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .core import SeriesStringMethod

maxframe/dataframe/datastore/to_odps.py CHANGED Viewed

@@ -27,6 +27,7 @@ from ...core import OutputType
 from ...io.odpsio import build_dataframe_table_meta
 from ...serialization.serializables import (
     BoolField,
+    DictField,
     FieldTypes,
     Int64Field,
     ListField,
@@ -55,6 +56,7 @@ class DataFrameToODPSTable(DataFrameDataStore):
     index = BoolField("index", default=True)
     index_label = ListField("index_label", FieldTypes.string, default=None)
     lifecycle = Int64Field("lifecycle", default=None)
+    table_properties = DictField("table_properties", default=None)
     def __init__(self, **kw):
         super().__init__(_output_types=[OutputType.dataframe], **kw)
@@ -84,6 +86,7 @@ def to_odps_table(
     index: bool = True,
     index_label: Union[None, str, List[str]] = None,
     lifecycle: Optional[int] = None,
+    table_properties: Optional[dict] = None,
 ):
     """
     Write DataFrame object into a MaxCompute (ODPS) table.
@@ -122,6 +125,8 @@ def to_odps_table(
         names will be used.
     lifecycle: Optional[int]
         Specify lifecycle of the output table.
+    table_properties: Optional[dict]
+        Specify properties of the output table.
     Returns
     -------
@@ -186,5 +191,6 @@ def to_odps_table(
         index=index,
         index_label=index_label,
         lifecycle=lifecycle or options.session.table_lifecycle,
+        table_properties=table_properties,
     )
     return op(df)

maxframe/dataframe/extensions/accessor.py CHANGED Viewed

@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import TYPE_CHECKING
 from ...core import BaseMaxFrameAccessor

maxframe/dataframe/extensions/apply_chunk.py CHANGED Viewed

@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import functools
 from typing import Any, Callable, Dict, List, Tuple, Union
@@ -19,7 +20,12 @@ import pandas as pd
 from ... import opcodes
 from ...core import OutputType
-from ...serialization.serializables import FunctionField, Int32Field
+from ...serialization.serializables import (
+    DictField,
+    FunctionField,
+    Int32Field,
+    TupleField,
+)
 from ...utils import quiet_stdio
 from ..core import DATAFRAME_TYPE, DataFrame, IndexValue, Series
 from ..operators import DataFrameOperator, DataFrameOperatorMixin
@@ -38,7 +44,9 @@ class DataFrameApplyChunkOperator(DataFrameOperator, DataFrameOperatorMixin):
     _op_type_ = opcodes.APPLY_CHUNK
     func = FunctionField("func")
-    batch_rows = Int32Field("batch_rows")
+    batch_rows = Int32Field("batch_rows", default=None)
+    args = TupleField("args", default=None)
+    kwargs = DictField("kwargs", default=None)
     def __init__(self, output_type=None, **kw):
         if output_type:
@@ -104,12 +112,11 @@ class DataFrameApplyChunkOperator(DataFrameOperator, DataFrameOperatorMixin):
         dtypes: Union[Tuple[str, Any], Dict[str, Any]] = None,
         output_type=None,
         index=None,
-        args=(),
-        **kwargs,
     ):
+        args = self.args or ()
+        kwargs = self.kwargs or {}
         # if not dtypes and not skip_infer:
-        origin_func = self.func
-        self.func = get_packed_func(df_or_series, origin_func, *args, **kwargs)
+        packed_func = get_packed_func(df_or_series, self.func, *args, **kwargs)
         # if skip_infer, directly build a frame
         if self.output_types and self.output_types[0] == OutputType.df_or_series:
@@ -118,8 +125,8 @@ class DataFrameApplyChunkOperator(DataFrameOperator, DataFrameOperatorMixin):
         # infer return index and dtypes
         dtypes, index_value, elementwise = self._infer_batch_func_returns(
             df_or_series,
-            origin_func=origin_func,
-            packed_func=self.func,
+            origin_func=self.func,
+            packed_func=packed_func,
             given_output_type=output_type,
             given_dtypes=dtypes,
             given_index=index,
@@ -166,6 +173,8 @@ class DataFrameApplyChunkOperator(DataFrameOperator, DataFrameOperatorMixin):
         given_dtypes: Union[Tuple[str, Any], pd.Series, List[Any], Dict[str, Any]],
         given_index: Union[pd.Index, IndexValue],
         given_elementwise: bool = False,
+        *args,
+        **kwargs,
     ):
         inferred_output_type = inferred_dtypes = inferred_index_value = None
         inferred_is_elementwise = False
@@ -190,7 +199,7 @@ class DataFrameApplyChunkOperator(DataFrameOperator, DataFrameOperatorMixin):
         try:
             # execute
             with np.errstate(all="ignore"), quiet_stdio():
-                infer_result = packed_func(empty_data)
+                infer_result = packed_func(empty_data, *args, **kwargs)
             #  if executed successfully, get index and dtypes from returned object
             if inferred_index_value is None:
@@ -258,7 +267,7 @@ def get_packed_func(df, func, *args, **kwargs) -> Any:
 def df_apply_chunk(
     dataframe,
     func: Union[str, Callable],
-    batch_rows,
+    batch_rows=None,
     dtypes=None,
     dtype=None,
     name=None,
@@ -462,11 +471,11 @@ def df_apply_chunk(
     if not isinstance(func, Callable):
         raise TypeError("function must be a callable object")
-    if not isinstance(batch_rows, int):
-        raise TypeError("batch_rows must be an integer")
-    if batch_rows <= 0:
-        raise ValueError("batch_rows must be greater than 0")
+    if batch_rows is not None:
+        if not isinstance(batch_rows, int):
+            raise TypeError("batch_rows must be an integer")
+        elif batch_rows <= 0:
+            raise ValueError("batch_rows must be greater than 0")
     dtypes = (name, dtype) if dtype is not None else dtypes
@@ -481,15 +490,17 @@ def df_apply_chunk(
     # bind args and kwargs
     op = DataFrameApplyChunkOperator(
-        func=func, batch_rows=batch_rows, output_type=output_type
+        func=func,
+        batch_rows=batch_rows,
+        output_type=output_type,
+        args=args,
+        kwargs=kwargs,
     )
     return op(
         dataframe,
         dtypes=dtypes,
         index=index,
-        args=args,
-        **kwargs,
     )
@@ -720,7 +731,11 @@ def series_apply_chunk(
         output_type = OutputType.df_or_series
     op = DataFrameApplyChunkOperator(
-        func=func, batch_rows=batch_rows, output_type=output_type
+        func=func,
+        batch_rows=batch_rows,
+        output_type=output_type,
+        args=args,
+        kwargs=kwargs,
     )
     dtypes = (name, dtype) if dtype is not None else dtypes
@@ -729,6 +744,4 @@ def series_apply_chunk(
         dtypes=dtypes,
         output_type=output_type,
         index=index,
-        args=args,
-        **kwargs,
     )

maxframe/dataframe/extensions/flatmap.py CHANGED Viewed

@@ -27,7 +27,12 @@ from ...serialization.serializables import (
 )
 from ..core import DataFrame
 from ..operators import DataFrameOperator, DataFrameOperatorMixin
-from ..utils import gen_unknown_index_value, make_dtypes, parse_index
+from ..utils import (
+    copy_func_scheduling_hints,
+    gen_unknown_index_value,
+    make_dtypes,
+    parse_index,
+)
 class DataFrameFlatMapOperator(DataFrameOperator, DataFrameOperatorMixin):
@@ -40,6 +45,8 @@ class DataFrameFlatMapOperator(DataFrameOperator, DataFrameOperatorMixin):
     def __init__(self, output_types=None, **kw):
         super().__init__(_output_types=output_types, **kw)
+        if hasattr(self, "func"):
+            copy_func_scheduling_hints(self.func, self)
     def _call_dataframe(self, df: DataFrame, dtypes: pd.Series):
         dtypes = make_dtypes(dtypes)

maxframe/dataframe/extensions/tests/test_apply_chunk.py CHANGED Viewed

@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import numpy as np
 import pandas as pd
 import pytest
@@ -102,7 +103,7 @@ def test_apply_chunk_infer_dtypes_and_index(df1, df2, df3):
     assert result.index_value is df1.index_value
     assert result.dtypes.equals(df1.dtypes)
     assert isinstance(result.op.func, MarkedFunction)
-    assert result.op.func is not process
+    assert result.op.func is process
     assert result.op.func.resources is process.resources
     assert result.op.func.pythonpacks is process.pythonpacks

maxframe/dataframe/extensions/tests/test_extensions.py CHANGED Viewed

@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import numpy as np
 import pandas as pd
 import pytest

maxframe/dataframe/merge/concat.py CHANGED Viewed

@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import List, Union
 import pandas as pd
@@ -100,8 +101,9 @@ class DataFrameConcat(DataFrameOperator, DataFrameOperatorMixin):
             row_length = 0
             for series in objs:
                 row_length += series.shape[0]
-            if self.ignore_index:  # pragma: no cover
-                index_value = parse_index(pd.RangeIndex(row_length))
+            if self.ignore_index:
+                idx_length = 0 if pd.isna(row_length) else row_length
+                index_value = parse_index(pd.RangeIndex(idx_length))
             else:
                 index = self._concat_index(objs)
                 index_value = parse_index(index, objs)
@@ -159,8 +161,9 @@ class DataFrameConcat(DataFrameOperator, DataFrameOperatorMixin):
             if self.join == "inner":
                 objs = [o[list(emtpy_result.columns)] for o in objs]
-            if self.ignore_index:  # pragma: no cover
-                index_value = parse_index(pd.RangeIndex(row_length))
+            if self.ignore_index:
+                idx_length = 0 if pd.isna(row_length) else row_length
+                index_value = parse_index(pd.RangeIndex(idx_length))
             else:
                 index = self._concat_index(objs)
                 index_value = parse_index(index, objs)

maxframe/dataframe/merge/merge.py CHANGED Viewed

@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
 from abc import abstractmethod
 from collections import namedtuple

maxframe/dataframe/merge/tests/test_merge.py CHANGED Viewed

@@ -16,10 +16,10 @@ import numpy as np
 import pandas as pd
 import pytest
+from .... import dataframe as md
 from ....tests.utils import assert_mf_index_dtype
 from ...core import IndexValue
-from ...datasource.dataframe import from_pandas
-from .. import DataFrameMerge, concat
+from .. import DataFrameMerge
 from ..merge import DistributedMapJoinHint, MapJoinHint, SkewJoinHint
@@ -29,8 +29,8 @@ def test_merge():
     )
     df2 = pd.DataFrame(np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"])
-    mdf1 = from_pandas(df1, chunk_size=2)
-    mdf2 = from_pandas(df2, chunk_size=3)
+    mdf1 = md.DataFrame(df1, chunk_size=2)
+    mdf2 = md.DataFrame(df2, chunk_size=3)
     mapjoin = MapJoinHint()
     dist_mapjoin1 = DistributedMapJoinHint(shard_count=5)
@@ -83,8 +83,8 @@ def test_merge_invalid_parameters():
     )
     pdf2 = pd.DataFrame(np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"])
-    df1 = from_pandas(pdf1, chunk_size=2)
-    df2 = from_pandas(pdf2, chunk_size=3)
+    df1 = md.DataFrame(pdf1, chunk_size=2)
+    df2 = md.DataFrame(pdf2, chunk_size=3)
     with pytest.raises(ValueError):
         df1.merge(df2, bloom_filter="wrong")
@@ -104,8 +104,8 @@ def test_join():
     df2 = pd.DataFrame([[1, 2, 3], [1, 5, 6], [7, 8, 9]], index=["a1", "b2", "b3"]) + 1
     df2 = pd.concat([df2, df2 + 1])
-    mdf1 = from_pandas(df1, chunk_size=2)
-    mdf2 = from_pandas(df2, chunk_size=2)
+    mdf1 = md.DataFrame(df1, chunk_size=2)
+    mdf2 = md.DataFrame(df2, chunk_size=2)
     parameters = [
         {"lsuffix": "l_", "rsuffix": "r_"},
@@ -132,8 +132,8 @@ def test_join_on():
     )
     df2 = pd.concat([df2, df2 + 1])
-    mdf1 = from_pandas(df1, chunk_size=2)
-    mdf2 = from_pandas(df2, chunk_size=2)
+    mdf1 = md.DataFrame(df1, chunk_size=2)
+    mdf2 = md.DataFrame(df2, chunk_size=2)
     parameters = [
         {"lsuffix": "l_", "rsuffix": "r_"},
@@ -157,15 +157,15 @@ def test_append():
     df1 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD"))
     df2 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD"))
-    mdf1 = from_pandas(df1, chunk_size=3)
-    mdf2 = from_pandas(df2, chunk_size=3)
+    mdf1 = md.DataFrame(df1, chunk_size=3)
+    mdf2 = md.DataFrame(df2, chunk_size=3)
     adf = mdf1.append(mdf2)
     assert adf.shape == (20, 4)
     assert_mf_index_dtype(adf.index_value.value, np.int64)
-    mdf1 = from_pandas(df1, chunk_size=3)
-    mdf2 = from_pandas(df2, chunk_size=3)
+    mdf1 = md.DataFrame(df1, chunk_size=3)
+    mdf2 = md.DataFrame(df2, chunk_size=3)
     adf = mdf1.append(mdf2, ignore_index=True)
     assert adf.shape == (20, 4)
@@ -173,84 +173,135 @@ def test_append():
     pd.testing.assert_index_equal(adf.index_value.to_pandas(), pd.RangeIndex(20))
-def test_concat():
+def test_concat_dataframe():
+    # test index concatenate
     df1 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD"))
     df2 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD"))
-    mdf1 = from_pandas(df1, chunk_size=4)
-    mdf2 = from_pandas(df2, chunk_size=4)
-    r = concat([mdf1, mdf2], axis="index")
+    mdf1 = md.DataFrame(df1, chunk_size=4)
+    mdf2 = md.DataFrame(df2, chunk_size=4)
+    r = md.concat([mdf1, mdf2], axis="index")
     assert r.shape == (20, 4)
     assert not isinstance(r.index_value.to_pandas(), pd.RangeIndex)
-    pd.testing.assert_series_equal(r.dtypes, df1.dtypes)
+    pd.testing.assert_series_equal(r.dtypes, mdf1.dtypes)
-    df3 = pd.DataFrame(
-        np.random.rand(10, 4), columns=list("ABCD"), index=pd.RangeIndex(10, 20)
+    # test index concatenate with range index
+    mdf3 = md.DataFrame(
+        np.random.rand(10, 4),
+        columns=list("ABCD"),
+        index=pd.RangeIndex(10, 20),
+        chunk_size=4,
     )
-    mdf3 = from_pandas(df3, chunk_size=4)
-    r = concat([mdf1, mdf3], axis="index")
+    r = md.concat([mdf1, mdf3], axis="index")
     assert r.shape == (20, 4)
-    pd.testing.assert_series_equal(r.dtypes, df1.dtypes)
+    pd.testing.assert_series_equal(r.dtypes, mdf1.dtypes)
     pd.testing.assert_index_equal(r.index_value.to_pandas(), pd.RangeIndex(20))
+    # test index concatenate with perm index
     df4 = pd.DataFrame(
         np.random.rand(10, 4),
         columns=list("ABCD"),
         index=np.random.permutation(np.arange(10)),
     )
-    mdf4 = from_pandas(df4, chunk_size=4)
-    r = concat([mdf1, mdf4], axis="index")
+    # test concat with same index with different sources
+    mdf4 = md.DataFrame(df4, chunk_size=4)
+    r = md.concat([mdf1, mdf4], axis="index")
     assert r.shape == (20, 4)
-    pd.testing.assert_series_equal(r.dtypes, df1.dtypes)
+    pd.testing.assert_series_equal(r.dtypes, mdf1.dtypes)
     pd.testing.assert_index_equal(
         r.index_value.to_pandas(), pd.Index([], dtype=np.int64)
     )
-    r = concat([mdf4, mdf1], axis="index")
+    r = md.concat([mdf4, mdf1], axis="index")
     assert r.shape == (20, 4)
-    pd.testing.assert_series_equal(r.dtypes, df1.dtypes)
+    pd.testing.assert_series_equal(r.dtypes, mdf1.dtypes)
     pd.testing.assert_index_equal(
         r.index_value.to_pandas(), pd.Index([], dtype=np.int64)
     )
-    r = concat([mdf4, mdf4], axis="index")
+    # test concat with same index with same source
+    r = md.concat([mdf4, mdf4], axis="index")
     assert r.shape == (20, 4)
-    pd.testing.assert_series_equal(r.dtypes, df1.dtypes)
+    pd.testing.assert_series_equal(r.dtypes, mdf1.dtypes)
     pd.testing.assert_index_equal(
         r.index_value.to_pandas(), pd.Index([], dtype=np.int64)
     )
-    mdf1 = from_pandas(df1, chunk_size=3)
-    mdf2 = from_pandas(df2, chunk_size=4)
-    r = concat([mdf1, mdf2], axis="columns")
+    # test concat with column outer join
+    mdf1 = md.DataFrame(df1, chunk_size=3)
+    mdf2 = md.DataFrame(df2, chunk_size=4)
+    r = md.concat([mdf1, mdf2], axis="columns")
     assert r.shape == (10, 8)
     expected_dtypes = pd.concat([df1, df2], axis="columns").dtypes
     pd.testing.assert_series_equal(r.dtypes, expected_dtypes)
-    df1 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD"))
-    df2 = pd.DataFrame(np.random.rand(10, 3), columns=list("ABC"))
-    mdf1 = from_pandas(df1, chunk_size=3)
-    mdf2 = from_pandas(df2, chunk_size=3)
-    r = concat([mdf1, mdf2], join="inner")
+    # test concat with column inner join
+    mdf1 = md.DataFrame(np.random.rand(10, 4), columns=list("ABCD"), chunk_size=3)
+    mdf2 = md.DataFrame(np.random.rand(10, 3), columns=list("ABC"), chunk_size=3)
+    r = md.concat([mdf1, mdf2], join="inner")
     assert r.shape == (20, 3)
+    # test concat with ignore index
+    r = md.concat([mdf1, mdf2], join="inner", ignore_index=True)
+    assert r.shape == (20, 3)
+    pd.testing.assert_index_equal(r.index_value.to_pandas(), pd.RangeIndex(20))
+    # test concat with unknown shapes
+    mdf1._shape = (np.nan, 4)
+    r = md.concat([mdf1, mdf2], join="inner", ignore_index=True)
+    np.testing.assert_array_equal(np.array(r.shape), np.array((np.nan, 3)))
+    r = md.concat([mdf1, mdf2], join="inner", ignore_index=True)
+    np.testing.assert_array_equal(np.array(r.shape), np.array((np.nan, 3)))
+    # test concat with empty frames
+    r = md.concat([md.DataFrame([]), mdf2], ignore_index=True)
+    assert r.shape == (10, 3)
+def test_concat_series():
+    # test row concat
+    ms1 = md.Series(np.random.rand(10))
+    ms2 = md.Series(np.random.rand(10))
+    r = md.concat([ms1, ms2])
+    assert r.shape == (20,)
+    # test row concat with unknown shape
+    ms1._shape = (np.nan,)
+    r = md.concat([ms1, ms2])
+    assert np.isnan(r.shape[0])
+    r = md.concat([ms1, ms2], ignore_index=True)
+    assert np.isnan(r.shape[0])
+    # test col concat
+    ms1 = md.Series(np.random.rand(10))
+    ms2 = md.Series(np.random.rand(10))
+    r = md.concat([ms1, ms2], axis=1)
+    assert r.shape == (10, 2)
+    # test col concat with names
+    ms1.name = "col1"
+    ms2.name = "col2"
+    r = md.concat([ms1, ms2], axis=1)
+    assert r.shape == (10, 2)
+    assert r.dtypes.index.tolist() == ["col1", "col2"]
 def test_invalid_join_hint():
-    df1 = pd.DataFrame(
-        np.arange(20).reshape((4, 5)) + 1, columns=["a", "b", "c", "d", "e"]
+    mdf1 = md.DataFrame(
+        np.arange(20).reshape((4, 5)) + 1,
+        columns=["a", "b", "c", "d", "e"],
+        chunk_size=2,
+    )
+    mdf2 = md.DataFrame(
+        np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"], chunk_size=3
     )
-    df2 = pd.DataFrame(np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"])
-    mdf1 = from_pandas(df1, chunk_size=2)
-    mdf2 = from_pandas(df2, chunk_size=3)
     # type error
     parameters = [
@@ -282,7 +333,6 @@ def test_invalid_join_hint():
     ]
     for kw in parameters:
-        print(kw)
         with pytest.raises(TypeError):
             mdf1.merge(mdf2, **kw)

maxframe/dataframe/missing/tests/test_missing.py CHANGED Viewed

@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import random
 import numpy as np

maxframe/dataframe/tests/test_utils.py CHANGED Viewed

@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import numpy as np
 import pandas as pd
 import pyarrow as pa
@@ -71,6 +72,12 @@ def test_pack_function(df1):
 @pytest.mark.parametrize(
     "dtype, fill_value, expected",
     [
+        (
+            ArrowDtype(pa.list_(pa.string())) if ArrowDtype else None,
+            1,
+            ["1"],
+        ),
+        (pa.list_(pa.string()), 1, ["1"]),
         (
             ArrowDtype(pa.map_(pa.int32(), pa.string())) if ArrowDtype else None,
             1,

maxframe/dataframe/ufunc/ufunc.py CHANGED Viewed

@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from numbers import Number
 from ...tensor import tensor as astensor

maxframe/dataframe/utils.py CHANGED Viewed

@@ -463,6 +463,9 @@ def _generate_value(dtype, fill_value):
     if ArrowDtype and isinstance(dtype, pd.ArrowDtype):
         return _generate_value(dtype.pyarrow_dtype, fill_value)
+    if isinstance(dtype, pa.ListType):
+        return [_generate_value(dtype.value_type, fill_value)]
     if isinstance(dtype, pa.MapType):
         return [
             (