PyPI - maxframe - Versions diffs - 0.1.0b5__cp310-cp310-macosx_10_9_universal2.whl → 1.0.0__cp310-cp310-macosx_10_9_universal2.whl - Mend

maxframe 0.1.0b5__cp310-cp310-macosx_10_9_universal2.whl → 1.0.0__cp310-cp310-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of maxframe might be problematic. Click here for more details.

Files changed (203) hide show

maxframe/_utils.cpython-310-darwin.so +0 -0
maxframe/codegen.py +10 -4
maxframe/config/config.py +68 -10
maxframe/config/validators.py +42 -11
maxframe/conftest.py +58 -14
maxframe/core/__init__.py +2 -16
maxframe/core/entity/__init__.py +1 -12
maxframe/core/entity/executable.py +1 -1
maxframe/core/entity/objects.py +46 -45
maxframe/core/entity/output_types.py +0 -3
maxframe/core/entity/tests/test_objects.py +43 -0
maxframe/core/entity/tileables.py +5 -78
maxframe/core/graph/__init__.py +2 -2
maxframe/core/graph/builder/__init__.py +0 -1
maxframe/core/graph/builder/base.py +5 -4
maxframe/core/graph/builder/tileable.py +4 -4
maxframe/core/graph/builder/utils.py +4 -8
maxframe/core/graph/core.cpython-310-darwin.so +0 -0
maxframe/core/graph/core.pyx +4 -4
maxframe/core/graph/entity.py +9 -33
maxframe/core/operator/__init__.py +2 -9
maxframe/core/operator/base.py +3 -5
maxframe/core/operator/objects.py +0 -9
maxframe/core/operator/utils.py +55 -0
maxframe/dataframe/__init__.py +1 -1
maxframe/dataframe/arithmetic/around.py +5 -17
maxframe/dataframe/arithmetic/core.py +15 -7
maxframe/dataframe/arithmetic/docstring.py +7 -33
maxframe/dataframe/arithmetic/equal.py +4 -2
maxframe/dataframe/arithmetic/greater.py +4 -2
maxframe/dataframe/arithmetic/greater_equal.py +4 -2
maxframe/dataframe/arithmetic/less.py +2 -2
maxframe/dataframe/arithmetic/less_equal.py +4 -2
maxframe/dataframe/arithmetic/not_equal.py +4 -2
maxframe/dataframe/arithmetic/tests/test_arithmetic.py +39 -16
maxframe/dataframe/core.py +31 -7
maxframe/dataframe/datasource/date_range.py +2 -2
maxframe/dataframe/datasource/read_odps_query.py +117 -23
maxframe/dataframe/datasource/read_odps_table.py +6 -3
maxframe/dataframe/datasource/tests/test_datasource.py +103 -8
maxframe/dataframe/datastore/tests/test_to_odps.py +48 -0
maxframe/dataframe/datastore/to_odps.py +28 -0
maxframe/dataframe/extensions/__init__.py +5 -0
maxframe/dataframe/extensions/flatjson.py +131 -0
maxframe/dataframe/extensions/flatmap.py +317 -0
maxframe/dataframe/extensions/reshuffle.py +1 -1
maxframe/dataframe/extensions/tests/test_extensions.py +108 -3
maxframe/dataframe/groupby/core.py +1 -1
maxframe/dataframe/groupby/cum.py +0 -1
maxframe/dataframe/groupby/fill.py +4 -1
maxframe/dataframe/groupby/getitem.py +6 -0
maxframe/dataframe/groupby/tests/test_groupby.py +5 -1
maxframe/dataframe/groupby/transform.py +5 -1
maxframe/dataframe/indexing/align.py +1 -1
maxframe/dataframe/indexing/loc.py +6 -4
maxframe/dataframe/indexing/rename.py +5 -28
maxframe/dataframe/indexing/sample.py +0 -1
maxframe/dataframe/indexing/set_index.py +68 -1
maxframe/dataframe/initializer.py +11 -1
maxframe/dataframe/merge/__init__.py +9 -1
maxframe/dataframe/merge/concat.py +41 -31
maxframe/dataframe/merge/merge.py +237 -3
maxframe/dataframe/merge/tests/test_merge.py +126 -1
maxframe/dataframe/misc/apply.py +5 -10
maxframe/dataframe/misc/case_when.py +1 -1
maxframe/dataframe/misc/describe.py +2 -2
maxframe/dataframe/misc/drop_duplicates.py +8 -8
maxframe/dataframe/misc/eval.py +4 -0
maxframe/dataframe/misc/memory_usage.py +2 -2
maxframe/dataframe/misc/pct_change.py +1 -83
maxframe/dataframe/misc/tests/test_misc.py +33 -2
maxframe/dataframe/misc/transform.py +1 -30
maxframe/dataframe/misc/value_counts.py +4 -17
maxframe/dataframe/missing/dropna.py +1 -1
maxframe/dataframe/missing/fillna.py +5 -5
maxframe/dataframe/operators.py +1 -17
maxframe/dataframe/reduction/core.py +2 -2
maxframe/dataframe/reduction/tests/test_reduction.py +2 -4
maxframe/dataframe/sort/sort_values.py +1 -11
maxframe/dataframe/statistics/corr.py +3 -3
maxframe/dataframe/statistics/quantile.py +13 -19
maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
maxframe/dataframe/tests/test_initializer.py +33 -2
maxframe/dataframe/utils.py +26 -11
maxframe/dataframe/window/expanding.py +5 -3
maxframe/dataframe/window/tests/test_expanding.py +2 -2
maxframe/errors.py +13 -0
maxframe/extension.py +12 -0
maxframe/io/__init__.py +13 -0
maxframe/io/objects/__init__.py +24 -0
maxframe/io/objects/core.py +140 -0
maxframe/io/objects/tensor.py +76 -0
maxframe/io/objects/tests/__init__.py +13 -0
maxframe/io/objects/tests/test_object_io.py +97 -0
maxframe/{odpsio → io/odpsio}/__init__.py +3 -1
maxframe/{odpsio → io/odpsio}/arrow.py +42 -10
maxframe/{odpsio → io/odpsio}/schema.py +38 -16
maxframe/io/odpsio/tableio.py +719 -0
maxframe/io/odpsio/tests/__init__.py +13 -0
maxframe/{odpsio → io/odpsio}/tests/test_schema.py +59 -22
maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +50 -23
maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
maxframe/io/odpsio/volumeio.py +63 -0
maxframe/learn/contrib/__init__.py +3 -1
maxframe/learn/contrib/graph/__init__.py +15 -0
maxframe/learn/contrib/graph/connected_components.py +215 -0
maxframe/learn/contrib/graph/tests/__init__.py +13 -0
maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
maxframe/learn/contrib/llm/__init__.py +16 -0
maxframe/learn/contrib/llm/core.py +54 -0
maxframe/learn/contrib/llm/models/__init__.py +14 -0
maxframe/learn/contrib/llm/models/dashscope.py +73 -0
maxframe/learn/contrib/llm/multi_modal.py +42 -0
maxframe/learn/contrib/llm/text.py +42 -0
maxframe/learn/contrib/xgboost/classifier.py +26 -2
maxframe/learn/contrib/xgboost/core.py +87 -2
maxframe/learn/contrib/xgboost/dmatrix.py +3 -6
maxframe/learn/contrib/xgboost/predict.py +29 -46
maxframe/learn/contrib/xgboost/regressor.py +3 -10
maxframe/learn/contrib/xgboost/train.py +29 -18
maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
maxframe/lib/mmh3.cpython-310-darwin.so +0 -0
maxframe/lib/mmh3.pyi +43 -0
maxframe/lib/sparse/tests/test_sparse.py +15 -15
maxframe/lib/wrapped_pickle.py +2 -1
maxframe/opcodes.py +8 -0
maxframe/protocol.py +154 -27
maxframe/remote/core.py +4 -8
maxframe/serialization/__init__.py +1 -0
maxframe/serialization/core.cpython-310-darwin.so +0 -0
maxframe/serialization/core.pxd +3 -0
maxframe/serialization/core.pyi +3 -0
maxframe/serialization/core.pyx +67 -26
maxframe/serialization/exception.py +1 -1
maxframe/serialization/pandas.py +52 -17
maxframe/serialization/serializables/core.py +180 -15
maxframe/serialization/serializables/field_type.py +4 -1
maxframe/serialization/serializables/tests/test_serializable.py +54 -5
maxframe/serialization/tests/test_serial.py +2 -1
maxframe/session.py +9 -2
maxframe/tensor/__init__.py +81 -2
maxframe/tensor/arithmetic/isclose.py +1 -0
maxframe/tensor/arithmetic/tests/test_arithmetic.py +22 -18
maxframe/tensor/core.py +5 -136
maxframe/tensor/datasource/array.py +3 -0
maxframe/tensor/datasource/full.py +1 -1
maxframe/tensor/datasource/tests/test_datasource.py +1 -1
maxframe/tensor/indexing/flatnonzero.py +1 -1
maxframe/tensor/indexing/getitem.py +2 -0
maxframe/tensor/merge/__init__.py +2 -0
maxframe/tensor/merge/concatenate.py +101 -0
maxframe/tensor/merge/tests/test_merge.py +30 -1
maxframe/tensor/merge/vstack.py +74 -0
maxframe/tensor/{base → misc}/__init__.py +2 -0
maxframe/tensor/{base → misc}/atleast_1d.py +1 -3
maxframe/tensor/misc/atleast_2d.py +70 -0
maxframe/tensor/misc/atleast_3d.py +85 -0
maxframe/tensor/misc/tests/__init__.py +13 -0
maxframe/tensor/{base → misc}/transpose.py +22 -18
maxframe/tensor/{base → misc}/unique.py +3 -3
maxframe/tensor/operators.py +1 -7
maxframe/tensor/random/core.py +1 -1
maxframe/tensor/reduction/count_nonzero.py +2 -1
maxframe/tensor/reduction/mean.py +1 -0
maxframe/tensor/reduction/nanmean.py +1 -0
maxframe/tensor/reduction/nanvar.py +2 -0
maxframe/tensor/reduction/tests/test_reduction.py +12 -1
maxframe/tensor/reduction/var.py +2 -0
maxframe/tensor/statistics/quantile.py +2 -2
maxframe/tensor/utils.py +2 -22
maxframe/tests/test_protocol.py +34 -0
maxframe/tests/test_utils.py +0 -12
maxframe/tests/utils.py +17 -2
maxframe/typing_.py +4 -1
maxframe/udf.py +8 -9
maxframe/utils.py +106 -86
{maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/METADATA +25 -25
{maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/RECORD +197 -173
{maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/WHEEL +1 -1
maxframe_client/__init__.py +0 -1
maxframe_client/clients/framedriver.py +4 -1
maxframe_client/fetcher.py +81 -74
maxframe_client/session/consts.py +3 -0
maxframe_client/session/graph.py +8 -2
maxframe_client/session/odps.py +194 -40
maxframe_client/session/task.py +94 -39
maxframe_client/tests/test_fetcher.py +21 -3
maxframe_client/tests/test_session.py +109 -8
maxframe/core/entity/chunks.py +0 -68
maxframe/core/entity/fuse.py +0 -73
maxframe/core/graph/builder/chunk.py +0 -430
maxframe/odpsio/tableio.py +0 -322
maxframe/odpsio/volumeio.py +0 -95
maxframe_client/clients/spe.py +0 -104
/maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
/maxframe/{tensor/base → dataframe/datastore}/tests/__init__.py +0 -0
/maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
/maxframe/tensor/{base → misc}/astype.py +0 -0
/maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
/maxframe/tensor/{base → misc}/ravel.py +0 -0
/maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
/maxframe/tensor/{base → misc}/where.py +0 -0
{maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/top_level.txt +0 -0

maxframe/dataframe/merge/merge.py CHANGED Viewed

@@ -11,12 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
+from abc import abstractmethod
 from collections import namedtuple
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 import numpy as np
+from pandas import Index
 from ... import opcodes
 from ...core import OutputType
@@ -28,6 +29,7 @@ from ...serialization.serializables import (
     Int32Field,
     KeyField,
     NamedTupleField,
+    Serializable,
     StringField,
     TupleField,
 )
@@ -73,9 +75,208 @@ class DataFrameMergeAlign(MapReduceOperator, DataFrameOperatorMixin):
 MergeSplitInfo = namedtuple("MergeSplitInfo", "split_side, split_index, nsplits")
+class JoinHint(Serializable):
+    @abstractmethod
+    def verify_params(
+        self,
+        hint_on_df: Union[DataFrame, Series],
+        on: str,
+        is_on_index: bool,
+        how: str,
+        is_hint_for_left: bool,
+    ):
+        pass
+    @abstractmethod
+    def verify_can_work_with(self, other: "JoinHint"):
+        pass
+class MapJoinHint(JoinHint):
+    def verify_params(
+        self,
+        hint_on_df: Union[DataFrame, Series],
+        on: str,
+        is_on_index: bool,
+        how: str,
+        is_hint_for_left: bool,
+    ):
+        if how in ("cross", "outer"):
+            raise ValueError(
+                "Invalid join hint, MapJoinHint is not support in cross and outer join"
+            )
+    def verify_can_work_with(self, other: JoinHint):
+        if isinstance(other, SkewJoinHint):
+            raise ValueError(
+                "Invalid join hint, SkewJoinHint cannot work with MapJoinHint"
+            )
+class DistributedMapJoinHint(JoinHint):
+    shard_count = Int32Field("shard_count")
+    replica_count = Int32Field("replica_count", default=1)
+    def verify_params(
+        self,
+        hint_on_df: Union[DataFrame, Series],
+        on: str,
+        is_on_index: bool,
+        how: str,
+        is_hint_for_left: bool,
+    ):
+        if how in ("cross", "outer"):
+            raise ValueError(
+                "Invalid join hint, DistributedMapJoinHint is not support in cross and outer join"
+            )
+        if not hasattr(self, "shard_count"):
+            raise ValueError(
+                "Invalid DistributedMapJoinHint, shard_count must be specified"
+            )
+        if self.shard_count <= 0 or self.replica_count <= 0:
+            raise ValueError(
+                "Invalid DistributedMapJoinHint, shard_count and replica_count must be greater than 0"
+            )
+    def verify_can_work_with(self, other: JoinHint):
+        pass
+class SkewJoinHint(JoinHint):
+    columns = AnyField("columns", default=None)
+    @staticmethod
+    def _check_index_levels(index, level_list):
+        selected_levels = set()
+        valid_levels = set(range(index.nlevels))
+        valid_level_names = set(index.names)
+        for item in level_list:
+            if isinstance(item, int):
+                if item not in valid_levels:
+                    raise ValueError(f"Level {item} is not a valid index level")
+                if item in selected_levels:
+                    raise ValueError(f"Level {item} is selected multiple times")
+                selected_levels.add(item)
+            elif isinstance(item, str):
+                if item not in valid_level_names:
+                    raise ValueError(f"'{item}' is not a valid index level name")
+                level = index.names.index(item)
+                if level in selected_levels:
+                    raise ValueError(
+                        f"'{item}' (Level {level}) is selected multiple times"
+                    )
+                selected_levels.add(level)
+            else:
+                raise ValueError(f"Invalid input type: {type(item)}")
+    @staticmethod
+    def _check_columns(join_on_columns, column_list):
+        selected_columns = set()
+        valid_columns = set(join_on_columns)
+        for item in column_list:
+            if isinstance(item, int):
+                if item < 0 or item >= len(join_on_columns):
+                    raise ValueError(f"Column index {item} is out of range")
+                col_name = join_on_columns[item]
+                if col_name in selected_columns:
+                    raise ValueError(
+                        f"Column '{col_name}' (index {item}) is selected multiple times"
+                    )
+                selected_columns.add(col_name)
+            elif isinstance(item, str):
+                if item not in valid_columns:
+                    raise ValueError(f"'{item}' is not a valid column name")
+                if item in selected_columns:
+                    raise ValueError(f"Column '{item}' is selected multiple times")
+                selected_columns.add(item)
+            else:
+                raise ValueError(f"Invalid input type: {type(item)}")
+    def verify_params(
+        self,
+        hint_on_df: Union[DataFrame, Series],
+        on: str,
+        is_on_index: bool,
+        how: str,
+        is_hint_for_left: bool,
+    ):
+        if how in ("cross", "outer"):
+            raise ValueError(
+                "Invalid join hint, map join is not support in cross and outer join"
+            )
+        if is_hint_for_left and how == "right":
+            raise ValueError(
+                "Invalid join hint, right join can only use SkewJoinHint on right frame"
+            )
+        elif not is_hint_for_left and how == "left":
+            raise ValueError(
+                "Invalid join hint, left join can only use SkewJoinHint on left frame"
+            )
+        # check columns
+        if self.columns is None:
+            return
+        if not isinstance(self.columns, list):
+            raise TypeError("Invalid SkewJoinHint, `columns` must be a list")
+        if all(isinstance(item, (int, str)) for item in self.columns):
+            # if elements are int (levels) or str (index names or column names)
+            self._verify_valid_index_or_columns(
+                self.columns, hint_on_df.index_value.to_pandas(), on, is_on_index
+            )
+        elif all(isinstance(c, dict) for c in self.columns):
+            # dict with column names and values
+            cols_set = set(self.columns[0].keys())
+            if any(cols_set != set(c.keys()) for c in self.columns):
+                raise ValueError(
+                    "Invalid SkewJoinHint, all values in `columns` need to have same columns"
+                )
+            self._verify_valid_index_or_columns(
+                cols_set, hint_on_df.index_value.to_pandas(), on, is_on_index
+            )
+        else:
+            raise TypeError("Invalid SkewJoinHint, annot accept `columns` type")
+    def verify_can_work_with(self, other: JoinHint):
+        if isinstance(other, SkewJoinHint):
+            raise ValueError(
+                "Invalid join hint, SkewJoinHint cannot work with MapJoinHint"
+            )
+    @staticmethod
+    def _verify_valid_index_or_columns(
+        skew_join_columns: Iterable[Union[int, str]],
+        frame_index: Index,
+        on: Union[str, List[str]],
+        is_on_index: bool,
+    ):
+        if isinstance(on, str):
+            on = [on]
+        on_columns = set(frame_index.names if is_on_index else on)
+        for col in skew_join_columns:
+            if isinstance(col, int):
+                if col < 0 or col >= len(on_columns):
+                    raise ValueError(
+                        f"Invalid, SkeJoinHint, `{col}` is out of join on columns range"
+                    )
+            else:
+                if col not in on_columns:
+                    raise ValueError(
+                        f"Invalid, SkeJoinHint, '{col}' is not a valid column name"
+                    )
 class DataFrameMerge(DataFrameOperator, DataFrameOperatorMixin):
     _op_type_ = opcodes.DATAFRAME_MERGE
+    # workaround for new field since v1.0.0rc2
+    # todo remove this when all versions below v1.0.0rc1 is eliminated
+    _legacy_new_non_primitives = ["left_hint", "right_hint"]
     how = StringField("how")
     on = AnyField("on")
     left_on = AnyField("left_on")
@@ -95,6 +296,8 @@ class DataFrameMerge(DataFrameOperator, DataFrameOperatorMixin):
     # only for broadcast merge
     split_info = NamedTupleField("split_info")
+    left_hint = AnyField("left_hint", default=None)
+    right_hint = AnyField("right_hint", default=None)
     def __init__(self, copy=None, **kwargs):
         super().__init__(copy_=copy, **kwargs)
@@ -150,7 +353,7 @@ def merge(
     df: Union[DataFrame, Series],
     right: Union[DataFrame, Series],
     how: str = "inner",
-    on: str = None,
+    on: Union[str, List[str]] = None,
     left_on: str = None,
     right_on: str = None,
     left_index: bool = False,
@@ -165,6 +368,8 @@ def merge(
     auto_merge_threshold: int = 8,
     bloom_filter: Union[bool, str] = "auto",
     bloom_filter_options: Dict[str, Any] = None,
+    left_hint: JoinHint = None,
+    right_hint: JoinHint = None,
 ) -> DataFrame:
     """
     Merge DataFrame or named Series objects with a database-style join.
@@ -267,6 +472,12 @@ def merge(
           when chunk size of left and right is greater than this threshold, apply bloom filter
         * "filter": "large", "small", "both", default "large"
           decides to filter on large, small or both DataFrames.
+    left_hint: JoinHint, default None
+        Join strategy to use for left frame. When data skew occurs, consider these strategies to avoid long-tail issues,
+        but use them cautiously to prevent OOM and unnecessary overhead.
+    right_hint: JoinHint, default None
+        Join strategy to use for right frame.
     Returns
     -------
@@ -381,6 +592,18 @@ def merge(
                 raise ValueError(
                     f"Invalid filter {k}, available: {BLOOM_FILTER_ON_OPTIONS}"
                 )
+    if left_hint:
+        if not isinstance(left_hint, JoinHint):
+            raise TypeError(f"left_hint must be a JoinHint, got {type(left_hint)}")
+        left_hint.verify_can_work_with(right_hint)
+        left_hint.verify_params(df, on or left_on, left_index, how, True)
+    if right_hint:
+        if not isinstance(right_hint, JoinHint):
+            raise TypeError(f"right_hint must be a JoinHint, got {type(right_hint)}")
+        right_hint.verify_params(right, on or right_on, right_index, how, False)
     op = DataFrameMerge(
         how=how,
         on=on,
@@ -399,6 +622,8 @@ def merge(
         bloom_filter=bloom_filter,
         bloom_filter_options=bloom_filter_options,
         output_types=[OutputType.dataframe],
+        left_hint=left_hint,
+        right_hint=right_hint,
     )
     return op(df, right)
@@ -416,6 +641,8 @@ def join(
     auto_merge_threshold: int = 8,
     bloom_filter: Union[bool, Dict] = True,
     bloom_filter_options: Dict[str, Any] = None,
+    left_hint: JoinHint = None,
+    right_hint: JoinHint = None,
 ) -> DataFrame:
     """
     Join columns of another DataFrame.
@@ -480,6 +707,11 @@ def join(
           when chunk size of left and right is greater than this threshold, apply bloom filter
         * "filter": "large", "small", "both", default "large"
           decides to filter on large, small or both DataFrames.
+    left_hint: JoinHint, default None
+        Join strategy to use for left frame. When data skew occurs, consider these strategies to avoid long-tail issues,
+        but use them cautiously to prevent OOM and unnecessary overhead.
+    right_hint: JoinHint, default None
+        Join strategy to use for right frame.
     Returns
     -------
@@ -590,4 +822,6 @@ def join(
         auto_merge_threshold=auto_merge_threshold,
         bloom_filter=bloom_filter,
         bloom_filter_options=bloom_filter_options,
+        left_hint=left_hint,
+        right_hint=right_hint,
     )

maxframe/dataframe/merge/tests/test_merge.py CHANGED Viewed

@@ -16,9 +16,11 @@ import numpy as np
 import pandas as pd
 import pytest
+from ....tests.utils import assert_mf_index_dtype
 from ...core import IndexValue
 from ...datasource.dataframe import from_pandas
 from .. import DataFrameMerge, concat
+from ..merge import DistributedMapJoinHint, MapJoinHint, SkewJoinHint
 def test_merge():
@@ -30,14 +32,39 @@ def test_merge():
     mdf1 = from_pandas(df1, chunk_size=2)
     mdf2 = from_pandas(df2, chunk_size=3)
+    mapjoin = MapJoinHint()
+    dist_mapjoin1 = DistributedMapJoinHint(shard_count=5)
+    skew_join1 = SkewJoinHint()
+    skew_join2 = SkewJoinHint(columns=[0])
+    skew_join3 = SkewJoinHint(columns=[{"a": 4}, {"a": 6}])
+    skew_join4 = SkewJoinHint(columns=[{"a": 4, "b": "test"}, {"a": 5, "b": "hello"}])
     parameters = [
         {},
         {"how": "left", "right_on": "x", "left_index": True},
+        {
+            "how": "left",
+            "right_on": "x",
+            "left_index": True,
+            "left_hint": mapjoin,
+            "right_hint": mapjoin,
+        },
         {"how": "right", "left_on": "a", "right_index": True},
+        {
+            "how": "right",
+            "left_on": "a",
+            "right_index": True,
+            "left_hint": mapjoin,
+            "right_hint": dist_mapjoin1,
+        },
         {"how": "left", "left_on": "a", "right_on": "x"},
+        {"how": "left", "left_on": "a", "right_on": "x", "left_hint": skew_join1},
         {"how": "right", "left_on": "a", "right_index": True},
+        {"how": "right", "left_on": "a", "right_index": True, "right_hint": skew_join2},
         {"how": "right", "on": "a"},
+        {"how": "right", "on": "a", "right_hint": skew_join3},
         {"how": "inner", "on": ["a", "b"]},
+        {"how": "inner", "on": ["a", "b"], "left_hint": skew_join4},
     ]
     for kw in parameters:
@@ -135,7 +162,7 @@ def test_append():
     adf = mdf1.append(mdf2)
     assert adf.shape == (20, 4)
-    assert isinstance(adf.index_value.value, IndexValue.Int64Index)
+    assert_mf_index_dtype(adf.index_value.value, np.int64)
     mdf1 = from_pandas(df1, chunk_size=3)
     mdf2 = from_pandas(df2, chunk_size=3)
@@ -155,6 +182,7 @@ def test_concat():
     r = concat([mdf1, mdf2], axis="index")
     assert r.shape == (20, 4)
+    assert not isinstance(r.index_value.to_pandas(), pd.RangeIndex)
     pd.testing.assert_series_equal(r.dtypes, df1.dtypes)
     df3 = pd.DataFrame(
@@ -213,3 +241,100 @@ def test_concat():
     mdf2 = from_pandas(df2, chunk_size=3)
     r = concat([mdf1, mdf2], join="inner")
     assert r.shape == (20, 3)
+def test_invalid_join_hint():
+    df1 = pd.DataFrame(
+        np.arange(20).reshape((4, 5)) + 1, columns=["a", "b", "c", "d", "e"]
+    )
+    df2 = pd.DataFrame(np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"])
+    mdf1 = from_pandas(df1, chunk_size=2)
+    mdf2 = from_pandas(df2, chunk_size=3)
+    # type error
+    parameters = [
+        {"how": "left", "right_on": "x", "left_index": True, "left_hint": [1]},
+        {
+            "how": "left",
+            "right_on": "x",
+            "left_index": True,
+            "left_hint": {"key": "value"},
+        },
+        {
+            "how": "right",
+            "left_on": "a",
+            "right_index": True,
+            "right_hint": SkewJoinHint(columns=2),
+        },
+        {
+            "how": "left",
+            "left_on": "a",
+            "right_on": "x",
+            "left_hint": SkewJoinHint(columns="a"),
+        },
+        {
+            "how": "right",
+            "left_on": "a",
+            "right_index": True,
+            "right_hint": SkewJoinHint(columns=["0", []]),
+        },
+    ]
+    for kw in parameters:
+        print(kw)
+        with pytest.raises(TypeError):
+            mdf1.merge(mdf2, **kw)
+    # value error
+    parameters = [
+        # mapjoin can't working with skew join
+        {
+            "how": "left",
+            "right_on": "x",
+            "left_index": True,
+            "left_hint": MapJoinHint(),
+            "right_hint": SkewJoinHint(),
+        },
+        # right join can't apply to skew join left frame
+        {
+            "how": "right",
+            "left_on": "a",
+            "right_index": True,
+            "left_hint": SkewJoinHint(),
+        },
+        # invalid columns
+        {
+            "how": "left",
+            "left_on": "a",
+            "right_on": "x",
+            "left_hint": SkewJoinHint(columns=["b"]),
+        },
+        # invalid index level
+        {
+            "how": "right",
+            "left_on": "a",
+            "right_index": True,
+            "right_hint": SkewJoinHint(columns=[5]),
+        },
+        # unmatched skew join columns
+        {
+            "how": "right",
+            "left_on": "a",
+            "right_index": True,
+            "right_hint": SkewJoinHint(columns=[{0: "value1"}, {1: "value2"}]),
+        },
+        # invalid dist_mapjoin shard_count
+        {"how": "right", "on": "a", "right_hint": DistributedMapJoinHint()},
+        # all can't work with outer join
+        {"how": "outer", "on": ["a", "b"], "left_hint": MapJoinHint()},
+        {
+            "how": "outer",
+            "on": ["a", "b"],
+            "left_hint": DistributedMapJoinHint(shard_count=5),
+        },
+        {"how": "outer", "on": ["a", "b"], "left_hint": SkewJoinHint()},
+    ]
+    for kw in parameters:
+        with pytest.raises(ValueError):
+            mdf1.merge(mdf2, **kw)

maxframe/dataframe/misc/apply.py CHANGED Viewed

@@ -170,6 +170,8 @@ class ApplyOperator(
         elif self.output_types[0] == OutputType.dataframe:
             shape = [np.nan, np.nan]
             shape[1 - self.axis] = df.shape[1 - self.axis]
+            if self.axis == 1:
+                shape[1] = len(dtypes)
             shape = tuple(shape)
         else:
             shape = (df.shape[1 - self.axis],)
@@ -317,6 +319,7 @@ def df_apply(
     skip_infer=False,
     **kwds,
 ):
+    # FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/50
     """
     Apply a function along an axis of the DataFrame.
@@ -442,20 +445,12 @@ def df_apply(
     B    27
     dtype: int64
-    >>> df.apply(np.sum, axis=1).execute()
+    >>> df.apply(lambda row: int(np.sum(row)), axis=1).execute()
     0    13
     1    13
     2    13
     dtype: int64
-    Returning a list-like will result in a Series
-    >>> df.apply(lambda x: [1, 2], axis=1).execute()
-    0    [1, 2]
-    1    [1, 2]
-    2    [1, 2]
-    dtype: object
     Passing ``result_type='expand'`` will expand list-like results
     to columns of a Dataframe
@@ -469,7 +464,7 @@ def df_apply(
     ``result_type='expand'``. The resulting column names
     will be the Series index.
-    >>> df.apply(lambda x: md.Series([1, 2], index=['foo', 'bar']), axis=1).execute()
+    >>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1).execute()
        foo  bar
     0    1    2
     1    1    2

maxframe/dataframe/misc/case_when.py CHANGED Viewed

@@ -99,7 +99,7 @@ def case_when(series, caselist):
     >>> b = md.Series([0, 3, 4, 5])
     >>> c.case_when(caselist=[(a.gt(0), a),  # condition, replacement
-    ...                       (b.gt(0), b)])
+    ...                       (b.gt(0), b)]).execute()
     0    6
     1    3
     2    1

maxframe/dataframe/misc/describe.py CHANGED Viewed

@@ -15,7 +15,7 @@
 import numpy as np
 import pandas as pd
-from ... import opcodes as OperandDef
+from ... import opcodes
 from ...serialization.serializables import AnyField, FieldTypes, KeyField, ListField
 from ..core import SERIES_TYPE
 from ..operators import DataFrameOperator, DataFrameOperatorMixin
@@ -23,7 +23,7 @@ from ..utils import build_empty_df, parse_index
 class DataFrameDescribe(DataFrameOperator, DataFrameOperatorMixin):
-    _op_type_ = OperandDef.DESCRIBE
+    _op_type_ = opcodes.DESCRIBE
     input = KeyField("input", default=None)
     percentiles = ListField("percentiles", FieldTypes.float64, default=None)

maxframe/dataframe/misc/drop_duplicates.py CHANGED Viewed

@@ -37,16 +37,19 @@ class DataFrameDropDuplicates(DuplicateOperand):
             shape += (3,)
         return shape
-    @classmethod
-    def _gen_tileable_params(cls, op: "DataFrameDropDuplicates", input_params):
+    def _gen_tileable_params(self, op: "DataFrameDropDuplicates", input_params):
         params = input_params.copy()
-        if op.ignore_index:
+        if op.ignore_index and self._output_types[0] != OutputType.index:
             params["index_value"] = parse_index(pd.RangeIndex(-1))
         else:
             params["index_value"] = gen_unknown_index_value(
-                input_params["index_value"], op.keep, op.subset, type(op).__name__
+                input_params["index_value"],
+                op.keep,
+                op.subset,
+                type(op).__name__,
+                normalize_range_index=True,
             )
-        params["shape"] = cls._get_shape(input_params["shape"], op)
+        params["shape"] = self._get_shape(input_params["shape"], op)
         return params
     def __call__(self, inp, inplace=False):
@@ -151,17 +154,14 @@ def series_drop_duplicates(
     With the 'keep' parameter, the selection behaviour of duplicated values
     can be changed. The value 'first' keeps the first occurrence for each
     set of duplicated entries. The default value of keep is 'first'.
     >>> s.drop_duplicates().execute()
     0      lame
     1       cow
     3    beetle
     5     hippo
     Name: animal, dtype: object
     The value 'last' for parameter 'keep' keeps the last occurrence for
     each set of duplicated entries.
     >>> s.drop_duplicates(keep='last').execute()
     1       cow
     3    beetle

maxframe/dataframe/misc/eval.py CHANGED Viewed

@@ -120,6 +120,10 @@ class CollectionVisitor(ast.NodeVisitor):
         if obj_name in self.env:
             self.referenced_vars.add(obj_name)
             return self.env[obj_name]
+        try:
+            return self.target[obj_name]
+        except KeyError:
+            pass
         raise KeyError(f"name {obj_name} is not defined")
     def visit(self, node):

maxframe/dataframe/misc/memory_usage.py CHANGED Viewed

@@ -58,7 +58,7 @@ class DataFrameMemoryUsage(DataFrameOperator, DataFrameOperatorMixin):
         """
         if df_or_series.ndim == 1:
             # the input data is a series, a Scalar will be returned
-            return self.new_scalar([df_or_series], dtype=np.dtype(np.int_))
+            return self.new_scalar([df_or_series], dtype=np.dtype(int))
         else:
             # the input data is a DataFrame, a Scalar will be returned
             # calculate shape of returning series given ``op.index``
@@ -71,7 +71,7 @@ class DataFrameMemoryUsage(DataFrameOperator, DataFrameOperatorMixin):
                 [df_or_series],
                 index_value=self._adapt_index(df_or_series.columns_value),
                 shape=new_shape,
-                dtype=np.dtype(np.int_),
+                dtype=np.dtype(int),
             )