PyPI - maxframe - Versions diffs - 0.1.0b3__cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl → 0.1.0b4__cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl - Mend

maxframe 0.1.0b3__cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl → 0.1.0b4__cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of maxframe might be problematic. Click here for more details.

Files changed (21) hide show

maxframe/config/config.py +3 -0
maxframe/dataframe/__init__.py +5 -0
maxframe/dataframe/core.py +4 -2
maxframe/dataframe/datasource/read_odps_query.py +3 -1
maxframe/dataframe/datasource/read_odps_table.py +2 -0
maxframe/dataframe/datastore/core.py +19 -0
maxframe/dataframe/datastore/to_csv.py +2 -2
maxframe/dataframe/datastore/to_odps.py +2 -2
maxframe/dataframe/indexing/reset_index.py +1 -17
maxframe/odpsio/arrow.py +8 -3
maxframe/odpsio/schema.py +18 -5
maxframe/odpsio/tests/test_schema.py +25 -0
maxframe/opcodes.py +5 -0
maxframe/session.py +4 -2
maxframe/utils.py +5 -0
{maxframe-0.1.0b3.dist-info → maxframe-0.1.0b4.dist-info}/METADATA +1 -1
{maxframe-0.1.0b3.dist-info → maxframe-0.1.0b4.dist-info}/RECORD +590 -589
maxframe_client/session/odps.py +11 -10
maxframe_client/tests/test_session.py +21 -0
{maxframe-0.1.0b3.dist-info → maxframe-0.1.0b4.dist-info}/WHEEL +0 -0
{maxframe-0.1.0b3.dist-info → maxframe-0.1.0b4.dist-info}/top_level.txt +0 -0

maxframe/config/config.py CHANGED Viewed

@@ -358,6 +358,9 @@ default_options.register_option(
 default_options.register_option(
     "show_progress", "auto", validator=any_validator(is_bool, is_string)
 )
+default_options.register_option(
+    "dag.settings", value=dict(), validator=is_dict, remote=True
+)
 ################
 # SPE Settings #

maxframe/dataframe/__init__.py CHANGED Viewed

@@ -57,6 +57,11 @@ try:
 except ImportError:  # pragma: no cover
     pass
+try:
+    from . import _internal
+except ImportError:  # pragma: no cover
+    pass
 del (
     arithmetic,
     datasource,

maxframe/dataframe/core.py CHANGED Viewed

@@ -960,7 +960,9 @@ class BaseSeriesData(HasShapeTileableData, _ToPandasMixin):
             buf = StringIO()
             max_rows = pd.get_option("display.max_rows")
             corner_max_rows = (
-                max_rows if self.shape[0] <= max_rows else corner_data.shape[0] - 1
+                max_rows
+                if self.shape[0] <= max_rows or corner_data.shape[0] == 0
+                else corner_data.shape[0] - 1
             )  # make sure max_rows < corner_data
             with pd.option_context("display.max_rows", corner_max_rows):
@@ -1605,7 +1607,7 @@ class DataFrameData(_BatchedFetcher, BaseDataFrameData):
             buf = StringIO()
             max_rows = pd.get_option("display.max_rows")
-            if self.shape[0] <= max_rows:
+            if self.shape[0] <= max_rows or corner_data.shape[0] == 0:
                 buf.write(repr(corner_data) if representation else str(corner_data))
             else:
                 # remember we cannot directly call repr(df),

maxframe/dataframe/datasource/read_odps_query.py CHANGED Viewed

@@ -263,7 +263,9 @@ def read_odps_query(
     result: DataFrame
         DataFrame read from MaxCompute (ODPS) table
     """
-    odps_entry = odps_entry or ODPS.from_environments()
+    odps_entry = odps_entry or ODPS.from_global() or ODPS.from_environments()
+    if odps_entry is None:
+        raise ValueError("Missing odps_entry parameter")
     inst = odps_entry.execute_sql(f"EXPLAIN {query}")
     explain_str = list(inst.get_task_results().values())[0]

maxframe/dataframe/datasource/read_odps_table.py CHANGED Viewed

@@ -164,6 +164,8 @@ def read_odps_table(
         DataFrame read from MaxCompute (ODPS) table
     """
     odps_entry = odps_entry or ODPS.from_global() or ODPS.from_environments()
+    if odps_entry is None:
+        raise ValueError("Missing odps_entry parameter")
     if isinstance(table_name, Table):
         table = table_name
     else:

maxframe/dataframe/datastore/core.py ADDED Viewed

@@ -0,0 +1,19 @@
+# Copyright 1999-2024 Alibaba Group Holding Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ..operators import DataFrameOperator, DataFrameOperatorMixin
+class DataFrameDataStore(DataFrameOperator, DataFrameOperatorMixin):
+    pass

maxframe/dataframe/datastore/to_csv.py CHANGED Viewed

@@ -23,11 +23,11 @@ from ...serialization.serializables import (
     ListField,
     StringField,
 )
-from ..operators import DataFrameOperator, DataFrameOperatorMixin
 from ..utils import parse_index
+from .core import DataFrameDataStore
-class DataFrameToCSV(DataFrameOperator, DataFrameOperatorMixin):
+class DataFrameToCSV(DataFrameDataStore):
     _op_type_ = opcodes.TO_CSV
     input = KeyField("input")

maxframe/dataframe/datastore/to_odps.py CHANGED Viewed

@@ -32,13 +32,13 @@ from ...serialization.serializables import (
 )
 from ...typing_ import TileableType
 from ..core import DataFrame  # noqa: F401
-from ..operators import DataFrameOperator, DataFrameOperatorMixin
 from ..utils import parse_index
+from .core import DataFrameDataStore
 logger = logging.getLogger(__name__)
-class DataFrameToODPSTable(DataFrameOperator, DataFrameOperatorMixin):
+class DataFrameToODPSTable(DataFrameDataStore):
     _op_type_ = opcodes.TO_ODPS_TABLE
     dtypes = SeriesField("dtypes")

maxframe/dataframe/indexing/reset_index.py CHANGED Viewed

@@ -107,7 +107,6 @@ def df_reset_index(
     inplace=False,
     col_level=0,
     col_fill="",
-    incremental_index=False,
 ):
     """
     Reset the index, or a level of it.
@@ -133,12 +132,6 @@ def df_reset_index(
     col_fill : object, default ''
         If the columns have multiple levels, determines how the other
         levels are named. If None then the index name is repeated.
-    incremental_index: bool, default False
-        Ensure RangeIndex incremental, when output DataFrame has multiple chunks,
-        ensuring index incremental costs more computation,
-        so by default, each chunk will have index which starts from 0,
-        setting incremental_index=True，reset_index will guarantee that
-        output DataFrame's index is from 0 to n - 1.
     Returns
     -------
@@ -264,7 +257,6 @@ def df_reset_index(
         drop=drop,
         col_level=col_level,
         col_fill=col_fill,
-        incremental_index=incremental_index,
         output_types=[OutputType.dataframe],
     )
     ret = op(df)
@@ -280,7 +272,6 @@ def series_reset_index(
     drop=False,
     name=no_default,
     inplace=False,
-    incremental_index=False,
 ):
     """
     Generate a new DataFrame or Series with the index reset.
@@ -303,12 +294,6 @@ def series_reset_index(
         when `drop` is True.
     inplace : bool, default False
         Modify the Series in place (do not create a new object).
-    incremental_index: bool, default False
-        Ensure RangeIndex incremental, when output Series has multiple chunks,
-        ensuring index incremental costs more computation,
-        so by default, each chunk will have index which starts from 0,
-        setting incremental_index=True，reset_index will guarantee that
-        output Series's index is from 0 to n - 1.
     Returns
     -------
@@ -406,8 +391,7 @@ def series_reset_index(
         level=level,
         drop=drop,
         name=name,
-        incremental_index=incremental_index,
-        output_types=[OutputType.series],
+        output_types=[OutputType.series if drop else OutputType.dataframe],
     )
     ret = op(series)
     if not inplace:

maxframe/odpsio/arrow.py CHANGED Viewed

@@ -65,14 +65,19 @@ def arrow_to_pandas(
         raise ValueError(f"Does not support meta type {table_meta.type!r}")
-def pandas_to_arrow(df: Any, nthreads=1) -> Tuple[ArrowTableType, DataFrameTableMeta]:
-    table_meta = build_dataframe_table_meta(df)
+def pandas_to_arrow(
+    df: Any, nthreads=1, ignore_index=False
+) -> Tuple[ArrowTableType, DataFrameTableMeta]:
+    table_meta = build_dataframe_table_meta(df, ignore_index)
     df = df.copy() if callable(getattr(df, "copy", None)) else df
     if table_meta.type in (OutputType.dataframe, OutputType.series):
         if table_meta.type == OutputType.series:
             df = df.to_frame("_data" if df.name is None else df.name)
         df.columns = pd.Index(table_meta.table_column_names)
-        df = df.rename_axis(table_meta.table_index_column_names).reset_index()
+        if not ignore_index:
+            df = df.rename_axis(table_meta.table_index_column_names).reset_index()
+    elif ignore_index:
+        df = pd.DataFrame([], columns=[])
     elif table_meta.type == OutputType.index:
         names = [f"_idx_{idx}" for idx in range(len(df.names))]
         df = df.to_frame(name=names[0] if len(names) == 1 else names)

maxframe/odpsio/schema.py CHANGED Viewed

@@ -175,7 +175,9 @@ def _scalar_as_index(df_obj: Any) -> pd.Index:
 def pandas_to_odps_schema(
-    df_obj: Any, unknown_as_string: bool = False
+    df_obj: Any,
+    unknown_as_string: bool = False,
+    ignore_index=False,
 ) -> Tuple[odps_types.OdpsSchema, DataFrameTableMeta]:
     from .. import dataframe as md
     from .arrow import pandas_to_arrow
@@ -209,7 +211,7 @@ def pandas_to_odps_schema(
     else:
         empty_df_obj = df_obj
-    arrow_data, table_meta = pandas_to_arrow(empty_df_obj)
+    arrow_data, table_meta = pandas_to_arrow(empty_df_obj, ignore_index=ignore_index)
     return (
         arrow_schema_to_odps_schema(
             arrow_data.schema, unknown_as_string=unknown_as_string
@@ -268,7 +270,9 @@ def build_table_column_name(
     return col_name
-def build_dataframe_table_meta(df_obj: Any) -> DataFrameTableMeta:
+def build_dataframe_table_meta(
+    df_obj: Any, ignore_index: bool = False
+) -> DataFrameTableMeta:
     from .. import dataframe as md
     col_to_count = defaultdict(lambda: 0)
@@ -285,6 +289,8 @@ def build_dataframe_table_meta(df_obj: Any) -> DataFrameTableMeta:
     else:  # pragma: no cover
         raise TypeError(f"Cannot accept type {type(df_obj)}")
+    assert not ignore_index or obj_type in (OutputType.dataframe, OutputType.series)
     if obj_type == OutputType.scalar:
         pd_dtypes = pd.Series([])
         column_index_names = []
@@ -340,12 +346,19 @@ def build_dataframe_table_meta(df_obj: Any) -> DataFrameTableMeta:
     else:
         index_dtypes = pd.Series([pd_index_val.dtype], index=pd_index_val.names)
+    if ignore_index:
+        table_index_column_names = []
+        pd_index_dtypes = pd.Series([], index=[])
+    else:
+        table_index_column_names = [f"_idx_{i}" for i in range(len(index_obj.names))]
+        pd_index_dtypes = index_dtypes
     return DataFrameTableMeta(
         table_name=table_name,
         type=obj_type,
         table_column_names=final_sql_columns,
-        table_index_column_names=[f"_idx_{i}" for i in range(len(index_obj.names))],
+        table_index_column_names=table_index_column_names,
         pd_column_dtypes=pd_dtypes,
         pd_column_level_names=column_index_names,
-        pd_index_dtypes=index_dtypes,
+        pd_index_dtypes=pd_index_dtypes,
     )

maxframe/odpsio/tests/test_schema.py CHANGED Viewed

@@ -61,6 +61,16 @@ def test_pandas_to_odps_schema_dataframe(wrap_obj):
     assert meta.pd_column_level_names == [None]
     assert meta.pd_index_level_names == [None]
+    test_df = _wrap_maxframe_obj(data, wrap=wrap_obj)
+    schema, meta = pandas_to_odps_schema(test_df, ignore_index=True)
+    assert [c.name for c in schema.columns] == list(test_df.dtypes.index.str.lower())
+    assert [c.type.name for c in schema.columns] == ["double"] * len(test_df.columns)
+    assert meta.type == OutputType.dataframe
+    assert meta.table_column_names == list(test_df.dtypes.index.str.lower())
+    assert meta.table_index_column_names == []
+    assert meta.pd_column_level_names == [None]
+    assert meta.pd_index_level_names == []
     data.columns = pd.MultiIndex.from_tuples(
         [("A", "A"), ("A", "B"), ("A", "C"), ("B", "A"), ("B", "B")], names=["c1", "c2"]
     )
@@ -99,6 +109,15 @@ def test_pandas_to_odps_schema_series(wrap_obj):
     assert meta.pd_column_level_names == [None]
     assert meta.pd_index_level_names == [None]
+    schema, meta = pandas_to_odps_schema(test_s, ignore_index=True)
+    assert [c.name for c in schema.columns] == ["_data"]
+    assert [c.type.name for c in schema.columns] == ["double"]
+    assert meta.type == OutputType.series
+    assert meta.table_column_names == ["_data"]
+    assert meta.table_index_column_names == []
+    assert meta.pd_column_level_names == [None]
+    assert meta.pd_index_level_names == []
     data.index = pd.MultiIndex.from_arrays(
         [np.random.choice(list("ABC"), 100), np.random.randint(0, 10, 100)],
         names=["c1", "c2"],
@@ -130,6 +149,9 @@ def test_pandas_to_odps_schema_index(wrap_obj):
     assert meta.pd_column_level_names == []
     assert meta.pd_index_level_names == [None]
+    with pytest.raises(AssertionError):
+        pandas_to_odps_schema(test_idx, unknown_as_string=True, ignore_index=True)
     data = pd.MultiIndex.from_arrays(
         [np.random.choice(list("ABC"), 100), np.random.randint(0, 10, 100)],
         names=["c1", "c2"],
@@ -159,6 +181,9 @@ def test_pandas_to_odps_schema_scalar(wrap_obj):
     assert meta.pd_column_level_names == []
     assert meta.pd_index_level_names == [None]
+    with pytest.raises(AssertionError):
+        pandas_to_odps_schema(test_scalar, unknown_as_string=True, ignore_index=True)
 def test_odps_arrow_schema_conversion():
     odps_schema = odps_types.OdpsSchema(

maxframe/opcodes.py CHANGED Viewed

@@ -564,6 +564,11 @@ CHOLESKY_FUSE = 999988
 # MaxFrame-dedicated functions
 DATAFRAME_RESHUFFLE = 10001
+# MaxFrame internal operators
+DATAFRAME_PROJECTION_SAME_INDEX_MERGE = 100001
+GROUPBY_AGGR_SAME_INDEX_MERGE = 100002
+DATAFRAME_ILOC_GET_AND_RENAME_ITEM = 100003
 # fetches
 FETCH_SHUFFLE = 999998
 FETCH = 999999

maxframe/session.py CHANGED Viewed

@@ -1211,7 +1211,7 @@ def new_session(
     # load third party extensions.
     ensure_isolation_created(kwargs)
-    odps_entry = odps_entry or ODPS.from_environments()
+    odps_entry = odps_entry or ODPS.from_global() or ODPS.from_environments()
     if address is None:
         from maxframe_client.session.consts import ODPS_SESSION_INSECURE_SCHEME
@@ -1255,7 +1255,9 @@ def get_default_or_create(**kwargs):
         if session is None:
             # no session attached, try to create one
             warnings.warn(warning_msg)
-            session = new_session(ODPS.from_environments(), **kwargs)
+            session = new_session(
+                ODPS.from_global() or ODPS.from_environments(), **kwargs
+            )
             session.as_default()
     if isinstance(session, IsolatedAsyncSession):
         session = SyncSession.from_isolated_session(session)

maxframe/utils.py CHANGED Viewed

@@ -381,6 +381,11 @@ def build_temp_table_name(session_id: str, tileable_key: str) -> str:
     return f"tmp_mf_{session_id}_{tileable_key}"
+def build_temp_intermediate_table_name(session_id: str, tileable_key: str) -> str:
+    temp_table = build_temp_table_name(session_id, tileable_key)
+    return f"{temp_table}_intermediate"
 def build_session_volume_name(session_id: str) -> str:
     return f"mf_vol_{session_id}"

{maxframe-0.1.0b3.dist-info → maxframe-0.1.0b4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: maxframe
-Version: 0.1.0b3
+Version: 0.1.0b4
 Summary: MaxFrame operator-based data analyze framework
 Requires-Dist: numpy >=1.19.0
 Requires-Dist: pandas >=1.0.0