PyPI - maxframe - Versions diffs - 1.0.0rc1__cp311-cp311-win_amd64.whl → 1.0.0rc2__cp311-cp311-win_amd64.whl - Mend

maxframe 1.0.0rc1__cp311-cp311-win_amd64.whl → 1.0.0rc2__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of maxframe might be problematic. Click here for more details.

Files changed (64) hide show

maxframe/_utils.cp311-win_amd64.pyd +0 -0
maxframe/codegen.py +0 -4
maxframe/config/config.py +34 -2
maxframe/config/validators.py +1 -0
maxframe/conftest.py +2 -0
maxframe/core/entity/objects.py +1 -1
maxframe/core/graph/core.cp311-win_amd64.pyd +0 -0
maxframe/dataframe/__init__.py +1 -1
maxframe/dataframe/arithmetic/around.py +5 -17
maxframe/dataframe/arithmetic/core.py +15 -7
maxframe/dataframe/arithmetic/docstring.py +5 -55
maxframe/dataframe/arithmetic/tests/test_arithmetic.py +22 -0
maxframe/dataframe/core.py +5 -5
maxframe/dataframe/datasource/date_range.py +2 -2
maxframe/dataframe/datasource/read_odps_query.py +6 -0
maxframe/dataframe/datasource/read_odps_table.py +2 -1
maxframe/dataframe/datasource/tests/test_datasource.py +14 -0
maxframe/dataframe/groupby/cum.py +0 -1
maxframe/dataframe/groupby/tests/test_groupby.py +4 -0
maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
maxframe/dataframe/indexing/rename.py +3 -37
maxframe/dataframe/indexing/sample.py +0 -1
maxframe/dataframe/indexing/set_index.py +68 -1
maxframe/dataframe/merge/merge.py +236 -2
maxframe/dataframe/merge/tests/test_merge.py +123 -0
maxframe/dataframe/misc/apply.py +3 -10
maxframe/dataframe/misc/case_when.py +1 -1
maxframe/dataframe/misc/describe.py +2 -2
maxframe/dataframe/misc/drop_duplicates.py +4 -25
maxframe/dataframe/misc/eval.py +4 -0
maxframe/dataframe/misc/pct_change.py +1 -83
maxframe/dataframe/misc/transform.py +1 -30
maxframe/dataframe/misc/value_counts.py +4 -17
maxframe/dataframe/missing/dropna.py +1 -1
maxframe/dataframe/missing/fillna.py +5 -5
maxframe/dataframe/sort/sort_values.py +1 -11
maxframe/dataframe/statistics/quantile.py +5 -17
maxframe/dataframe/utils.py +4 -7
maxframe/learn/contrib/xgboost/dmatrix.py +2 -2
maxframe/learn/contrib/xgboost/predict.py +2 -2
maxframe/learn/contrib/xgboost/train.py +2 -2
maxframe/lib/mmh3.cp311-win_amd64.pyd +0 -0
maxframe/odpsio/__init__.py +1 -1
maxframe/odpsio/arrow.py +8 -4
maxframe/odpsio/schema.py +10 -7
maxframe/odpsio/tableio.py +388 -14
maxframe/odpsio/tests/test_schema.py +16 -15
maxframe/odpsio/tests/test_tableio.py +48 -21
maxframe/protocol.py +40 -2
maxframe/serialization/core.cp311-win_amd64.pyd +0 -0
maxframe/serialization/serializables/core.py +48 -9
maxframe/tensor/__init__.py +59 -0
maxframe/tensor/base/unique.py +2 -2
maxframe/tensor/statistics/quantile.py +2 -2
maxframe/tests/utils.py +11 -2
maxframe/utils.py +17 -9
{maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc2.dist-info}/METADATA +74 -1
{maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc2.dist-info}/RECORD +64 -64
{maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc2.dist-info}/WHEEL +1 -1
maxframe_client/fetcher.py +38 -27
maxframe_client/session/odps.py +5 -5
maxframe_client/tests/test_fetcher.py +21 -3
maxframe_client/tests/test_session.py +13 -2
{maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc2.dist-info}/top_level.txt +0 -0

maxframe/dataframe/misc/pct_change.py CHANGED Viewed

@@ -18,6 +18,7 @@ from ..utils import validate_axis
 def pct_change(
     df_or_series, periods=1, fill_method="pad", limit=None, freq=None, **kwargs
 ):
+    # FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/32
     """
     Percentage change between the current and a prior element.
@@ -50,89 +51,6 @@ def pct_change(
     DataFrame.diff : Compute the difference of two elements in a DataFrame.
     Series.shift : Shift the index by some number of periods.
     DataFrame.shift : Shift the index by some number of periods.
-    Examples
-    --------
-    **Series**
-    >>> import maxframe.dataframe as md
-    >>> s = md.Series([90, 91, 85])
-    >>> s.execute()
-    0    90
-    1    91
-    2    85
-    dtype: int64
-    >>> s.pct_change().execute()
-    0         NaN
-    1    0.011111
-    2   -0.065934
-    dtype: float64
-    >>> s.pct_change(periods=2).execute()
-    0         NaN
-    1         NaN
-    2   -0.055556
-    dtype: float64
-    See the percentage change in a Series where filling NAs with last
-    valid observation forward to next valid.
-    >>> s = md.Series([90, 91, None, 85])
-    >>> s.execute()
-    0    90.0
-    1    91.0
-    2     NaN
-    3    85.0
-    dtype: float64
-    >>> s.pct_change(fill_method='ffill').execute()
-    0         NaN
-    1    0.011111
-    2    0.000000
-    3   -0.065934
-    dtype: float64
-    **DataFrame**
-    Percentage change in French franc, Deutsche Mark, and Italian lira from
-    1980-01-01 to 1980-03-01.
-    >>> df = md.DataFrame({
-    ...     'FR': [4.0405, 4.0963, 4.3149],
-    ...     'GR': [1.7246, 1.7482, 1.8519],
-    ...     'IT': [804.74, 810.01, 860.13]},
-    ...     index=['1980-01-01', '1980-02-01', '1980-03-01'])
-    >>> df.execute()
-                    FR      GR      IT
-    1980-01-01  4.0405  1.7246  804.74
-    1980-02-01  4.0963  1.7482  810.01
-    1980-03-01  4.3149  1.8519  860.13
-    >>> df.pct_change().execute()
-                      FR        GR        IT
-    1980-01-01       NaN       NaN       NaN
-    1980-02-01  0.013810  0.013684  0.006549
-    1980-03-01  0.053365  0.059318  0.061876
-    Percentage of change in GOOG and APPL stock volume. Shows computing
-    the percentage change between columns.
-    >>> df = md.DataFrame({
-    ...     '2016': [1769950, 30586265],
-    ...     '2015': [1500923, 40912316],
-    ...     '2014': [1371819, 41403351]},
-    ...     index=['GOOG', 'APPL'])
-    >>> df.execute()
-              2016      2015      2014
-    GOOG   1769950   1500923   1371819
-    APPL  30586265  40912316  41403351
-    >>> df.pct_change(axis='columns').execute()
-          2016      2015      2014
-    GOOG   NaN -0.151997 -0.086016
-    APPL   NaN  0.337604  0.012002
     """
     axis = validate_axis(kwargs.pop("axis", 0))

maxframe/dataframe/misc/transform.py CHANGED Viewed

@@ -228,21 +228,6 @@ def df_transform(df, func, axis=0, *args, dtypes=None, skip_infer=False, **kwarg
     0  1  2
     1  2  3
     2  3  4
-    Even though the resulting DataFrame must have the same length as the
-    input DataFrame, it is possible to provide several input functions:
-    >>> s = md.Series(range(3))
-    >>> s.execute()
-    0    0
-    1    1
-    2    2
-    dtype: int64
-    >>> s.transform([mt.sqrt, mt.exp]).execute()
-           sqrt        exp
-    0  0.000000   1.000000
-    1  1.000000   2.718282
-    2  1.414214   7.389056
     """
     op = TransformOperator(
         func=func,
@@ -265,6 +250,7 @@ def series_transform(
     dtype=None,
     **kwargs
 ):
+    # FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/10
     """
     Call ``func`` on self producing a Series with transformed values.
@@ -332,21 +318,6 @@ def series_transform(
     0  1  2
     1  2  3
     2  3  4
-    Even though the resulting Series must have the same length as the
-    input Series, it is possible to provide several input functions:
-    >>> s = md.Series(range(3))
-    >>> s.execute()
-    0    0
-    1    1
-    2    2
-    dtype: int64
-    >>> s.transform([mt.sqrt, mt.exp]).execute()
-       sqrt        exp
-    0  0.000000   1.000000
-    1  1.000000   2.718282
-    2  1.414214   7.389056
     """
     op = TransformOperator(
         func=func,

maxframe/dataframe/misc/value_counts.py CHANGED Viewed

@@ -85,6 +85,7 @@ def value_counts(
     dropna=True,
     method="auto",
 ):
+    # FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/33
     """
     Return a Series containing counts of unique values.
@@ -125,9 +126,8 @@ def value_counts(
     Examples
     --------
     >>> import maxframe.dataframe as md
-    >>> import maxframe.tensor as mt
-    >>> s = md.Series([3, 1, 2, 3, 4, mt.nan])
+    >>> import numpy as np
+    >>> s = md.Series([3, 1, 2, 3, 4, np.nan])
     >>> s.value_counts().execute()
     3.0    2
     4.0    1
@@ -138,7 +138,7 @@ def value_counts(
     With `normalize` set to `True`, returns the relative frequency by
     dividing all values by the sum of values.
-    >>> s = md.Series([3, 1, 2, 3, 4, mt.nan])
+    >>> s = md.Series([3, 1, 2, 3, 4, np.nan])
     >>> s.value_counts(normalize=True).execute()
     3.0    0.4
     4.0    0.2
@@ -146,19 +146,6 @@ def value_counts(
     1.0    0.2
     dtype: float64
-    **bins**
-    Bins can be useful for going from a continuous variable to a
-    categorical variable; instead of counting unique
-    apparitions of values, divide the index in the specified
-    number of half-open bins.
-    >>> s.value_counts(bins=3).execute()
-    (2.0, 3.0]      2
-    (0.996, 2.0]    2
-    (3.0, 4.0]      1
-    dtype: int64
     **dropna**
     With `dropna` set to `False` we can also see NaN index values.

maxframe/dataframe/missing/dropna.py CHANGED Viewed

@@ -234,7 +234,7 @@ def series_dropna(series, axis=0, inplace=False, how=None):
     Empty strings are not considered NA values. ``None`` is considered an
     NA value.
-    >>> ser = md.Series([np.NaN, 2, md.NaT, '', None, 'I stay'])
+    >>> ser = md.Series([np.NaN, '2', md.NaT, '', None, 'I stay'])
     >>> ser.execute()
     0       NaN
     1         2

maxframe/dataframe/missing/fillna.py CHANGED Viewed

@@ -132,11 +132,11 @@ def fillna(
     --------
     >>> import maxframe.tensor as mt
     >>> import maxframe.dataframe as md
-    >>> df = md.DataFrame([[mt.nan, 2, mt.nan, 0],
-    ...                    [3, 4, mt.nan, 1],
-    ...                    [mt.nan, mt.nan, mt.nan, 5],
-    ...                    [mt.nan, 3, mt.nan, 4]],
-    ...                   columns=list('ABCD'))
+    >>> df = md.DataFrame([[np.nan, 2, np.nan, 0],
+                           [3, 4, np.nan, 1],
+                           [np.nan, np.nan, np.nan, 5],
+                           [np.nan, 3, np.nan, 4]],
+                          columns=list('ABCD'))
     >>> df.execute()
          A    B   C  D
     0  NaN  2.0 NaN  0

maxframe/dataframe/sort/sort_values.py CHANGED Viewed

@@ -67,6 +67,7 @@ def dataframe_sort_values(
     parallel_kind="PSRS",
     psrs_kinds=None,
 ):
+    # FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/15
     """
     Sort by the values along either axis.
@@ -152,17 +153,6 @@ def dataframe_sort_values(
     0   A    2    0
     1   A    1    1
     3   NaN  8    4
-    Putting NAs first
-    >>> df.sort_values(by='col1', ascending=False, na_position='first').execute()
-        col1 col2 col3
-    3   NaN  8    4
-    4   D    7    2
-    5   C    4    3
-    2   B    9    9
-    0   A    2    0
-    1   A    1    1
     """
     if na_position not in ["last", "first"]:  # pragma: no cover

maxframe/dataframe/statistics/quantile.py CHANGED Viewed

@@ -14,8 +14,9 @@
 import numpy as np
 import pandas as pd
+from pandas.core.dtypes.cast import find_common_type
-from ... import opcodes as OperandDef
+from ... import opcodes
 from ...core import ENTITY_TYPE
 from ...serialization.serializables import (
     AnyField,
@@ -32,11 +33,11 @@ from ...tensor.datasource import tensor as astensor
 from ...tensor.statistics.quantile import quantile as tensor_quantile
 from ..core import DATAFRAME_TYPE
 from ..operators import DataFrameOperator, DataFrameOperatorMixin
-from ..utils import build_empty_df, find_common_type, parse_index, validate_axis
+from ..utils import build_empty_df, parse_index, validate_axis
 class DataFrameQuantile(DataFrameOperator, DataFrameOperatorMixin):
-    _op_type_ = OperandDef.QUANTILE
+    _op_type_ = opcodes.QUANTILE
     input = KeyField("input", default=None)
     q = AnyField("q", default=None)
@@ -259,6 +260,7 @@ def quantile_series(series, q=0.5, interpolation="linear"):
 def quantile_dataframe(df, q=0.5, axis=0, numeric_only=True, interpolation="linear"):
+    # FIXME: Timedelta not support. Data invalid: ODPS-0010000:InvalidArgument:duration[ns] is not equal to string
     """
     Return values at the given quantile over requested axis.
@@ -309,20 +311,6 @@ def quantile_dataframe(df, q=0.5, axis=0, numeric_only=True, interpolation="line
            a     b
     0.1  1.3   3.7
     0.5  2.5  55.0
-    Specifying `numeric_only=False` will also compute the quantile of
-    datetime and timedelta data.
-    >>> df = md.DataFrame({'A': [1, 2],
-    ...                    'B': [md.Timestamp('2010'),
-    ...                          md.Timestamp('2011')],
-    ...                    'C': [md.Timedelta('1 days'),
-    ...                          md.Timedelta('2 days')]})
-    >>> df.quantile(0.5, numeric_only=False).execute()
-    A                    1.5
-    B    2010-07-02 12:00:00
-    C        1 days 12:00:00
-    Name: 0.5, dtype: object
     """
     if isinstance(q, ENTITY_TYPE):
         q = astensor(q)

maxframe/dataframe/utils.py CHANGED Viewed

@@ -26,7 +26,6 @@ import numpy as np
 import pandas as pd
 from pandas.api.extensions import ExtensionDtype
 from pandas.api.types import is_string_dtype
-from pandas.core.dtypes.cast import find_common_type
 from pandas.core.dtypes.inference import is_dict_like, is_list_like
 from ..core import Entity, ExecutableTuple
@@ -477,11 +476,11 @@ def build_df(df_obj, fill_value=1, size=1, ensure_string=False):
     else:
         fill_values = fill_value
-    from .core import SERIES_TYPE
+    from .core import INDEX_TYPE, SERIES_TYPE
     dtypes = (
         pd.Series([df_obj.dtype], index=[df_obj.name])
-        if isinstance(df_obj, SERIES_TYPE)
+        if isinstance(df_obj, (INDEX_TYPE, SERIES_TYPE))
         else df_obj.dtypes
     )
     for size, fill_value in zip(sizes, fill_values):
@@ -593,7 +592,7 @@ def build_series(
     return ret_series
-def infer_index_value(left_index_value, right_index_value):
+def infer_index_value(left_index_value, right_index_value, level=None):
     from .core import IndexValue
     if isinstance(left_index_value.value, IndexValue.RangeIndex) and isinstance(
@@ -616,9 +615,7 @@ def infer_index_value(left_index_value, right_index_value):
     left_index = left_index_value.to_pandas()
     right_index = right_index_value.to_pandas()
-    out_index = pd.Index(
-        [], dtype=find_common_type([left_index.dtype, right_index.dtype])
-    )
+    out_index = left_index.join(right_index, level=level)[:0]
     return parse_index(out_index, left_index_value, right_index_value)

maxframe/learn/contrib/xgboost/dmatrix.py CHANGED Viewed

@@ -13,7 +13,7 @@
 # limitations under the License.
-from .... import opcodes as OperandDef
+from .... import opcodes
 from ....core.entity.output_types import get_output_types
 from ....core.operator.base import Operator
 from ....core.operator.core import TileableOperatorMixin
@@ -27,7 +27,7 @@ from ...utils import convert_to_tensor_or_dataframe
 class ToDMatrix(Operator, TileableOperatorMixin):
-    _op_type_ = OperandDef.TO_DMATRIX
+    _op_type_ = opcodes.TO_DMATRIX
     data = KeyField("data", default=None)
     label = KeyField("label", default=None)

maxframe/learn/contrib/xgboost/predict.py CHANGED Viewed

@@ -17,7 +17,7 @@ import pickle
 import numpy as np
 import pandas as pd
-from .... import opcodes as OperandDef
+from .... import opcodes
 from ....core.entity.output_types import OutputType
 from ....core.operator.base import Operator
 from ....core.operator.core import TileableOperatorMixin
@@ -28,7 +28,7 @@ from .dmatrix import check_data
 class XGBPredict(Operator, TileableOperatorMixin):
-    _op_type_ = OperandDef.XGBOOST_PREDICT
+    _op_type_ = opcodes.XGBOOST_PREDICT
     output_dtype = np.dtype(np.float32)
     data = KeyField("data", default=None)

maxframe/learn/contrib/xgboost/train.py CHANGED Viewed

@@ -15,7 +15,7 @@
 import logging
 from collections import OrderedDict
-from .... import opcodes as OperandDef
+from .... import opcodes
 from ....core import OutputType
 from ....core.operator.base import Operator
 from ....core.operator.core import TileableOperatorMixin
@@ -41,7 +41,7 @@ def _on_serialize_evals(evals_val):
 class XGBTrain(Operator, TileableOperatorMixin):
-    _op_type_ = OperandDef.XGBOOST_TRAIN
+    _op_type_ = opcodes.XGBOOST_TRAIN
     params = DictField("params", key_type=FieldTypes.string, default=None)
     dtrain = KeyField("dtrain", default=None)

maxframe/lib/mmh3.cp311-win_amd64.pyd CHANGED Viewed

Binary file

maxframe/odpsio/__init__.py CHANGED Viewed

@@ -18,4 +18,4 @@ from .schema import (
     odps_schema_to_pandas_dtypes,
     pandas_to_odps_schema,
 )
-from .tableio import HaloTableIO
+from .tableio import HaloTableIO, ODPSTableIO

maxframe/odpsio/arrow.py CHANGED Viewed

@@ -45,9 +45,13 @@ def _rebuild_dataframe(
 def _rebuild_index(df: pd.DataFrame, table_meta: DataFrameTableMeta) -> pd.Index:
     if df.shape[1] > 1:
-        df.columns = pd.Index(table_meta.pd_index_level_names)
-        return pd.MultiIndex.from_frame(df)
-    return pd.Index(df.iloc[:, 0], name=table_meta.pd_index_level_names[0])
+        idx = pd.MultiIndex.from_frame(df)
+        idx.names = table_meta.pd_index_level_names
+    else:
+        # make sure even if None names are updated properly
+        idx = pd.Index(df.iloc[:, 0])
+        idx.name = table_meta.pd_index_level_names[0]
+    return idx
 def arrow_to_pandas(
@@ -75,7 +79,7 @@ def pandas_to_arrow(
         df.columns = pd.Index(table_meta.table_column_names)
         if not ignore_index:
             df = df.rename_axis(table_meta.table_index_column_names).reset_index()
-    elif ignore_index:
+    elif ignore_index and table_meta.type != OutputType.index:
         df = pd.DataFrame([], columns=[])
     elif table_meta.type == OutputType.index:
         names = [f"_idx_{idx}" for idx in range(len(df.names))]

maxframe/odpsio/schema.py CHANGED Viewed

@@ -126,10 +126,15 @@ def odps_type_to_arrow_type(
             ]
             col_type = pa.struct(fields)
         elif isinstance(odps_type, odps_types.Decimal):
-            col_type = pa.decimal128(
-                odps_type.precision or odps_types.Decimal._max_precision,
-                odps_type.scale or odps_types.Decimal._max_scale,
-            )
+            if odps_type.name == "decimal":
+                # legacy decimal data without precision or scale
+                # precision data from internal compat mode
+                col_type = pa.decimal128(38, 18)
+            else:
+                col_type = pa.decimal128(
+                    odps_type.precision or odps_types.Decimal._max_precision,
+                    odps_type.scale or odps_types.Decimal._max_scale,
+                )
         elif isinstance(odps_type, (odps_types.Varchar, odps_types.Char)):
             col_type = pa.string()
         else:
@@ -289,8 +294,6 @@ def build_dataframe_table_meta(
     else:  # pragma: no cover
         raise TypeError(f"Cannot accept type {type(df_obj)}")
-    assert not ignore_index or obj_type in (OutputType.dataframe, OutputType.series)
     if obj_type == OutputType.scalar:
         pd_dtypes = pd.Series([])
         column_index_names = []
@@ -346,7 +349,7 @@ def build_dataframe_table_meta(
     else:
         index_dtypes = pd.Series([pd_index_val.dtype], index=pd_index_val.names)
-    if ignore_index:
+    if ignore_index and obj_type != OutputType.index:
         table_index_column_names = []
         pd_index_dtypes = pd.Series([], index=[])
     else: