PyPI - maxframe - Versions diffs - 0.1.0b5__cp311-cp311-win32.whl → 1.0.0rc2__cp311-cp311-win32.whl - Mend

maxframe 0.1.0b5cp311-cp311-win32.whl → 1.0.0rc2cp311-cp311-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of maxframe might be problematic. Click here for more details.

Files changed (92) hide show

maxframe/_utils.cp311-win32.pyd +0 -0
maxframe/codegen.py +6 -2
maxframe/config/config.py +38 -2
maxframe/config/validators.py +1 -0
maxframe/conftest.py +2 -0
maxframe/core/__init__.py +0 -3
maxframe/core/entity/__init__.py +1 -8
maxframe/core/entity/objects.py +3 -45
maxframe/core/graph/core.cp311-win32.pyd +0 -0
maxframe/core/graph/core.pyx +4 -4
maxframe/dataframe/__init__.py +1 -1
maxframe/dataframe/arithmetic/around.py +5 -17
maxframe/dataframe/arithmetic/core.py +15 -7
maxframe/dataframe/arithmetic/docstring.py +5 -55
maxframe/dataframe/arithmetic/tests/test_arithmetic.py +22 -0
maxframe/dataframe/core.py +5 -5
maxframe/dataframe/datasource/date_range.py +2 -2
maxframe/dataframe/datasource/read_odps_query.py +6 -0
maxframe/dataframe/datasource/read_odps_table.py +2 -1
maxframe/dataframe/datasource/tests/test_datasource.py +14 -0
maxframe/dataframe/datastore/tests/__init__.py +13 -0
maxframe/dataframe/datastore/tests/test_to_odps.py +48 -0
maxframe/dataframe/datastore/to_odps.py +21 -0
maxframe/dataframe/groupby/cum.py +0 -1
maxframe/dataframe/groupby/tests/test_groupby.py +4 -0
maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
maxframe/dataframe/indexing/align.py +1 -1
maxframe/dataframe/indexing/rename.py +3 -37
maxframe/dataframe/indexing/sample.py +0 -1
maxframe/dataframe/indexing/set_index.py +68 -1
maxframe/dataframe/merge/merge.py +236 -2
maxframe/dataframe/merge/tests/test_merge.py +123 -0
maxframe/dataframe/misc/apply.py +5 -10
maxframe/dataframe/misc/case_when.py +1 -1
maxframe/dataframe/misc/describe.py +2 -2
maxframe/dataframe/misc/drop_duplicates.py +4 -25
maxframe/dataframe/misc/eval.py +4 -0
maxframe/dataframe/misc/memory_usage.py +2 -2
maxframe/dataframe/misc/pct_change.py +1 -83
maxframe/dataframe/misc/tests/test_misc.py +23 -0
maxframe/dataframe/misc/transform.py +1 -30
maxframe/dataframe/misc/value_counts.py +4 -17
maxframe/dataframe/missing/dropna.py +1 -1
maxframe/dataframe/missing/fillna.py +5 -5
maxframe/dataframe/sort/sort_values.py +1 -11
maxframe/dataframe/statistics/corr.py +3 -3
maxframe/dataframe/statistics/quantile.py +5 -17
maxframe/dataframe/utils.py +4 -7
maxframe/errors.py +13 -0
maxframe/extension.py +12 -0
maxframe/learn/contrib/xgboost/dmatrix.py +2 -2
maxframe/learn/contrib/xgboost/predict.py +2 -2
maxframe/learn/contrib/xgboost/train.py +2 -2
maxframe/lib/mmh3.cp311-win32.pyd +0 -0
maxframe/lib/mmh3.pyi +43 -0
maxframe/lib/wrapped_pickle.py +2 -1
maxframe/odpsio/__init__.py +1 -1
maxframe/odpsio/arrow.py +8 -4
maxframe/odpsio/schema.py +10 -7
maxframe/odpsio/tableio.py +388 -14
maxframe/odpsio/tests/test_schema.py +16 -15
maxframe/odpsio/tests/test_tableio.py +48 -21
maxframe/protocol.py +148 -12
maxframe/serialization/core.cp311-win32.pyd +0 -0
maxframe/serialization/core.pxd +3 -0
maxframe/serialization/core.pyi +3 -0
maxframe/serialization/core.pyx +54 -25
maxframe/serialization/exception.py +1 -1
maxframe/serialization/pandas.py +7 -2
maxframe/serialization/serializables/core.py +158 -12
maxframe/serialization/serializables/tests/test_serializable.py +46 -4
maxframe/tensor/__init__.py +59 -0
maxframe/tensor/arithmetic/tests/test_arithmetic.py +1 -1
maxframe/tensor/base/atleast_1d.py +1 -1
maxframe/tensor/base/unique.py +3 -3
maxframe/tensor/reduction/count_nonzero.py +1 -1
maxframe/tensor/statistics/quantile.py +2 -2
maxframe/tests/test_protocol.py +34 -0
maxframe/tests/test_utils.py +0 -12
maxframe/tests/utils.py +11 -2
maxframe/utils.py +24 -13
{maxframe-0.1.0b5.dist-info → maxframe-1.0.0rc2.dist-info}/METADATA +75 -2
{maxframe-0.1.0b5.dist-info → maxframe-1.0.0rc2.dist-info}/RECORD +91 -89
{maxframe-0.1.0b5.dist-info → maxframe-1.0.0rc2.dist-info}/WHEEL +1 -1
maxframe_client/__init__.py +0 -1
maxframe_client/fetcher.py +38 -27
maxframe_client/session/odps.py +50 -10
maxframe_client/session/task.py +41 -20
maxframe_client/tests/test_fetcher.py +21 -3
maxframe_client/tests/test_session.py +49 -2
maxframe_client/clients/spe.py +0 -104
{maxframe-0.1.0b5.dist-info → maxframe-1.0.0rc2.dist-info}/top_level.txt +0 -0

maxframe/_utils.cp311-win32.pyd CHANGED Viewed

Binary file

maxframe/codegen.py CHANGED Viewed

@@ -86,6 +86,8 @@ class AbstractUDF(Serializable):
 class UserCodeMixin:
+    __slots__ = ()
     @classmethod
     def obj_to_python_expr(cls, obj: Any = None) -> str:
         """
@@ -344,6 +346,8 @@ def register_engine_codegen(type_: Type["BigDagCodeGenerator"]):
 BUILTIN_ENGINE_SPE = "SPE"
 BUILTIN_ENGINE_MCSQL = "MCSQL"
+FAST_RANGE_INDEX_ENABLED = "codegen.fast_range_index_enabled"
 class BigDagCodeGenerator(metaclass=abc.ABCMeta):
     _context: BigDagCodeContext
@@ -516,12 +520,12 @@ class BigDagCodeGenerator(metaclass=abc.ABCMeta):
     def register_udfs(self, odps_ctx: "ODPSSessionContext"):
         for udf in self._context.get_udfs():
-            logger.info("[Session %s] Registering UDF %s", self._session_id, udf.name)
+            logger.info("[Session=%s] Registering UDF %s", self._session_id, udf.name)
             udf.register(odps_ctx, True)
     def unregister_udfs(self, odps_ctx: "ODPSSessionContext"):
         for udf in self._context.get_udfs():
-            logger.info("[Session %s] Unregistering UDF %s", self._session_id, udf.name)
+            logger.info("[Session=%s] Unregistering UDF %s", self._session_id, udf.name)
             udf.unregister(odps_ctx)
     def get_udfs(self) -> List[AbstractUDF]:

maxframe/config/config.py CHANGED Viewed

@@ -19,6 +19,15 @@ import warnings
 from copy import deepcopy
 from typing import Any, Dict, Optional, Union
+from odps.lib import tzlocal
+try:
+    from zoneinfo import available_timezones
+except ImportError:
+    from pytz import all_timezones
+    available_timezones = lambda: all_timezones
 from ..utils import get_python_tag
 from .validators import (
     ValidatorType,
@@ -28,6 +37,7 @@ from .validators import (
     is_dict,
     is_in,
     is_integer,
+    is_non_negative_integer,
     is_null,
     is_numeric,
     is_string,
@@ -37,10 +47,12 @@ _DEFAULT_REDIRECT_WARN = "Option {source} has been replaced by {target} and migh
 _DEFAULT_MAX_ALIVE_SECONDS = 3 * 24 * 3600
 _DEFAULT_MAX_IDLE_SECONDS = 3600
 _DEFAULT_SPE_OPERATION_TIMEOUT_SECONDS = 120
+_DEFAULT_SPE_FAILURE_RETRY_TIMES = 5
 _DEFAULT_UPLOAD_BATCH_SIZE = 4096
 _DEFAULT_TEMP_LIFECYCLE = 1
 _DEFAULT_TASK_START_TIMEOUT = 60
-_DEFAULT_LOGVIEW_HOURS = 24 * 60
+_DEFAULT_TASK_RESTART_TIMEOUT = 300
+_DEFAULT_LOGVIEW_HOURS = 24 * 30
 class OptionError(Exception):
@@ -296,19 +308,37 @@ class Config:
         return {k: v for k, v in res.items() if k in self._remote_options}
+def _get_legal_local_tz_name() -> Optional[str]:
+    """Sometimes we may get illegal tz name from tzlocal.get_localzone()"""
+    tz_name = str(tzlocal.get_localzone())
+    if tz_name not in available_timezones():
+        return None
+    return tz_name
 default_options = Config()
 default_options.register_option(
     "execution_mode", "trigger", validator=is_in(["trigger", "eager"])
 )
+default_options.register_option("use_common_table", False, validator=is_bool)
 default_options.register_option(
     "python_tag", get_python_tag(), validator=is_string, remote=True
 )
+default_options.register_option(
+    "local_timezone",
+    _get_legal_local_tz_name(),
+    validator=any_validator(is_null, is_in(set(available_timezones()))),
+    remote=True,
+)
 default_options.register_option(
     "session.logview_hours", _DEFAULT_LOGVIEW_HOURS, validator=is_integer, remote=True
 )
 default_options.register_option(
     "client.task_start_timeout", _DEFAULT_TASK_START_TIMEOUT, validator=is_integer
 )
+default_options.register_option(
+    "client.task_restart_timeout", _DEFAULT_TASK_RESTART_TIMEOUT, validator=is_integer
+)
 default_options.register_option("sql.enable_mcqa", True, validator=is_bool, remote=True)
 default_options.register_option(
     "sql.generate_comments", True, validator=is_bool, remote=True
@@ -374,7 +404,13 @@ default_options.register_option(
 default_options.register_option(
     "spe.operation_timeout_seconds",
     _DEFAULT_SPE_OPERATION_TIMEOUT_SECONDS,
-    validator=is_integer,
+    validator=is_non_negative_integer,
+    remote=True,
+)
+default_options.register_option(
+    "spe.failure_retry_times",
+    _DEFAULT_SPE_FAILURE_RETRY_TIMES,
+    validator=is_non_negative_integer,
     remote=True,
 )

maxframe/config/validators.py CHANGED Viewed

@@ -40,6 +40,7 @@ is_numeric = lambda x: isinstance(x, (int, float))
 is_string = lambda x: isinstance(x, str)
 is_dict = lambda x: isinstance(x, dict)
 is_positive_integer = lambda x: is_integer(x) and x > 0
+is_non_negative_integer = lambda x: is_integer(x) and x >= 0
 def is_in(vals):

maxframe/conftest.py CHANGED Viewed

@@ -87,6 +87,7 @@ def oss_config():
         oss_secret_access_key = config.get("oss", "secret_access_key")
         oss_bucket_name = config.get("oss", "bucket_name")
         oss_endpoint = config.get("oss", "endpoint")
+        oss_rolearn = config.get("oss", "rolearn")
         config.oss_config = (
             oss_access_id,
@@ -99,6 +100,7 @@ def oss_config():
         auth = oss2.Auth(oss_access_id, oss_secret_access_key)
         config.oss_bucket = oss2.Bucket(auth, oss_endpoint, oss_bucket_name)
+        config.oss_rolearn = oss_rolearn
         return config
     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError, ImportError):
         return None

maxframe/core/__init__.py CHANGED Viewed

@@ -19,7 +19,6 @@ from .entity import (
     CHUNK_TYPE,
     ENTITY_TYPE,
     FUSE_CHUNK_TYPE,
-    OBJECT_CHUNK_TYPE,
     OBJECT_TYPE,
     TILEABLE_TYPE,
     Chunk,
@@ -33,8 +32,6 @@ from .entity import (
     HasShapeTileableData,
     NotSupportTile,
     Object,
-    ObjectChunk,
-    ObjectChunkData,
     ObjectData,
     OutputType,
     Tileable,

maxframe/core/entity/__init__.py CHANGED Viewed

@@ -16,14 +16,7 @@ from .chunks import CHUNK_TYPE, Chunk, ChunkData
 from .core import ENTITY_TYPE, Entity, EntityData
 from .executable import ExecutableTuple, _ExecuteAndFetchMixin
 from .fuse import FUSE_CHUNK_TYPE, FuseChunk, FuseChunkData
-from .objects import (
-    OBJECT_CHUNK_TYPE,
-    OBJECT_TYPE,
-    Object,
-    ObjectChunk,
-    ObjectChunkData,
-    ObjectData,
-)
+from .objects import OBJECT_TYPE, Object, ObjectData
 from .output_types import (
     OutputType,
     get_fetch_class,

maxframe/core/entity/objects.py CHANGED Viewed

@@ -14,58 +14,17 @@
 from typing import Any, Dict
-from ...serialization.serializables import FieldTypes, ListField
-from ...utils import skip_na_call
-from .chunks import Chunk, ChunkData
 from .core import Entity
 from .executable import _ToObjectMixin
 from .tileables import TileableData
-class ObjectChunkData(ChunkData):
-    # chunk whose data could be any serializable
-    __slots__ = ()
-    type_name = "Object"
-    def __init__(self, op=None, index=None, **kw):
-        super().__init__(_op=op, _index=index, **kw)
-    @property
-    def params(self) -> Dict[str, Any]:
-        # params return the properties which useful to rebuild a new chunk
-        return {
-            "index": self.index,
-        }
-    @params.setter
-    def params(self, new_params: Dict[str, Any]):
-        params = new_params.copy()
-        params.pop("index", None)  # index not needed to update
-        if params:  # pragma: no cover
-            raise TypeError(f"Unknown params: {list(params)}")
-    @classmethod
-    def get_params_from_data(cls, data: Any) -> Dict[str, Any]:
-        return dict()
-class ObjectChunk(Chunk):
-    __slots__ = ()
-    _allow_data_type_ = (ObjectChunkData,)
-    type_name = "Object"
 class ObjectData(TileableData, _ToObjectMixin):
     __slots__ = ()
     type_name = "Object"
-    # optional fields
-    _chunks = ListField(
-        "chunks",
-        FieldTypes.reference(ObjectChunkData),
-        on_serialize=skip_na_call(lambda x: [it.data for it in x]),
-        on_deserialize=skip_na_call(lambda x: [ObjectChunk(it) for it in x]),
-    )
+    # workaround for removed field since v0.1.0b5
+    # todo remove this when all versions below v1.0.0rc1 is eliminated
+    _legacy_deprecated_non_primitives = ["_chunks"]
     def __init__(self, op=None, nsplits=None, **kw):
         super().__init__(_op=op, _nsplits=nsplits, **kw)
@@ -97,4 +56,3 @@ class Object(Entity, _ToObjectMixin):
 OBJECT_TYPE = (Object, ObjectData)
-OBJECT_CHUNK_TYPE = (ObjectChunk, ObjectChunkData)

maxframe/core/graph/core.cp311-win32.pyd CHANGED Viewed

Binary file

maxframe/core/graph/core.pyx CHANGED Viewed

@@ -354,10 +354,10 @@ cdef class DirectedGraph:
                         sio.write(f'"Chunk:{self._gen_chunk_key(input_chunk, trunc_key)}" {chunk_style}\n')
                         visited.add(input_chunk.key)
                     if op.key not in visited:
-                        sio.write(f'"{op_name}:{op.key[:trunc_key]}" {operator_style}\n')
+                        sio.write(f'"{op_name}:{op.key[:trunc_key]}_{id(op)}" {operator_style}\n')
                         visited.add(op.key)
                     sio.write(f'"Chunk:{self._gen_chunk_key(input_chunk, trunc_key)}" -> '
-                              f'"{op_name}:{op.key[:trunc_key]}"\n')
+                              f'"{op_name}:{op.key[:trunc_key]}_{id(op)}"\n')
                 for output_chunk in (op.outputs or []):
                     if output_chunk.key not in visited:
@@ -367,9 +367,9 @@ cdef class DirectedGraph:
                         sio.write(f'"Chunk:{self._gen_chunk_key(output_chunk, trunc_key)}" {tmp_chunk_style}\n')
                         visited.add(output_chunk.key)
                     if op.key not in visited:
-                        sio.write(f'"{op_name}:{op.key[:trunc_key]}" {operator_style}\n')
+                        sio.write(f'"{op_name}:{op.key[:trunc_key]}_{id(op)}" {operator_style}\n')
                         visited.add(op.key)
-                    sio.write(f'"{op_name}:{op.key[:trunc_key]}" -> '
+                    sio.write(f'"{op_name}:{op.key[:trunc_key]}_{id(op)}" -> '
                               f'"Chunk:{self._gen_chunk_key(output_chunk, trunc_key)}"')
                     if show_columns:
                         sio.write(f' [ label={get_col_names(output_chunk)} ]')

maxframe/dataframe/__init__.py CHANGED Viewed

@@ -54,7 +54,7 @@ from .reduction import CustomReduction, unique
 from .tseries.to_datetime import to_datetime
 try:
-    from pandas import NA, Timestamp
+    from pandas import NA, NaT, Timestamp
 except ImportError:  # pragma: no cover
     pass

maxframe/dataframe/arithmetic/around.py CHANGED Viewed

@@ -43,20 +43,20 @@ def around(df, decimals=0, *args, **kwargs):
     return op(df)
+# FIXME Series input of decimals not supported yet
 around.__frame_doc__ = """
 Round a DataFrame to a variable number of decimal places.
 Parameters
 ----------
-decimals : int, dict, Series
+decimals : int, dict
     Number of decimal places to round each column to. If an int is
     given, round each column to the same number of places.
     Otherwise dict and Series round to variable numbers of places.
     Column names should be in the keys if `decimals` is a
-    dict-like, or in the index if `decimals` is a Series. Any
-    columns not included in `decimals` will be left as is. Elements
-    of `decimals` which are not columns of the input will be
-    ignored.
+    dict-like. Any columns not included in `decimals` will be left
+    as is. Elements of `decimals` which are not columns of the
+    input will be ignored.
 *args
     Additional keywords have no effect but might be accepted for
     compatibility with numpy.
@@ -107,18 +107,6 @@ places as value
 1   0.0   1.0
 2   0.7   0.0
 3   0.2   0.0
-Using a Series, the number of places for specific columns can be
-specified with the column names as index and the number of
-decimal places as value
->>> decimals = md.Series([0, 1], index=['cats', 'dogs'])
->>> df.round(decimals).execute()
-    dogs  cats
-0   0.2   0.0
-1   0.0   1.0
-2   0.7   0.0
-3   0.2   0.0
 """
 around.__series_doc__ = """
 Round each value in a Series to the given number of decimals.

maxframe/dataframe/arithmetic/core.py CHANGED Viewed

@@ -39,7 +39,7 @@ class DataFrameBinOpMixin(DataFrameOperatorMixin):
         raise NotImplementedError
     @classmethod
-    def _calc_properties(cls, x1, x2=None, axis="columns"):
+    def _calc_properties(cls, x1, x2=None, axis="columns", level=None):
         if isinstance(x1, DATAFRAME_TYPE) and (
             x2 is None or pd.api.types.is_scalar(x2) or isinstance(x2, TENSOR_TYPE)
         ):
@@ -108,7 +108,9 @@ class DataFrameBinOpMixin(DataFrameOperatorMixin):
                     index = copy.copy(x1.index_value)
                     index_shape = x1.shape[0]
                 else:
-                    index = infer_index_value(x1.index_value, x2.index_value)
+                    index = infer_index_value(
+                        x1.index_value, x2.index_value, level=level
+                    )
                     if index.key == x1.index_value.key == x2.index_value.key and (
                         not np.isnan(x1.shape[0]) or not np.isnan(x2.shape[0])
                     ):
@@ -141,7 +143,9 @@ class DataFrameBinOpMixin(DataFrameOperatorMixin):
                         column_shape = len(dtypes)
                     else:  # pragma: no cover
                         dtypes = x1.dtypes  # FIXME
-                        columns = infer_index_value(x1.columns_value, x2.index_value)
+                        columns = infer_index_value(
+                            x1.columns_value, x2.index_value, level=level
+                        )
                         column_shape = np.nan
             else:
                 assert axis == "index" or axis == 0
@@ -169,7 +173,9 @@ class DataFrameBinOpMixin(DataFrameOperatorMixin):
                                 ],
                                 index=x1.dtypes.index,
                             )
-                        index = infer_index_value(x1.index_value, x2.index_value)
+                        index = infer_index_value(
+                            x1.index_value, x2.index_value, level=level
+                        )
                         index_shape = np.nan
             return {
                 "shape": (index_shape, column_shape),
@@ -187,7 +193,9 @@ class DataFrameBinOpMixin(DataFrameOperatorMixin):
                     index = copy.copy(x1.index_value)
                     index_shape = x1.shape[0]
                 else:
-                    index = infer_index_value(x1.index_value, x2.index_value)
+                    index = infer_index_value(
+                        x1.index_value, x2.index_value, level=level
+                    )
                     if index.key == x1.index_value.key == x2.index_value.key and (
                         not np.isnan(x1.shape[0]) or not np.isnan(x2.shape[0])
                     ):
@@ -237,14 +245,14 @@ class DataFrameBinOpMixin(DataFrameOperatorMixin):
         self._check_inputs(x1, x2)
         if isinstance(x1, DATAFRAME_TYPE) or isinstance(x2, DATAFRAME_TYPE):
             df1, df2 = (x1, x2) if isinstance(x1, DATAFRAME_TYPE) else (x2, x1)
-            kw = self._calc_properties(df1, df2, axis=self.axis)
+            kw = self._calc_properties(df1, df2, axis=self.axis, level=self.level)
             if not pd.api.types.is_scalar(df2):
                 return self.new_dataframe([x1, x2], **kw)
             else:
                 return self.new_dataframe([df1], **kw)
         if isinstance(x1, SERIES_TYPE) or isinstance(x2, SERIES_TYPE):
             s1, s2 = (x1, x2) if isinstance(x1, SERIES_TYPE) else (x2, x1)
-            kw = self._calc_properties(s1, s2)
+            kw = self._calc_properties(s1, s2, level=self.level)
             if not pd.api.types.is_scalar(s2):
                 return self.new_series([x1, x2], **kw)
             else:

maxframe/dataframe/arithmetic/docstring.py CHANGED Viewed

@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# FIXME：https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/17
 _flex_doc_FRAME = """
 Get {desc} of dataframe and other, element-wise (binary operator `{op_name}`).
 Equivalent to ``{equiv}``, but with support to substitute a fill_value
@@ -127,44 +128,15 @@ circle          0
 triangle        3
 rectangle       4
->>> (df * other).execute()
-           angles  degrees
-circle          0      NaN
-triangle        9      NaN
-rectangle      16      NaN
 >>> df.mul(other, fill_value=0).execute()
            angles  degrees
 circle          0      0.0
 triangle        9      0.0
 rectangle      16      0.0
-Divide by a MultiIndex by level.
->>> df_multindex = md.DataFrame({{'angles': [0, 3, 4, 4, 5, 6],
-...                              'degrees': [360, 180, 360, 360, 540, 720]}},
-...                             index=[['A', 'A', 'A', 'B', 'B', 'B'],
-...                                    ['circle', 'triangle', 'rectangle',
-...                                     'square', 'pentagon', 'hexagon']])
->>> df_multindex.execute()
-             angles  degrees
-A circle          0      360
-  triangle        3      180
-  rectangle       4      360
-B square          4      360
-  pentagon        5      540
-  hexagon         6      720
->>> df.div(df_multindex, level=1, fill_value=0).execute()
-             angles  degrees
-A circle        NaN      1.0
-  triangle      1.0      1.0
-  rectangle     1.0      1.0
-B square        0.0      0.0
-  pentagon      0.0      0.0
-  hexagon       0.0      0.0
 """
+# FIXME：https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/28
 _flex_doc_SERIES = """
 Return {desc} of series and other, element-wise (binary operator `{op_name}`).
@@ -213,6 +185,7 @@ e    NaN
 dtype: float64
 """
+# FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/48
 _flex_comp_doc_FRAME = """
 Get {desc} of dataframe and other, element-wise (binary operator `{op_name}`).
 Among flexible wrappers (`eq`, `ne`, `le`, `lt`, `ge`, `gt`) to comparison
@@ -257,7 +230,8 @@ Mismatched indices will be unioned together.
 Examples
 --------
->>> df = pd.DataFrame({{'cost': [250, 150, 100],
+>>> import maxframe.dataframe as md
+>>> df = md.DataFrame({{'cost': [250, 150, 100],
 ...                    'revenue': [100, 250, 300]}},
 ...                   index=['A', 'B', 'C'])
 >>> df.execute()
@@ -332,30 +306,6 @@ A  False    False
 B  False    False
 C  False     True
 D  False    False
-Compare to a MultiIndex by level.
->>> df_multindex = pd.DataFrame({{'cost': [250, 150, 100, 150, 300, 220],
-...                              'revenue': [100, 250, 300, 200, 175, 225]}},
-...                             index=[['Q1', 'Q1', 'Q1', 'Q2', 'Q2', 'Q2'],
-...                                    ['A', 'B', 'C', 'A', 'B', 'C']])
->>> df_multindex.execute()
-      cost  revenue
-Q1 A   250      100
-   B   150      250
-   C   100      300
-Q2 A   150      200
-   B   300      175
-   C   220      225
->>> df.le(df_multindex, level=1).execute()
-       cost  revenue
-Q1 A   True     True
-   B   True     True
-   C   True     True
-Q2 A  False     True
-   B   True    False
-   C   True    False
 """

maxframe/dataframe/arithmetic/tests/test_arithmetic.py CHANGED Viewed

@@ -239,6 +239,28 @@ def test_dataframe_and_series_with_shuffle(func_name, func_opts):
     assert df2.columns_value.key != df1.columns_value.key
+@pytest.mark.parametrize("func_name, func_opts", binary_functions.items())
+def test_dataframe_and_series_with_multiindex(func_name, func_opts):
+    data1 = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=pd.MultiIndex.from_arrays(
+            [list("AAAAABBBBB"), [4, 9, 3, 2, 1, 5, 8, 6, 7, 10]]
+        ),
+        columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7],
+    )
+    data1 = to_boolean_if_needed(func_opts.func_name, data1)
+    df1 = from_pandas(data1, chunk_size=5)
+    s1 = from_pandas_series(data1[10].reset_index(level=0, drop=True), chunk_size=6)
+    df2 = getattr(df1, func_opts.func_name)(s1, level=1, axis=0)
+    # test df2's index and columns
+    assert df2.shape == (np.nan, df1.shape[1])
+    assert df2.index_value.key != df1.index_value.key
+    assert df2.index_value.names == df1.index_value.names
+    assert df2.columns_value.key == df1.columns_value.key
 @pytest.mark.parametrize("func_name, func_opts", binary_functions.items())
 def test_series_and_series_with_align_map(func_name, func_opts):
     data1 = pd.DataFrame(

maxframe/dataframe/core.py CHANGED Viewed

@@ -1086,11 +1086,11 @@ class Series(HasShapeTileable, _ToPandasMixin):
         --------
         >>> import maxframe.dataframe as md
         >>> s = md.Series({'a': 1, 'b': 2, 'c': 3})
-        >>> s.ndim.execute()
+        >>> s.ndim
         1
         >>> df = md.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
-        >>> df.ndim.execute()
+        >>> df.ndim
         2
         """
         return super().ndim
@@ -1520,7 +1520,7 @@ class BaseDataFrameData(HasShapeTileableData, _ToPandasMixin):
         self._columns_value = parse_index(dtypes.index, store_data=True)
         self._dtypes_value = DtypesValue(key=tokenize(dtypes), value=dtypes)
         new_shape = list(self._shape)
-        new_shape[0] = len(dtypes)
+        new_shape[-1] = len(dtypes)
         self._shape = tuple(new_shape)
     @property
@@ -1761,11 +1761,11 @@ class DataFrame(HasShapeTileable, _ToPandasMixin):
         --------
         >>> import maxframe.dataframe as md
         >>> s = md.Series({'a': 1, 'b': 2, 'c': 3})
-        >>> s.ndim.execute()
+        >>> s.ndim
         1
         >>> df = md.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
-        >>> df.ndim.execute()
+        >>> df.ndim
         2
         """
         return super().ndim

maxframe/dataframe/datasource/date_range.py CHANGED Viewed

@@ -22,7 +22,7 @@ from pandas._libs.tslibs import timezones
 from pandas.tseries.frequencies import to_offset
 from pandas.tseries.offsets import Tick
-from ... import opcodes as OperandDef
+from ... import opcodes
 from ...core import OutputType
 from ...serialization.serializables import AnyField, BoolField, Int64Field, StringField
 from ...utils import no_default, pd_release_version
@@ -117,7 +117,7 @@ def generate_range_count(
 class DataFrameDateRange(DataFrameOperator, DataFrameOperatorMixin):
-    _op_type_ = OperandDef.DATE_RANGE
+    _op_type_ = opcodes.DATE_RANGE
     start = AnyField("start")
     end = AnyField("end")

maxframe/dataframe/datasource/read_odps_query.py CHANGED Viewed

@@ -47,6 +47,7 @@ _EXPLAIN_TASK_SCHEMA_REGEX = re.compile(
     re.MULTILINE,
 )
 _EXPLAIN_COLUMN_REGEX = re.compile(r"([^\(]+) \(([^)]+)\)(?:| AS ([^ ]+))(?:\n|$)")
+_ANONYMOUS_COL_REGEX = re.compile(r"^_c\d+$")
 @dataclasses.dataclass
@@ -272,6 +273,11 @@ def read_odps_query(
     explain_str = list(inst.get_task_results().values())[0]
     odps_schema = _parse_explained_schema(explain_str)
+    for col in odps_schema.columns:
+        if _ANONYMOUS_COL_REGEX.match(col.name) and col.name not in query:
+            raise ValueError("Need to specify names for all columns in SELECT clause.")
     dtypes = odps_schema_to_pandas_dtypes(odps_schema)
     if not index_col:

maxframe/dataframe/datasource/read_odps_table.py CHANGED Viewed

@@ -119,9 +119,10 @@ class DataFrameReadODPSTable(
             return self.new_tileable(
                 [],
                 None,
-                shape=shape,
+                shape=shape[:1],
                 name=getattr(index_value, "name", None),
                 names=getattr(index_value, "names", None),
+                dtype=self.index_dtypes.iloc[0],
                 index_value=index_value,
                 chunk_bytes=chunk_bytes,
                 chunk_size=chunk_size,

maxframe/dataframe/datasource/tests/test_datasource.py CHANGED Viewed

@@ -21,6 +21,7 @@ import pytest
 from odps import ODPS
 from .... import tensor as mt
+from ....core import OutputType
 from ....tests.utils import tn
 from ....utils import lazy_import
 from ... import read_odps_query, read_odps_table
@@ -295,6 +296,15 @@ def test_from_odps_table():
         ),
     )
+    out_idx = read_odps_table(
+        test_table,
+        columns=[],
+        index_col=["col1", "col2"],
+        output_type=OutputType.index,
+    )
+    assert out_idx.names == ["col1", "col2"]
+    assert out_idx.shape == (np.nan,)
     test_table.drop()
     test_parted_table.drop()
@@ -319,6 +329,10 @@ def test_from_odps_query():
         read_odps_query(f"CREATE TABLE dummy_table AS SELECT * FROM {table1_name}")
     assert "instant query" in err_info.value.args[0]
+    with pytest.raises(ValueError) as err_info:
+        read_odps_query(f"SELECT col1, col2 + col3 FROM {table1_name}")
+    assert "names" in err_info.value.args[0]
     query1 = f"SELECT * FROM {table1_name} WHERE col1 > 10"
     df = read_odps_query(query1)
     assert df.op.query == query1

maxframe 0.1.0b5__cp311-cp311-win32.whl → 1.0.0rc2__cp311-cp311-win32.whl

Potentially problematic release.

maxframe 0.1.0b5cp311-cp311-win32.whl → 1.0.0rc2cp311-cp311-win32.whl