PyPI - maxframe - Versions diffs - 1.0.0rc4__cp38-cp38-macosx_10_9_universal2.whl → 1.1.1__cp38-cp38-macosx_10_9_universal2.whl - Mend

maxframe 1.0.0rc4__cp38-cp38-macosx_10_9_universal2.whl → 1.1.1__cp38-cp38-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of maxframe might be problematic. Click here for more details.

Files changed (88) hide show

maxframe/_utils.cpython-38-darwin.so +0 -0
maxframe/config/__init__.py +1 -1
maxframe/config/config.py +26 -0
maxframe/config/tests/test_config.py +20 -1
maxframe/conftest.py +17 -4
maxframe/core/graph/core.cpython-38-darwin.so +0 -0
maxframe/core/operator/base.py +2 -0
maxframe/dataframe/arithmetic/tests/test_arithmetic.py +17 -16
maxframe/dataframe/core.py +24 -2
maxframe/dataframe/datasource/read_odps_query.py +65 -35
maxframe/dataframe/datasource/read_odps_table.py +4 -2
maxframe/dataframe/datasource/tests/test_datasource.py +59 -7
maxframe/dataframe/extensions/__init__.py +5 -0
maxframe/dataframe/extensions/apply_chunk.py +649 -0
maxframe/dataframe/extensions/flatjson.py +131 -0
maxframe/dataframe/extensions/flatmap.py +28 -40
maxframe/dataframe/extensions/reshuffle.py +1 -1
maxframe/dataframe/extensions/tests/test_apply_chunk.py +186 -0
maxframe/dataframe/extensions/tests/test_extensions.py +46 -2
maxframe/dataframe/groupby/__init__.py +1 -0
maxframe/dataframe/groupby/aggregation.py +1 -0
maxframe/dataframe/groupby/apply.py +9 -1
maxframe/dataframe/groupby/core.py +1 -1
maxframe/dataframe/groupby/fill.py +4 -1
maxframe/dataframe/groupby/getitem.py +6 -0
maxframe/dataframe/groupby/tests/test_groupby.py +1 -1
maxframe/dataframe/groupby/transform.py +8 -2
maxframe/dataframe/indexing/loc.py +6 -4
maxframe/dataframe/merge/__init__.py +9 -1
maxframe/dataframe/merge/concat.py +41 -31
maxframe/dataframe/merge/merge.py +1 -1
maxframe/dataframe/merge/tests/test_merge.py +3 -1
maxframe/dataframe/misc/apply.py +3 -0
maxframe/dataframe/misc/drop_duplicates.py +5 -1
maxframe/dataframe/misc/map.py +3 -1
maxframe/dataframe/misc/tests/test_misc.py +24 -2
maxframe/dataframe/misc/transform.py +22 -13
maxframe/dataframe/reduction/__init__.py +3 -0
maxframe/dataframe/reduction/aggregation.py +1 -0
maxframe/dataframe/reduction/median.py +56 -0
maxframe/dataframe/reduction/tests/test_reduction.py +17 -7
maxframe/dataframe/statistics/quantile.py +8 -2
maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
maxframe/dataframe/tests/test_utils.py +60 -0
maxframe/dataframe/utils.py +110 -7
maxframe/dataframe/window/expanding.py +5 -3
maxframe/dataframe/window/tests/test_expanding.py +2 -2
maxframe/io/objects/tests/test_object_io.py +39 -12
maxframe/io/odpsio/__init__.py +1 -1
maxframe/io/odpsio/arrow.py +51 -2
maxframe/io/odpsio/schema.py +23 -5
maxframe/io/odpsio/tableio.py +80 -124
maxframe/io/odpsio/tests/test_schema.py +40 -0
maxframe/io/odpsio/tests/test_tableio.py +5 -5
maxframe/io/odpsio/tests/test_volumeio.py +35 -11
maxframe/io/odpsio/volumeio.py +27 -3
maxframe/learn/contrib/__init__.py +3 -2
maxframe/learn/contrib/llm/__init__.py +16 -0
maxframe/learn/contrib/llm/core.py +54 -0
maxframe/learn/contrib/llm/models/__init__.py +14 -0
maxframe/learn/contrib/llm/models/dashscope.py +73 -0
maxframe/learn/contrib/llm/multi_modal.py +42 -0
maxframe/learn/contrib/llm/text.py +42 -0
maxframe/lib/mmh3.cpython-38-darwin.so +0 -0
maxframe/lib/sparse/tests/test_sparse.py +15 -15
maxframe/opcodes.py +7 -1
maxframe/serialization/core.cpython-38-darwin.so +0 -0
maxframe/serialization/core.pyx +13 -1
maxframe/serialization/pandas.py +50 -20
maxframe/serialization/serializables/core.py +70 -15
maxframe/serialization/serializables/field_type.py +4 -1
maxframe/serialization/serializables/tests/test_serializable.py +12 -2
maxframe/serialization/tests/test_serial.py +2 -1
maxframe/tensor/__init__.py +19 -7
maxframe/tensor/merge/vstack.py +1 -1
maxframe/tests/utils.py +16 -0
maxframe/udf.py +27 -0
maxframe/utils.py +42 -8
{maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/METADATA +2 -2
{maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/RECORD +88 -77
{maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/WHEEL +1 -1
maxframe_client/clients/framedriver.py +4 -1
maxframe_client/fetcher.py +23 -8
maxframe_client/session/odps.py +40 -11
maxframe_client/session/task.py +6 -25
maxframe_client/session/tests/test_task.py +35 -6
maxframe_client/tests/test_session.py +30 -10
{maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/top_level.txt +0 -0

maxframe/io/odpsio/tableio.py CHANGED Viewed

@@ -15,20 +15,20 @@
 import os
 import time
 from abc import ABC, abstractmethod
+from collections import OrderedDict
 from contextlib import contextmanager
 from typing import Dict, List, Optional, Union
-import numpy as np
 import pyarrow as pa
 from odps import ODPS
-from odps import __version__ as pyodps_version
 from odps.apis.storage_api import (
     StorageApiArrowClient,
     TableBatchScanResponse,
     TableBatchWriteResponse,
 )
-from odps.tunnel import TableTunnel
+from odps.tunnel import TableDownloadSession, TableDownloadStatus, TableTunnel
 from odps.types import OdpsSchema, PartitionSpec, timestamp_ntz
+from odps.utils import call_with_retry
 try:
     import pyarrow.compute as pac
@@ -37,20 +37,19 @@ except ImportError:
 from ...config import options
 from ...env import ODPS_STORAGE_API_ENDPOINT
-from ...lib.version import Version
-from ...utils import sync_pyodps_options
+from ...utils import is_empty, sync_pyodps_options
 from .schema import odps_schema_to_arrow_schema
 PartitionsType = Union[List[str], str, None]
 _DEFAULT_ROW_BATCH_SIZE = 4096
-_need_patch_batch = Version(pyodps_version) < Version("0.12.0")
+_DOWNLOAD_ID_CACHE_SIZE = 100
 class ODPSTableIO(ABC):
     def __new__(cls, odps: ODPS):
         if cls is ODPSTableIO:
-            if options.use_common_table:
+            if options.use_common_table or ODPS_STORAGE_API_ENDPOINT in os.environ:
                 return HaloTableIO(odps)
             else:
                 return TunnelTableIO(odps)
@@ -68,7 +67,11 @@ class ODPSTableIO(ABC):
     ) -> OdpsSchema:
         final_cols = []
-        columns = columns or [col.name for col in table_schema.simple_columns]
+        columns = (
+            columns
+            if not is_empty(columns)
+            else [col.name for col in table_schema.simple_columns]
+        )
         if partition_columns is True:
             partition_columns = [c.name for c in table_schema.partitions]
         else:
@@ -132,7 +135,12 @@ class TunnelMultiPartitionReader:
         self._cur_partition_id = -1
         self._reader_start_pos = 0
-        if partitions is None or isinstance(partitions, str):
+        if partitions is None:
+            if not self._table.table_schema.partitions:
+                self._partitions = [None]
+            else:
+                self._partitions = [str(pt) for pt in self._table.partitions]
+        elif isinstance(partitions, str):
             self._partitions = [partitions]
         else:
             self._partitions = partitions
@@ -160,17 +168,14 @@ class TunnelMultiPartitionReader:
             self._cur_partition_id += 1
             part_str = self._partitions[self._cur_partition_id]
-            # todo make this more formal when PyODPS 0.12.0 is released
-            req_columns = self._columns
-            if not _need_patch_batch:
-                req_columns = self._schema.names
+            req_columns = self._schema.names
             with sync_pyodps_options():
                 self._cur_reader = self._table.open_reader(
                     part_str,
                     columns=req_columns,
                     arrow=True,
                     download_id=self._partition_to_download_ids.get(part_str),
+                    append_partitions=True,
                 )
             if self._cur_reader.count + self._reader_start_pos > self._start:
                 start = self._start - self._reader_start_pos
@@ -186,35 +191,6 @@ class TunnelMultiPartitionReader:
         else:
             self._cur_reader = None
-    def _fill_batch_partition(self, batch: pa.RecordBatch) -> pa.RecordBatch:
-        pt_spec = PartitionSpec(self._partitions[self._cur_partition_id])
-        names = list(batch.schema.names)
-        arrays = []
-        for idx in range(batch.num_columns):
-            col = batch.column(idx)
-            if isinstance(col.type, pa.TimestampType):
-                if col.type.tz is not None:
-                    target_type = pa.timestamp(
-                        self._schema.types[idx].unit, col.type.tz
-                    )
-                    arrays.append(col.cast(target_type))
-                else:
-                    target_type = pa.timestamp(
-                        self._schema.types[idx].unit, options.local_timezone
-                    )
-                    pd_col = col.to_pandas().dt.tz_localize(options.local_timezone)
-                    arrays.append(pa.Array.from_pandas(pd_col).cast(target_type))
-            else:
-                arrays.append(batch.column(idx))
-        for part_col in self._partition_cols or []:
-            names.append(part_col)
-            col_type = self._schema.field_by_name(part_col).type
-            pt_col = np.repeat([pt_spec[part_col]], batch.num_rows)
-            arrays.append(pa.array(pt_col).cast(col_type))
-        return pa.RecordBatch.from_arrays(arrays, names)
     def read(self):
         with sync_pyodps_options():
             if self._cur_reader is None:
@@ -227,10 +203,7 @@ class TunnelMultiPartitionReader:
                     if batch is not None:
                         if self._row_left is not None:
                             self._row_left -= batch.num_rows
-                        if _need_patch_batch:
-                            return self._fill_batch_partition(batch)
-                        else:
-                            return batch
+                        return batch
                 except StopIteration:
                     self._open_next_reader()
             return None
@@ -247,35 +220,47 @@ class TunnelMultiPartitionReader:
         return pa.Table.from_batches(batches)
-class TunnelWrappedWriter:
-    def __init__(self, nested_writer):
-        self._writer = nested_writer
+class TunnelTableIO(ODPSTableIO):
+    _down_session_ids = OrderedDict()
-    def write(self, data: Union[pa.RecordBatch, pa.Table]):
-        if not any(isinstance(tp, pa.TimestampType) for tp in data.schema.types):
-            self._writer.write(data)
-            return
-        pa_type = type(data)
-        arrays = []
-        for idx in range(data.num_columns):
-            name = data.schema.names[idx]
-            col = data.column(idx)
-            if not isinstance(col.type, pa.TimestampType):
-                arrays.append(col)
-                continue
-            if self._writer.schema[name].type == timestamp_ntz:
-                col = HaloTableArrowWriter._localize_timezone(col, "UTC")
-            else:
-                col = HaloTableArrowWriter._localize_timezone(col)
-            arrays.append(col)
-        data = pa_type.from_arrays(arrays, names=data.schema.names)
-        self._writer.write(data)
+    @classmethod
+    def create_download_sessions(
+        cls,
+        odps_entry: ODPS,
+        full_table_name: str,
+        partitions: List[Optional[str]] = None,
+    ) -> Dict[Optional[str], TableDownloadSession]:
+        table = odps_entry.get_table(full_table_name)
+        tunnel = TableTunnel(odps_entry)
+        parts = (
+            [partitions]
+            if partitions is None or isinstance(partitions, str)
+            else partitions
+        )
+        part_to_session = dict()
+        for part in parts:
+            part_key = (full_table_name, part)
+            down_session = None
+            if part_key in cls._down_session_ids:
+                down_id = cls._down_session_ids[part_key]
+                down_session = tunnel.create_download_session(
+                    table, async_mode=True, partition_spec=part, download_id=down_id
+                )
+                if down_session.status != TableDownloadStatus.Normal:
+                    down_session = None
-    def __getattr__(self, item):
-        return getattr(self._writer, item)
+            if down_session is None:
+                down_session = tunnel.create_download_session(
+                    table, async_mode=True, partition_spec=part
+                )
+            while len(cls._down_session_ids) >= _DOWNLOAD_ID_CACHE_SIZE:
+                cls._down_session_ids.popitem(False)
+            cls._down_session_ids[part_key] = down_session.id
+            part_to_session[part] = down_session
+        return part_to_session
-class TunnelTableIO(ODPSTableIO):
     @contextmanager
     def open_reader(
         self,
@@ -302,21 +287,15 @@ class TunnelTableIO(ODPSTableIO):
             or (reverse_range and start is None)
         ):
             with sync_pyodps_options():
-                table = self._odps.get_table(full_table_name)
-                tunnel = TableTunnel(self._odps)
-                parts = (
-                    [partitions]
-                    if partitions is None or isinstance(partitions, str)
-                    else partitions
+                tunnel_sessions = self.create_download_sessions(
+                    self._odps, full_table_name, partitions
+                )
+                part_to_down_id = {
+                    pt: session.id for (pt, session) in tunnel_sessions.items()
+                }
+                total_records = sum(
+                    session.count for session in tunnel_sessions.values()
                 )
-                part_to_down_id = dict()
-                total_records = 0
-                for part in parts:
-                    down_session = tunnel.create_download_session(
-                        table, async_mode=True, partition_spec=part
-                    )
-                    part_to_down_id[part] = down_session.id
-                    total_records += down_session.count
         count = None
         if start is not None or stop is not None:
@@ -360,13 +339,7 @@ class TunnelTableIO(ODPSTableIO):
                 create_partition=partition is not None,
                 overwrite=overwrite,
             ) as writer:
-                # fixme should yield writer directly once pyodps fixes
-                #  related arrow timestamp bug when provided schema and
-                #  table schema is identical.
-                if _need_patch_batch:
-                    yield TunnelWrappedWriter(writer)
-                else:
-                    yield writer
+                yield writer
 class HaloTableArrowReader:
@@ -422,7 +395,7 @@ class HaloTableArrowReader:
             split_index=self._cur_split_id + 1,
             **read_rows_kw,
         )
-        self._cur_reader = self._client.read_rows_arrow(req)
+        self._cur_reader = call_with_retry(self._client.read_rows_arrow, req)
         self._cur_split_id += 1
     def _convert_timezone(self, batch: pa.RecordBatch) -> pa.RecordBatch:
@@ -494,8 +467,9 @@ class HaloTableArrowWriter:
     def open(self):
         from odps.apis.storage_api import WriteRowsRequest
-        self._writer = self._client.write_rows_arrow(
-            WriteRowsRequest(self._write_info.session_id)
+        self._writer = call_with_retry(
+            self._client.write_rows_arrow,
+            WriteRowsRequest(self._write_info.session_id),
         )
     @classmethod
@@ -566,28 +540,6 @@ class HaloTableIO(ODPSTableIO):
             for pt in partitions
         ]
-    def get_table_record_count(
-        self, full_table_name: str, partitions: PartitionsType = None
-    ):
-        from odps.apis.storage_api import SplitOptions, TableBatchScanRequest
-        table = self._odps.get_table(full_table_name)
-        client = StorageApiArrowClient(
-            self._odps, table, rest_endpoint=self._storage_api_endpoint
-        )
-        split_option = SplitOptions.SplitMode.SIZE
-        scan_kw = {
-            "required_partitions": self._convert_partitions(partitions),
-            "split_options": SplitOptions.get_default_options(split_option),
-        }
-        # todo add more options for partition column handling
-        req = TableBatchScanRequest(**scan_kw)
-        resp = client.create_read_session(req)
-        return resp.record_count
     @contextmanager
     def open_reader(
         self,
@@ -631,12 +583,12 @@ class HaloTableIO(ODPSTableIO):
         # todo add more options for partition column handling
         req = TableBatchScanRequest(**scan_kw)
-        resp = client.create_read_session(req)
+        resp = call_with_retry(client.create_read_session, req)
         session_id = resp.session_id
         status = resp.session_status
         while status == SessionStatus.INIT:
-            resp = client.get_read_session(SessionRequest(session_id))
+            resp = call_with_retry(client.get_read_session, SessionRequest(session_id))
             status = resp.session_status
             time.sleep(1.0)
@@ -691,7 +643,7 @@ class HaloTableIO(ODPSTableIO):
         part_strs = self._convert_partitions(partition)
         part_str = part_strs[0] if part_strs else None
         req = TableBatchWriteRequest(partition_spec=part_str, overwrite=overwrite)
-        resp = client.create_write_session(req)
+        resp = call_with_retry(client.create_write_session, req)
         session_id = resp.session_id
         writer = HaloTableArrowWriter(client, resp, table.table_schema)
@@ -700,9 +652,13 @@ class HaloTableIO(ODPSTableIO):
         yield writer
         commit_msg = writer.close()
-        resp = client.commit_write_session(
-            SessionRequest(session_id=session_id), [commit_msg]
+        resp = call_with_retry(
+            client.commit_write_session,
+            SessionRequest(session_id=session_id),
+            [commit_msg],
         )
         while resp.session_status == SessionStatus.COMMITTING:
-            resp = client.get_write_session(SessionRequest(session_id=session_id))
+            resp = call_with_retry(
+                client.get_write_session, SessionRequest(session_id=session_id)
+            )
         assert resp.session_status == SessionStatus.COMMITTED

maxframe/io/odpsio/tests/test_schema.py CHANGED Viewed

@@ -21,6 +21,7 @@ from odps import types as odps_types
 from .... import dataframe as md
 from .... import tensor as mt
 from ....core import OutputType
+from ....utils import pd_release_version
 from ..schema import (
     arrow_schema_to_odps_schema,
     build_dataframe_table_meta,
@@ -292,3 +293,42 @@ def test_build_table_meta(wrap_obj):
     table_meta = build_dataframe_table_meta(test_df)
     expected_cols = ["a_2", "a_3", "a_0", "a_1_0", "a_1_1", "b", "c"]
     assert table_meta.table_column_names == expected_cols
+@pytest.mark.skipif(
+    pd_release_version[0] < 2, reason="only run under pandas 2.0 or greater"
+)
+def test_table_meta_with_datetime():
+    raw_df = pd.DataFrame(
+        [
+            [1, "abc", "2024-10-01 11:23:12"],
+            [3, "uvw", "2024-10-02 22:55:13"],
+        ],
+        columns=["col1", "col2", "col3"],
+    )
+    df = md.DataFrame(raw_df).astype({"col3": "datetime64[ms]"})
+    schema, _ = pandas_to_odps_schema(df, unknown_as_string=True)
+    assert schema.columns[3].type == odps_types.datetime
+    raw_series = pd.Series(
+        ["2024-10-01 11:23:12", "2024-10-02 22:55:13"], dtype="datetime64[ms]"
+    )
+    s = md.Series(raw_series)
+    schema, _ = pandas_to_odps_schema(s, unknown_as_string=True)
+    assert schema.columns[1].type == odps_types.datetime
+    raw_index = pd.Index(
+        ["2024-10-01 11:23:12", "2024-10-02 22:55:13"], dtype="datetime64[ms]"
+    )
+    idx = md.Index(raw_index)
+    schema, _ = pandas_to_odps_schema(idx, unknown_as_string=True)
+    assert schema.columns[0].type == odps_types.datetime
+    src_df = pd.DataFrame(
+        [[1, "2024-10-01 11:23:12"], [3, "2024-10-02 22:55:13"]],
+        columns=["A", "B"],
+    ).astype({"B": "datetime64[ms]"})
+    raw_multiindex = pd.MultiIndex.from_frame(src_df)
+    multiidx = md.Index(raw_multiindex)
+    schema, _ = pandas_to_odps_schema(multiidx, unknown_as_string=True)
+    assert schema.columns[1].type == odps_types.datetime

maxframe/io/odpsio/tests/test_tableio.py CHANGED Viewed

@@ -31,7 +31,7 @@ def switch_table_io(request):
     old_use_common_table = options.use_common_table
     try:
         options.use_common_table = request.param
-        yield
+        yield request.param
     finally:
         options.use_common_table = old_use_common_table
@@ -45,7 +45,7 @@ def test_empty_table_io(switch_table_io):
     table_io = ODPSTableIO(o)
     # test read from empty table
-    empty_table_name = tn("test_empty_table_halo_read")
+    empty_table_name = tn("test_empty_table_halo_read_" + str(switch_table_io).lower())
     o.delete_table(empty_table_name, if_exists=True)
     tb = o.create_table(empty_table_name, "col1 string", lifecycle=1)
@@ -65,7 +65,7 @@ def test_table_io_without_parts(switch_table_io):
     table_io = ODPSTableIO(o)
     # test read and write tables without partition
-    no_part_table_name = tn("test_no_part_halo_write")
+    no_part_table_name = tn("test_no_part_halo_write_" + str(switch_table_io).lower())
     o.delete_table(no_part_table_name, if_exists=True)
     col_desc = ",".join(f"{c} double" for c in "abcde") + ", f datetime"
     tb = o.create_table(no_part_table_name, col_desc, lifecycle=1)
@@ -99,7 +99,7 @@ def test_table_io_with_range_reader(switch_table_io):
     table_io = ODPSTableIO(o)
     # test read and write tables without partition
-    no_part_table_name = tn("test_no_part_halo_write")
+    no_part_table_name = tn("test_halo_write_range_" + str(switch_table_io).lower())
     o.delete_table(no_part_table_name, if_exists=True)
     tb = o.create_table(
         no_part_table_name, ",".join(f"{c} double" for c in "abcde"), lifecycle=1
@@ -139,7 +139,7 @@ def test_table_io_with_parts(switch_table_io):
     table_io = ODPSTableIO(o)
     # test read and write tables with partition
-    parted_table_name = tn("test_parted_halo_write")
+    parted_table_name = tn("test_parted_halo_write_" + str(switch_table_io).lower())
     o.delete_table(parted_table_name, if_exists=True)
     tb = o.create_table(
         parted_table_name,

maxframe/io/odpsio/tests/test_volumeio.py CHANGED Viewed

@@ -42,15 +42,33 @@ def create_volume(request, oss_config):
             oss_bucket_name,
             oss_endpoint,
         ) = oss_config.oss_config
-        test_location = "oss://%s:%s@%s/%s/%s" % (
-            oss_access_id,
-            oss_secret_access_key,
-            oss_endpoint,
-            oss_bucket_name,
-            oss_test_dir_name,
-        )
+        if "test" in oss_endpoint:
+            # offline config
+            test_location = "oss://%s:%s@%s/%s/%s" % (
+                oss_access_id,
+                oss_secret_access_key,
+                oss_endpoint,
+                oss_bucket_name,
+                oss_test_dir_name,
+            )
+            rolearn = None
+        else:
+            # online config
+            endpoint_parts = oss_endpoint.split(".", 1)
+            if "-internal" not in endpoint_parts[0]:
+                endpoint_parts[0] += "-internal"
+            test_location = "oss://%s/%s/%s" % (
+                ".".join(endpoint_parts),
+                oss_bucket_name,
+                oss_test_dir_name,
+            )
+            rolearn = oss_config.oss_rolearn
         oss_config.oss_bucket.put_object(oss_test_dir_name + "/", b"")
-        odps_entry.create_external_volume(test_vol_name, location=test_location)
+        odps_entry.create_external_volume(
+            test_vol_name, location=test_location, rolearn=rolearn
+        )
     try:
         yield test_vol_name
     finally:
@@ -75,13 +93,19 @@ def test_read_write_volume(create_volume):
     odps_entry = ODPS.from_environments()
-    writer = ODPSVolumeWriter(odps_entry, create_volume, test_vol_dir)
+    writer = ODPSVolumeWriter(
+        odps_entry, create_volume, test_vol_dir, replace_internal_host=True
+    )
-    writer = ODPSVolumeWriter(odps_entry, create_volume, test_vol_dir)
+    writer = ODPSVolumeWriter(
+        odps_entry, create_volume, test_vol_dir, replace_internal_host=True
+    )
     writer.write_file("file1", b"content1")
     writer.write_file("file2", b"content2")
-    reader = ODPSVolumeReader(odps_entry, create_volume, test_vol_dir)
+    reader = ODPSVolumeReader(
+        odps_entry, create_volume, test_vol_dir, replace_internal_host=True
+    )
     assert reader.read_file("file1") == b"content1"
     assert reader.read_file("file2") == b"content2"

maxframe/io/odpsio/volumeio.py CHANGED Viewed

@@ -16,13 +16,25 @@ import inspect
 from typing import Iterator, List, Optional, Union
 from odps import ODPS
+from odps import __version__ as pyodps_version
+from ...lib.version import Version
+_has_replace_internal_host = Version(pyodps_version) >= Version("0.12.0")
 class ODPSVolumeReader:
-    def __init__(self, odps_entry: ODPS, volume_name: str, volume_dir: str):
+    def __init__(
+        self,
+        odps_entry: ODPS,
+        volume_name: str,
+        volume_dir: str,
+        replace_internal_host: bool = False,
+    ):
         self._odps_entry = odps_entry
         self._volume = odps_entry.get_volume(volume_name)
         self._volume_dir = volume_dir
+        self._replace_internal_host = replace_internal_host
     def list_files(self) -> List[str]:
         def _get_file_name(vol_file):
@@ -38,7 +50,12 @@ class ODPSVolumeReader:
         ]
     def read_file(self, file_name: str) -> bytes:
-        with self._volume.open_reader(self._volume_dir + "/" + file_name) as reader:
+        kw = {}
+        if _has_replace_internal_host and self._replace_internal_host:
+            kw = {"replace_internal_host": self._replace_internal_host}
+        with self._volume.open_reader(
+            self._volume_dir + "/" + file_name, **kw
+        ) as reader:
             return reader.read()
@@ -49,13 +66,20 @@ class ODPSVolumeWriter:
         volume_name: str,
         volume_dir: str,
         schema_name: Optional[str] = None,
+        replace_internal_host: bool = False,
     ):
         self._odps_entry = odps_entry
         self._volume = odps_entry.get_volume(volume_name, schema=schema_name)
         self._volume_dir = volume_dir
+        self._replace_internal_host = replace_internal_host
     def write_file(self, file_name: str, data: Union[bytes, Iterator[bytes]]):
-        with self._volume.open_writer(self._volume_dir + "/" + file_name) as writer:
+        kw = {}
+        if _has_replace_internal_host and self._replace_internal_host:
+            kw = {"replace_internal_host": self._replace_internal_host}
+        with self._volume.open_writer(
+            self._volume_dir + "/" + file_name, **kw
+        ) as writer:
             if not inspect.isgenerator(data):
                 writer.write(data)
             else:

maxframe/learn/contrib/__init__.py CHANGED Viewed

@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from . import graph, pytorch
+from . import graph, llm, pytorch
-del pytorch
 del graph
+del llm
+del pytorch

maxframe/learn/contrib/llm/__init__.py ADDED Viewed

@@ -0,0 +1,16 @@
+# Copyright 1999-2024 Alibaba Group Holding Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from . import models, multi_modal, text
+del models

maxframe/learn/contrib/llm/core.py ADDED Viewed

@@ -0,0 +1,54 @@
+# Copyright 1999-2024 Alibaba Group Holding Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict
+import numpy as np
+import pandas as pd
+from ....core.entity.output_types import OutputType
+from ....core.operator.base import Operator
+from ....core.operator.core import TileableOperatorMixin
+from ....dataframe.utils import parse_index
+from ....serialization.serializables.core import Serializable
+from ....serialization.serializables.field import AnyField, DictField, StringField
+class LLM(Serializable):
+    name = StringField("name", default=None)
+    def validate_params(self, params: Dict[str, Any]):
+        pass
+class LLMOperator(Operator, TileableOperatorMixin):
+    model = AnyField("model", default=None)
+    prompt_template = AnyField("prompt_template", default=None)
+    params = DictField("params", default=None)
+    def __init__(self, output_types=None, **kw):
+        if output_types is None:
+            output_types = [OutputType.dataframe]
+        super().__init__(_output_types=output_types, **kw)
+    def __call__(self, data):
+        col_names = ["response", "success"]
+        columns = parse_index(pd.Index(col_names), store_data=True)
+        out_dtypes = pd.Series([np.dtype("O"), np.dtype("bool")], index=col_names)
+        return self.new_tileable(
+            inputs=[data],
+            dtypes=out_dtypes,
+            shape=(data.shape[0], len(col_names)),
+            index_value=data.index_value,
+            columns_value=columns,
+        )

maxframe/learn/contrib/llm/models/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+# Copyright 1999-2024 Alibaba Group Holding Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .dashscope import DashScopeMultiModalLLM, DashScopeTextLLM