maxframe 0.1.0b4__cp311-cp311-macosx_10_9_universal2.whl → 1.0.0__cp311-cp311-macosx_10_9_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/__init__.py +1 -0
- maxframe/_utils.cpython-311-darwin.so +0 -0
- maxframe/codegen.py +56 -5
- maxframe/config/config.py +78 -10
- maxframe/config/validators.py +42 -11
- maxframe/conftest.py +58 -14
- maxframe/core/__init__.py +2 -16
- maxframe/core/entity/__init__.py +1 -12
- maxframe/core/entity/executable.py +1 -1
- maxframe/core/entity/objects.py +46 -45
- maxframe/core/entity/output_types.py +0 -3
- maxframe/core/entity/tests/test_objects.py +43 -0
- maxframe/core/entity/tileables.py +5 -78
- maxframe/core/graph/__init__.py +2 -2
- maxframe/core/graph/builder/__init__.py +0 -1
- maxframe/core/graph/builder/base.py +5 -4
- maxframe/core/graph/builder/tileable.py +4 -4
- maxframe/core/graph/builder/utils.py +4 -8
- maxframe/core/graph/core.cpython-311-darwin.so +0 -0
- maxframe/core/graph/core.pyx +4 -4
- maxframe/core/graph/entity.py +9 -33
- maxframe/core/operator/__init__.py +2 -9
- maxframe/core/operator/base.py +3 -5
- maxframe/core/operator/objects.py +0 -9
- maxframe/core/operator/utils.py +55 -0
- maxframe/dataframe/__init__.py +2 -1
- maxframe/dataframe/arithmetic/around.py +5 -17
- maxframe/dataframe/arithmetic/core.py +15 -7
- maxframe/dataframe/arithmetic/docstring.py +7 -33
- maxframe/dataframe/arithmetic/equal.py +4 -2
- maxframe/dataframe/arithmetic/greater.py +4 -2
- maxframe/dataframe/arithmetic/greater_equal.py +4 -2
- maxframe/dataframe/arithmetic/less.py +2 -2
- maxframe/dataframe/arithmetic/less_equal.py +4 -2
- maxframe/dataframe/arithmetic/not_equal.py +4 -2
- maxframe/dataframe/arithmetic/tests/test_arithmetic.py +39 -16
- maxframe/dataframe/core.py +58 -12
- maxframe/dataframe/datasource/date_range.py +2 -2
- maxframe/dataframe/datasource/read_odps_query.py +120 -24
- maxframe/dataframe/datasource/read_odps_table.py +9 -4
- maxframe/dataframe/datasource/tests/test_datasource.py +103 -8
- maxframe/dataframe/datastore/tests/test_to_odps.py +48 -0
- maxframe/dataframe/datastore/to_odps.py +28 -0
- maxframe/dataframe/extensions/__init__.py +5 -0
- maxframe/dataframe/extensions/flatjson.py +131 -0
- maxframe/dataframe/extensions/flatmap.py +317 -0
- maxframe/dataframe/extensions/reshuffle.py +1 -1
- maxframe/dataframe/extensions/tests/test_extensions.py +108 -3
- maxframe/dataframe/groupby/core.py +1 -1
- maxframe/dataframe/groupby/cum.py +0 -1
- maxframe/dataframe/groupby/fill.py +4 -1
- maxframe/dataframe/groupby/getitem.py +6 -0
- maxframe/dataframe/groupby/tests/test_groupby.py +5 -1
- maxframe/dataframe/groupby/transform.py +5 -1
- maxframe/dataframe/indexing/align.py +1 -1
- maxframe/dataframe/indexing/loc.py +6 -4
- maxframe/dataframe/indexing/rename.py +5 -28
- maxframe/dataframe/indexing/sample.py +0 -1
- maxframe/dataframe/indexing/set_index.py +68 -1
- maxframe/dataframe/initializer.py +11 -1
- maxframe/dataframe/merge/__init__.py +9 -1
- maxframe/dataframe/merge/concat.py +41 -31
- maxframe/dataframe/merge/merge.py +237 -3
- maxframe/dataframe/merge/tests/test_merge.py +126 -1
- maxframe/dataframe/misc/__init__.py +4 -0
- maxframe/dataframe/misc/apply.py +6 -11
- maxframe/dataframe/misc/case_when.py +141 -0
- maxframe/dataframe/misc/describe.py +2 -2
- maxframe/dataframe/misc/drop_duplicates.py +8 -8
- maxframe/dataframe/misc/eval.py +4 -0
- maxframe/dataframe/misc/memory_usage.py +2 -2
- maxframe/dataframe/misc/pct_change.py +1 -83
- maxframe/dataframe/misc/pivot_table.py +262 -0
- maxframe/dataframe/misc/tests/test_misc.py +93 -1
- maxframe/dataframe/misc/transform.py +1 -30
- maxframe/dataframe/misc/value_counts.py +4 -17
- maxframe/dataframe/missing/dropna.py +1 -1
- maxframe/dataframe/missing/fillna.py +5 -5
- maxframe/dataframe/operators.py +1 -17
- maxframe/dataframe/plotting/core.py +2 -2
- maxframe/dataframe/reduction/core.py +4 -3
- maxframe/dataframe/reduction/tests/test_reduction.py +2 -4
- maxframe/dataframe/sort/sort_values.py +1 -11
- maxframe/dataframe/statistics/corr.py +3 -3
- maxframe/dataframe/statistics/quantile.py +13 -19
- maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
- maxframe/dataframe/tests/test_initializer.py +33 -2
- maxframe/dataframe/utils.py +33 -11
- maxframe/dataframe/window/expanding.py +5 -3
- maxframe/dataframe/window/tests/test_expanding.py +2 -2
- maxframe/errors.py +13 -0
- maxframe/extension.py +12 -0
- maxframe/io/__init__.py +13 -0
- maxframe/io/objects/__init__.py +24 -0
- maxframe/io/objects/core.py +140 -0
- maxframe/io/objects/tensor.py +76 -0
- maxframe/io/objects/tests/__init__.py +13 -0
- maxframe/io/objects/tests/test_object_io.py +97 -0
- maxframe/{odpsio → io/odpsio}/__init__.py +3 -1
- maxframe/{odpsio → io/odpsio}/arrow.py +43 -12
- maxframe/{odpsio → io/odpsio}/schema.py +38 -16
- maxframe/io/odpsio/tableio.py +719 -0
- maxframe/io/odpsio/tests/__init__.py +13 -0
- maxframe/{odpsio → io/odpsio}/tests/test_schema.py +75 -33
- maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +50 -23
- maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
- maxframe/io/odpsio/volumeio.py +63 -0
- maxframe/learn/contrib/__init__.py +3 -1
- maxframe/learn/contrib/graph/__init__.py +15 -0
- maxframe/learn/contrib/graph/connected_components.py +215 -0
- maxframe/learn/contrib/graph/tests/__init__.py +13 -0
- maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
- maxframe/learn/contrib/llm/__init__.py +16 -0
- maxframe/learn/contrib/llm/core.py +54 -0
- maxframe/learn/contrib/llm/models/__init__.py +14 -0
- maxframe/learn/contrib/llm/models/dashscope.py +73 -0
- maxframe/learn/contrib/llm/multi_modal.py +42 -0
- maxframe/learn/contrib/llm/text.py +42 -0
- maxframe/learn/contrib/utils.py +52 -0
- maxframe/learn/contrib/xgboost/__init__.py +26 -0
- maxframe/learn/contrib/xgboost/classifier.py +110 -0
- maxframe/learn/contrib/xgboost/core.py +241 -0
- maxframe/learn/contrib/xgboost/dmatrix.py +147 -0
- maxframe/learn/contrib/xgboost/predict.py +121 -0
- maxframe/learn/contrib/xgboost/regressor.py +71 -0
- maxframe/learn/contrib/xgboost/tests/__init__.py +13 -0
- maxframe/learn/contrib/xgboost/tests/test_core.py +43 -0
- maxframe/learn/contrib/xgboost/train.py +132 -0
- maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
- maxframe/learn/utils/__init__.py +15 -0
- maxframe/learn/utils/core.py +29 -0
- maxframe/lib/mmh3.cpython-311-darwin.so +0 -0
- maxframe/lib/mmh3.pyi +43 -0
- maxframe/lib/sparse/tests/test_sparse.py +15 -15
- maxframe/lib/wrapped_pickle.py +2 -1
- maxframe/opcodes.py +11 -0
- maxframe/protocol.py +154 -27
- maxframe/remote/core.py +4 -8
- maxframe/serialization/__init__.py +1 -0
- maxframe/serialization/core.cpython-311-darwin.so +0 -0
- maxframe/serialization/core.pxd +3 -0
- maxframe/serialization/core.pyi +64 -0
- maxframe/serialization/core.pyx +67 -26
- maxframe/serialization/exception.py +1 -1
- maxframe/serialization/pandas.py +52 -17
- maxframe/serialization/serializables/core.py +180 -15
- maxframe/serialization/serializables/field_type.py +4 -1
- maxframe/serialization/serializables/tests/test_serializable.py +54 -5
- maxframe/serialization/tests/test_serial.py +2 -1
- maxframe/session.py +37 -2
- maxframe/tensor/__init__.py +81 -2
- maxframe/tensor/arithmetic/isclose.py +1 -0
- maxframe/tensor/arithmetic/tests/test_arithmetic.py +22 -18
- maxframe/tensor/core.py +5 -136
- maxframe/tensor/datasource/array.py +7 -2
- maxframe/tensor/datasource/full.py +1 -1
- maxframe/tensor/datasource/scalar.py +1 -1
- maxframe/tensor/datasource/tests/test_datasource.py +1 -1
- maxframe/tensor/indexing/flatnonzero.py +1 -1
- maxframe/tensor/indexing/getitem.py +2 -0
- maxframe/tensor/merge/__init__.py +2 -0
- maxframe/tensor/merge/concatenate.py +101 -0
- maxframe/tensor/merge/tests/test_merge.py +30 -1
- maxframe/tensor/merge/vstack.py +74 -0
- maxframe/tensor/{base → misc}/__init__.py +4 -0
- maxframe/tensor/misc/atleast_1d.py +72 -0
- maxframe/tensor/misc/atleast_2d.py +70 -0
- maxframe/tensor/misc/atleast_3d.py +85 -0
- maxframe/tensor/misc/tests/__init__.py +13 -0
- maxframe/tensor/{base → misc}/transpose.py +22 -18
- maxframe/tensor/misc/unique.py +205 -0
- maxframe/tensor/operators.py +1 -7
- maxframe/tensor/random/core.py +1 -1
- maxframe/tensor/reduction/count_nonzero.py +2 -1
- maxframe/tensor/reduction/mean.py +1 -0
- maxframe/tensor/reduction/nanmean.py +1 -0
- maxframe/tensor/reduction/nanvar.py +2 -0
- maxframe/tensor/reduction/tests/test_reduction.py +12 -1
- maxframe/tensor/reduction/var.py +2 -0
- maxframe/tensor/statistics/quantile.py +2 -2
- maxframe/tensor/utils.py +2 -22
- maxframe/tests/test_protocol.py +34 -0
- maxframe/tests/test_utils.py +0 -12
- maxframe/tests/utils.py +17 -2
- maxframe/typing_.py +4 -1
- maxframe/udf.py +62 -3
- maxframe/utils.py +112 -86
- {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/METADATA +25 -25
- {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/RECORD +208 -167
- {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/WHEEL +1 -1
- maxframe_client/__init__.py +0 -1
- maxframe_client/clients/framedriver.py +4 -1
- maxframe_client/fetcher.py +123 -54
- maxframe_client/session/consts.py +3 -0
- maxframe_client/session/graph.py +8 -2
- maxframe_client/session/odps.py +223 -40
- maxframe_client/session/task.py +108 -80
- maxframe_client/tests/test_fetcher.py +21 -3
- maxframe_client/tests/test_session.py +136 -8
- maxframe/core/entity/chunks.py +0 -68
- maxframe/core/entity/fuse.py +0 -73
- maxframe/core/graph/builder/chunk.py +0 -430
- maxframe/odpsio/tableio.py +0 -300
- maxframe/odpsio/volumeio.py +0 -95
- maxframe_client/clients/spe.py +0 -104
- /maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
- /maxframe/{tensor/base → dataframe/datastore}/tests/__init__.py +0 -0
- /maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
- /maxframe/tensor/{base → misc}/astype.py +0 -0
- /maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
- /maxframe/tensor/{base → misc}/ravel.py +0 -0
- /maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
- /maxframe/tensor/{base → misc}/where.py +0 -0
- {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/top_level.txt +0 -0
maxframe/tensor/utils.py
CHANGED
|
@@ -19,18 +19,13 @@ import itertools
|
|
|
19
19
|
import operator
|
|
20
20
|
from collections import OrderedDict
|
|
21
21
|
from collections.abc import Iterable
|
|
22
|
-
from functools import
|
|
22
|
+
from functools import wraps
|
|
23
23
|
from math import ceil
|
|
24
24
|
from numbers import Integral
|
|
25
25
|
from typing import Dict, List, Union
|
|
26
26
|
|
|
27
27
|
import numpy as np
|
|
28
28
|
|
|
29
|
-
try:
|
|
30
|
-
import tiledb
|
|
31
|
-
except (ImportError, OSError): # pragma: no cover
|
|
32
|
-
tildb = None
|
|
33
|
-
|
|
34
29
|
from ..core import ExecutableTuple
|
|
35
30
|
from ..lib.mmh3 import hash_from_buffer
|
|
36
31
|
from ..utils import lazy_import
|
|
@@ -508,7 +503,7 @@ def decide_unify_split(*splits):
|
|
|
508
503
|
|
|
509
504
|
|
|
510
505
|
def check_out_param(out, t, casting):
|
|
511
|
-
from .
|
|
506
|
+
from .misc import broadcast_to
|
|
512
507
|
|
|
513
508
|
if not hasattr(out, "shape"):
|
|
514
509
|
raise TypeError("return arrays must be a tensor")
|
|
@@ -563,21 +558,6 @@ def filter_inputs(inputs):
|
|
|
563
558
|
return [inp for inp in inputs if isinstance(inp, ENTITY_TYPE)]
|
|
564
559
|
|
|
565
560
|
|
|
566
|
-
# As TileDB Ctx's creation is a bit time-consuming,
|
|
567
|
-
# we just cache the Ctx
|
|
568
|
-
# also remember the arguments should be hashable
|
|
569
|
-
@lru_cache(10)
|
|
570
|
-
def _create_tiledb_ctx(conf_tuple):
|
|
571
|
-
if conf_tuple is not None:
|
|
572
|
-
return tiledb.Ctx(dict(conf_tuple))
|
|
573
|
-
return tiledb.Ctx()
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
def get_tiledb_ctx(conf):
|
|
577
|
-
key = tuple(conf.items()) if conf is not None else None
|
|
578
|
-
return _create_tiledb_ctx(key)
|
|
579
|
-
|
|
580
|
-
|
|
581
561
|
# this function is only used for pandas' compatibility
|
|
582
562
|
def to_numpy(pdf):
|
|
583
563
|
try:
|
maxframe/tests/test_protocol.py
CHANGED
|
@@ -85,6 +85,40 @@ def test_error_info_json_serialize():
|
|
|
85
85
|
deserial_err_info.reraise()
|
|
86
86
|
|
|
87
87
|
|
|
88
|
+
class CannotPickleException(Exception):
|
|
89
|
+
def __reduce__(self):
|
|
90
|
+
raise ValueError
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class CannotUnpickleException(Exception):
|
|
94
|
+
@classmethod
|
|
95
|
+
def load_from_pk(cls, _):
|
|
96
|
+
raise ValueError
|
|
97
|
+
|
|
98
|
+
def __reduce__(self):
|
|
99
|
+
return type(self).load_from_pk, (0,)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def test_error_info_fallback_json_serialize():
|
|
103
|
+
try:
|
|
104
|
+
raise CannotPickleException
|
|
105
|
+
except CannotPickleException as ex:
|
|
106
|
+
err_info1 = ErrorInfo.from_exception(ex)
|
|
107
|
+
|
|
108
|
+
try:
|
|
109
|
+
raise CannotUnpickleException
|
|
110
|
+
except CannotUnpickleException as ex:
|
|
111
|
+
err_info2 = ErrorInfo.from_exception(ex)
|
|
112
|
+
|
|
113
|
+
for err_info in (err_info1, err_info2):
|
|
114
|
+
deserial_err_info = ErrorInfo.from_json(err_info.to_json())
|
|
115
|
+
assert deserial_err_info.raw_error_source is None
|
|
116
|
+
assert deserial_err_info.raw_error_data is None
|
|
117
|
+
|
|
118
|
+
with pytest.raises(RemoteException):
|
|
119
|
+
deserial_err_info.reraise()
|
|
120
|
+
|
|
121
|
+
|
|
88
122
|
def test_dag_info_json_serialize():
|
|
89
123
|
try:
|
|
90
124
|
raise ValueError("ERR_DATA")
|
maxframe/tests/test_utils.py
CHANGED
|
@@ -288,15 +288,6 @@ def test_estimate_pandas_size():
|
|
|
288
288
|
df2 = pd.DataFrame(np.random.rand(1000, 10))
|
|
289
289
|
assert utils.estimate_pandas_size(df2) == sys.getsizeof(df2)
|
|
290
290
|
|
|
291
|
-
df3 = pd.DataFrame(
|
|
292
|
-
{
|
|
293
|
-
"A": np.random.choice(["abcd", "def", "gh"], size=(1000,)),
|
|
294
|
-
"B": np.random.rand(1000),
|
|
295
|
-
"C": np.random.rand(1000),
|
|
296
|
-
}
|
|
297
|
-
)
|
|
298
|
-
assert utils.estimate_pandas_size(df3) != sys.getsizeof(df3)
|
|
299
|
-
|
|
300
291
|
s1 = pd.Series(np.random.rand(1000))
|
|
301
292
|
assert utils.estimate_pandas_size(s1) == sys.getsizeof(s1)
|
|
302
293
|
|
|
@@ -307,7 +298,6 @@ def test_estimate_pandas_size():
|
|
|
307
298
|
assert utils.estimate_pandas_size(s2) == sys.getsizeof(s2)
|
|
308
299
|
|
|
309
300
|
s3 = pd.Series(np.random.choice(["abcd", "def", "gh"], size=(1000,)))
|
|
310
|
-
assert utils.estimate_pandas_size(s3) != sys.getsizeof(s3)
|
|
311
301
|
assert (
|
|
312
302
|
pytest.approx(utils.estimate_pandas_size(s3) / sys.getsizeof(s3), abs=0.5) == 1
|
|
313
303
|
)
|
|
@@ -318,7 +308,6 @@ def test_estimate_pandas_size():
|
|
|
318
308
|
assert utils.estimate_pandas_size(idx1) == sys.getsizeof(idx1)
|
|
319
309
|
|
|
320
310
|
string_idx = pd.Index(np.random.choice(["a", "bb", "cc"], size=(1000,)))
|
|
321
|
-
assert utils.estimate_pandas_size(string_idx) != sys.getsizeof(string_idx)
|
|
322
311
|
assert (
|
|
323
312
|
pytest.approx(
|
|
324
313
|
utils.estimate_pandas_size(string_idx) / sys.getsizeof(string_idx), abs=0.5
|
|
@@ -338,7 +327,6 @@ def test_estimate_pandas_size():
|
|
|
338
327
|
},
|
|
339
328
|
index=idx2,
|
|
340
329
|
)
|
|
341
|
-
assert utils.estimate_pandas_size(df4) != sys.getsizeof(df4)
|
|
342
330
|
assert (
|
|
343
331
|
pytest.approx(utils.estimate_pandas_size(df4) / sys.getsizeof(df4), abs=0.5)
|
|
344
332
|
== 1
|
maxframe/tests/utils.py
CHANGED
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
|
|
15
15
|
import asyncio
|
|
16
16
|
import functools
|
|
17
|
+
import hashlib
|
|
17
18
|
import os
|
|
18
19
|
import queue
|
|
19
20
|
import socket
|
|
@@ -25,7 +26,7 @@ import pytest
|
|
|
25
26
|
from tornado import netutil
|
|
26
27
|
|
|
27
28
|
from ..core import Tileable, TileableGraph
|
|
28
|
-
from ..utils import lazy_import
|
|
29
|
+
from ..utils import create_sync_primitive, lazy_import, to_binary
|
|
29
30
|
|
|
30
31
|
try:
|
|
31
32
|
from flaky import flaky
|
|
@@ -102,7 +103,7 @@ def run_app_in_thread(app_func):
|
|
|
102
103
|
def fixture_func(*args, **kwargs):
|
|
103
104
|
app_loop = asyncio.new_event_loop()
|
|
104
105
|
q = queue.Queue()
|
|
105
|
-
exit_event = asyncio.Event
|
|
106
|
+
exit_event = create_sync_primitive(asyncio.Event, app_loop)
|
|
106
107
|
app_thread = Thread(
|
|
107
108
|
name="TestAppThread",
|
|
108
109
|
target=app_thread_func,
|
|
@@ -162,3 +163,17 @@ def require_hadoop(func):
|
|
|
162
163
|
not os.environ.get("WITH_HADOOP"), reason="Only run when hadoop is installed"
|
|
163
164
|
)(func)
|
|
164
165
|
return func
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def get_test_unique_name(size=None):
|
|
169
|
+
test_name = os.getenv("PYTEST_CURRENT_TEST", "pyodps_test")
|
|
170
|
+
digest = hashlib.md5(to_binary(test_name)).hexdigest()
|
|
171
|
+
if size:
|
|
172
|
+
digest = digest[:size]
|
|
173
|
+
return digest + "_" + str(os.getpid())
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def assert_mf_index_dtype(idx_obj, dtype):
|
|
177
|
+
from ..dataframe.core import IndexValue
|
|
178
|
+
|
|
179
|
+
assert isinstance(idx_obj, IndexValue.IndexBase) and idx_obj.dtype == dtype
|
maxframe/typing_.py
CHANGED
|
@@ -12,11 +12,14 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from
|
|
15
|
+
from numbers import Integral
|
|
16
|
+
from typing import List, TypeVar, Union
|
|
16
17
|
|
|
17
18
|
import pandas as pd
|
|
18
19
|
import pyarrow as pa
|
|
19
20
|
|
|
21
|
+
SlicesType = List[Union[None, Integral, slice]]
|
|
22
|
+
|
|
20
23
|
TimeoutType = Union[int, float, None]
|
|
21
24
|
|
|
22
25
|
|
maxframe/udf.py
CHANGED
|
@@ -12,21 +12,48 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
+
import shlex
|
|
15
16
|
from typing import Callable, List, Optional, Union
|
|
16
17
|
|
|
17
18
|
from odps.models import Resource
|
|
18
19
|
|
|
19
20
|
from .serialization.serializables import (
|
|
21
|
+
BoolField,
|
|
20
22
|
FieldTypes,
|
|
21
23
|
FunctionField,
|
|
22
24
|
ListField,
|
|
23
25
|
Serializable,
|
|
26
|
+
StringField,
|
|
24
27
|
)
|
|
28
|
+
from .utils import tokenize
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class PythonPackOptions(Serializable):
|
|
32
|
+
_key_args = ("force_rebuild", "prefer_binary", "pre_release", "no_audit_wheel")
|
|
33
|
+
|
|
34
|
+
key = StringField("key")
|
|
35
|
+
requirements = ListField("requirements", FieldTypes.string, default_factory=list)
|
|
36
|
+
force_rebuild = BoolField("force_rebuild", default=False)
|
|
37
|
+
prefer_binary = BoolField("prefer_binary", default=False)
|
|
38
|
+
pre_release = BoolField("pre_release", default=False)
|
|
39
|
+
pack_instance_id = StringField("pack_instance_id", default=None)
|
|
40
|
+
no_audit_wheel = BoolField("no_audit_wheel", default=False)
|
|
41
|
+
|
|
42
|
+
def __init__(self, key: str = None, **kw):
|
|
43
|
+
super().__init__(key=key, **kw)
|
|
44
|
+
if self.key is None:
|
|
45
|
+
args = {k: getattr(self, k) for k in self._key_args}
|
|
46
|
+
self.key = tokenize(set(self.requirements), args)
|
|
47
|
+
|
|
48
|
+
def __repr__(self):
|
|
49
|
+
args_str = " ".join(f"{k}={getattr(self, k)}" for k in self._key_args)
|
|
50
|
+
return f"<PythonPackOptions {self.requirements} {args_str}>"
|
|
25
51
|
|
|
26
52
|
|
|
27
53
|
class MarkedFunction(Serializable):
|
|
28
54
|
func = FunctionField("func")
|
|
29
55
|
resources = ListField("resources", FieldTypes.string, default_factory=list)
|
|
56
|
+
pythonpacks = ListField("pythonpacks", FieldTypes.reference, default_factory=list)
|
|
30
57
|
|
|
31
58
|
def __init__(self, func: Optional[Callable] = None, **kw):
|
|
32
59
|
super().__init__(func=func, **kw)
|
|
@@ -54,13 +81,41 @@ def with_resources(*resources: Union[str, Resource], use_wrapper_class: bool = T
|
|
|
54
81
|
def func_wrapper(func):
|
|
55
82
|
str_resources = [res_to_str(r) for r in resources]
|
|
56
83
|
if not use_wrapper_class:
|
|
57
|
-
func
|
|
84
|
+
existing = getattr(func, "resources") or []
|
|
85
|
+
func.resources = existing + str_resources
|
|
86
|
+
return func
|
|
87
|
+
|
|
88
|
+
if isinstance(func, MarkedFunction):
|
|
89
|
+
func.resources = func.resources + str_resources
|
|
58
90
|
return func
|
|
91
|
+
return MarkedFunction(func, resources=str_resources)
|
|
92
|
+
|
|
93
|
+
return func_wrapper
|
|
94
|
+
|
|
59
95
|
|
|
96
|
+
def with_python_requirements(
|
|
97
|
+
*requirements: str,
|
|
98
|
+
force_rebuild: bool = False,
|
|
99
|
+
prefer_binary: bool = False,
|
|
100
|
+
pre_release: bool = False,
|
|
101
|
+
no_audit_wheel: bool = False,
|
|
102
|
+
):
|
|
103
|
+
result_req = []
|
|
104
|
+
for req in requirements:
|
|
105
|
+
result_req.extend(shlex.split(req))
|
|
106
|
+
|
|
107
|
+
def func_wrapper(func):
|
|
108
|
+
pack_item = PythonPackOptions(
|
|
109
|
+
requirements=requirements,
|
|
110
|
+
force_rebuild=force_rebuild,
|
|
111
|
+
prefer_binary=prefer_binary,
|
|
112
|
+
pre_release=pre_release,
|
|
113
|
+
no_audit_wheel=no_audit_wheel,
|
|
114
|
+
)
|
|
60
115
|
if isinstance(func, MarkedFunction):
|
|
61
|
-
func.
|
|
116
|
+
func.pythonpacks.append(pack_item)
|
|
62
117
|
return func
|
|
63
|
-
return MarkedFunction(func,
|
|
118
|
+
return MarkedFunction(func, pythonpacks=[pack_item])
|
|
64
119
|
|
|
65
120
|
return func_wrapper
|
|
66
121
|
|
|
@@ -72,3 +127,7 @@ def get_udf_resources(
|
|
|
72
127
|
func: Callable,
|
|
73
128
|
) -> List[Union[Resource, str]]:
|
|
74
129
|
return getattr(func, "resources", None) or []
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def get_udf_pythonpacks(func: Callable) -> List[PythonPackOptions]:
|
|
133
|
+
return getattr(func, "pythonpacks", None) or []
|
maxframe/utils.py
CHANGED
|
@@ -19,7 +19,6 @@ import dataclasses
|
|
|
19
19
|
import datetime
|
|
20
20
|
import enum
|
|
21
21
|
import functools
|
|
22
|
-
import hashlib
|
|
23
22
|
import importlib
|
|
24
23
|
import inspect
|
|
25
24
|
import io
|
|
@@ -33,7 +32,6 @@ import sys
|
|
|
33
32
|
import threading
|
|
34
33
|
import time
|
|
35
34
|
import tokenize as pytokenize
|
|
36
|
-
import traceback
|
|
37
35
|
import types
|
|
38
36
|
import weakref
|
|
39
37
|
import zlib
|
|
@@ -76,7 +74,7 @@ from ._utils import ( # noqa: F401 # pylint: disable=unused-import
|
|
|
76
74
|
tokenize_int,
|
|
77
75
|
)
|
|
78
76
|
from .lib.version import parse as parse_version
|
|
79
|
-
from .typing_ import
|
|
77
|
+
from .typing_ import TileableType, TimeoutType
|
|
80
78
|
|
|
81
79
|
# make flake8 happy by referencing these imports
|
|
82
80
|
NamedType = NamedType
|
|
@@ -246,58 +244,6 @@ def copy_tileables(tileables: List[TileableType], **kwargs):
|
|
|
246
244
|
return op.new_tileables(inputs, kws=kws, output_limit=len(kws))
|
|
247
245
|
|
|
248
246
|
|
|
249
|
-
def build_fetch_chunk(chunk: ChunkType, **kwargs) -> ChunkType:
|
|
250
|
-
from .core.operator import ShuffleProxy
|
|
251
|
-
|
|
252
|
-
chunk_op = chunk.op
|
|
253
|
-
params = chunk.params.copy()
|
|
254
|
-
assert not isinstance(chunk_op, ShuffleProxy)
|
|
255
|
-
# for non-shuffle nodes, we build Fetch chunks
|
|
256
|
-
# to replace original chunk
|
|
257
|
-
op = chunk_op.get_fetch_op_cls(chunk)(sparse=chunk.op.sparse, gpu=chunk.op.gpu)
|
|
258
|
-
return op.new_chunk(
|
|
259
|
-
None,
|
|
260
|
-
is_broadcaster=chunk.is_broadcaster,
|
|
261
|
-
kws=[params],
|
|
262
|
-
_key=chunk.key,
|
|
263
|
-
**kwargs,
|
|
264
|
-
)
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
def build_fetch_tileable(tileable: TileableType) -> TileableType:
|
|
268
|
-
if tileable.is_coarse():
|
|
269
|
-
chunks = None
|
|
270
|
-
else:
|
|
271
|
-
chunks = []
|
|
272
|
-
for c in tileable.chunks:
|
|
273
|
-
fetch_chunk = build_fetch_chunk(c, index=c.index)
|
|
274
|
-
chunks.append(fetch_chunk)
|
|
275
|
-
|
|
276
|
-
tileable_op = tileable.op
|
|
277
|
-
params = tileable.params.copy()
|
|
278
|
-
|
|
279
|
-
new_op = tileable_op.get_fetch_op_cls(tileable)(_id=tileable_op.id)
|
|
280
|
-
return new_op.new_tileables(
|
|
281
|
-
None,
|
|
282
|
-
chunks=chunks,
|
|
283
|
-
nsplits=tileable.nsplits,
|
|
284
|
-
_key=tileable.key,
|
|
285
|
-
_id=tileable.id,
|
|
286
|
-
**params,
|
|
287
|
-
)[0]
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
def build_fetch(entity: EntityType) -> EntityType:
|
|
291
|
-
from .core import CHUNK_TYPE, ENTITY_TYPE
|
|
292
|
-
|
|
293
|
-
if isinstance(entity, CHUNK_TYPE):
|
|
294
|
-
return build_fetch_chunk(entity)
|
|
295
|
-
elif isinstance(entity, ENTITY_TYPE):
|
|
296
|
-
return build_fetch_tileable(entity)
|
|
297
|
-
else:
|
|
298
|
-
raise TypeError(f"Type {type(entity)} not supported")
|
|
299
|
-
|
|
300
|
-
|
|
301
247
|
def get_dtype(dtype: Union[np.dtype, pd.api.extensions.ExtensionDtype]):
|
|
302
248
|
if pd.api.types.is_extension_array_dtype(dtype):
|
|
303
249
|
return dtype
|
|
@@ -387,25 +333,7 @@ def build_temp_intermediate_table_name(session_id: str, tileable_key: str) -> st
|
|
|
387
333
|
|
|
388
334
|
|
|
389
335
|
def build_session_volume_name(session_id: str) -> str:
|
|
390
|
-
return f"mf_vol_{session_id}"
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
def build_tileable_dir_name(tileable_key: str) -> str:
|
|
394
|
-
m = hashlib.md5()
|
|
395
|
-
m.update(f"mf_dir_{tileable_key}".encode())
|
|
396
|
-
return m.hexdigest()
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
def extract_messages_and_stacks(exc: Exception) -> Tuple[List[str], List[str]]:
|
|
400
|
-
cur_exc = exc
|
|
401
|
-
messages, stacks = [], []
|
|
402
|
-
while True:
|
|
403
|
-
messages.append(str(cur_exc))
|
|
404
|
-
stacks.append("".join(traceback.format_tb(cur_exc.__traceback__)))
|
|
405
|
-
if exc.__cause__ is None:
|
|
406
|
-
break
|
|
407
|
-
cur_exc = exc.__cause__
|
|
408
|
-
return messages, stacks
|
|
336
|
+
return f"mf_vol_{session_id.replace('-', '_')}"
|
|
409
337
|
|
|
410
338
|
|
|
411
339
|
async def wait_http_response(
|
|
@@ -442,11 +370,27 @@ def format_timeout_params(timeout: TimeoutType) -> str:
|
|
|
442
370
|
return f"?wait=1&timeout={timeout}"
|
|
443
371
|
|
|
444
372
|
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
373
|
+
_PrimitiveType = TypeVar("_PrimitiveType")
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
def create_sync_primitive(
|
|
377
|
+
cls: Type[_PrimitiveType], loop: asyncio.AbstractEventLoop
|
|
378
|
+
) -> _PrimitiveType:
|
|
379
|
+
"""
|
|
380
|
+
Create an asyncio sync primitive (locks, events, etc.)
|
|
381
|
+
in a certain event loop.
|
|
382
|
+
"""
|
|
383
|
+
if sys.version_info[1] < 10:
|
|
384
|
+
return cls(loop=loop)
|
|
385
|
+
|
|
386
|
+
# From Python3.10 the loop parameter has been removed. We should work around here.
|
|
387
|
+
old_loop = asyncio.get_event_loop()
|
|
388
|
+
try:
|
|
389
|
+
asyncio.set_event_loop(loop)
|
|
390
|
+
primitive = cls()
|
|
391
|
+
finally:
|
|
392
|
+
asyncio.set_event_loop(old_loop)
|
|
393
|
+
return primitive
|
|
450
394
|
|
|
451
395
|
|
|
452
396
|
class ToThreadCancelledError(asyncio.CancelledError):
|
|
@@ -487,15 +431,22 @@ class ToThreadMixin:
|
|
|
487
431
|
thread_name_prefix=f"{type(self).__name__}Pool-{self._counter()}",
|
|
488
432
|
)
|
|
489
433
|
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
)
|
|
434
|
+
loop = asyncio.events.get_running_loop()
|
|
435
|
+
ctx = contextvars.copy_context()
|
|
436
|
+
func_call = functools.partial(ctx.run, func, *args, **kwargs)
|
|
437
|
+
fut = loop.run_in_executor(self._pool, func_call)
|
|
438
|
+
|
|
493
439
|
try:
|
|
494
|
-
|
|
440
|
+
coro = fut
|
|
441
|
+
if wait_on_cancel:
|
|
442
|
+
coro = asyncio.shield(coro)
|
|
443
|
+
if timeout is not None:
|
|
444
|
+
coro = asyncio.wait_for(coro, timeout)
|
|
445
|
+
return await coro
|
|
495
446
|
except (asyncio.CancelledError, asyncio.TimeoutError) as ex:
|
|
496
447
|
if not wait_on_cancel:
|
|
497
448
|
raise
|
|
498
|
-
result = await
|
|
449
|
+
result = await fut
|
|
499
450
|
raise ToThreadCancelledError(*ex.args, result=result)
|
|
500
451
|
|
|
501
452
|
def ensure_async_call(
|
|
@@ -519,6 +470,7 @@ def config_odps_default_options():
|
|
|
519
470
|
"metaservice.client.cache.enable": "false",
|
|
520
471
|
"odps.sql.session.result.cache.enable": "false",
|
|
521
472
|
"odps.sql.submit.mode": "script",
|
|
473
|
+
"odps.sql.job.max.time.hours": 72,
|
|
522
474
|
}
|
|
523
475
|
|
|
524
476
|
|
|
@@ -883,8 +835,41 @@ def parse_readable_size(value: Union[str, int, float]) -> Tuple[float, bool]:
|
|
|
883
835
|
raise ValueError(f"Unknown limitation value: {value}")
|
|
884
836
|
|
|
885
837
|
|
|
886
|
-
def remove_suffix(value: str, suffix: str) -> str:
|
|
887
|
-
|
|
838
|
+
def remove_suffix(value: str, suffix: str) -> Tuple[str, bool]:
|
|
839
|
+
"""
|
|
840
|
+
Remove a suffix from a given string if it exists.
|
|
841
|
+
|
|
842
|
+
Parameters
|
|
843
|
+
----------
|
|
844
|
+
value : str
|
|
845
|
+
The original string.
|
|
846
|
+
suffix : str
|
|
847
|
+
The suffix to be removed.
|
|
848
|
+
|
|
849
|
+
Returns
|
|
850
|
+
-------
|
|
851
|
+
Tuple[str, bool]
|
|
852
|
+
A tuple containing the modified string and a boolean indicating whether the suffix was found.
|
|
853
|
+
"""
|
|
854
|
+
|
|
855
|
+
# Check if the suffix is an empty string
|
|
856
|
+
if len(suffix) == 0:
|
|
857
|
+
# If the suffix is empty, return the original string with True
|
|
858
|
+
return value, True
|
|
859
|
+
|
|
860
|
+
# Check if the length of the value is less than the length of the suffix
|
|
861
|
+
if len(value) < len(suffix):
|
|
862
|
+
# If the value is shorter than the suffix, it cannot have the suffix
|
|
863
|
+
return value, False
|
|
864
|
+
|
|
865
|
+
# Check if the suffix matches the end of the value
|
|
866
|
+
match = value.endswith(suffix)
|
|
867
|
+
|
|
868
|
+
# If the suffix is found, remove it; otherwise, return the original string
|
|
869
|
+
if match:
|
|
870
|
+
return value[: -len(suffix)], match
|
|
871
|
+
else:
|
|
872
|
+
return value, match
|
|
888
873
|
|
|
889
874
|
|
|
890
875
|
def find_objects(nested: Union[List, Dict], types: Union[Type, Tuple[Type]]) -> List:
|
|
@@ -1106,3 +1091,44 @@ def get_python_tag():
|
|
|
1106
1091
|
# todo add implementation suffix for non-GIL tags when PEP703 is ready
|
|
1107
1092
|
version_info = sys.version_info
|
|
1108
1093
|
return f"cp{version_info[0]}{version_info[1]}"
|
|
1094
|
+
|
|
1095
|
+
|
|
1096
|
+
def get_item_if_scalar(val: Any) -> Any:
|
|
1097
|
+
if isinstance(val, np.ndarray) and val.shape == ():
|
|
1098
|
+
return val.item()
|
|
1099
|
+
return val
|
|
1100
|
+
|
|
1101
|
+
|
|
1102
|
+
def collect_leaf_operators(root) -> List[Type]:
|
|
1103
|
+
result = []
|
|
1104
|
+
|
|
1105
|
+
def _collect(op_type):
|
|
1106
|
+
if len(op_type.__subclasses__()) == 0:
|
|
1107
|
+
result.append(op_type)
|
|
1108
|
+
for subclass in op_type.__subclasses__():
|
|
1109
|
+
_collect(subclass)
|
|
1110
|
+
|
|
1111
|
+
_collect(root)
|
|
1112
|
+
return result
|
|
1113
|
+
|
|
1114
|
+
|
|
1115
|
+
@contextmanager
|
|
1116
|
+
def sync_pyodps_options():
|
|
1117
|
+
from odps.config import OptionError
|
|
1118
|
+
from odps.config import option_context as pyodps_option_context
|
|
1119
|
+
|
|
1120
|
+
from .config import options
|
|
1121
|
+
|
|
1122
|
+
with pyodps_option_context() as cfg:
|
|
1123
|
+
cfg.local_timezone = options.local_timezone
|
|
1124
|
+
if options.session.enable_schema:
|
|
1125
|
+
try:
|
|
1126
|
+
cfg.enable_schema = options.session.enable_schema
|
|
1127
|
+
except (AttributeError, OptionError):
|
|
1128
|
+
# fixme enable_schema only supported in PyODPS 0.12.0 or later
|
|
1129
|
+
cfg.always_enable_schema = options.session.enable_schema
|
|
1130
|
+
yield
|
|
1131
|
+
|
|
1132
|
+
|
|
1133
|
+
def str_to_bool(s: Optional[str]) -> Optional[bool]:
|
|
1134
|
+
return s.lower().strip() in ("true", "1") if s is not None else None
|
|
@@ -1,33 +1,33 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: maxframe
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.0
|
|
4
4
|
Summary: MaxFrame operator-based data analyze framework
|
|
5
|
-
Requires-Dist: numpy
|
|
6
|
-
Requires-Dist: pandas
|
|
7
|
-
Requires-Dist: pyodps
|
|
8
|
-
Requires-Dist: scipy
|
|
9
|
-
Requires-Dist: pyarrow
|
|
10
|
-
Requires-Dist: msgpack
|
|
11
|
-
Requires-Dist: traitlets
|
|
12
|
-
Requires-Dist: cloudpickle
|
|
13
|
-
Requires-Dist: pyyaml
|
|
14
|
-
Requires-Dist: tornado
|
|
15
|
-
Requires-Dist: defusedxml
|
|
16
|
-
Requires-Dist: tqdm
|
|
17
|
-
Requires-Dist: importlib-metadata
|
|
18
|
-
Requires-Dist: pickle5
|
|
5
|
+
Requires-Dist: numpy<2.0.0,>=1.19.0
|
|
6
|
+
Requires-Dist: pandas>=1.0.0
|
|
7
|
+
Requires-Dist: pyodps>=0.11.6.1
|
|
8
|
+
Requires-Dist: scipy>=1.0
|
|
9
|
+
Requires-Dist: pyarrow>=1.0.0
|
|
10
|
+
Requires-Dist: msgpack>=1.0.0
|
|
11
|
+
Requires-Dist: traitlets>=5.0
|
|
12
|
+
Requires-Dist: cloudpickle<3.0.0,>=1.5.0
|
|
13
|
+
Requires-Dist: pyyaml>=5.1
|
|
14
|
+
Requires-Dist: tornado>=6.0
|
|
15
|
+
Requires-Dist: defusedxml>=0.5.0
|
|
16
|
+
Requires-Dist: tqdm>=4.1.0
|
|
17
|
+
Requires-Dist: importlib-metadata>=1.4
|
|
18
|
+
Requires-Dist: pickle5; python_version < "3.8"
|
|
19
19
|
Provides-Extra: dev
|
|
20
|
-
Requires-Dist: black
|
|
21
|
-
Requires-Dist: flake8
|
|
22
|
-
Requires-Dist: pre-commit
|
|
23
|
-
Requires-Dist: graphviz
|
|
20
|
+
Requires-Dist: black>=22.3.0; extra == "dev"
|
|
21
|
+
Requires-Dist: flake8>=5.0.4; extra == "dev"
|
|
22
|
+
Requires-Dist: pre-commit>=2.15.0; extra == "dev"
|
|
23
|
+
Requires-Dist: graphviz>=0.20.1; extra == "dev"
|
|
24
24
|
Provides-Extra: test
|
|
25
|
-
Requires-Dist: mock
|
|
26
|
-
Requires-Dist: pytest
|
|
27
|
-
Requires-Dist: pytest-cov
|
|
28
|
-
Requires-Dist: pytest-asyncio
|
|
29
|
-
Requires-Dist: pytest-timeout
|
|
30
|
-
Requires-Dist: matplotlib
|
|
25
|
+
Requires-Dist: mock; extra == "test"
|
|
26
|
+
Requires-Dist: pytest>=7.3.1; extra == "test"
|
|
27
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == "test"
|
|
28
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == "test"
|
|
29
|
+
Requires-Dist: pytest-timeout>=2.1.0; extra == "test"
|
|
30
|
+
Requires-Dist: matplotlib>=2.0.0; extra == "test"
|
|
31
31
|
|
|
32
32
|
MaxCompute MaxFrame Client
|
|
33
33
|
==========================
|