maxframe 1.0.0rc4__cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl → 1.1.1__cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/config/__init__.py +1 -1
- maxframe/config/config.py +26 -0
- maxframe/config/tests/test_config.py +20 -1
- maxframe/conftest.py +17 -4
- maxframe/core/operator/base.py +2 -0
- maxframe/dataframe/arithmetic/tests/test_arithmetic.py +17 -16
- maxframe/dataframe/core.py +24 -2
- maxframe/dataframe/datasource/read_odps_query.py +65 -35
- maxframe/dataframe/datasource/read_odps_table.py +4 -2
- maxframe/dataframe/datasource/tests/test_datasource.py +59 -7
- maxframe/dataframe/extensions/__init__.py +5 -0
- maxframe/dataframe/extensions/apply_chunk.py +649 -0
- maxframe/dataframe/extensions/flatjson.py +131 -0
- maxframe/dataframe/extensions/flatmap.py +28 -40
- maxframe/dataframe/extensions/reshuffle.py +1 -1
- maxframe/dataframe/extensions/tests/test_apply_chunk.py +186 -0
- maxframe/dataframe/extensions/tests/test_extensions.py +46 -2
- maxframe/dataframe/groupby/__init__.py +1 -0
- maxframe/dataframe/groupby/aggregation.py +1 -0
- maxframe/dataframe/groupby/apply.py +9 -1
- maxframe/dataframe/groupby/core.py +1 -1
- maxframe/dataframe/groupby/fill.py +4 -1
- maxframe/dataframe/groupby/getitem.py +6 -0
- maxframe/dataframe/groupby/tests/test_groupby.py +1 -1
- maxframe/dataframe/groupby/transform.py +8 -2
- maxframe/dataframe/indexing/loc.py +6 -4
- maxframe/dataframe/merge/__init__.py +9 -1
- maxframe/dataframe/merge/concat.py +41 -31
- maxframe/dataframe/merge/merge.py +1 -1
- maxframe/dataframe/merge/tests/test_merge.py +3 -1
- maxframe/dataframe/misc/apply.py +3 -0
- maxframe/dataframe/misc/drop_duplicates.py +5 -1
- maxframe/dataframe/misc/map.py +3 -1
- maxframe/dataframe/misc/tests/test_misc.py +24 -2
- maxframe/dataframe/misc/transform.py +22 -13
- maxframe/dataframe/reduction/__init__.py +3 -0
- maxframe/dataframe/reduction/aggregation.py +1 -0
- maxframe/dataframe/reduction/median.py +56 -0
- maxframe/dataframe/reduction/tests/test_reduction.py +17 -7
- maxframe/dataframe/statistics/quantile.py +8 -2
- maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
- maxframe/dataframe/tests/test_utils.py +60 -0
- maxframe/dataframe/utils.py +110 -7
- maxframe/dataframe/window/expanding.py +5 -3
- maxframe/dataframe/window/tests/test_expanding.py +2 -2
- maxframe/io/objects/tests/test_object_io.py +39 -12
- maxframe/io/odpsio/__init__.py +1 -1
- maxframe/io/odpsio/arrow.py +51 -2
- maxframe/io/odpsio/schema.py +23 -5
- maxframe/io/odpsio/tableio.py +80 -124
- maxframe/io/odpsio/tests/test_schema.py +40 -0
- maxframe/io/odpsio/tests/test_tableio.py +5 -5
- maxframe/io/odpsio/tests/test_volumeio.py +35 -11
- maxframe/io/odpsio/volumeio.py +27 -3
- maxframe/learn/contrib/__init__.py +3 -2
- maxframe/learn/contrib/llm/__init__.py +16 -0
- maxframe/learn/contrib/llm/core.py +54 -0
- maxframe/learn/contrib/llm/models/__init__.py +14 -0
- maxframe/learn/contrib/llm/models/dashscope.py +73 -0
- maxframe/learn/contrib/llm/multi_modal.py +42 -0
- maxframe/learn/contrib/llm/text.py +42 -0
- maxframe/lib/sparse/tests/test_sparse.py +15 -15
- maxframe/opcodes.py +7 -1
- maxframe/serialization/core.cpython-311-aarch64-linux-gnu.so +0 -0
- maxframe/serialization/core.pyx +13 -1
- maxframe/serialization/pandas.py +50 -20
- maxframe/serialization/serializables/core.py +70 -15
- maxframe/serialization/serializables/field_type.py +4 -1
- maxframe/serialization/serializables/tests/test_serializable.py +12 -2
- maxframe/serialization/tests/test_serial.py +2 -1
- maxframe/tensor/__init__.py +19 -7
- maxframe/tensor/merge/vstack.py +1 -1
- maxframe/tests/utils.py +16 -0
- maxframe/udf.py +27 -0
- maxframe/utils.py +42 -8
- {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/METADATA +4 -4
- {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/RECORD +573 -562
- {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/WHEEL +1 -1
- maxframe_client/clients/framedriver.py +4 -1
- maxframe_client/fetcher.py +23 -8
- maxframe_client/session/odps.py +40 -11
- maxframe_client/session/task.py +6 -25
- maxframe_client/session/tests/test_task.py +35 -6
- maxframe_client/tests/test_session.py +30 -10
- {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/top_level.txt +0 -0
maxframe/config/__init__.py
CHANGED
maxframe/config/config.py
CHANGED
|
@@ -28,6 +28,8 @@ except ImportError:
|
|
|
28
28
|
|
|
29
29
|
available_timezones = lambda: all_timezones
|
|
30
30
|
|
|
31
|
+
import logging
|
|
32
|
+
|
|
31
33
|
from ..utils import get_python_tag
|
|
32
34
|
from .validators import (
|
|
33
35
|
ValidatorType,
|
|
@@ -43,6 +45,8 @@ from .validators import (
|
|
|
43
45
|
is_valid_cache_path,
|
|
44
46
|
)
|
|
45
47
|
|
|
48
|
+
logger = logging.getLogger(__name__)
|
|
49
|
+
|
|
46
50
|
_DEFAULT_REDIRECT_WARN = "Option {source} has been replaced by {target} and might be removed in a future release."
|
|
47
51
|
_DEFAULT_MAX_ALIVE_SECONDS = 3 * 24 * 3600
|
|
48
52
|
_DEFAULT_MAX_IDLE_SECONDS = 3600
|
|
@@ -380,6 +384,9 @@ default_options.register_option(
|
|
|
380
384
|
default_options.register_option(
|
|
381
385
|
"session.enable_schema", None, validator=is_null | is_bool, remote=True
|
|
382
386
|
)
|
|
387
|
+
default_options.register_option(
|
|
388
|
+
"session.enable_high_availability", None, validator=is_null | is_bool, remote=True
|
|
389
|
+
)
|
|
383
390
|
default_options.register_option(
|
|
384
391
|
"session.default_schema", None, validator=is_null | is_string, remote=True
|
|
385
392
|
)
|
|
@@ -496,3 +503,22 @@ class OptionsProxy:
|
|
|
496
503
|
|
|
497
504
|
|
|
498
505
|
options = OptionsProxy()
|
|
506
|
+
|
|
507
|
+
|
|
508
|
+
def update_wlm_quota_settings(session_id: str, engine_settings: Dict[str, Any]):
|
|
509
|
+
engine_quota = engine_settings.get("odps.task.wlm.quota", None)
|
|
510
|
+
session_quota = options.session.quota_name or None
|
|
511
|
+
if engine_quota != session_quota and engine_quota:
|
|
512
|
+
logger.warning(
|
|
513
|
+
"[Session=%s] Session quota (%s) is different to SubDag engine quota (%s)",
|
|
514
|
+
session_id,
|
|
515
|
+
session_quota,
|
|
516
|
+
engine_quota,
|
|
517
|
+
)
|
|
518
|
+
# TODO(renxiang): overwrite or not overwrite
|
|
519
|
+
return
|
|
520
|
+
|
|
521
|
+
if session_quota:
|
|
522
|
+
engine_settings["odps.task.wlm.quota"] = session_quota
|
|
523
|
+
elif "odps.task.wlm.quota" in engine_settings:
|
|
524
|
+
engine_settings.pop("odps.task.wlm.quota")
|
|
@@ -18,7 +18,14 @@ import threading
|
|
|
18
18
|
|
|
19
19
|
import pytest
|
|
20
20
|
|
|
21
|
-
from ..config import
|
|
21
|
+
from ..config import (
|
|
22
|
+
Config,
|
|
23
|
+
is_integer,
|
|
24
|
+
is_string,
|
|
25
|
+
option_context,
|
|
26
|
+
options,
|
|
27
|
+
update_wlm_quota_settings,
|
|
28
|
+
)
|
|
22
29
|
|
|
23
30
|
|
|
24
31
|
def test_config_context():
|
|
@@ -101,3 +108,15 @@ def test_config_copy():
|
|
|
101
108
|
|
|
102
109
|
target_cfg.update(src_cfg_dict)
|
|
103
110
|
assert target_cfg.a.b.c == 1
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def test_update_wlm_quota_settings():
|
|
114
|
+
with option_context({}):
|
|
115
|
+
options.session.quota_name = "quota1"
|
|
116
|
+
engine_settings = {}
|
|
117
|
+
update_wlm_quota_settings("session_id", engine_settings)
|
|
118
|
+
assert engine_settings["odps.task.wlm.quota"] == "quota1"
|
|
119
|
+
options.session.quota_name = None
|
|
120
|
+
update_wlm_quota_settings("session_id", engine_settings)
|
|
121
|
+
# TODO(renxiang): overwrite or not overwrite
|
|
122
|
+
assert "odps.task.wlm.quota" in engine_settings
|
maxframe/conftest.py
CHANGED
|
@@ -40,10 +40,14 @@ def _get_odps_env(test_config: ConfigParser, section_name: str) -> ODPS:
|
|
|
40
40
|
access_id = test_config.get(section_name, "access_id")
|
|
41
41
|
except NoOptionError:
|
|
42
42
|
access_id = test_config.get("odps", "access_id")
|
|
43
|
+
if not access_id:
|
|
44
|
+
access_id = os.getenv("ACCESS_ID")
|
|
43
45
|
try:
|
|
44
46
|
secret_access_key = test_config.get(section_name, "secret_access_key")
|
|
45
47
|
except NoOptionError:
|
|
46
48
|
secret_access_key = test_config.get("odps", "secret_access_key")
|
|
49
|
+
if not secret_access_key:
|
|
50
|
+
secret_access_key = os.getenv("SECRET_ACCESS_KEY")
|
|
47
51
|
try:
|
|
48
52
|
project = test_config.get(section_name, "project")
|
|
49
53
|
except NoOptionError:
|
|
@@ -119,14 +123,23 @@ def oss_config():
|
|
|
119
123
|
old_cache_url = options.object_cache_url
|
|
120
124
|
|
|
121
125
|
try:
|
|
122
|
-
oss_access_id = config.get("oss", "access_id")
|
|
123
|
-
oss_secret_access_key = config.get("oss", "secret_access_key")
|
|
126
|
+
oss_access_id = config.get("oss", "access_id") or os.getenv("ACCESS_ID")
|
|
127
|
+
oss_secret_access_key = config.get("oss", "secret_access_key") or os.getenv(
|
|
128
|
+
"SECRET_ACCESS_KEY"
|
|
129
|
+
)
|
|
124
130
|
oss_bucket_name = config.get("oss", "bucket_name")
|
|
125
131
|
oss_endpoint = config.get("oss", "endpoint")
|
|
126
132
|
oss_rolearn = config.get("oss", "rolearn")
|
|
127
133
|
|
|
128
134
|
options.service_role_arn = oss_rolearn
|
|
129
|
-
|
|
135
|
+
if "test" in oss_endpoint:
|
|
136
|
+
oss_svc_endpoint = oss_endpoint
|
|
137
|
+
else:
|
|
138
|
+
endpoint_parts = oss_endpoint.split(".", 1)
|
|
139
|
+
if "-internal" not in endpoint_parts[0]:
|
|
140
|
+
endpoint_parts[0] += "-internal"
|
|
141
|
+
oss_svc_endpoint = ".".join(endpoint_parts)
|
|
142
|
+
options.object_cache_url = f"oss://{oss_svc_endpoint}/{oss_bucket_name}"
|
|
130
143
|
|
|
131
144
|
config.oss_config = (
|
|
132
145
|
oss_access_id,
|
|
@@ -141,7 +154,7 @@ def oss_config():
|
|
|
141
154
|
config.oss_bucket = oss2.Bucket(auth, oss_endpoint, oss_bucket_name)
|
|
142
155
|
config.oss_rolearn = oss_rolearn
|
|
143
156
|
yield config
|
|
144
|
-
except (
|
|
157
|
+
except (NoSectionError, NoOptionError, ImportError):
|
|
145
158
|
return None
|
|
146
159
|
finally:
|
|
147
160
|
options.service_role_arn = old_role_arn
|
maxframe/core/operator/base.py
CHANGED
|
@@ -86,6 +86,8 @@ class SchedulingHint(Serializable):
|
|
|
86
86
|
# `gpu` indicates that if the operator should be executed on the GPU.
|
|
87
87
|
gpu = BoolField("gpu", default=None)
|
|
88
88
|
priority = Int32Field("priority", default=None)
|
|
89
|
+
expect_engine = StringField("expect_engine", default=None)
|
|
90
|
+
expect_resources = DictField("expect_resources", FieldTypes.string, default=None)
|
|
89
91
|
|
|
90
92
|
@classproperty
|
|
91
93
|
@lru_cache(1)
|
|
@@ -22,6 +22,7 @@ import pandas as pd
|
|
|
22
22
|
import pytest
|
|
23
23
|
|
|
24
24
|
from ....core import OperatorType
|
|
25
|
+
from ....tests.utils import assert_mf_index_dtype
|
|
25
26
|
from ....utils import dataslots
|
|
26
27
|
from ...core import IndexValue
|
|
27
28
|
from ...datasource.dataframe import from_pandas
|
|
@@ -164,7 +165,7 @@ def test_without_shuffle(func_name, func_opts):
|
|
|
164
165
|
pd.testing.assert_index_equal(
|
|
165
166
|
df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
|
|
166
167
|
)
|
|
167
|
-
|
|
168
|
+
assert_mf_index_dtype(df3.index_value.value, np.int64)
|
|
168
169
|
pd.testing.assert_index_equal(
|
|
169
170
|
df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
|
|
170
171
|
)
|
|
@@ -176,7 +177,7 @@ def test_without_shuffle(func_name, func_opts):
|
|
|
176
177
|
pd.testing.assert_index_equal(
|
|
177
178
|
df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
|
|
178
179
|
)
|
|
179
|
-
|
|
180
|
+
assert_mf_index_dtype(df3.index_value.value, np.int64)
|
|
180
181
|
pd.testing.assert_index_equal(
|
|
181
182
|
df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
|
|
182
183
|
)
|
|
@@ -370,7 +371,7 @@ def test_with_one_shuffle(func_name, func_opts):
|
|
|
370
371
|
pd.testing.assert_index_equal(
|
|
371
372
|
df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
|
|
372
373
|
)
|
|
373
|
-
|
|
374
|
+
assert_mf_index_dtype(df3.index_value.value, np.int64)
|
|
374
375
|
pd.testing.assert_index_equal(
|
|
375
376
|
df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
|
|
376
377
|
)
|
|
@@ -403,7 +404,7 @@ def test_with_all_shuffle(func_name, func_opts):
|
|
|
403
404
|
pd.testing.assert_index_equal(
|
|
404
405
|
df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
|
|
405
406
|
)
|
|
406
|
-
|
|
407
|
+
assert_mf_index_dtype(df3.index_value.value, np.int64)
|
|
407
408
|
pd.testing.assert_index_equal(
|
|
408
409
|
df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
|
|
409
410
|
)
|
|
@@ -433,7 +434,7 @@ def test_with_all_shuffle(func_name, func_opts):
|
|
|
433
434
|
pd.testing.assert_index_equal(
|
|
434
435
|
df6.columns_value.to_pandas(), func_opts.func(data4, data5).columns
|
|
435
436
|
)
|
|
436
|
-
|
|
437
|
+
assert_mf_index_dtype(df6.index_value.value, np.int64)
|
|
437
438
|
pd.testing.assert_index_equal(
|
|
438
439
|
df6.index_value.to_pandas(), pd.Index([], dtype=np.int64)
|
|
439
440
|
)
|
|
@@ -468,7 +469,7 @@ def test_without_shuffle_and_with_one_chunk(func_name, func_opts):
|
|
|
468
469
|
pd.testing.assert_index_equal(
|
|
469
470
|
df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
|
|
470
471
|
)
|
|
471
|
-
|
|
472
|
+
assert_mf_index_dtype(df3.index_value.value, np.int64)
|
|
472
473
|
pd.testing.assert_index_equal(
|
|
473
474
|
df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
|
|
474
475
|
)
|
|
@@ -501,7 +502,7 @@ def test_both_one_chunk(func_name, func_opts):
|
|
|
501
502
|
pd.testing.assert_index_equal(
|
|
502
503
|
df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
|
|
503
504
|
)
|
|
504
|
-
|
|
505
|
+
assert_mf_index_dtype(df3.index_value.value, np.int64)
|
|
505
506
|
pd.testing.assert_index_equal(
|
|
506
507
|
df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
|
|
507
508
|
)
|
|
@@ -534,7 +535,7 @@ def test_with_shuffle_and_one_chunk(func_name, func_opts):
|
|
|
534
535
|
pd.testing.assert_index_equal(
|
|
535
536
|
df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
|
|
536
537
|
)
|
|
537
|
-
|
|
538
|
+
assert_mf_index_dtype(df3.index_value.value, np.int64)
|
|
538
539
|
pd.testing.assert_index_equal(
|
|
539
540
|
df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
|
|
540
541
|
)
|
|
@@ -558,7 +559,7 @@ def test_on_same_dataframe(func_name, func_opts):
|
|
|
558
559
|
pd.testing.assert_index_equal(
|
|
559
560
|
df2.columns_value.to_pandas(), func_opts.func(data, data).columns
|
|
560
561
|
)
|
|
561
|
-
|
|
562
|
+
assert_mf_index_dtype(df2.index_value.value, np.int64)
|
|
562
563
|
pd.testing.assert_index_equal(
|
|
563
564
|
df2.index_value.to_pandas(), pd.Index([], dtype=np.int64)
|
|
564
565
|
)
|
|
@@ -590,19 +591,19 @@ def test_dataframe_and_scalar(func_name, func_opts):
|
|
|
590
591
|
pd.testing.assert_series_equal(result.dtypes, expected.dtypes)
|
|
591
592
|
|
|
592
593
|
pd.testing.assert_index_equal(result.columns_value.to_pandas(), data.columns)
|
|
593
|
-
|
|
594
|
+
assert_mf_index_dtype(result.index_value.value, np.int64)
|
|
594
595
|
|
|
595
596
|
pd.testing.assert_index_equal(result2.columns_value.to_pandas(), data.columns)
|
|
596
|
-
|
|
597
|
+
assert_mf_index_dtype(result2.index_value.value, np.int64)
|
|
597
598
|
|
|
598
599
|
pd.testing.assert_index_equal(result3.columns_value.to_pandas(), data.columns)
|
|
599
|
-
|
|
600
|
+
assert_mf_index_dtype(result3.index_value.value, np.int64)
|
|
600
601
|
|
|
601
602
|
pd.testing.assert_index_equal(result4.columns_value.to_pandas(), data.columns)
|
|
602
|
-
|
|
603
|
+
assert_mf_index_dtype(result4.index_value.value, np.int64)
|
|
603
604
|
|
|
604
605
|
pd.testing.assert_index_equal(result5.columns_value.to_pandas(), data.columns)
|
|
605
|
-
|
|
606
|
+
assert_mf_index_dtype(result5.index_value.value, np.int64)
|
|
606
607
|
|
|
607
608
|
if "builtin_function_or_method" not in str(type(func_opts.func)):
|
|
608
609
|
# skip NotImplemented test for comparison function
|
|
@@ -679,7 +680,7 @@ def test_abs():
|
|
|
679
680
|
pd.testing.assert_index_equal(
|
|
680
681
|
df2.columns_value.to_pandas(), df1.columns_value.to_pandas()
|
|
681
682
|
)
|
|
682
|
-
|
|
683
|
+
assert_mf_index_dtype(df2.index_value.value, np.int64)
|
|
683
684
|
assert df2.shape == (10, 10)
|
|
684
685
|
|
|
685
686
|
|
|
@@ -697,7 +698,7 @@ def test_not():
|
|
|
697
698
|
pd.testing.assert_index_equal(
|
|
698
699
|
df2.columns_value.to_pandas(), df1.columns_value.to_pandas()
|
|
699
700
|
)
|
|
700
|
-
|
|
701
|
+
assert_mf_index_dtype(df2.index_value.value, np.int64)
|
|
701
702
|
assert df2.shape == (10, 10)
|
|
702
703
|
|
|
703
704
|
|
maxframe/dataframe/core.py
CHANGED
|
@@ -142,6 +142,14 @@ class IndexValue(Serializable):
|
|
|
142
142
|
_data = NDArrayField("data")
|
|
143
143
|
_dtype = DataTypeField("dtype")
|
|
144
144
|
|
|
145
|
+
@property
|
|
146
|
+
def dtype(self):
|
|
147
|
+
return getattr(self, "_dtype", None)
|
|
148
|
+
|
|
149
|
+
@property
|
|
150
|
+
def inferred_type(self):
|
|
151
|
+
return "floating" if self.dtype.kind == "f" else "integer"
|
|
152
|
+
|
|
145
153
|
class RangeIndex(IndexBase):
|
|
146
154
|
_name = AnyField("name")
|
|
147
155
|
_slice = SliceField("slice")
|
|
@@ -243,6 +251,10 @@ class IndexValue(Serializable):
|
|
|
243
251
|
_data = NDArrayField("data")
|
|
244
252
|
_dtype = DataTypeField("dtype")
|
|
245
253
|
|
|
254
|
+
@property
|
|
255
|
+
def dtype(self):
|
|
256
|
+
return getattr(self, "_dtype", None)
|
|
257
|
+
|
|
246
258
|
@property
|
|
247
259
|
def inferred_type(self):
|
|
248
260
|
return "integer"
|
|
@@ -254,6 +266,10 @@ class IndexValue(Serializable):
|
|
|
254
266
|
_data = NDArrayField("data")
|
|
255
267
|
_dtype = DataTypeField("dtype")
|
|
256
268
|
|
|
269
|
+
@property
|
|
270
|
+
def dtype(self):
|
|
271
|
+
return getattr(self, "_dtype", None)
|
|
272
|
+
|
|
257
273
|
@property
|
|
258
274
|
def inferred_type(self):
|
|
259
275
|
return "integer"
|
|
@@ -265,6 +281,10 @@ class IndexValue(Serializable):
|
|
|
265
281
|
_data = NDArrayField("data")
|
|
266
282
|
_dtype = DataTypeField("dtype")
|
|
267
283
|
|
|
284
|
+
@property
|
|
285
|
+
def dtype(self):
|
|
286
|
+
return getattr(self, "_dtype", None)
|
|
287
|
+
|
|
268
288
|
@property
|
|
269
289
|
def inferred_type(self):
|
|
270
290
|
return "floating"
|
|
@@ -1514,8 +1534,7 @@ class BaseDataFrameData(HasShapeTileableData, _ToPandasMixin):
|
|
|
1514
1534
|
refresh_index_value(self)
|
|
1515
1535
|
refresh_dtypes(self)
|
|
1516
1536
|
|
|
1517
|
-
def
|
|
1518
|
-
dtypes = table_meta.pd_column_dtypes
|
|
1537
|
+
def refresh_from_dtypes(self, dtypes: pd.Series) -> None:
|
|
1519
1538
|
self._dtypes = dtypes
|
|
1520
1539
|
self._columns_value = parse_index(dtypes.index, store_data=True)
|
|
1521
1540
|
self._dtypes_value = DtypesValue(key=tokenize(dtypes), value=dtypes)
|
|
@@ -1523,6 +1542,9 @@ class BaseDataFrameData(HasShapeTileableData, _ToPandasMixin):
|
|
|
1523
1542
|
new_shape[-1] = len(dtypes)
|
|
1524
1543
|
self._shape = tuple(new_shape)
|
|
1525
1544
|
|
|
1545
|
+
def refresh_from_table_meta(self, table_meta: DataFrameTableMeta) -> None:
|
|
1546
|
+
self.refresh_from_dtypes(table_meta.pd_column_dtypes)
|
|
1547
|
+
|
|
1526
1548
|
@property
|
|
1527
1549
|
def dtypes(self):
|
|
1528
1550
|
dt = getattr(self, "_dtypes", None)
|
|
@@ -37,6 +37,7 @@ from ...serialization.serializables import (
|
|
|
37
37
|
SeriesField,
|
|
38
38
|
StringField,
|
|
39
39
|
)
|
|
40
|
+
from ...utils import is_empty
|
|
40
41
|
from ..utils import parse_index
|
|
41
42
|
from .core import ColumnPruneSupportedDataSourceMixin, IncrementalIndexDatasource
|
|
42
43
|
|
|
@@ -57,7 +58,7 @@ _EXPLAIN_COLUMN_REGEX = re.compile(r"([^\(]+) \(([^\n]+)\)(?:| AS ([^ ]+))(?:\n|
|
|
|
57
58
|
_ANONYMOUS_COL_REGEX = re.compile(r"^_c(\d+)$")
|
|
58
59
|
|
|
59
60
|
_SIMPLE_SCHEMA_COLS_REGEX = re.compile(r"SELECT (([^:]+:[^, ]+[, ]*)+)FROM")
|
|
60
|
-
_SIMPLE_SCHEMA_COL_REGEX = re.compile(r"([
|
|
61
|
+
_SIMPLE_SCHEMA_COL_REGEX = re.compile(r"([^ \.\)]+):([^ ]+)")
|
|
61
62
|
|
|
62
63
|
|
|
63
64
|
@dataclasses.dataclass
|
|
@@ -180,23 +181,30 @@ def _parse_full_explain(explain_string: str) -> OdpsSchema:
|
|
|
180
181
|
|
|
181
182
|
job_dag = jobs_sector.build_dag()
|
|
182
183
|
indep_job_names = list(job_dag.iter_indep(reverse=True))
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
184
|
+
schema_signatures = dict()
|
|
185
|
+
for job_name in indep_job_names:
|
|
186
|
+
tasks_sector = jobs_sector.jobs[job_name]
|
|
187
|
+
task_dag = tasks_sector.build_dag()
|
|
188
|
+
indep_task_names = list(task_dag.iter_indep(reverse=True))
|
|
189
|
+
for task_name in indep_task_names:
|
|
190
|
+
task_sector = tasks_sector.tasks[task_name]
|
|
191
|
+
if not task_sector.schema: # pragma: no cover
|
|
192
|
+
raise ValueError("Cannot detect output schema")
|
|
193
|
+
if task_sector.output_target != "Screen":
|
|
194
|
+
raise ValueError("The SQL statement should be an instant query")
|
|
195
|
+
sig_tuples = sorted(
|
|
196
|
+
[
|
|
197
|
+
(c.column_alias or c.column_name, c.column_type)
|
|
198
|
+
for c in task_sector.schema
|
|
199
|
+
]
|
|
200
|
+
)
|
|
201
|
+
schema_signatures[hash(tuple(sig_tuples))] = task_sector.schema
|
|
202
|
+
if len(schema_signatures) != 1:
|
|
190
203
|
raise ValueError("Only one final task is allowed in SQL statement")
|
|
191
|
-
|
|
192
|
-
task_sector = tasks_sector.tasks[indep_task_names[0]]
|
|
193
|
-
if not task_sector.schema: # pragma: no cover
|
|
194
|
-
raise ValueError("Cannot detect output schema")
|
|
195
|
-
if task_sector.output_target != "Screen":
|
|
196
|
-
raise ValueError("The SQL statement should be an instant query")
|
|
204
|
+
schema = list(schema_signatures.values())[0]
|
|
197
205
|
cols = [
|
|
198
206
|
Column(c.column_alias or c.column_name, validate_data_type(c.column_type))
|
|
199
|
-
for c in
|
|
207
|
+
for c in schema
|
|
200
208
|
]
|
|
201
209
|
return OdpsSchema(cols)
|
|
202
210
|
|
|
@@ -209,7 +217,7 @@ def _parse_simple_explain(explain_string: str) -> OdpsSchema:
|
|
|
209
217
|
fields_str = fields_match.group(1)
|
|
210
218
|
cols = []
|
|
211
219
|
for field, type_name in _SIMPLE_SCHEMA_COL_REGEX.findall(fields_str):
|
|
212
|
-
cols.append(Column(field, validate_data_type(type_name)))
|
|
220
|
+
cols.append(Column(field, validate_data_type(type_name.rstrip(","))))
|
|
213
221
|
return OdpsSchema(cols)
|
|
214
222
|
|
|
215
223
|
|
|
@@ -243,7 +251,7 @@ class DataFrameReadODPSQuery(
|
|
|
243
251
|
self.columns = columns
|
|
244
252
|
|
|
245
253
|
def __call__(self, chunk_bytes=None, chunk_size=None):
|
|
246
|
-
if
|
|
254
|
+
if is_empty(self.index_columns):
|
|
247
255
|
index_value = parse_index(pd.RangeIndex(0))
|
|
248
256
|
elif len(self.index_columns) == 1:
|
|
249
257
|
index_value = parse_index(
|
|
@@ -257,12 +265,18 @@ class DataFrameReadODPSQuery(
|
|
|
257
265
|
)
|
|
258
266
|
index_value = parse_index(idx)
|
|
259
267
|
|
|
260
|
-
|
|
268
|
+
if self.dtypes is not None:
|
|
269
|
+
columns_value = parse_index(self.dtypes.index, store_data=True)
|
|
270
|
+
shape = (np.nan, len(self.dtypes))
|
|
271
|
+
else:
|
|
272
|
+
columns_value = None
|
|
273
|
+
shape = (np.nan, np.nan)
|
|
274
|
+
|
|
261
275
|
self.output_types = [OutputType.dataframe]
|
|
262
276
|
return self.new_tileable(
|
|
263
277
|
[],
|
|
264
278
|
None,
|
|
265
|
-
shape=
|
|
279
|
+
shape=shape,
|
|
266
280
|
dtypes=self.dtypes,
|
|
267
281
|
index_value=index_value,
|
|
268
282
|
columns_value=columns_value,
|
|
@@ -278,6 +292,7 @@ def read_odps_query(
|
|
|
278
292
|
string_as_binary: bool = None,
|
|
279
293
|
sql_hints: Dict[str, str] = None,
|
|
280
294
|
anonymous_col_prefix: str = _DEFAULT_ANONYMOUS_COL_PREFIX,
|
|
295
|
+
skip_schema: bool = False,
|
|
281
296
|
**kw,
|
|
282
297
|
):
|
|
283
298
|
"""
|
|
@@ -298,6 +313,10 @@ def read_odps_query(
|
|
|
298
313
|
User specified SQL hints.
|
|
299
314
|
anonymous_col_prefix: str, optional
|
|
300
315
|
Prefix for anonymous columns, '_anon_col_' by default.
|
|
316
|
+
skip_schema: bool, optional
|
|
317
|
+
Skip resolving output schema before execution. Once this is configured,
|
|
318
|
+
the output DataFrame cannot be inputs of other DataFrame operators
|
|
319
|
+
before execution.
|
|
301
320
|
|
|
302
321
|
Returns
|
|
303
322
|
-------
|
|
@@ -319,28 +338,39 @@ def read_odps_query(
|
|
|
319
338
|
|
|
320
339
|
if odps_entry is None:
|
|
321
340
|
raise ValueError("Missing odps_entry parameter")
|
|
322
|
-
inst = odps_entry.execute_sql(f"EXPLAIN {query}", hints=hints)
|
|
323
|
-
logger.debug("Explain instance ID: %s", inst.id)
|
|
324
|
-
explain_str = list(inst.get_task_results().values())[0]
|
|
325
341
|
|
|
326
|
-
odps_schema = _parse_explained_schema(explain_str)
|
|
327
|
-
|
|
328
|
-
new_columns = []
|
|
329
342
|
col_renames = {}
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
343
|
+
if not skip_schema:
|
|
344
|
+
inst = odps_entry.execute_sql(f"EXPLAIN {query}", hints=hints)
|
|
345
|
+
logger.debug("Explain instance ID: %s", inst.id)
|
|
346
|
+
explain_str = list(inst.get_task_results().values())[0]
|
|
347
|
+
|
|
348
|
+
try:
|
|
349
|
+
odps_schema = _parse_explained_schema(explain_str)
|
|
350
|
+
except ValueError as ex:
|
|
351
|
+
exc = ValueError(str(ex) + "\nExplain instance ID: " + inst.id)
|
|
352
|
+
raise exc.with_traceback(ex.__traceback__) from None
|
|
353
|
+
|
|
354
|
+
new_columns = []
|
|
355
|
+
for col in odps_schema.columns:
|
|
356
|
+
anon_match = _ANONYMOUS_COL_REGEX.match(col.name)
|
|
357
|
+
if anon_match and col.name not in query:
|
|
358
|
+
new_name = anonymous_col_prefix + anon_match.group(1)
|
|
359
|
+
col_renames[col.name] = new_name
|
|
360
|
+
new_columns.append(Column(new_name, col.type))
|
|
361
|
+
else:
|
|
362
|
+
new_columns.append(col)
|
|
363
|
+
|
|
364
|
+
dtypes = odps_schema_to_pandas_dtypes(OdpsSchema(new_columns))
|
|
365
|
+
else:
|
|
366
|
+
dtypes = None
|
|
340
367
|
|
|
341
368
|
if not index_col:
|
|
342
369
|
index_dtypes = None
|
|
343
370
|
else:
|
|
371
|
+
if dtypes is None:
|
|
372
|
+
raise ValueError("Cannot configure index_col when skip_schema is True")
|
|
373
|
+
|
|
344
374
|
if isinstance(index_col, str):
|
|
345
375
|
index_col = [index_col]
|
|
346
376
|
index_col_set = set(index_col)
|
|
@@ -34,6 +34,7 @@ from ...serialization.serializables import (
|
|
|
34
34
|
SeriesField,
|
|
35
35
|
StringField,
|
|
36
36
|
)
|
|
37
|
+
from ...utils import is_empty
|
|
37
38
|
from ..core import DataFrame # noqa: F401
|
|
38
39
|
from ..utils import parse_index
|
|
39
40
|
from .core import ColumnPruneSupportedDataSourceMixin, IncrementalIndexDatasource
|
|
@@ -76,7 +77,7 @@ class DataFrameReadODPSTable(
|
|
|
76
77
|
self.columns = columns
|
|
77
78
|
|
|
78
79
|
def __call__(self, shape, chunk_bytes=None, chunk_size=None):
|
|
79
|
-
if
|
|
80
|
+
if is_empty(self.index_columns):
|
|
80
81
|
if np.isnan(shape[0]):
|
|
81
82
|
index_value = parse_index(pd.RangeIndex(0))
|
|
82
83
|
else:
|
|
@@ -238,7 +239,8 @@ def read_odps_table(
|
|
|
238
239
|
partitions = [partitions]
|
|
239
240
|
|
|
240
241
|
append_partitions = append_partitions or any(
|
|
241
|
-
pt.name in (columns
|
|
242
|
+
pt.name in (columns if not is_empty(columns) else ())
|
|
243
|
+
for pt in (table.table_schema.partitions or ())
|
|
242
244
|
)
|
|
243
245
|
op = DataFrameReadODPSTable(
|
|
244
246
|
table_name=table.full_table_name,
|
|
@@ -13,6 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import os
|
|
16
|
+
import uuid
|
|
16
17
|
from collections import OrderedDict
|
|
17
18
|
|
|
18
19
|
import numpy as np
|
|
@@ -26,7 +27,14 @@ from ....core import OutputType
|
|
|
26
27
|
from ....tests.utils import tn
|
|
27
28
|
from ....utils import lazy_import
|
|
28
29
|
from ... import read_odps_query, read_odps_table
|
|
29
|
-
from ...core import
|
|
30
|
+
from ...core import (
|
|
31
|
+
DatetimeIndex,
|
|
32
|
+
Float64Index,
|
|
33
|
+
Index,
|
|
34
|
+
IndexValue,
|
|
35
|
+
Int64Index,
|
|
36
|
+
MultiIndex,
|
|
37
|
+
)
|
|
30
38
|
from ..dataframe import from_pandas as from_pandas_df
|
|
31
39
|
from ..date_range import date_range
|
|
32
40
|
from ..from_tensor import (
|
|
@@ -36,7 +44,12 @@ from ..from_tensor import (
|
|
|
36
44
|
)
|
|
37
45
|
from ..index import from_pandas as from_pandas_index
|
|
38
46
|
from ..index import from_tileable
|
|
39
|
-
from ..read_odps_query import
|
|
47
|
+
from ..read_odps_query import (
|
|
48
|
+
ColumnSchema,
|
|
49
|
+
_parse_full_explain,
|
|
50
|
+
_parse_simple_explain,
|
|
51
|
+
_resolve_task_sector,
|
|
52
|
+
)
|
|
40
53
|
from ..series import from_pandas as from_pandas_series
|
|
41
54
|
|
|
42
55
|
ray = lazy_import("ray")
|
|
@@ -114,18 +127,22 @@ def test_from_tileable_index():
|
|
|
114
127
|
|
|
115
128
|
for o in [df, df[0]]:
|
|
116
129
|
index = o.index
|
|
117
|
-
assert isinstance(index, Int64Index)
|
|
130
|
+
assert isinstance(index, (Index, Int64Index))
|
|
118
131
|
assert index.dtype == np.int64
|
|
119
132
|
assert index.name == pd_df.index.name
|
|
120
|
-
assert isinstance(
|
|
133
|
+
assert isinstance(
|
|
134
|
+
index.index_value.value, (IndexValue.Int64Index, IndexValue.Index)
|
|
135
|
+
)
|
|
121
136
|
|
|
122
137
|
t = mt.random.rand(10, chunk_size=6)
|
|
123
138
|
index = from_tileable(t, name="new_name")
|
|
124
139
|
|
|
125
|
-
assert isinstance(index, Float64Index)
|
|
140
|
+
assert isinstance(index, (Index, Float64Index))
|
|
126
141
|
assert index.dtype == np.float64
|
|
127
142
|
assert index.name == "new_name"
|
|
128
|
-
assert isinstance(
|
|
143
|
+
assert isinstance(
|
|
144
|
+
index.index_value.value, (IndexValue.Float64Index, IndexValue.Index)
|
|
145
|
+
)
|
|
129
146
|
|
|
130
147
|
|
|
131
148
|
def test_from_tensor():
|
|
@@ -327,7 +344,10 @@ def test_from_odps_query():
|
|
|
327
344
|
odps_entry.write_table(test_table2, [["A", 10, 4.5]])
|
|
328
345
|
|
|
329
346
|
with pytest.raises(ValueError) as err_info:
|
|
330
|
-
read_odps_query(
|
|
347
|
+
read_odps_query(
|
|
348
|
+
f"CREATE TABLE dummy_table_{uuid.uuid4().hex} "
|
|
349
|
+
f"AS SELECT * FROM {table1_name}"
|
|
350
|
+
)
|
|
331
351
|
assert "instant query" in err_info.value.args[0]
|
|
332
352
|
|
|
333
353
|
query1 = f"SELECT * FROM {table1_name} WHERE col1 > 10"
|
|
@@ -343,6 +363,10 @@ def test_from_odps_query():
|
|
|
343
363
|
),
|
|
344
364
|
)
|
|
345
365
|
|
|
366
|
+
df = read_odps_query(query1, skip_schema=True)
|
|
367
|
+
assert df.dtypes is None
|
|
368
|
+
assert df.columns_value is None
|
|
369
|
+
|
|
346
370
|
df = read_odps_query(query1, index_col="col1")
|
|
347
371
|
assert df.op.query == query1
|
|
348
372
|
assert df.index_value.name == "col1"
|
|
@@ -442,3 +466,31 @@ def test_resolve_simple_explain():
|
|
|
442
466
|
assert schema.columns[0].type == odps_types.string
|
|
443
467
|
assert schema.columns[1].name == "createdate"
|
|
444
468
|
assert schema.columns[1].type == odps_types.bigint
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
def test_resolve_conditional():
|
|
472
|
+
input_path = os.path.join(
|
|
473
|
+
os.path.dirname(__file__), "test-data", "task-input-multi-cond.txt"
|
|
474
|
+
)
|
|
475
|
+
with open(input_path, "r") as f:
|
|
476
|
+
sector = f.read()
|
|
477
|
+
|
|
478
|
+
expected_col_types = {
|
|
479
|
+
"cs1": "string",
|
|
480
|
+
"cs2": "string",
|
|
481
|
+
"ci1": "bigint",
|
|
482
|
+
"cs3": "string",
|
|
483
|
+
"cs4": "string",
|
|
484
|
+
"cs5": "string",
|
|
485
|
+
"cs6": "string",
|
|
486
|
+
"cs7": "string",
|
|
487
|
+
"cs8": "string",
|
|
488
|
+
"ci2": "int",
|
|
489
|
+
"ci3": "bigint",
|
|
490
|
+
"cs9": "string",
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
schema = _parse_full_explain(sector)
|
|
494
|
+
for col, (exp_nm, exp_tp) in zip(schema.columns, expected_col_types.items()):
|
|
495
|
+
assert col.name == exp_nm
|
|
496
|
+
assert col.type == odps_types.validate_data_type(exp_tp)
|