maxframe 1.0.0rc4__cp38-cp38-macosx_10_9_universal2.whl → 1.1.0__cp38-cp38-macosx_10_9_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cpython-38-darwin.so +0 -0
- maxframe/config/config.py +3 -0
- maxframe/conftest.py +9 -2
- maxframe/core/graph/core.cpython-38-darwin.so +0 -0
- maxframe/core/operator/base.py +2 -0
- maxframe/dataframe/arithmetic/tests/test_arithmetic.py +17 -16
- maxframe/dataframe/core.py +24 -2
- maxframe/dataframe/datasource/read_odps_query.py +63 -34
- maxframe/dataframe/datasource/tests/test_datasource.py +59 -7
- maxframe/dataframe/extensions/__init__.py +5 -0
- maxframe/dataframe/extensions/apply_chunk.py +649 -0
- maxframe/dataframe/extensions/flatjson.py +131 -0
- maxframe/dataframe/extensions/flatmap.py +28 -40
- maxframe/dataframe/extensions/reshuffle.py +1 -1
- maxframe/dataframe/extensions/tests/test_apply_chunk.py +186 -0
- maxframe/dataframe/extensions/tests/test_extensions.py +46 -2
- maxframe/dataframe/groupby/__init__.py +1 -0
- maxframe/dataframe/groupby/aggregation.py +1 -0
- maxframe/dataframe/groupby/apply.py +9 -1
- maxframe/dataframe/groupby/core.py +1 -1
- maxframe/dataframe/groupby/fill.py +4 -1
- maxframe/dataframe/groupby/getitem.py +6 -0
- maxframe/dataframe/groupby/tests/test_groupby.py +1 -1
- maxframe/dataframe/groupby/transform.py +8 -2
- maxframe/dataframe/indexing/loc.py +6 -4
- maxframe/dataframe/merge/__init__.py +9 -1
- maxframe/dataframe/merge/concat.py +41 -31
- maxframe/dataframe/merge/merge.py +1 -1
- maxframe/dataframe/merge/tests/test_merge.py +3 -1
- maxframe/dataframe/misc/apply.py +3 -0
- maxframe/dataframe/misc/drop_duplicates.py +5 -1
- maxframe/dataframe/misc/map.py +3 -1
- maxframe/dataframe/misc/tests/test_misc.py +24 -2
- maxframe/dataframe/misc/transform.py +22 -13
- maxframe/dataframe/reduction/__init__.py +3 -0
- maxframe/dataframe/reduction/aggregation.py +1 -0
- maxframe/dataframe/reduction/median.py +56 -0
- maxframe/dataframe/reduction/tests/test_reduction.py +17 -7
- maxframe/dataframe/statistics/quantile.py +8 -2
- maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
- maxframe/dataframe/tests/test_utils.py +60 -0
- maxframe/dataframe/utils.py +110 -7
- maxframe/dataframe/window/expanding.py +5 -3
- maxframe/dataframe/window/tests/test_expanding.py +2 -2
- maxframe/io/objects/tests/test_object_io.py +39 -12
- maxframe/io/odpsio/arrow.py +30 -2
- maxframe/io/odpsio/schema.py +23 -5
- maxframe/io/odpsio/tableio.py +26 -110
- maxframe/io/odpsio/tests/test_schema.py +40 -0
- maxframe/io/odpsio/tests/test_tableio.py +5 -5
- maxframe/io/odpsio/tests/test_volumeio.py +35 -11
- maxframe/io/odpsio/volumeio.py +27 -3
- maxframe/learn/contrib/__init__.py +3 -2
- maxframe/learn/contrib/llm/__init__.py +16 -0
- maxframe/learn/contrib/llm/core.py +54 -0
- maxframe/learn/contrib/llm/models/__init__.py +14 -0
- maxframe/learn/contrib/llm/models/dashscope.py +73 -0
- maxframe/learn/contrib/llm/multi_modal.py +42 -0
- maxframe/learn/contrib/llm/text.py +42 -0
- maxframe/lib/mmh3.cpython-38-darwin.so +0 -0
- maxframe/lib/sparse/tests/test_sparse.py +15 -15
- maxframe/opcodes.py +7 -1
- maxframe/serialization/core.cpython-38-darwin.so +0 -0
- maxframe/serialization/core.pyx +13 -1
- maxframe/serialization/pandas.py +50 -20
- maxframe/serialization/serializables/core.py +24 -5
- maxframe/serialization/serializables/field_type.py +4 -1
- maxframe/serialization/serializables/tests/test_serializable.py +8 -1
- maxframe/serialization/tests/test_serial.py +2 -1
- maxframe/tensor/__init__.py +19 -7
- maxframe/tests/utils.py +16 -0
- maxframe/udf.py +27 -0
- maxframe/utils.py +36 -8
- {maxframe-1.0.0rc4.dist-info → maxframe-1.1.0.dist-info}/METADATA +2 -2
- {maxframe-1.0.0rc4.dist-info → maxframe-1.1.0.dist-info}/RECORD +83 -72
- {maxframe-1.0.0rc4.dist-info → maxframe-1.1.0.dist-info}/WHEEL +1 -1
- maxframe_client/clients/framedriver.py +4 -1
- maxframe_client/fetcher.py +18 -2
- maxframe_client/session/odps.py +23 -10
- maxframe_client/session/task.py +2 -24
- maxframe_client/session/tests/test_task.py +0 -4
- maxframe_client/tests/test_session.py +30 -10
- {maxframe-1.0.0rc4.dist-info → maxframe-1.1.0.dist-info}/top_level.txt +0 -0
|
Binary file
|
maxframe/config/config.py
CHANGED
|
@@ -380,6 +380,9 @@ default_options.register_option(
|
|
|
380
380
|
default_options.register_option(
|
|
381
381
|
"session.enable_schema", None, validator=is_null | is_bool, remote=True
|
|
382
382
|
)
|
|
383
|
+
default_options.register_option(
|
|
384
|
+
"session.enable_high_availability", None, validator=is_null | is_bool, remote=True
|
|
385
|
+
)
|
|
383
386
|
default_options.register_option(
|
|
384
387
|
"session.default_schema", None, validator=is_null | is_string, remote=True
|
|
385
388
|
)
|
maxframe/conftest.py
CHANGED
|
@@ -126,7 +126,14 @@ def oss_config():
|
|
|
126
126
|
oss_rolearn = config.get("oss", "rolearn")
|
|
127
127
|
|
|
128
128
|
options.service_role_arn = oss_rolearn
|
|
129
|
-
|
|
129
|
+
if "test" in oss_endpoint:
|
|
130
|
+
oss_svc_endpoint = oss_endpoint
|
|
131
|
+
else:
|
|
132
|
+
endpoint_parts = oss_endpoint.split(".", 1)
|
|
133
|
+
if "-internal" not in endpoint_parts[0]:
|
|
134
|
+
endpoint_parts[0] += "-internal"
|
|
135
|
+
oss_svc_endpoint = ".".join(endpoint_parts)
|
|
136
|
+
options.object_cache_url = f"oss://{oss_svc_endpoint}/{oss_bucket_name}"
|
|
130
137
|
|
|
131
138
|
config.oss_config = (
|
|
132
139
|
oss_access_id,
|
|
@@ -141,7 +148,7 @@ def oss_config():
|
|
|
141
148
|
config.oss_bucket = oss2.Bucket(auth, oss_endpoint, oss_bucket_name)
|
|
142
149
|
config.oss_rolearn = oss_rolearn
|
|
143
150
|
yield config
|
|
144
|
-
except (
|
|
151
|
+
except (NoSectionError, NoOptionError, ImportError):
|
|
145
152
|
return None
|
|
146
153
|
finally:
|
|
147
154
|
options.service_role_arn = old_role_arn
|
|
Binary file
|
maxframe/core/operator/base.py
CHANGED
|
@@ -86,6 +86,8 @@ class SchedulingHint(Serializable):
|
|
|
86
86
|
# `gpu` indicates that if the operator should be executed on the GPU.
|
|
87
87
|
gpu = BoolField("gpu", default=None)
|
|
88
88
|
priority = Int32Field("priority", default=None)
|
|
89
|
+
expect_engine = StringField("expect_engine", default=None)
|
|
90
|
+
expect_resources = DictField("expect_resources", FieldTypes.string, default=None)
|
|
89
91
|
|
|
90
92
|
@classproperty
|
|
91
93
|
@lru_cache(1)
|
|
@@ -22,6 +22,7 @@ import pandas as pd
|
|
|
22
22
|
import pytest
|
|
23
23
|
|
|
24
24
|
from ....core import OperatorType
|
|
25
|
+
from ....tests.utils import assert_mf_index_dtype
|
|
25
26
|
from ....utils import dataslots
|
|
26
27
|
from ...core import IndexValue
|
|
27
28
|
from ...datasource.dataframe import from_pandas
|
|
@@ -164,7 +165,7 @@ def test_without_shuffle(func_name, func_opts):
|
|
|
164
165
|
pd.testing.assert_index_equal(
|
|
165
166
|
df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
|
|
166
167
|
)
|
|
167
|
-
|
|
168
|
+
assert_mf_index_dtype(df3.index_value.value, np.int64)
|
|
168
169
|
pd.testing.assert_index_equal(
|
|
169
170
|
df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
|
|
170
171
|
)
|
|
@@ -176,7 +177,7 @@ def test_without_shuffle(func_name, func_opts):
|
|
|
176
177
|
pd.testing.assert_index_equal(
|
|
177
178
|
df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
|
|
178
179
|
)
|
|
179
|
-
|
|
180
|
+
assert_mf_index_dtype(df3.index_value.value, np.int64)
|
|
180
181
|
pd.testing.assert_index_equal(
|
|
181
182
|
df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
|
|
182
183
|
)
|
|
@@ -370,7 +371,7 @@ def test_with_one_shuffle(func_name, func_opts):
|
|
|
370
371
|
pd.testing.assert_index_equal(
|
|
371
372
|
df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
|
|
372
373
|
)
|
|
373
|
-
|
|
374
|
+
assert_mf_index_dtype(df3.index_value.value, np.int64)
|
|
374
375
|
pd.testing.assert_index_equal(
|
|
375
376
|
df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
|
|
376
377
|
)
|
|
@@ -403,7 +404,7 @@ def test_with_all_shuffle(func_name, func_opts):
|
|
|
403
404
|
pd.testing.assert_index_equal(
|
|
404
405
|
df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
|
|
405
406
|
)
|
|
406
|
-
|
|
407
|
+
assert_mf_index_dtype(df3.index_value.value, np.int64)
|
|
407
408
|
pd.testing.assert_index_equal(
|
|
408
409
|
df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
|
|
409
410
|
)
|
|
@@ -433,7 +434,7 @@ def test_with_all_shuffle(func_name, func_opts):
|
|
|
433
434
|
pd.testing.assert_index_equal(
|
|
434
435
|
df6.columns_value.to_pandas(), func_opts.func(data4, data5).columns
|
|
435
436
|
)
|
|
436
|
-
|
|
437
|
+
assert_mf_index_dtype(df6.index_value.value, np.int64)
|
|
437
438
|
pd.testing.assert_index_equal(
|
|
438
439
|
df6.index_value.to_pandas(), pd.Index([], dtype=np.int64)
|
|
439
440
|
)
|
|
@@ -468,7 +469,7 @@ def test_without_shuffle_and_with_one_chunk(func_name, func_opts):
|
|
|
468
469
|
pd.testing.assert_index_equal(
|
|
469
470
|
df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
|
|
470
471
|
)
|
|
471
|
-
|
|
472
|
+
assert_mf_index_dtype(df3.index_value.value, np.int64)
|
|
472
473
|
pd.testing.assert_index_equal(
|
|
473
474
|
df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
|
|
474
475
|
)
|
|
@@ -501,7 +502,7 @@ def test_both_one_chunk(func_name, func_opts):
|
|
|
501
502
|
pd.testing.assert_index_equal(
|
|
502
503
|
df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
|
|
503
504
|
)
|
|
504
|
-
|
|
505
|
+
assert_mf_index_dtype(df3.index_value.value, np.int64)
|
|
505
506
|
pd.testing.assert_index_equal(
|
|
506
507
|
df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
|
|
507
508
|
)
|
|
@@ -534,7 +535,7 @@ def test_with_shuffle_and_one_chunk(func_name, func_opts):
|
|
|
534
535
|
pd.testing.assert_index_equal(
|
|
535
536
|
df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
|
|
536
537
|
)
|
|
537
|
-
|
|
538
|
+
assert_mf_index_dtype(df3.index_value.value, np.int64)
|
|
538
539
|
pd.testing.assert_index_equal(
|
|
539
540
|
df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
|
|
540
541
|
)
|
|
@@ -558,7 +559,7 @@ def test_on_same_dataframe(func_name, func_opts):
|
|
|
558
559
|
pd.testing.assert_index_equal(
|
|
559
560
|
df2.columns_value.to_pandas(), func_opts.func(data, data).columns
|
|
560
561
|
)
|
|
561
|
-
|
|
562
|
+
assert_mf_index_dtype(df2.index_value.value, np.int64)
|
|
562
563
|
pd.testing.assert_index_equal(
|
|
563
564
|
df2.index_value.to_pandas(), pd.Index([], dtype=np.int64)
|
|
564
565
|
)
|
|
@@ -590,19 +591,19 @@ def test_dataframe_and_scalar(func_name, func_opts):
|
|
|
590
591
|
pd.testing.assert_series_equal(result.dtypes, expected.dtypes)
|
|
591
592
|
|
|
592
593
|
pd.testing.assert_index_equal(result.columns_value.to_pandas(), data.columns)
|
|
593
|
-
|
|
594
|
+
assert_mf_index_dtype(result.index_value.value, np.int64)
|
|
594
595
|
|
|
595
596
|
pd.testing.assert_index_equal(result2.columns_value.to_pandas(), data.columns)
|
|
596
|
-
|
|
597
|
+
assert_mf_index_dtype(result2.index_value.value, np.int64)
|
|
597
598
|
|
|
598
599
|
pd.testing.assert_index_equal(result3.columns_value.to_pandas(), data.columns)
|
|
599
|
-
|
|
600
|
+
assert_mf_index_dtype(result3.index_value.value, np.int64)
|
|
600
601
|
|
|
601
602
|
pd.testing.assert_index_equal(result4.columns_value.to_pandas(), data.columns)
|
|
602
|
-
|
|
603
|
+
assert_mf_index_dtype(result4.index_value.value, np.int64)
|
|
603
604
|
|
|
604
605
|
pd.testing.assert_index_equal(result5.columns_value.to_pandas(), data.columns)
|
|
605
|
-
|
|
606
|
+
assert_mf_index_dtype(result5.index_value.value, np.int64)
|
|
606
607
|
|
|
607
608
|
if "builtin_function_or_method" not in str(type(func_opts.func)):
|
|
608
609
|
# skip NotImplemented test for comparison function
|
|
@@ -679,7 +680,7 @@ def test_abs():
|
|
|
679
680
|
pd.testing.assert_index_equal(
|
|
680
681
|
df2.columns_value.to_pandas(), df1.columns_value.to_pandas()
|
|
681
682
|
)
|
|
682
|
-
|
|
683
|
+
assert_mf_index_dtype(df2.index_value.value, np.int64)
|
|
683
684
|
assert df2.shape == (10, 10)
|
|
684
685
|
|
|
685
686
|
|
|
@@ -697,7 +698,7 @@ def test_not():
|
|
|
697
698
|
pd.testing.assert_index_equal(
|
|
698
699
|
df2.columns_value.to_pandas(), df1.columns_value.to_pandas()
|
|
699
700
|
)
|
|
700
|
-
|
|
701
|
+
assert_mf_index_dtype(df2.index_value.value, np.int64)
|
|
701
702
|
assert df2.shape == (10, 10)
|
|
702
703
|
|
|
703
704
|
|
maxframe/dataframe/core.py
CHANGED
|
@@ -142,6 +142,14 @@ class IndexValue(Serializable):
|
|
|
142
142
|
_data = NDArrayField("data")
|
|
143
143
|
_dtype = DataTypeField("dtype")
|
|
144
144
|
|
|
145
|
+
@property
|
|
146
|
+
def dtype(self):
|
|
147
|
+
return getattr(self, "_dtype", None)
|
|
148
|
+
|
|
149
|
+
@property
|
|
150
|
+
def inferred_type(self):
|
|
151
|
+
return "floating" if self.dtype.kind == "f" else "integer"
|
|
152
|
+
|
|
145
153
|
class RangeIndex(IndexBase):
|
|
146
154
|
_name = AnyField("name")
|
|
147
155
|
_slice = SliceField("slice")
|
|
@@ -243,6 +251,10 @@ class IndexValue(Serializable):
|
|
|
243
251
|
_data = NDArrayField("data")
|
|
244
252
|
_dtype = DataTypeField("dtype")
|
|
245
253
|
|
|
254
|
+
@property
|
|
255
|
+
def dtype(self):
|
|
256
|
+
return getattr(self, "_dtype", None)
|
|
257
|
+
|
|
246
258
|
@property
|
|
247
259
|
def inferred_type(self):
|
|
248
260
|
return "integer"
|
|
@@ -254,6 +266,10 @@ class IndexValue(Serializable):
|
|
|
254
266
|
_data = NDArrayField("data")
|
|
255
267
|
_dtype = DataTypeField("dtype")
|
|
256
268
|
|
|
269
|
+
@property
|
|
270
|
+
def dtype(self):
|
|
271
|
+
return getattr(self, "_dtype", None)
|
|
272
|
+
|
|
257
273
|
@property
|
|
258
274
|
def inferred_type(self):
|
|
259
275
|
return "integer"
|
|
@@ -265,6 +281,10 @@ class IndexValue(Serializable):
|
|
|
265
281
|
_data = NDArrayField("data")
|
|
266
282
|
_dtype = DataTypeField("dtype")
|
|
267
283
|
|
|
284
|
+
@property
|
|
285
|
+
def dtype(self):
|
|
286
|
+
return getattr(self, "_dtype", None)
|
|
287
|
+
|
|
268
288
|
@property
|
|
269
289
|
def inferred_type(self):
|
|
270
290
|
return "floating"
|
|
@@ -1514,8 +1534,7 @@ class BaseDataFrameData(HasShapeTileableData, _ToPandasMixin):
|
|
|
1514
1534
|
refresh_index_value(self)
|
|
1515
1535
|
refresh_dtypes(self)
|
|
1516
1536
|
|
|
1517
|
-
def
|
|
1518
|
-
dtypes = table_meta.pd_column_dtypes
|
|
1537
|
+
def refresh_from_dtypes(self, dtypes: pd.Series) -> None:
|
|
1519
1538
|
self._dtypes = dtypes
|
|
1520
1539
|
self._columns_value = parse_index(dtypes.index, store_data=True)
|
|
1521
1540
|
self._dtypes_value = DtypesValue(key=tokenize(dtypes), value=dtypes)
|
|
@@ -1523,6 +1542,9 @@ class BaseDataFrameData(HasShapeTileableData, _ToPandasMixin):
|
|
|
1523
1542
|
new_shape[-1] = len(dtypes)
|
|
1524
1543
|
self._shape = tuple(new_shape)
|
|
1525
1544
|
|
|
1545
|
+
def refresh_from_table_meta(self, table_meta: DataFrameTableMeta) -> None:
|
|
1546
|
+
self.refresh_from_dtypes(table_meta.pd_column_dtypes)
|
|
1547
|
+
|
|
1526
1548
|
@property
|
|
1527
1549
|
def dtypes(self):
|
|
1528
1550
|
dt = getattr(self, "_dtypes", None)
|
|
@@ -57,7 +57,7 @@ _EXPLAIN_COLUMN_REGEX = re.compile(r"([^\(]+) \(([^\n]+)\)(?:| AS ([^ ]+))(?:\n|
|
|
|
57
57
|
_ANONYMOUS_COL_REGEX = re.compile(r"^_c(\d+)$")
|
|
58
58
|
|
|
59
59
|
_SIMPLE_SCHEMA_COLS_REGEX = re.compile(r"SELECT (([^:]+:[^, ]+[, ]*)+)FROM")
|
|
60
|
-
_SIMPLE_SCHEMA_COL_REGEX = re.compile(r"([
|
|
60
|
+
_SIMPLE_SCHEMA_COL_REGEX = re.compile(r"([^ \.\)]+):([^ ]+)")
|
|
61
61
|
|
|
62
62
|
|
|
63
63
|
@dataclasses.dataclass
|
|
@@ -180,23 +180,30 @@ def _parse_full_explain(explain_string: str) -> OdpsSchema:
|
|
|
180
180
|
|
|
181
181
|
job_dag = jobs_sector.build_dag()
|
|
182
182
|
indep_job_names = list(job_dag.iter_indep(reverse=True))
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
183
|
+
schema_signatures = dict()
|
|
184
|
+
for job_name in indep_job_names:
|
|
185
|
+
tasks_sector = jobs_sector.jobs[job_name]
|
|
186
|
+
task_dag = tasks_sector.build_dag()
|
|
187
|
+
indep_task_names = list(task_dag.iter_indep(reverse=True))
|
|
188
|
+
for task_name in indep_task_names:
|
|
189
|
+
task_sector = tasks_sector.tasks[task_name]
|
|
190
|
+
if not task_sector.schema: # pragma: no cover
|
|
191
|
+
raise ValueError("Cannot detect output schema")
|
|
192
|
+
if task_sector.output_target != "Screen":
|
|
193
|
+
raise ValueError("The SQL statement should be an instant query")
|
|
194
|
+
sig_tuples = sorted(
|
|
195
|
+
[
|
|
196
|
+
(c.column_alias or c.column_name, c.column_type)
|
|
197
|
+
for c in task_sector.schema
|
|
198
|
+
]
|
|
199
|
+
)
|
|
200
|
+
schema_signatures[hash(tuple(sig_tuples))] = task_sector.schema
|
|
201
|
+
if len(schema_signatures) != 1:
|
|
190
202
|
raise ValueError("Only one final task is allowed in SQL statement")
|
|
191
|
-
|
|
192
|
-
task_sector = tasks_sector.tasks[indep_task_names[0]]
|
|
193
|
-
if not task_sector.schema: # pragma: no cover
|
|
194
|
-
raise ValueError("Cannot detect output schema")
|
|
195
|
-
if task_sector.output_target != "Screen":
|
|
196
|
-
raise ValueError("The SQL statement should be an instant query")
|
|
203
|
+
schema = list(schema_signatures.values())[0]
|
|
197
204
|
cols = [
|
|
198
205
|
Column(c.column_alias or c.column_name, validate_data_type(c.column_type))
|
|
199
|
-
for c in
|
|
206
|
+
for c in schema
|
|
200
207
|
]
|
|
201
208
|
return OdpsSchema(cols)
|
|
202
209
|
|
|
@@ -209,7 +216,7 @@ def _parse_simple_explain(explain_string: str) -> OdpsSchema:
|
|
|
209
216
|
fields_str = fields_match.group(1)
|
|
210
217
|
cols = []
|
|
211
218
|
for field, type_name in _SIMPLE_SCHEMA_COL_REGEX.findall(fields_str):
|
|
212
|
-
cols.append(Column(field, validate_data_type(type_name)))
|
|
219
|
+
cols.append(Column(field, validate_data_type(type_name.rstrip(","))))
|
|
213
220
|
return OdpsSchema(cols)
|
|
214
221
|
|
|
215
222
|
|
|
@@ -257,12 +264,18 @@ class DataFrameReadODPSQuery(
|
|
|
257
264
|
)
|
|
258
265
|
index_value = parse_index(idx)
|
|
259
266
|
|
|
260
|
-
|
|
267
|
+
if self.dtypes is not None:
|
|
268
|
+
columns_value = parse_index(self.dtypes.index, store_data=True)
|
|
269
|
+
shape = (np.nan, len(self.dtypes))
|
|
270
|
+
else:
|
|
271
|
+
columns_value = None
|
|
272
|
+
shape = (np.nan, np.nan)
|
|
273
|
+
|
|
261
274
|
self.output_types = [OutputType.dataframe]
|
|
262
275
|
return self.new_tileable(
|
|
263
276
|
[],
|
|
264
277
|
None,
|
|
265
|
-
shape=
|
|
278
|
+
shape=shape,
|
|
266
279
|
dtypes=self.dtypes,
|
|
267
280
|
index_value=index_value,
|
|
268
281
|
columns_value=columns_value,
|
|
@@ -278,6 +291,7 @@ def read_odps_query(
|
|
|
278
291
|
string_as_binary: bool = None,
|
|
279
292
|
sql_hints: Dict[str, str] = None,
|
|
280
293
|
anonymous_col_prefix: str = _DEFAULT_ANONYMOUS_COL_PREFIX,
|
|
294
|
+
skip_schema: bool = False,
|
|
281
295
|
**kw,
|
|
282
296
|
):
|
|
283
297
|
"""
|
|
@@ -298,6 +312,10 @@ def read_odps_query(
|
|
|
298
312
|
User specified SQL hints.
|
|
299
313
|
anonymous_col_prefix: str, optional
|
|
300
314
|
Prefix for anonymous columns, '_anon_col_' by default.
|
|
315
|
+
skip_schema: bool, optional
|
|
316
|
+
Skip resolving output schema before execution. Once this is configured,
|
|
317
|
+
the output DataFrame cannot be inputs of other DataFrame operators
|
|
318
|
+
before execution.
|
|
301
319
|
|
|
302
320
|
Returns
|
|
303
321
|
-------
|
|
@@ -319,28 +337,39 @@ def read_odps_query(
|
|
|
319
337
|
|
|
320
338
|
if odps_entry is None:
|
|
321
339
|
raise ValueError("Missing odps_entry parameter")
|
|
322
|
-
inst = odps_entry.execute_sql(f"EXPLAIN {query}", hints=hints)
|
|
323
|
-
logger.debug("Explain instance ID: %s", inst.id)
|
|
324
|
-
explain_str = list(inst.get_task_results().values())[0]
|
|
325
340
|
|
|
326
|
-
odps_schema = _parse_explained_schema(explain_str)
|
|
327
|
-
|
|
328
|
-
new_columns = []
|
|
329
341
|
col_renames = {}
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
342
|
+
if not skip_schema:
|
|
343
|
+
inst = odps_entry.execute_sql(f"EXPLAIN {query}", hints=hints)
|
|
344
|
+
logger.debug("Explain instance ID: %s", inst.id)
|
|
345
|
+
explain_str = list(inst.get_task_results().values())[0]
|
|
346
|
+
|
|
347
|
+
try:
|
|
348
|
+
odps_schema = _parse_explained_schema(explain_str)
|
|
349
|
+
except ValueError as ex:
|
|
350
|
+
exc = ValueError(str(ex) + "\nExplain instance ID: " + inst.id)
|
|
351
|
+
raise exc.with_traceback(ex.__traceback__) from None
|
|
352
|
+
|
|
353
|
+
new_columns = []
|
|
354
|
+
for col in odps_schema.columns:
|
|
355
|
+
anon_match = _ANONYMOUS_COL_REGEX.match(col.name)
|
|
356
|
+
if anon_match and col.name not in query:
|
|
357
|
+
new_name = anonymous_col_prefix + anon_match.group(1)
|
|
358
|
+
col_renames[col.name] = new_name
|
|
359
|
+
new_columns.append(Column(new_name, col.type))
|
|
360
|
+
else:
|
|
361
|
+
new_columns.append(col)
|
|
362
|
+
|
|
363
|
+
dtypes = odps_schema_to_pandas_dtypes(OdpsSchema(new_columns))
|
|
364
|
+
else:
|
|
365
|
+
dtypes = None
|
|
340
366
|
|
|
341
367
|
if not index_col:
|
|
342
368
|
index_dtypes = None
|
|
343
369
|
else:
|
|
370
|
+
if dtypes is None:
|
|
371
|
+
raise ValueError("Cannot configure index_col when skip_schema is True")
|
|
372
|
+
|
|
344
373
|
if isinstance(index_col, str):
|
|
345
374
|
index_col = [index_col]
|
|
346
375
|
index_col_set = set(index_col)
|
|
@@ -13,6 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import os
|
|
16
|
+
import uuid
|
|
16
17
|
from collections import OrderedDict
|
|
17
18
|
|
|
18
19
|
import numpy as np
|
|
@@ -26,7 +27,14 @@ from ....core import OutputType
|
|
|
26
27
|
from ....tests.utils import tn
|
|
27
28
|
from ....utils import lazy_import
|
|
28
29
|
from ... import read_odps_query, read_odps_table
|
|
29
|
-
from ...core import
|
|
30
|
+
from ...core import (
|
|
31
|
+
DatetimeIndex,
|
|
32
|
+
Float64Index,
|
|
33
|
+
Index,
|
|
34
|
+
IndexValue,
|
|
35
|
+
Int64Index,
|
|
36
|
+
MultiIndex,
|
|
37
|
+
)
|
|
30
38
|
from ..dataframe import from_pandas as from_pandas_df
|
|
31
39
|
from ..date_range import date_range
|
|
32
40
|
from ..from_tensor import (
|
|
@@ -36,7 +44,12 @@ from ..from_tensor import (
|
|
|
36
44
|
)
|
|
37
45
|
from ..index import from_pandas as from_pandas_index
|
|
38
46
|
from ..index import from_tileable
|
|
39
|
-
from ..read_odps_query import
|
|
47
|
+
from ..read_odps_query import (
|
|
48
|
+
ColumnSchema,
|
|
49
|
+
_parse_full_explain,
|
|
50
|
+
_parse_simple_explain,
|
|
51
|
+
_resolve_task_sector,
|
|
52
|
+
)
|
|
40
53
|
from ..series import from_pandas as from_pandas_series
|
|
41
54
|
|
|
42
55
|
ray = lazy_import("ray")
|
|
@@ -114,18 +127,22 @@ def test_from_tileable_index():
|
|
|
114
127
|
|
|
115
128
|
for o in [df, df[0]]:
|
|
116
129
|
index = o.index
|
|
117
|
-
assert isinstance(index, Int64Index)
|
|
130
|
+
assert isinstance(index, (Index, Int64Index))
|
|
118
131
|
assert index.dtype == np.int64
|
|
119
132
|
assert index.name == pd_df.index.name
|
|
120
|
-
assert isinstance(
|
|
133
|
+
assert isinstance(
|
|
134
|
+
index.index_value.value, (IndexValue.Int64Index, IndexValue.Index)
|
|
135
|
+
)
|
|
121
136
|
|
|
122
137
|
t = mt.random.rand(10, chunk_size=6)
|
|
123
138
|
index = from_tileable(t, name="new_name")
|
|
124
139
|
|
|
125
|
-
assert isinstance(index, Float64Index)
|
|
140
|
+
assert isinstance(index, (Index, Float64Index))
|
|
126
141
|
assert index.dtype == np.float64
|
|
127
142
|
assert index.name == "new_name"
|
|
128
|
-
assert isinstance(
|
|
143
|
+
assert isinstance(
|
|
144
|
+
index.index_value.value, (IndexValue.Float64Index, IndexValue.Index)
|
|
145
|
+
)
|
|
129
146
|
|
|
130
147
|
|
|
131
148
|
def test_from_tensor():
|
|
@@ -327,7 +344,10 @@ def test_from_odps_query():
|
|
|
327
344
|
odps_entry.write_table(test_table2, [["A", 10, 4.5]])
|
|
328
345
|
|
|
329
346
|
with pytest.raises(ValueError) as err_info:
|
|
330
|
-
read_odps_query(
|
|
347
|
+
read_odps_query(
|
|
348
|
+
f"CREATE TABLE dummy_table_{uuid.uuid4().hex} "
|
|
349
|
+
f"AS SELECT * FROM {table1_name}"
|
|
350
|
+
)
|
|
331
351
|
assert "instant query" in err_info.value.args[0]
|
|
332
352
|
|
|
333
353
|
query1 = f"SELECT * FROM {table1_name} WHERE col1 > 10"
|
|
@@ -343,6 +363,10 @@ def test_from_odps_query():
|
|
|
343
363
|
),
|
|
344
364
|
)
|
|
345
365
|
|
|
366
|
+
df = read_odps_query(query1, skip_schema=True)
|
|
367
|
+
assert df.dtypes is None
|
|
368
|
+
assert df.columns_value is None
|
|
369
|
+
|
|
346
370
|
df = read_odps_query(query1, index_col="col1")
|
|
347
371
|
assert df.op.query == query1
|
|
348
372
|
assert df.index_value.name == "col1"
|
|
@@ -442,3 +466,31 @@ def test_resolve_simple_explain():
|
|
|
442
466
|
assert schema.columns[0].type == odps_types.string
|
|
443
467
|
assert schema.columns[1].name == "createdate"
|
|
444
468
|
assert schema.columns[1].type == odps_types.bigint
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
def test_resolve_conditional():
|
|
472
|
+
input_path = os.path.join(
|
|
473
|
+
os.path.dirname(__file__), "test-data", "task-input-multi-cond.txt"
|
|
474
|
+
)
|
|
475
|
+
with open(input_path, "r") as f:
|
|
476
|
+
sector = f.read()
|
|
477
|
+
|
|
478
|
+
expected_col_types = {
|
|
479
|
+
"cs1": "string",
|
|
480
|
+
"cs2": "string",
|
|
481
|
+
"ci1": "bigint",
|
|
482
|
+
"cs3": "string",
|
|
483
|
+
"cs4": "string",
|
|
484
|
+
"cs5": "string",
|
|
485
|
+
"cs6": "string",
|
|
486
|
+
"cs7": "string",
|
|
487
|
+
"cs8": "string",
|
|
488
|
+
"ci2": "int",
|
|
489
|
+
"ci3": "bigint",
|
|
490
|
+
"cs9": "string",
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
schema = _parse_full_explain(sector)
|
|
494
|
+
for col, (exp_nm, exp_tp) in zip(schema.columns, expected_col_types.items()):
|
|
495
|
+
assert col.name == exp_nm
|
|
496
|
+
assert col.type == odps_types.validate_data_type(exp_tp)
|
|
@@ -18,6 +18,8 @@ from .accessor import (
|
|
|
18
18
|
IndexMaxFrameAccessor,
|
|
19
19
|
SeriesMaxFrameAccessor,
|
|
20
20
|
)
|
|
21
|
+
from .apply_chunk import df_apply_chunk, series_apply_chunk
|
|
22
|
+
from .flatjson import series_flatjson
|
|
21
23
|
from .flatmap import df_flatmap, series_flatmap
|
|
22
24
|
from .reshuffle import DataFrameReshuffle, df_reshuffle
|
|
23
25
|
|
|
@@ -27,7 +29,10 @@ def _install():
|
|
|
27
29
|
|
|
28
30
|
DataFrameMaxFrameAccessor._register("reshuffle", df_reshuffle)
|
|
29
31
|
DataFrameMaxFrameAccessor._register("flatmap", df_flatmap)
|
|
32
|
+
DataFrameMaxFrameAccessor._register("apply_chunk", df_apply_chunk)
|
|
30
33
|
SeriesMaxFrameAccessor._register("flatmap", series_flatmap)
|
|
34
|
+
SeriesMaxFrameAccessor._register("flatjson", series_flatjson)
|
|
35
|
+
SeriesMaxFrameAccessor._register("apply_chunk", series_apply_chunk)
|
|
31
36
|
|
|
32
37
|
if DataFrameMaxFrameAccessor._api_count:
|
|
33
38
|
for t in DATAFRAME_TYPE:
|