maxframe 0.1.0b5__cp311-cp311-macosx_10_9_universal2.whl → 1.0.0__cp311-cp311-macosx_10_9_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cpython-311-darwin.so +0 -0
- maxframe/codegen.py +10 -4
- maxframe/config/config.py +68 -10
- maxframe/config/validators.py +42 -11
- maxframe/conftest.py +58 -14
- maxframe/core/__init__.py +2 -16
- maxframe/core/entity/__init__.py +1 -12
- maxframe/core/entity/executable.py +1 -1
- maxframe/core/entity/objects.py +46 -45
- maxframe/core/entity/output_types.py +0 -3
- maxframe/core/entity/tests/test_objects.py +43 -0
- maxframe/core/entity/tileables.py +5 -78
- maxframe/core/graph/__init__.py +2 -2
- maxframe/core/graph/builder/__init__.py +0 -1
- maxframe/core/graph/builder/base.py +5 -4
- maxframe/core/graph/builder/tileable.py +4 -4
- maxframe/core/graph/builder/utils.py +4 -8
- maxframe/core/graph/core.cpython-311-darwin.so +0 -0
- maxframe/core/graph/core.pyx +4 -4
- maxframe/core/graph/entity.py +9 -33
- maxframe/core/operator/__init__.py +2 -9
- maxframe/core/operator/base.py +3 -5
- maxframe/core/operator/objects.py +0 -9
- maxframe/core/operator/utils.py +55 -0
- maxframe/dataframe/__init__.py +1 -1
- maxframe/dataframe/arithmetic/around.py +5 -17
- maxframe/dataframe/arithmetic/core.py +15 -7
- maxframe/dataframe/arithmetic/docstring.py +7 -33
- maxframe/dataframe/arithmetic/equal.py +4 -2
- maxframe/dataframe/arithmetic/greater.py +4 -2
- maxframe/dataframe/arithmetic/greater_equal.py +4 -2
- maxframe/dataframe/arithmetic/less.py +2 -2
- maxframe/dataframe/arithmetic/less_equal.py +4 -2
- maxframe/dataframe/arithmetic/not_equal.py +4 -2
- maxframe/dataframe/arithmetic/tests/test_arithmetic.py +39 -16
- maxframe/dataframe/core.py +31 -7
- maxframe/dataframe/datasource/date_range.py +2 -2
- maxframe/dataframe/datasource/read_odps_query.py +117 -23
- maxframe/dataframe/datasource/read_odps_table.py +6 -3
- maxframe/dataframe/datasource/tests/test_datasource.py +103 -8
- maxframe/dataframe/datastore/tests/test_to_odps.py +48 -0
- maxframe/dataframe/datastore/to_odps.py +28 -0
- maxframe/dataframe/extensions/__init__.py +5 -0
- maxframe/dataframe/extensions/flatjson.py +131 -0
- maxframe/dataframe/extensions/flatmap.py +317 -0
- maxframe/dataframe/extensions/reshuffle.py +1 -1
- maxframe/dataframe/extensions/tests/test_extensions.py +108 -3
- maxframe/dataframe/groupby/core.py +1 -1
- maxframe/dataframe/groupby/cum.py +0 -1
- maxframe/dataframe/groupby/fill.py +4 -1
- maxframe/dataframe/groupby/getitem.py +6 -0
- maxframe/dataframe/groupby/tests/test_groupby.py +5 -1
- maxframe/dataframe/groupby/transform.py +5 -1
- maxframe/dataframe/indexing/align.py +1 -1
- maxframe/dataframe/indexing/loc.py +6 -4
- maxframe/dataframe/indexing/rename.py +5 -28
- maxframe/dataframe/indexing/sample.py +0 -1
- maxframe/dataframe/indexing/set_index.py +68 -1
- maxframe/dataframe/initializer.py +11 -1
- maxframe/dataframe/merge/__init__.py +9 -1
- maxframe/dataframe/merge/concat.py +41 -31
- maxframe/dataframe/merge/merge.py +237 -3
- maxframe/dataframe/merge/tests/test_merge.py +126 -1
- maxframe/dataframe/misc/apply.py +5 -10
- maxframe/dataframe/misc/case_when.py +1 -1
- maxframe/dataframe/misc/describe.py +2 -2
- maxframe/dataframe/misc/drop_duplicates.py +8 -8
- maxframe/dataframe/misc/eval.py +4 -0
- maxframe/dataframe/misc/memory_usage.py +2 -2
- maxframe/dataframe/misc/pct_change.py +1 -83
- maxframe/dataframe/misc/tests/test_misc.py +33 -2
- maxframe/dataframe/misc/transform.py +1 -30
- maxframe/dataframe/misc/value_counts.py +4 -17
- maxframe/dataframe/missing/dropna.py +1 -1
- maxframe/dataframe/missing/fillna.py +5 -5
- maxframe/dataframe/operators.py +1 -17
- maxframe/dataframe/reduction/core.py +2 -2
- maxframe/dataframe/reduction/tests/test_reduction.py +2 -4
- maxframe/dataframe/sort/sort_values.py +1 -11
- maxframe/dataframe/statistics/corr.py +3 -3
- maxframe/dataframe/statistics/quantile.py +13 -19
- maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
- maxframe/dataframe/tests/test_initializer.py +33 -2
- maxframe/dataframe/utils.py +26 -11
- maxframe/dataframe/window/expanding.py +5 -3
- maxframe/dataframe/window/tests/test_expanding.py +2 -2
- maxframe/errors.py +13 -0
- maxframe/extension.py +12 -0
- maxframe/io/__init__.py +13 -0
- maxframe/io/objects/__init__.py +24 -0
- maxframe/io/objects/core.py +140 -0
- maxframe/io/objects/tensor.py +76 -0
- maxframe/io/objects/tests/__init__.py +13 -0
- maxframe/io/objects/tests/test_object_io.py +97 -0
- maxframe/{odpsio → io/odpsio}/__init__.py +3 -1
- maxframe/{odpsio → io/odpsio}/arrow.py +42 -10
- maxframe/{odpsio → io/odpsio}/schema.py +38 -16
- maxframe/io/odpsio/tableio.py +719 -0
- maxframe/io/odpsio/tests/__init__.py +13 -0
- maxframe/{odpsio → io/odpsio}/tests/test_schema.py +59 -22
- maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +50 -23
- maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
- maxframe/io/odpsio/volumeio.py +63 -0
- maxframe/learn/contrib/__init__.py +3 -1
- maxframe/learn/contrib/graph/__init__.py +15 -0
- maxframe/learn/contrib/graph/connected_components.py +215 -0
- maxframe/learn/contrib/graph/tests/__init__.py +13 -0
- maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
- maxframe/learn/contrib/llm/__init__.py +16 -0
- maxframe/learn/contrib/llm/core.py +54 -0
- maxframe/learn/contrib/llm/models/__init__.py +14 -0
- maxframe/learn/contrib/llm/models/dashscope.py +73 -0
- maxframe/learn/contrib/llm/multi_modal.py +42 -0
- maxframe/learn/contrib/llm/text.py +42 -0
- maxframe/learn/contrib/xgboost/classifier.py +26 -2
- maxframe/learn/contrib/xgboost/core.py +87 -2
- maxframe/learn/contrib/xgboost/dmatrix.py +3 -6
- maxframe/learn/contrib/xgboost/predict.py +29 -46
- maxframe/learn/contrib/xgboost/regressor.py +3 -10
- maxframe/learn/contrib/xgboost/train.py +29 -18
- maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
- maxframe/lib/mmh3.cpython-311-darwin.so +0 -0
- maxframe/lib/mmh3.pyi +43 -0
- maxframe/lib/sparse/tests/test_sparse.py +15 -15
- maxframe/lib/wrapped_pickle.py +2 -1
- maxframe/opcodes.py +8 -0
- maxframe/protocol.py +154 -27
- maxframe/remote/core.py +4 -8
- maxframe/serialization/__init__.py +1 -0
- maxframe/serialization/core.cpython-311-darwin.so +0 -0
- maxframe/serialization/core.pxd +3 -0
- maxframe/serialization/core.pyi +3 -0
- maxframe/serialization/core.pyx +67 -26
- maxframe/serialization/exception.py +1 -1
- maxframe/serialization/pandas.py +52 -17
- maxframe/serialization/serializables/core.py +180 -15
- maxframe/serialization/serializables/field_type.py +4 -1
- maxframe/serialization/serializables/tests/test_serializable.py +54 -5
- maxframe/serialization/tests/test_serial.py +2 -1
- maxframe/session.py +9 -2
- maxframe/tensor/__init__.py +81 -2
- maxframe/tensor/arithmetic/isclose.py +1 -0
- maxframe/tensor/arithmetic/tests/test_arithmetic.py +22 -18
- maxframe/tensor/core.py +5 -136
- maxframe/tensor/datasource/array.py +3 -0
- maxframe/tensor/datasource/full.py +1 -1
- maxframe/tensor/datasource/tests/test_datasource.py +1 -1
- maxframe/tensor/indexing/flatnonzero.py +1 -1
- maxframe/tensor/indexing/getitem.py +2 -0
- maxframe/tensor/merge/__init__.py +2 -0
- maxframe/tensor/merge/concatenate.py +101 -0
- maxframe/tensor/merge/tests/test_merge.py +30 -1
- maxframe/tensor/merge/vstack.py +74 -0
- maxframe/tensor/{base → misc}/__init__.py +2 -0
- maxframe/tensor/{base → misc}/atleast_1d.py +1 -3
- maxframe/tensor/misc/atleast_2d.py +70 -0
- maxframe/tensor/misc/atleast_3d.py +85 -0
- maxframe/tensor/misc/tests/__init__.py +13 -0
- maxframe/tensor/{base → misc}/transpose.py +22 -18
- maxframe/tensor/{base → misc}/unique.py +3 -3
- maxframe/tensor/operators.py +1 -7
- maxframe/tensor/random/core.py +1 -1
- maxframe/tensor/reduction/count_nonzero.py +2 -1
- maxframe/tensor/reduction/mean.py +1 -0
- maxframe/tensor/reduction/nanmean.py +1 -0
- maxframe/tensor/reduction/nanvar.py +2 -0
- maxframe/tensor/reduction/tests/test_reduction.py +12 -1
- maxframe/tensor/reduction/var.py +2 -0
- maxframe/tensor/statistics/quantile.py +2 -2
- maxframe/tensor/utils.py +2 -22
- maxframe/tests/test_protocol.py +34 -0
- maxframe/tests/test_utils.py +0 -12
- maxframe/tests/utils.py +17 -2
- maxframe/typing_.py +4 -1
- maxframe/udf.py +8 -9
- maxframe/utils.py +106 -86
- {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/METADATA +25 -25
- {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/RECORD +197 -173
- {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/WHEEL +1 -1
- maxframe_client/__init__.py +0 -1
- maxframe_client/clients/framedriver.py +4 -1
- maxframe_client/fetcher.py +81 -74
- maxframe_client/session/consts.py +3 -0
- maxframe_client/session/graph.py +8 -2
- maxframe_client/session/odps.py +194 -40
- maxframe_client/session/task.py +94 -39
- maxframe_client/tests/test_fetcher.py +21 -3
- maxframe_client/tests/test_session.py +109 -8
- maxframe/core/entity/chunks.py +0 -68
- maxframe/core/entity/fuse.py +0 -73
- maxframe/core/graph/builder/chunk.py +0 -430
- maxframe/odpsio/tableio.py +0 -322
- maxframe/odpsio/volumeio.py +0 -95
- maxframe_client/clients/spe.py +0 -104
- /maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
- /maxframe/{tensor/base → dataframe/datastore}/tests/__init__.py +0 -0
- /maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
- /maxframe/tensor/{base → misc}/astype.py +0 -0
- /maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
- /maxframe/tensor/{base → misc}/ravel.py +0 -0
- /maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
- /maxframe/tensor/{base → misc}/where.py +0 -0
- {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -13,6 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import dataclasses
|
|
16
|
+
import logging
|
|
16
17
|
import re
|
|
17
18
|
from typing import Dict, List, Optional, Tuple, Union
|
|
18
19
|
|
|
@@ -22,12 +23,14 @@ from odps import ODPS
|
|
|
22
23
|
from odps.types import Column, OdpsSchema, validate_data_type
|
|
23
24
|
|
|
24
25
|
from ... import opcodes
|
|
26
|
+
from ...config import options
|
|
25
27
|
from ...core import OutputType
|
|
26
28
|
from ...core.graph import DAG
|
|
27
|
-
from ...odpsio import odps_schema_to_pandas_dtypes
|
|
29
|
+
from ...io.odpsio import odps_schema_to_pandas_dtypes
|
|
28
30
|
from ...serialization.serializables import (
|
|
29
31
|
AnyField,
|
|
30
32
|
BoolField,
|
|
33
|
+
DictField,
|
|
31
34
|
FieldTypes,
|
|
32
35
|
Int64Field,
|
|
33
36
|
ListField,
|
|
@@ -37,6 +40,10 @@ from ...serialization.serializables import (
|
|
|
37
40
|
from ..utils import parse_index
|
|
38
41
|
from .core import ColumnPruneSupportedDataSourceMixin, IncrementalIndexDatasource
|
|
39
42
|
|
|
43
|
+
logger = logging.getLogger(__name__)
|
|
44
|
+
|
|
45
|
+
_DEFAULT_ANONYMOUS_COL_PREFIX = "_anon_col_"
|
|
46
|
+
|
|
40
47
|
_EXPLAIN_DEPENDS_REGEX = re.compile(r"([^\s]+) depends on: ([^\n]+)")
|
|
41
48
|
_EXPLAIN_JOB_REGEX = re.compile(r"(\S+) is root job")
|
|
42
49
|
_EXPLAIN_TASKS_HEADER_REGEX = re.compile(r"In Job ([^:]+):")
|
|
@@ -46,7 +53,11 @@ _EXPLAIN_TASK_SCHEMA_REGEX = re.compile(
|
|
|
46
53
|
r"In Task ([^:]+)[\S\s]+FS: output: ([^\n #]+)[\s\S]+schema:\s+([\S\s]+)$",
|
|
47
54
|
re.MULTILINE,
|
|
48
55
|
)
|
|
49
|
-
_EXPLAIN_COLUMN_REGEX = re.compile(r"([^\(]+) \(([
|
|
56
|
+
_EXPLAIN_COLUMN_REGEX = re.compile(r"([^\(]+) \(([^\n]+)\)(?:| AS ([^ ]+))(?:\n|$)")
|
|
57
|
+
_ANONYMOUS_COL_REGEX = re.compile(r"^_c(\d+)$")
|
|
58
|
+
|
|
59
|
+
_SIMPLE_SCHEMA_COLS_REGEX = re.compile(r"SELECT (([^:]+:[^, ]+[, ]*)+)FROM")
|
|
60
|
+
_SIMPLE_SCHEMA_COL_REGEX = re.compile(r"([^ \.\)]+):([^ ]+)")
|
|
50
61
|
|
|
51
62
|
|
|
52
63
|
@dataclasses.dataclass
|
|
@@ -151,7 +162,7 @@ def _resolve_task_sector(job_name: str, sector: str) -> TaskSector:
|
|
|
151
162
|
return TaskSector(job_name, task_name, out_target, schemas)
|
|
152
163
|
|
|
153
164
|
|
|
154
|
-
def
|
|
165
|
+
def _parse_full_explain(explain_string: str) -> OdpsSchema:
|
|
155
166
|
sectors = _split_explain_string(explain_string)
|
|
156
167
|
jobs_sector = tasks_sector = None
|
|
157
168
|
|
|
@@ -169,27 +180,53 @@ def _parse_explained_schema(explain_string: str) -> OdpsSchema:
|
|
|
169
180
|
|
|
170
181
|
job_dag = jobs_sector.build_dag()
|
|
171
182
|
indep_job_names = list(job_dag.iter_indep(reverse=True))
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
183
|
+
schema_signatures = dict()
|
|
184
|
+
for job_name in indep_job_names:
|
|
185
|
+
tasks_sector = jobs_sector.jobs[job_name]
|
|
186
|
+
task_dag = tasks_sector.build_dag()
|
|
187
|
+
indep_task_names = list(task_dag.iter_indep(reverse=True))
|
|
188
|
+
for task_name in indep_task_names:
|
|
189
|
+
task_sector = tasks_sector.tasks[task_name]
|
|
190
|
+
if not task_sector.schema: # pragma: no cover
|
|
191
|
+
raise ValueError("Cannot detect output schema")
|
|
192
|
+
if task_sector.output_target != "Screen":
|
|
193
|
+
raise ValueError("The SQL statement should be an instant query")
|
|
194
|
+
sig_tuples = sorted(
|
|
195
|
+
[
|
|
196
|
+
(c.column_alias or c.column_name, c.column_type)
|
|
197
|
+
for c in task_sector.schema
|
|
198
|
+
]
|
|
199
|
+
)
|
|
200
|
+
schema_signatures[hash(tuple(sig_tuples))] = task_sector.schema
|
|
201
|
+
if len(schema_signatures) != 1:
|
|
179
202
|
raise ValueError("Only one final task is allowed in SQL statement")
|
|
180
|
-
|
|
181
|
-
task_sector = tasks_sector.tasks[indep_task_names[0]]
|
|
182
|
-
if not task_sector.schema: # pragma: no cover
|
|
183
|
-
raise ValueError("Cannot detect output schema")
|
|
184
|
-
if task_sector.output_target != "Screen":
|
|
185
|
-
raise ValueError("The SQL statement should be an instant query")
|
|
203
|
+
schema = list(schema_signatures.values())[0]
|
|
186
204
|
cols = [
|
|
187
205
|
Column(c.column_alias or c.column_name, validate_data_type(c.column_type))
|
|
188
|
-
for c in
|
|
206
|
+
for c in schema
|
|
189
207
|
]
|
|
190
208
|
return OdpsSchema(cols)
|
|
191
209
|
|
|
192
210
|
|
|
211
|
+
def _parse_simple_explain(explain_string: str) -> OdpsSchema:
|
|
212
|
+
fields_match = _SIMPLE_SCHEMA_COLS_REGEX.search(explain_string)
|
|
213
|
+
if not fields_match:
|
|
214
|
+
raise ValueError("Cannot detect output table schema")
|
|
215
|
+
|
|
216
|
+
fields_str = fields_match.group(1)
|
|
217
|
+
cols = []
|
|
218
|
+
for field, type_name in _SIMPLE_SCHEMA_COL_REGEX.findall(fields_str):
|
|
219
|
+
cols.append(Column(field, validate_data_type(type_name.rstrip(","))))
|
|
220
|
+
return OdpsSchema(cols)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def _parse_explained_schema(explain_string: str) -> OdpsSchema:
|
|
224
|
+
if explain_string.startswith("AdhocSink"):
|
|
225
|
+
return _parse_simple_explain(explain_string)
|
|
226
|
+
else:
|
|
227
|
+
return _parse_full_explain(explain_string)
|
|
228
|
+
|
|
229
|
+
|
|
193
230
|
class DataFrameReadODPSQuery(
|
|
194
231
|
IncrementalIndexDatasource,
|
|
195
232
|
ColumnPruneSupportedDataSourceMixin,
|
|
@@ -204,6 +241,7 @@ class DataFrameReadODPSQuery(
|
|
|
204
241
|
string_as_binary = BoolField("string_as_binary", default=None)
|
|
205
242
|
index_columns = ListField("index_columns", FieldTypes.string, default=None)
|
|
206
243
|
index_dtypes = SeriesField("index_dtypes", default=None)
|
|
244
|
+
column_renames = DictField("column_renames", default=None)
|
|
207
245
|
|
|
208
246
|
def get_columns(self):
|
|
209
247
|
return self.columns
|
|
@@ -226,12 +264,18 @@ class DataFrameReadODPSQuery(
|
|
|
226
264
|
)
|
|
227
265
|
index_value = parse_index(idx)
|
|
228
266
|
|
|
229
|
-
|
|
267
|
+
if self.dtypes is not None:
|
|
268
|
+
columns_value = parse_index(self.dtypes.index, store_data=True)
|
|
269
|
+
shape = (np.nan, len(self.dtypes))
|
|
270
|
+
else:
|
|
271
|
+
columns_value = None
|
|
272
|
+
shape = (np.nan, np.nan)
|
|
273
|
+
|
|
230
274
|
self.output_types = [OutputType.dataframe]
|
|
231
275
|
return self.new_tileable(
|
|
232
276
|
[],
|
|
233
277
|
None,
|
|
234
|
-
shape=
|
|
278
|
+
shape=shape,
|
|
235
279
|
dtypes=self.dtypes,
|
|
236
280
|
index_value=index_value,
|
|
237
281
|
columns_value=columns_value,
|
|
@@ -245,6 +289,9 @@ def read_odps_query(
|
|
|
245
289
|
odps_entry: ODPS = None,
|
|
246
290
|
index_col: Union[None, str, List[str]] = None,
|
|
247
291
|
string_as_binary: bool = None,
|
|
292
|
+
sql_hints: Dict[str, str] = None,
|
|
293
|
+
anonymous_col_prefix: str = _DEFAULT_ANONYMOUS_COL_PREFIX,
|
|
294
|
+
skip_schema: bool = False,
|
|
248
295
|
**kw,
|
|
249
296
|
):
|
|
250
297
|
"""
|
|
@@ -259,24 +306,70 @@ def read_odps_query(
|
|
|
259
306
|
MaxCompute SQL statement.
|
|
260
307
|
index_col: Union[None, str, List[str]]
|
|
261
308
|
Columns to be specified as indexes.
|
|
309
|
+
string_as_binary: bool, optional
|
|
310
|
+
Whether to convert string columns to binary.
|
|
311
|
+
sql_hints: Dict[str, str], optional
|
|
312
|
+
User specified SQL hints.
|
|
313
|
+
anonymous_col_prefix: str, optional
|
|
314
|
+
Prefix for anonymous columns, '_anon_col_' by default.
|
|
315
|
+
skip_schema: bool, optional
|
|
316
|
+
Skip resolving output schema before execution. Once this is configured,
|
|
317
|
+
the output DataFrame cannot be inputs of other DataFrame operators
|
|
318
|
+
before execution.
|
|
262
319
|
|
|
263
320
|
Returns
|
|
264
321
|
-------
|
|
265
322
|
result: DataFrame
|
|
266
323
|
DataFrame read from MaxCompute (ODPS) table
|
|
267
324
|
"""
|
|
325
|
+
hints = options.sql.settings.copy() or {}
|
|
326
|
+
if sql_hints:
|
|
327
|
+
hints.update(sql_hints)
|
|
328
|
+
|
|
268
329
|
odps_entry = odps_entry or ODPS.from_global() or ODPS.from_environments()
|
|
330
|
+
|
|
331
|
+
if options.session.enable_schema or odps_entry.is_schema_namespace_enabled():
|
|
332
|
+
hints["odps.namespace.schema"] = "true"
|
|
333
|
+
hints["odps.sql.allow.namespace.schema"] = "true"
|
|
334
|
+
|
|
335
|
+
# fixme workaround for multi-stage split process
|
|
336
|
+
hints["odps.sql.object.table.split.by.object.size.enabled"] = "false"
|
|
337
|
+
|
|
269
338
|
if odps_entry is None:
|
|
270
339
|
raise ValueError("Missing odps_entry parameter")
|
|
271
|
-
inst = odps_entry.execute_sql(f"EXPLAIN {query}")
|
|
272
|
-
explain_str = list(inst.get_task_results().values())[0]
|
|
273
340
|
|
|
274
|
-
|
|
275
|
-
|
|
341
|
+
col_renames = {}
|
|
342
|
+
if not skip_schema:
|
|
343
|
+
inst = odps_entry.execute_sql(f"EXPLAIN {query}", hints=hints)
|
|
344
|
+
logger.debug("Explain instance ID: %s", inst.id)
|
|
345
|
+
explain_str = list(inst.get_task_results().values())[0]
|
|
346
|
+
|
|
347
|
+
try:
|
|
348
|
+
odps_schema = _parse_explained_schema(explain_str)
|
|
349
|
+
except ValueError as ex:
|
|
350
|
+
exc = ValueError(str(ex) + "\nExplain instance ID: " + inst.id)
|
|
351
|
+
raise exc.with_traceback(ex.__traceback__) from None
|
|
352
|
+
|
|
353
|
+
new_columns = []
|
|
354
|
+
for col in odps_schema.columns:
|
|
355
|
+
anon_match = _ANONYMOUS_COL_REGEX.match(col.name)
|
|
356
|
+
if anon_match and col.name not in query:
|
|
357
|
+
new_name = anonymous_col_prefix + anon_match.group(1)
|
|
358
|
+
col_renames[col.name] = new_name
|
|
359
|
+
new_columns.append(Column(new_name, col.type))
|
|
360
|
+
else:
|
|
361
|
+
new_columns.append(col)
|
|
362
|
+
|
|
363
|
+
dtypes = odps_schema_to_pandas_dtypes(OdpsSchema(new_columns))
|
|
364
|
+
else:
|
|
365
|
+
dtypes = None
|
|
276
366
|
|
|
277
367
|
if not index_col:
|
|
278
368
|
index_dtypes = None
|
|
279
369
|
else:
|
|
370
|
+
if dtypes is None:
|
|
371
|
+
raise ValueError("Cannot configure index_col when skip_schema is True")
|
|
372
|
+
|
|
280
373
|
if isinstance(index_col, str):
|
|
281
374
|
index_col = [index_col]
|
|
282
375
|
index_col_set = set(index_col)
|
|
@@ -295,5 +388,6 @@ def read_odps_query(
|
|
|
295
388
|
string_as_binary=string_as_binary,
|
|
296
389
|
index_columns=index_col,
|
|
297
390
|
index_dtypes=index_dtypes,
|
|
391
|
+
column_renames=col_renames,
|
|
298
392
|
)
|
|
299
393
|
return op(chunk_bytes=chunk_bytes, chunk_size=chunk_size)
|
|
@@ -22,8 +22,9 @@ from odps.models import Table
|
|
|
22
22
|
from odps.utils import to_timestamp
|
|
23
23
|
|
|
24
24
|
from ... import opcodes
|
|
25
|
+
from ...config import options
|
|
25
26
|
from ...core import OutputType
|
|
26
|
-
from ...odpsio import odps_schema_to_pandas_dtypes
|
|
27
|
+
from ...io.odpsio import odps_schema_to_pandas_dtypes
|
|
27
28
|
from ...serialization.serializables import (
|
|
28
29
|
AnyField,
|
|
29
30
|
BoolField,
|
|
@@ -119,9 +120,10 @@ class DataFrameReadODPSTable(
|
|
|
119
120
|
return self.new_tileable(
|
|
120
121
|
[],
|
|
121
122
|
None,
|
|
122
|
-
shape=shape,
|
|
123
|
+
shape=shape[:1],
|
|
123
124
|
name=getattr(index_value, "name", None),
|
|
124
125
|
names=getattr(index_value, "names", None),
|
|
126
|
+
dtype=self.index_dtypes.iloc[0],
|
|
125
127
|
index_value=index_value,
|
|
126
128
|
chunk_bytes=chunk_bytes,
|
|
127
129
|
chunk_size=chunk_size,
|
|
@@ -166,12 +168,13 @@ def read_odps_table(
|
|
|
166
168
|
DataFrame read from MaxCompute (ODPS) table
|
|
167
169
|
"""
|
|
168
170
|
odps_entry = odps_entry or ODPS.from_global() or ODPS.from_environments()
|
|
171
|
+
schema = options.session.default_schema or odps_entry.schema
|
|
169
172
|
if odps_entry is None:
|
|
170
173
|
raise ValueError("Missing odps_entry parameter")
|
|
171
174
|
if isinstance(table_name, Table):
|
|
172
175
|
table = table_name
|
|
173
176
|
else:
|
|
174
|
-
table = odps_entry.get_table(table_name)
|
|
177
|
+
table = odps_entry.get_table(table_name, schema=schema)
|
|
175
178
|
|
|
176
179
|
if not table.table_schema.partitions and (
|
|
177
180
|
partitions is not None or append_partitions
|
|
@@ -13,18 +13,28 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import os
|
|
16
|
+
import uuid
|
|
16
17
|
from collections import OrderedDict
|
|
17
18
|
|
|
18
19
|
import numpy as np
|
|
19
20
|
import pandas as pd
|
|
20
21
|
import pytest
|
|
21
22
|
from odps import ODPS
|
|
23
|
+
from odps import types as odps_types
|
|
22
24
|
|
|
23
25
|
from .... import tensor as mt
|
|
26
|
+
from ....core import OutputType
|
|
24
27
|
from ....tests.utils import tn
|
|
25
28
|
from ....utils import lazy_import
|
|
26
29
|
from ... import read_odps_query, read_odps_table
|
|
27
|
-
from ...core import
|
|
30
|
+
from ...core import (
|
|
31
|
+
DatetimeIndex,
|
|
32
|
+
Float64Index,
|
|
33
|
+
Index,
|
|
34
|
+
IndexValue,
|
|
35
|
+
Int64Index,
|
|
36
|
+
MultiIndex,
|
|
37
|
+
)
|
|
28
38
|
from ..dataframe import from_pandas as from_pandas_df
|
|
29
39
|
from ..date_range import date_range
|
|
30
40
|
from ..from_tensor import (
|
|
@@ -34,7 +44,12 @@ from ..from_tensor import (
|
|
|
34
44
|
)
|
|
35
45
|
from ..index import from_pandas as from_pandas_index
|
|
36
46
|
from ..index import from_tileable
|
|
37
|
-
from ..read_odps_query import
|
|
47
|
+
from ..read_odps_query import (
|
|
48
|
+
ColumnSchema,
|
|
49
|
+
_parse_full_explain,
|
|
50
|
+
_parse_simple_explain,
|
|
51
|
+
_resolve_task_sector,
|
|
52
|
+
)
|
|
38
53
|
from ..series import from_pandas as from_pandas_series
|
|
39
54
|
|
|
40
55
|
ray = lazy_import("ray")
|
|
@@ -112,18 +127,22 @@ def test_from_tileable_index():
|
|
|
112
127
|
|
|
113
128
|
for o in [df, df[0]]:
|
|
114
129
|
index = o.index
|
|
115
|
-
assert isinstance(index, Int64Index)
|
|
130
|
+
assert isinstance(index, (Index, Int64Index))
|
|
116
131
|
assert index.dtype == np.int64
|
|
117
132
|
assert index.name == pd_df.index.name
|
|
118
|
-
assert isinstance(
|
|
133
|
+
assert isinstance(
|
|
134
|
+
index.index_value.value, (IndexValue.Int64Index, IndexValue.Index)
|
|
135
|
+
)
|
|
119
136
|
|
|
120
137
|
t = mt.random.rand(10, chunk_size=6)
|
|
121
138
|
index = from_tileable(t, name="new_name")
|
|
122
139
|
|
|
123
|
-
assert isinstance(index, Float64Index)
|
|
140
|
+
assert isinstance(index, (Index, Float64Index))
|
|
124
141
|
assert index.dtype == np.float64
|
|
125
142
|
assert index.name == "new_name"
|
|
126
|
-
assert isinstance(
|
|
143
|
+
assert isinstance(
|
|
144
|
+
index.index_value.value, (IndexValue.Float64Index, IndexValue.Index)
|
|
145
|
+
)
|
|
127
146
|
|
|
128
147
|
|
|
129
148
|
def test_from_tensor():
|
|
@@ -295,6 +314,15 @@ def test_from_odps_table():
|
|
|
295
314
|
),
|
|
296
315
|
)
|
|
297
316
|
|
|
317
|
+
out_idx = read_odps_table(
|
|
318
|
+
test_table,
|
|
319
|
+
columns=[],
|
|
320
|
+
index_col=["col1", "col2"],
|
|
321
|
+
output_type=OutputType.index,
|
|
322
|
+
)
|
|
323
|
+
assert out_idx.names == ["col1", "col2"]
|
|
324
|
+
assert out_idx.shape == (np.nan,)
|
|
325
|
+
|
|
298
326
|
test_table.drop()
|
|
299
327
|
test_parted_table.drop()
|
|
300
328
|
|
|
@@ -316,7 +344,10 @@ def test_from_odps_query():
|
|
|
316
344
|
odps_entry.write_table(test_table2, [["A", 10, 4.5]])
|
|
317
345
|
|
|
318
346
|
with pytest.raises(ValueError) as err_info:
|
|
319
|
-
read_odps_query(
|
|
347
|
+
read_odps_query(
|
|
348
|
+
f"CREATE TABLE dummy_table_{uuid.uuid4().hex} "
|
|
349
|
+
f"AS SELECT * FROM {table1_name}"
|
|
350
|
+
)
|
|
320
351
|
assert "instant query" in err_info.value.args[0]
|
|
321
352
|
|
|
322
353
|
query1 = f"SELECT * FROM {table1_name} WHERE col1 > 10"
|
|
@@ -332,6 +363,10 @@ def test_from_odps_query():
|
|
|
332
363
|
),
|
|
333
364
|
)
|
|
334
365
|
|
|
366
|
+
df = read_odps_query(query1, skip_schema=True)
|
|
367
|
+
assert df.dtypes is None
|
|
368
|
+
assert df.columns_value is None
|
|
369
|
+
|
|
335
370
|
df = read_odps_query(query1, index_col="col1")
|
|
336
371
|
assert df.op.query == query1
|
|
337
372
|
assert df.index_value.name == "col1"
|
|
@@ -387,7 +422,9 @@ def test_date_range():
|
|
|
387
422
|
|
|
388
423
|
|
|
389
424
|
def test_resolve_task_sector():
|
|
390
|
-
input_path = os.path.join(
|
|
425
|
+
input_path = os.path.join(
|
|
426
|
+
os.path.dirname(__file__), "test-data", "task-input-full.txt"
|
|
427
|
+
)
|
|
391
428
|
with open(input_path, "r") as f:
|
|
392
429
|
sector = f.read()
|
|
393
430
|
actual_sector = _resolve_task_sector("job0", sector)
|
|
@@ -399,3 +436,61 @@ def test_resolve_task_sector():
|
|
|
399
436
|
assert actual_sector.schema[0] == ColumnSchema("unnamed: 0", "bigint", "")
|
|
400
437
|
assert actual_sector.schema[1] == ColumnSchema("id", "bigint", "id_alias")
|
|
401
438
|
assert actual_sector.schema[2] == ColumnSchema("listing_url", "string", "")
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
def test_resolve_task_odps2():
|
|
442
|
+
input_path = os.path.join(
|
|
443
|
+
os.path.dirname(__file__), "test-data", "task-input-odps2.txt"
|
|
444
|
+
)
|
|
445
|
+
with open(input_path, "r") as f:
|
|
446
|
+
sector = f.read()
|
|
447
|
+
actual_sector = _resolve_task_sector("job0", sector)
|
|
448
|
+
|
|
449
|
+
assert actual_sector.job_name == "job0"
|
|
450
|
+
assert actual_sector.task_name == "M1"
|
|
451
|
+
assert actual_sector.output_target == "Screen"
|
|
452
|
+
assert len(actual_sector.schema) == 2
|
|
453
|
+
assert actual_sector.schema[0] == ColumnSchema("key", "varchar(2048)", "")
|
|
454
|
+
assert actual_sector.schema[1] == ColumnSchema("data", "binary", "")
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
def test_resolve_simple_explain():
|
|
458
|
+
input_path = os.path.join(
|
|
459
|
+
os.path.dirname(__file__), "test-data", "task-input-simple.txt"
|
|
460
|
+
)
|
|
461
|
+
with open(input_path, "r") as f:
|
|
462
|
+
sector = f.read()
|
|
463
|
+
|
|
464
|
+
schema = _parse_simple_explain(sector)
|
|
465
|
+
assert schema.columns[0].name == "memberid"
|
|
466
|
+
assert schema.columns[0].type == odps_types.string
|
|
467
|
+
assert schema.columns[1].name == "createdate"
|
|
468
|
+
assert schema.columns[1].type == odps_types.bigint
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
def test_resolve_conditional():
|
|
472
|
+
input_path = os.path.join(
|
|
473
|
+
os.path.dirname(__file__), "test-data", "task-input-multi-cond.txt"
|
|
474
|
+
)
|
|
475
|
+
with open(input_path, "r") as f:
|
|
476
|
+
sector = f.read()
|
|
477
|
+
|
|
478
|
+
expected_col_types = {
|
|
479
|
+
"cs1": "string",
|
|
480
|
+
"cs2": "string",
|
|
481
|
+
"ci1": "bigint",
|
|
482
|
+
"cs3": "string",
|
|
483
|
+
"cs4": "string",
|
|
484
|
+
"cs5": "string",
|
|
485
|
+
"cs6": "string",
|
|
486
|
+
"cs7": "string",
|
|
487
|
+
"cs8": "string",
|
|
488
|
+
"ci2": "int",
|
|
489
|
+
"ci3": "bigint",
|
|
490
|
+
"cs9": "string",
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
schema = _parse_full_explain(sector)
|
|
494
|
+
for col, (exp_nm, exp_tp) in zip(schema.columns, expected_col_types.items()):
|
|
495
|
+
assert col.name == exp_nm
|
|
496
|
+
assert col.type == odps_types.validate_data_type(exp_tp)
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import pytest
|
|
16
|
+
|
|
17
|
+
from ... import DataFrame
|
|
18
|
+
from ..to_odps import to_odps_table
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@pytest.fixture
|
|
22
|
+
def df():
|
|
23
|
+
return DataFrame({"A": [1, 2], "B": [3, 4]})
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@pytest.mark.parametrize(
|
|
27
|
+
"kwargs",
|
|
28
|
+
[
|
|
29
|
+
{"partition_col": ["A", "C"]},
|
|
30
|
+
{"partition_col": "C"},
|
|
31
|
+
{"partition": "a=1,C=2"},
|
|
32
|
+
],
|
|
33
|
+
)
|
|
34
|
+
def test_to_odps_table_validation(df, kwargs):
|
|
35
|
+
with pytest.raises(ValueError):
|
|
36
|
+
to_odps_table(df, "test_table", **kwargs)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@pytest.mark.parametrize(
|
|
40
|
+
"kwargs",
|
|
41
|
+
[
|
|
42
|
+
{"partition_col": ["a", "B"]},
|
|
43
|
+
{"partition_col": "a"},
|
|
44
|
+
{"partition": "C=1,d=2"},
|
|
45
|
+
],
|
|
46
|
+
)
|
|
47
|
+
def test_to_odps_table_vaild(df, kwargs):
|
|
48
|
+
to_odps_table(df, "test_table", **kwargs)
|
|
@@ -17,11 +17,14 @@
|
|
|
17
17
|
import logging
|
|
18
18
|
from typing import List, Optional, Union
|
|
19
19
|
|
|
20
|
+
from odps import ODPS
|
|
20
21
|
from odps.models import Table as ODPSTable
|
|
22
|
+
from odps.types import PartitionSpec
|
|
21
23
|
|
|
22
24
|
from ... import opcodes
|
|
23
25
|
from ...config import options
|
|
24
26
|
from ...core import OutputType
|
|
27
|
+
from ...io.odpsio import build_dataframe_table_meta
|
|
25
28
|
from ...serialization.serializables import (
|
|
26
29
|
BoolField,
|
|
27
30
|
FieldTypes,
|
|
@@ -134,8 +137,14 @@ def to_odps_table(
|
|
|
134
137
|
--------
|
|
135
138
|
|
|
136
139
|
"""
|
|
140
|
+
odps_entry = ODPS.from_global() or ODPS.from_environments()
|
|
137
141
|
if isinstance(table, ODPSTable):
|
|
138
142
|
table = table.full_table_name
|
|
143
|
+
elif options.session.enable_schema and "." not in table:
|
|
144
|
+
default_schema = (
|
|
145
|
+
options.session.default_schema or odps_entry.schema or "default"
|
|
146
|
+
)
|
|
147
|
+
table = default_schema + "." + table
|
|
139
148
|
|
|
140
149
|
if isinstance(index_label, str):
|
|
141
150
|
index_label = [index_label]
|
|
@@ -147,6 +156,25 @@ def to_odps_table(
|
|
|
147
156
|
f"index_label needs {len(df.index.nlevels)} labels "
|
|
148
157
|
f"but it only have {len(index_label)}"
|
|
149
158
|
)
|
|
159
|
+
table_cols = set(build_dataframe_table_meta(df).table_column_names)
|
|
160
|
+
if partition:
|
|
161
|
+
partition_intersect = (
|
|
162
|
+
set(x.lower() for x in PartitionSpec(partition).keys()) & table_cols
|
|
163
|
+
)
|
|
164
|
+
if partition_intersect:
|
|
165
|
+
raise ValueError(
|
|
166
|
+
f"Data column(s) {partition_intersect} in the dataframe"
|
|
167
|
+
" cannot be used in parameter 'partition'."
|
|
168
|
+
" Use 'partition_col' instead."
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
if partition_col:
|
|
172
|
+
partition_diff = set(x.lower() for x in partition_col) - table_cols
|
|
173
|
+
if partition_diff:
|
|
174
|
+
raise ValueError(
|
|
175
|
+
f"Partition column(s) {partition_diff}"
|
|
176
|
+
" is not the data column(s) of the input dataframe."
|
|
177
|
+
)
|
|
150
178
|
|
|
151
179
|
op = DataFrameToODPSTable(
|
|
152
180
|
dtypes=df.dtypes,
|
|
@@ -18,6 +18,8 @@ from .accessor import (
|
|
|
18
18
|
IndexMaxFrameAccessor,
|
|
19
19
|
SeriesMaxFrameAccessor,
|
|
20
20
|
)
|
|
21
|
+
from .flatjson import series_flatjson
|
|
22
|
+
from .flatmap import df_flatmap, series_flatmap
|
|
21
23
|
from .reshuffle import DataFrameReshuffle, df_reshuffle
|
|
22
24
|
|
|
23
25
|
|
|
@@ -25,6 +27,9 @@ def _install():
|
|
|
25
27
|
from ..core import DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE
|
|
26
28
|
|
|
27
29
|
DataFrameMaxFrameAccessor._register("reshuffle", df_reshuffle)
|
|
30
|
+
DataFrameMaxFrameAccessor._register("flatmap", df_flatmap)
|
|
31
|
+
SeriesMaxFrameAccessor._register("flatmap", series_flatmap)
|
|
32
|
+
SeriesMaxFrameAccessor._register("flatjson", series_flatjson)
|
|
28
33
|
|
|
29
34
|
if DataFrameMaxFrameAccessor._api_count:
|
|
30
35
|
for t in DATAFRAME_TYPE:
|