maxframe 1.0.0rc3__cp38-cp38-win32.whl → 1.1.0__cp38-cp38-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cp38-win32.pyd +0 -0
- maxframe/codegen.py +1 -0
- maxframe/config/config.py +16 -1
- maxframe/conftest.py +52 -14
- maxframe/core/entity/executable.py +1 -1
- maxframe/core/graph/core.cp38-win32.pyd +0 -0
- maxframe/core/operator/base.py +2 -0
- maxframe/dataframe/arithmetic/docstring.py +26 -2
- maxframe/dataframe/arithmetic/equal.py +4 -2
- maxframe/dataframe/arithmetic/greater.py +4 -2
- maxframe/dataframe/arithmetic/greater_equal.py +4 -2
- maxframe/dataframe/arithmetic/less.py +2 -2
- maxframe/dataframe/arithmetic/less_equal.py +4 -2
- maxframe/dataframe/arithmetic/not_equal.py +4 -2
- maxframe/dataframe/arithmetic/tests/test_arithmetic.py +17 -16
- maxframe/dataframe/core.py +26 -2
- maxframe/dataframe/datasource/read_odps_query.py +116 -28
- maxframe/dataframe/datasource/read_odps_table.py +3 -1
- maxframe/dataframe/datasource/tests/test_datasource.py +93 -12
- maxframe/dataframe/datastore/to_odps.py +7 -0
- maxframe/dataframe/extensions/__init__.py +8 -0
- maxframe/dataframe/extensions/apply_chunk.py +649 -0
- maxframe/dataframe/extensions/flatjson.py +131 -0
- maxframe/dataframe/extensions/flatmap.py +314 -0
- maxframe/dataframe/extensions/reshuffle.py +1 -1
- maxframe/dataframe/extensions/tests/test_apply_chunk.py +186 -0
- maxframe/dataframe/extensions/tests/test_extensions.py +108 -3
- maxframe/dataframe/groupby/__init__.py +1 -0
- maxframe/dataframe/groupby/aggregation.py +1 -0
- maxframe/dataframe/groupby/apply.py +9 -1
- maxframe/dataframe/groupby/core.py +1 -1
- maxframe/dataframe/groupby/fill.py +4 -1
- maxframe/dataframe/groupby/getitem.py +6 -0
- maxframe/dataframe/groupby/tests/test_groupby.py +1 -1
- maxframe/dataframe/groupby/transform.py +8 -2
- maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
- maxframe/dataframe/indexing/loc.py +6 -4
- maxframe/dataframe/indexing/rename.py +11 -0
- maxframe/dataframe/initializer.py +11 -1
- maxframe/dataframe/merge/__init__.py +9 -1
- maxframe/dataframe/merge/concat.py +41 -31
- maxframe/dataframe/merge/merge.py +1 -1
- maxframe/dataframe/merge/tests/test_merge.py +3 -1
- maxframe/dataframe/misc/apply.py +3 -0
- maxframe/dataframe/misc/drop_duplicates.py +23 -2
- maxframe/dataframe/misc/map.py +3 -1
- maxframe/dataframe/misc/tests/test_misc.py +24 -2
- maxframe/dataframe/misc/transform.py +22 -13
- maxframe/dataframe/reduction/__init__.py +3 -0
- maxframe/dataframe/reduction/aggregation.py +1 -0
- maxframe/dataframe/reduction/median.py +56 -0
- maxframe/dataframe/reduction/tests/test_reduction.py +17 -7
- maxframe/dataframe/statistics/quantile.py +8 -2
- maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
- maxframe/dataframe/tests/test_initializer.py +33 -2
- maxframe/dataframe/tests/test_utils.py +60 -0
- maxframe/dataframe/utils.py +110 -7
- maxframe/dataframe/window/expanding.py +5 -3
- maxframe/dataframe/window/tests/test_expanding.py +2 -2
- maxframe/io/objects/tests/test_object_io.py +39 -12
- maxframe/io/odpsio/arrow.py +30 -2
- maxframe/io/odpsio/schema.py +28 -8
- maxframe/io/odpsio/tableio.py +55 -133
- maxframe/io/odpsio/tests/test_schema.py +40 -4
- maxframe/io/odpsio/tests/test_tableio.py +5 -5
- maxframe/io/odpsio/tests/test_volumeio.py +35 -11
- maxframe/io/odpsio/volumeio.py +36 -6
- maxframe/learn/contrib/__init__.py +3 -1
- maxframe/learn/contrib/graph/__init__.py +15 -0
- maxframe/learn/contrib/graph/connected_components.py +215 -0
- maxframe/learn/contrib/graph/tests/__init__.py +13 -0
- maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
- maxframe/learn/contrib/llm/__init__.py +16 -0
- maxframe/learn/contrib/llm/core.py +54 -0
- maxframe/learn/contrib/llm/models/__init__.py +14 -0
- maxframe/learn/contrib/llm/models/dashscope.py +73 -0
- maxframe/learn/contrib/llm/multi_modal.py +42 -0
- maxframe/learn/contrib/llm/text.py +42 -0
- maxframe/learn/contrib/xgboost/classifier.py +3 -3
- maxframe/learn/contrib/xgboost/predict.py +8 -39
- maxframe/learn/contrib/xgboost/train.py +4 -3
- maxframe/lib/mmh3.cp38-win32.pyd +0 -0
- maxframe/lib/sparse/tests/test_sparse.py +15 -15
- maxframe/opcodes.py +10 -1
- maxframe/protocol.py +6 -1
- maxframe/serialization/core.cp38-win32.pyd +0 -0
- maxframe/serialization/core.pyx +13 -1
- maxframe/serialization/pandas.py +50 -20
- maxframe/serialization/serializables/core.py +24 -5
- maxframe/serialization/serializables/field_type.py +4 -1
- maxframe/serialization/serializables/tests/test_serializable.py +8 -1
- maxframe/serialization/tests/test_serial.py +2 -1
- maxframe/session.py +9 -2
- maxframe/tensor/__init__.py +19 -7
- maxframe/tensor/indexing/getitem.py +2 -0
- maxframe/tensor/merge/concatenate.py +23 -20
- maxframe/tensor/merge/vstack.py +5 -1
- maxframe/tensor/misc/transpose.py +1 -1
- maxframe/tests/utils.py +16 -0
- maxframe/udf.py +27 -0
- maxframe/utils.py +64 -14
- {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/METADATA +2 -2
- {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/RECORD +112 -96
- {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/WHEEL +1 -1
- maxframe_client/clients/framedriver.py +4 -1
- maxframe_client/fetcher.py +28 -10
- maxframe_client/session/consts.py +3 -0
- maxframe_client/session/odps.py +104 -20
- maxframe_client/session/task.py +42 -26
- maxframe_client/session/tests/test_task.py +0 -4
- maxframe_client/tests/test_session.py +44 -12
- {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/top_level.txt +0 -0
|
@@ -13,6 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import dataclasses
|
|
16
|
+
import logging
|
|
16
17
|
import re
|
|
17
18
|
from typing import Dict, List, Optional, Tuple, Union
|
|
18
19
|
|
|
@@ -22,12 +23,14 @@ from odps import ODPS
|
|
|
22
23
|
from odps.types import Column, OdpsSchema, validate_data_type
|
|
23
24
|
|
|
24
25
|
from ... import opcodes
|
|
26
|
+
from ...config import options
|
|
25
27
|
from ...core import OutputType
|
|
26
28
|
from ...core.graph import DAG
|
|
27
29
|
from ...io.odpsio import odps_schema_to_pandas_dtypes
|
|
28
30
|
from ...serialization.serializables import (
|
|
29
31
|
AnyField,
|
|
30
32
|
BoolField,
|
|
33
|
+
DictField,
|
|
31
34
|
FieldTypes,
|
|
32
35
|
Int64Field,
|
|
33
36
|
ListField,
|
|
@@ -37,6 +40,10 @@ from ...serialization.serializables import (
|
|
|
37
40
|
from ..utils import parse_index
|
|
38
41
|
from .core import ColumnPruneSupportedDataSourceMixin, IncrementalIndexDatasource
|
|
39
42
|
|
|
43
|
+
logger = logging.getLogger(__name__)
|
|
44
|
+
|
|
45
|
+
_DEFAULT_ANONYMOUS_COL_PREFIX = "_anon_col_"
|
|
46
|
+
|
|
40
47
|
_EXPLAIN_DEPENDS_REGEX = re.compile(r"([^\s]+) depends on: ([^\n]+)")
|
|
41
48
|
_EXPLAIN_JOB_REGEX = re.compile(r"(\S+) is root job")
|
|
42
49
|
_EXPLAIN_TASKS_HEADER_REGEX = re.compile(r"In Job ([^:]+):")
|
|
@@ -46,8 +53,11 @@ _EXPLAIN_TASK_SCHEMA_REGEX = re.compile(
|
|
|
46
53
|
r"In Task ([^:]+)[\S\s]+FS: output: ([^\n #]+)[\s\S]+schema:\s+([\S\s]+)$",
|
|
47
54
|
re.MULTILINE,
|
|
48
55
|
)
|
|
49
|
-
_EXPLAIN_COLUMN_REGEX = re.compile(r"([^\(]+) \(([
|
|
50
|
-
_ANONYMOUS_COL_REGEX = re.compile(r"^_c\d
|
|
56
|
+
_EXPLAIN_COLUMN_REGEX = re.compile(r"([^\(]+) \(([^\n]+)\)(?:| AS ([^ ]+))(?:\n|$)")
|
|
57
|
+
_ANONYMOUS_COL_REGEX = re.compile(r"^_c(\d+)$")
|
|
58
|
+
|
|
59
|
+
_SIMPLE_SCHEMA_COLS_REGEX = re.compile(r"SELECT (([^:]+:[^, ]+[, ]*)+)FROM")
|
|
60
|
+
_SIMPLE_SCHEMA_COL_REGEX = re.compile(r"([^ \.\)]+):([^ ]+)")
|
|
51
61
|
|
|
52
62
|
|
|
53
63
|
@dataclasses.dataclass
|
|
@@ -152,7 +162,7 @@ def _resolve_task_sector(job_name: str, sector: str) -> TaskSector:
|
|
|
152
162
|
return TaskSector(job_name, task_name, out_target, schemas)
|
|
153
163
|
|
|
154
164
|
|
|
155
|
-
def
|
|
165
|
+
def _parse_full_explain(explain_string: str) -> OdpsSchema:
|
|
156
166
|
sectors = _split_explain_string(explain_string)
|
|
157
167
|
jobs_sector = tasks_sector = None
|
|
158
168
|
|
|
@@ -170,27 +180,53 @@ def _parse_explained_schema(explain_string: str) -> OdpsSchema:
|
|
|
170
180
|
|
|
171
181
|
job_dag = jobs_sector.build_dag()
|
|
172
182
|
indep_job_names = list(job_dag.iter_indep(reverse=True))
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
183
|
+
schema_signatures = dict()
|
|
184
|
+
for job_name in indep_job_names:
|
|
185
|
+
tasks_sector = jobs_sector.jobs[job_name]
|
|
186
|
+
task_dag = tasks_sector.build_dag()
|
|
187
|
+
indep_task_names = list(task_dag.iter_indep(reverse=True))
|
|
188
|
+
for task_name in indep_task_names:
|
|
189
|
+
task_sector = tasks_sector.tasks[task_name]
|
|
190
|
+
if not task_sector.schema: # pragma: no cover
|
|
191
|
+
raise ValueError("Cannot detect output schema")
|
|
192
|
+
if task_sector.output_target != "Screen":
|
|
193
|
+
raise ValueError("The SQL statement should be an instant query")
|
|
194
|
+
sig_tuples = sorted(
|
|
195
|
+
[
|
|
196
|
+
(c.column_alias or c.column_name, c.column_type)
|
|
197
|
+
for c in task_sector.schema
|
|
198
|
+
]
|
|
199
|
+
)
|
|
200
|
+
schema_signatures[hash(tuple(sig_tuples))] = task_sector.schema
|
|
201
|
+
if len(schema_signatures) != 1:
|
|
180
202
|
raise ValueError("Only one final task is allowed in SQL statement")
|
|
181
|
-
|
|
182
|
-
task_sector = tasks_sector.tasks[indep_task_names[0]]
|
|
183
|
-
if not task_sector.schema: # pragma: no cover
|
|
184
|
-
raise ValueError("Cannot detect output schema")
|
|
185
|
-
if task_sector.output_target != "Screen":
|
|
186
|
-
raise ValueError("The SQL statement should be an instant query")
|
|
203
|
+
schema = list(schema_signatures.values())[0]
|
|
187
204
|
cols = [
|
|
188
205
|
Column(c.column_alias or c.column_name, validate_data_type(c.column_type))
|
|
189
|
-
for c in
|
|
206
|
+
for c in schema
|
|
190
207
|
]
|
|
191
208
|
return OdpsSchema(cols)
|
|
192
209
|
|
|
193
210
|
|
|
211
|
+
def _parse_simple_explain(explain_string: str) -> OdpsSchema:
|
|
212
|
+
fields_match = _SIMPLE_SCHEMA_COLS_REGEX.search(explain_string)
|
|
213
|
+
if not fields_match:
|
|
214
|
+
raise ValueError("Cannot detect output table schema")
|
|
215
|
+
|
|
216
|
+
fields_str = fields_match.group(1)
|
|
217
|
+
cols = []
|
|
218
|
+
for field, type_name in _SIMPLE_SCHEMA_COL_REGEX.findall(fields_str):
|
|
219
|
+
cols.append(Column(field, validate_data_type(type_name.rstrip(","))))
|
|
220
|
+
return OdpsSchema(cols)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def _parse_explained_schema(explain_string: str) -> OdpsSchema:
|
|
224
|
+
if explain_string.startswith("AdhocSink"):
|
|
225
|
+
return _parse_simple_explain(explain_string)
|
|
226
|
+
else:
|
|
227
|
+
return _parse_full_explain(explain_string)
|
|
228
|
+
|
|
229
|
+
|
|
194
230
|
class DataFrameReadODPSQuery(
|
|
195
231
|
IncrementalIndexDatasource,
|
|
196
232
|
ColumnPruneSupportedDataSourceMixin,
|
|
@@ -205,6 +241,7 @@ class DataFrameReadODPSQuery(
|
|
|
205
241
|
string_as_binary = BoolField("string_as_binary", default=None)
|
|
206
242
|
index_columns = ListField("index_columns", FieldTypes.string, default=None)
|
|
207
243
|
index_dtypes = SeriesField("index_dtypes", default=None)
|
|
244
|
+
column_renames = DictField("column_renames", default=None)
|
|
208
245
|
|
|
209
246
|
def get_columns(self):
|
|
210
247
|
return self.columns
|
|
@@ -227,12 +264,18 @@ class DataFrameReadODPSQuery(
|
|
|
227
264
|
)
|
|
228
265
|
index_value = parse_index(idx)
|
|
229
266
|
|
|
230
|
-
|
|
267
|
+
if self.dtypes is not None:
|
|
268
|
+
columns_value = parse_index(self.dtypes.index, store_data=True)
|
|
269
|
+
shape = (np.nan, len(self.dtypes))
|
|
270
|
+
else:
|
|
271
|
+
columns_value = None
|
|
272
|
+
shape = (np.nan, np.nan)
|
|
273
|
+
|
|
231
274
|
self.output_types = [OutputType.dataframe]
|
|
232
275
|
return self.new_tileable(
|
|
233
276
|
[],
|
|
234
277
|
None,
|
|
235
|
-
shape=
|
|
278
|
+
shape=shape,
|
|
236
279
|
dtypes=self.dtypes,
|
|
237
280
|
index_value=index_value,
|
|
238
281
|
columns_value=columns_value,
|
|
@@ -246,6 +289,9 @@ def read_odps_query(
|
|
|
246
289
|
odps_entry: ODPS = None,
|
|
247
290
|
index_col: Union[None, str, List[str]] = None,
|
|
248
291
|
string_as_binary: bool = None,
|
|
292
|
+
sql_hints: Dict[str, str] = None,
|
|
293
|
+
anonymous_col_prefix: str = _DEFAULT_ANONYMOUS_COL_PREFIX,
|
|
294
|
+
skip_schema: bool = False,
|
|
249
295
|
**kw,
|
|
250
296
|
):
|
|
251
297
|
"""
|
|
@@ -260,29 +306,70 @@ def read_odps_query(
|
|
|
260
306
|
MaxCompute SQL statement.
|
|
261
307
|
index_col: Union[None, str, List[str]]
|
|
262
308
|
Columns to be specified as indexes.
|
|
309
|
+
string_as_binary: bool, optional
|
|
310
|
+
Whether to convert string columns to binary.
|
|
311
|
+
sql_hints: Dict[str, str], optional
|
|
312
|
+
User specified SQL hints.
|
|
313
|
+
anonymous_col_prefix: str, optional
|
|
314
|
+
Prefix for anonymous columns, '_anon_col_' by default.
|
|
315
|
+
skip_schema: bool, optional
|
|
316
|
+
Skip resolving output schema before execution. Once this is configured,
|
|
317
|
+
the output DataFrame cannot be inputs of other DataFrame operators
|
|
318
|
+
before execution.
|
|
263
319
|
|
|
264
320
|
Returns
|
|
265
321
|
-------
|
|
266
322
|
result: DataFrame
|
|
267
323
|
DataFrame read from MaxCompute (ODPS) table
|
|
268
324
|
"""
|
|
325
|
+
hints = options.sql.settings.copy() or {}
|
|
326
|
+
if sql_hints:
|
|
327
|
+
hints.update(sql_hints)
|
|
328
|
+
|
|
269
329
|
odps_entry = odps_entry or ODPS.from_global() or ODPS.from_environments()
|
|
270
|
-
if odps_entry is None:
|
|
271
|
-
raise ValueError("Missing odps_entry parameter")
|
|
272
|
-
inst = odps_entry.execute_sql(f"EXPLAIN {query}")
|
|
273
|
-
explain_str = list(inst.get_task_results().values())[0]
|
|
274
330
|
|
|
275
|
-
|
|
331
|
+
if options.session.enable_schema or odps_entry.is_schema_namespace_enabled():
|
|
332
|
+
hints["odps.namespace.schema"] = "true"
|
|
333
|
+
hints["odps.sql.allow.namespace.schema"] = "true"
|
|
334
|
+
|
|
335
|
+
# fixme workaround for multi-stage split process
|
|
336
|
+
hints["odps.sql.object.table.split.by.object.size.enabled"] = "false"
|
|
276
337
|
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
raise ValueError("Need to specify names for all columns in SELECT clause.")
|
|
338
|
+
if odps_entry is None:
|
|
339
|
+
raise ValueError("Missing odps_entry parameter")
|
|
280
340
|
|
|
281
|
-
|
|
341
|
+
col_renames = {}
|
|
342
|
+
if not skip_schema:
|
|
343
|
+
inst = odps_entry.execute_sql(f"EXPLAIN {query}", hints=hints)
|
|
344
|
+
logger.debug("Explain instance ID: %s", inst.id)
|
|
345
|
+
explain_str = list(inst.get_task_results().values())[0]
|
|
346
|
+
|
|
347
|
+
try:
|
|
348
|
+
odps_schema = _parse_explained_schema(explain_str)
|
|
349
|
+
except ValueError as ex:
|
|
350
|
+
exc = ValueError(str(ex) + "\nExplain instance ID: " + inst.id)
|
|
351
|
+
raise exc.with_traceback(ex.__traceback__) from None
|
|
352
|
+
|
|
353
|
+
new_columns = []
|
|
354
|
+
for col in odps_schema.columns:
|
|
355
|
+
anon_match = _ANONYMOUS_COL_REGEX.match(col.name)
|
|
356
|
+
if anon_match and col.name not in query:
|
|
357
|
+
new_name = anonymous_col_prefix + anon_match.group(1)
|
|
358
|
+
col_renames[col.name] = new_name
|
|
359
|
+
new_columns.append(Column(new_name, col.type))
|
|
360
|
+
else:
|
|
361
|
+
new_columns.append(col)
|
|
362
|
+
|
|
363
|
+
dtypes = odps_schema_to_pandas_dtypes(OdpsSchema(new_columns))
|
|
364
|
+
else:
|
|
365
|
+
dtypes = None
|
|
282
366
|
|
|
283
367
|
if not index_col:
|
|
284
368
|
index_dtypes = None
|
|
285
369
|
else:
|
|
370
|
+
if dtypes is None:
|
|
371
|
+
raise ValueError("Cannot configure index_col when skip_schema is True")
|
|
372
|
+
|
|
286
373
|
if isinstance(index_col, str):
|
|
287
374
|
index_col = [index_col]
|
|
288
375
|
index_col_set = set(index_col)
|
|
@@ -301,5 +388,6 @@ def read_odps_query(
|
|
|
301
388
|
string_as_binary=string_as_binary,
|
|
302
389
|
index_columns=index_col,
|
|
303
390
|
index_dtypes=index_dtypes,
|
|
391
|
+
column_renames=col_renames,
|
|
304
392
|
)
|
|
305
393
|
return op(chunk_bytes=chunk_bytes, chunk_size=chunk_size)
|
|
@@ -22,6 +22,7 @@ from odps.models import Table
|
|
|
22
22
|
from odps.utils import to_timestamp
|
|
23
23
|
|
|
24
24
|
from ... import opcodes
|
|
25
|
+
from ...config import options
|
|
25
26
|
from ...core import OutputType
|
|
26
27
|
from ...io.odpsio import odps_schema_to_pandas_dtypes
|
|
27
28
|
from ...serialization.serializables import (
|
|
@@ -167,12 +168,13 @@ def read_odps_table(
|
|
|
167
168
|
DataFrame read from MaxCompute (ODPS) table
|
|
168
169
|
"""
|
|
169
170
|
odps_entry = odps_entry or ODPS.from_global() or ODPS.from_environments()
|
|
171
|
+
schema = options.session.default_schema or odps_entry.schema
|
|
170
172
|
if odps_entry is None:
|
|
171
173
|
raise ValueError("Missing odps_entry parameter")
|
|
172
174
|
if isinstance(table_name, Table):
|
|
173
175
|
table = table_name
|
|
174
176
|
else:
|
|
175
|
-
table = odps_entry.get_table(table_name)
|
|
177
|
+
table = odps_entry.get_table(table_name, schema=schema)
|
|
176
178
|
|
|
177
179
|
if not table.table_schema.partitions and (
|
|
178
180
|
partitions is not None or append_partitions
|
|
@@ -13,19 +13,28 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import os
|
|
16
|
+
import uuid
|
|
16
17
|
from collections import OrderedDict
|
|
17
18
|
|
|
18
19
|
import numpy as np
|
|
19
20
|
import pandas as pd
|
|
20
21
|
import pytest
|
|
21
22
|
from odps import ODPS
|
|
23
|
+
from odps import types as odps_types
|
|
22
24
|
|
|
23
25
|
from .... import tensor as mt
|
|
24
26
|
from ....core import OutputType
|
|
25
27
|
from ....tests.utils import tn
|
|
26
28
|
from ....utils import lazy_import
|
|
27
29
|
from ... import read_odps_query, read_odps_table
|
|
28
|
-
from ...core import
|
|
30
|
+
from ...core import (
|
|
31
|
+
DatetimeIndex,
|
|
32
|
+
Float64Index,
|
|
33
|
+
Index,
|
|
34
|
+
IndexValue,
|
|
35
|
+
Int64Index,
|
|
36
|
+
MultiIndex,
|
|
37
|
+
)
|
|
29
38
|
from ..dataframe import from_pandas as from_pandas_df
|
|
30
39
|
from ..date_range import date_range
|
|
31
40
|
from ..from_tensor import (
|
|
@@ -35,7 +44,12 @@ from ..from_tensor import (
|
|
|
35
44
|
)
|
|
36
45
|
from ..index import from_pandas as from_pandas_index
|
|
37
46
|
from ..index import from_tileable
|
|
38
|
-
from ..read_odps_query import
|
|
47
|
+
from ..read_odps_query import (
|
|
48
|
+
ColumnSchema,
|
|
49
|
+
_parse_full_explain,
|
|
50
|
+
_parse_simple_explain,
|
|
51
|
+
_resolve_task_sector,
|
|
52
|
+
)
|
|
39
53
|
from ..series import from_pandas as from_pandas_series
|
|
40
54
|
|
|
41
55
|
ray = lazy_import("ray")
|
|
@@ -113,18 +127,22 @@ def test_from_tileable_index():
|
|
|
113
127
|
|
|
114
128
|
for o in [df, df[0]]:
|
|
115
129
|
index = o.index
|
|
116
|
-
assert isinstance(index, Int64Index)
|
|
130
|
+
assert isinstance(index, (Index, Int64Index))
|
|
117
131
|
assert index.dtype == np.int64
|
|
118
132
|
assert index.name == pd_df.index.name
|
|
119
|
-
assert isinstance(
|
|
133
|
+
assert isinstance(
|
|
134
|
+
index.index_value.value, (IndexValue.Int64Index, IndexValue.Index)
|
|
135
|
+
)
|
|
120
136
|
|
|
121
137
|
t = mt.random.rand(10, chunk_size=6)
|
|
122
138
|
index = from_tileable(t, name="new_name")
|
|
123
139
|
|
|
124
|
-
assert isinstance(index, Float64Index)
|
|
140
|
+
assert isinstance(index, (Index, Float64Index))
|
|
125
141
|
assert index.dtype == np.float64
|
|
126
142
|
assert index.name == "new_name"
|
|
127
|
-
assert isinstance(
|
|
143
|
+
assert isinstance(
|
|
144
|
+
index.index_value.value, (IndexValue.Float64Index, IndexValue.Index)
|
|
145
|
+
)
|
|
128
146
|
|
|
129
147
|
|
|
130
148
|
def test_from_tensor():
|
|
@@ -326,13 +344,12 @@ def test_from_odps_query():
|
|
|
326
344
|
odps_entry.write_table(test_table2, [["A", 10, 4.5]])
|
|
327
345
|
|
|
328
346
|
with pytest.raises(ValueError) as err_info:
|
|
329
|
-
read_odps_query(
|
|
347
|
+
read_odps_query(
|
|
348
|
+
f"CREATE TABLE dummy_table_{uuid.uuid4().hex} "
|
|
349
|
+
f"AS SELECT * FROM {table1_name}"
|
|
350
|
+
)
|
|
330
351
|
assert "instant query" in err_info.value.args[0]
|
|
331
352
|
|
|
332
|
-
with pytest.raises(ValueError) as err_info:
|
|
333
|
-
read_odps_query(f"SELECT col1, col2 + col3 FROM {table1_name}")
|
|
334
|
-
assert "names" in err_info.value.args[0]
|
|
335
|
-
|
|
336
353
|
query1 = f"SELECT * FROM {table1_name} WHERE col1 > 10"
|
|
337
354
|
df = read_odps_query(query1)
|
|
338
355
|
assert df.op.query == query1
|
|
@@ -346,6 +363,10 @@ def test_from_odps_query():
|
|
|
346
363
|
),
|
|
347
364
|
)
|
|
348
365
|
|
|
366
|
+
df = read_odps_query(query1, skip_schema=True)
|
|
367
|
+
assert df.dtypes is None
|
|
368
|
+
assert df.columns_value is None
|
|
369
|
+
|
|
349
370
|
df = read_odps_query(query1, index_col="col1")
|
|
350
371
|
assert df.op.query == query1
|
|
351
372
|
assert df.index_value.name == "col1"
|
|
@@ -401,7 +422,9 @@ def test_date_range():
|
|
|
401
422
|
|
|
402
423
|
|
|
403
424
|
def test_resolve_task_sector():
|
|
404
|
-
input_path = os.path.join(
|
|
425
|
+
input_path = os.path.join(
|
|
426
|
+
os.path.dirname(__file__), "test-data", "task-input-full.txt"
|
|
427
|
+
)
|
|
405
428
|
with open(input_path, "r") as f:
|
|
406
429
|
sector = f.read()
|
|
407
430
|
actual_sector = _resolve_task_sector("job0", sector)
|
|
@@ -413,3 +436,61 @@ def test_resolve_task_sector():
|
|
|
413
436
|
assert actual_sector.schema[0] == ColumnSchema("unnamed: 0", "bigint", "")
|
|
414
437
|
assert actual_sector.schema[1] == ColumnSchema("id", "bigint", "id_alias")
|
|
415
438
|
assert actual_sector.schema[2] == ColumnSchema("listing_url", "string", "")
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
def test_resolve_task_odps2():
|
|
442
|
+
input_path = os.path.join(
|
|
443
|
+
os.path.dirname(__file__), "test-data", "task-input-odps2.txt"
|
|
444
|
+
)
|
|
445
|
+
with open(input_path, "r") as f:
|
|
446
|
+
sector = f.read()
|
|
447
|
+
actual_sector = _resolve_task_sector("job0", sector)
|
|
448
|
+
|
|
449
|
+
assert actual_sector.job_name == "job0"
|
|
450
|
+
assert actual_sector.task_name == "M1"
|
|
451
|
+
assert actual_sector.output_target == "Screen"
|
|
452
|
+
assert len(actual_sector.schema) == 2
|
|
453
|
+
assert actual_sector.schema[0] == ColumnSchema("key", "varchar(2048)", "")
|
|
454
|
+
assert actual_sector.schema[1] == ColumnSchema("data", "binary", "")
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
def test_resolve_simple_explain():
|
|
458
|
+
input_path = os.path.join(
|
|
459
|
+
os.path.dirname(__file__), "test-data", "task-input-simple.txt"
|
|
460
|
+
)
|
|
461
|
+
with open(input_path, "r") as f:
|
|
462
|
+
sector = f.read()
|
|
463
|
+
|
|
464
|
+
schema = _parse_simple_explain(sector)
|
|
465
|
+
assert schema.columns[0].name == "memberid"
|
|
466
|
+
assert schema.columns[0].type == odps_types.string
|
|
467
|
+
assert schema.columns[1].name == "createdate"
|
|
468
|
+
assert schema.columns[1].type == odps_types.bigint
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
def test_resolve_conditional():
|
|
472
|
+
input_path = os.path.join(
|
|
473
|
+
os.path.dirname(__file__), "test-data", "task-input-multi-cond.txt"
|
|
474
|
+
)
|
|
475
|
+
with open(input_path, "r") as f:
|
|
476
|
+
sector = f.read()
|
|
477
|
+
|
|
478
|
+
expected_col_types = {
|
|
479
|
+
"cs1": "string",
|
|
480
|
+
"cs2": "string",
|
|
481
|
+
"ci1": "bigint",
|
|
482
|
+
"cs3": "string",
|
|
483
|
+
"cs4": "string",
|
|
484
|
+
"cs5": "string",
|
|
485
|
+
"cs6": "string",
|
|
486
|
+
"cs7": "string",
|
|
487
|
+
"cs8": "string",
|
|
488
|
+
"ci2": "int",
|
|
489
|
+
"ci3": "bigint",
|
|
490
|
+
"cs9": "string",
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
schema = _parse_full_explain(sector)
|
|
494
|
+
for col, (exp_nm, exp_tp) in zip(schema.columns, expected_col_types.items()):
|
|
495
|
+
assert col.name == exp_nm
|
|
496
|
+
assert col.type == odps_types.validate_data_type(exp_tp)
|
|
@@ -17,6 +17,7 @@
|
|
|
17
17
|
import logging
|
|
18
18
|
from typing import List, Optional, Union
|
|
19
19
|
|
|
20
|
+
from odps import ODPS
|
|
20
21
|
from odps.models import Table as ODPSTable
|
|
21
22
|
from odps.types import PartitionSpec
|
|
22
23
|
|
|
@@ -136,8 +137,14 @@ def to_odps_table(
|
|
|
136
137
|
--------
|
|
137
138
|
|
|
138
139
|
"""
|
|
140
|
+
odps_entry = ODPS.from_global() or ODPS.from_environments()
|
|
139
141
|
if isinstance(table, ODPSTable):
|
|
140
142
|
table = table.full_table_name
|
|
143
|
+
elif options.session.enable_schema and "." not in table:
|
|
144
|
+
default_schema = (
|
|
145
|
+
options.session.default_schema or odps_entry.schema or "default"
|
|
146
|
+
)
|
|
147
|
+
table = default_schema + "." + table
|
|
141
148
|
|
|
142
149
|
if isinstance(index_label, str):
|
|
143
150
|
index_label = [index_label]
|
|
@@ -18,6 +18,9 @@ from .accessor import (
|
|
|
18
18
|
IndexMaxFrameAccessor,
|
|
19
19
|
SeriesMaxFrameAccessor,
|
|
20
20
|
)
|
|
21
|
+
from .apply_chunk import df_apply_chunk, series_apply_chunk
|
|
22
|
+
from .flatjson import series_flatjson
|
|
23
|
+
from .flatmap import df_flatmap, series_flatmap
|
|
21
24
|
from .reshuffle import DataFrameReshuffle, df_reshuffle
|
|
22
25
|
|
|
23
26
|
|
|
@@ -25,6 +28,11 @@ def _install():
|
|
|
25
28
|
from ..core import DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE
|
|
26
29
|
|
|
27
30
|
DataFrameMaxFrameAccessor._register("reshuffle", df_reshuffle)
|
|
31
|
+
DataFrameMaxFrameAccessor._register("flatmap", df_flatmap)
|
|
32
|
+
DataFrameMaxFrameAccessor._register("apply_chunk", df_apply_chunk)
|
|
33
|
+
SeriesMaxFrameAccessor._register("flatmap", series_flatmap)
|
|
34
|
+
SeriesMaxFrameAccessor._register("flatjson", series_flatjson)
|
|
35
|
+
SeriesMaxFrameAccessor._register("apply_chunk", series_apply_chunk)
|
|
28
36
|
|
|
29
37
|
if DataFrameMaxFrameAccessor._api_count:
|
|
30
38
|
for t in DATAFRAME_TYPE:
|