maxframe 1.0.0rc3__cp38-cp38-win32.whl → 1.1.0__cp38-cp38-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (112) hide show
  1. maxframe/_utils.cp38-win32.pyd +0 -0
  2. maxframe/codegen.py +1 -0
  3. maxframe/config/config.py +16 -1
  4. maxframe/conftest.py +52 -14
  5. maxframe/core/entity/executable.py +1 -1
  6. maxframe/core/graph/core.cp38-win32.pyd +0 -0
  7. maxframe/core/operator/base.py +2 -0
  8. maxframe/dataframe/arithmetic/docstring.py +26 -2
  9. maxframe/dataframe/arithmetic/equal.py +4 -2
  10. maxframe/dataframe/arithmetic/greater.py +4 -2
  11. maxframe/dataframe/arithmetic/greater_equal.py +4 -2
  12. maxframe/dataframe/arithmetic/less.py +2 -2
  13. maxframe/dataframe/arithmetic/less_equal.py +4 -2
  14. maxframe/dataframe/arithmetic/not_equal.py +4 -2
  15. maxframe/dataframe/arithmetic/tests/test_arithmetic.py +17 -16
  16. maxframe/dataframe/core.py +26 -2
  17. maxframe/dataframe/datasource/read_odps_query.py +116 -28
  18. maxframe/dataframe/datasource/read_odps_table.py +3 -1
  19. maxframe/dataframe/datasource/tests/test_datasource.py +93 -12
  20. maxframe/dataframe/datastore/to_odps.py +7 -0
  21. maxframe/dataframe/extensions/__init__.py +8 -0
  22. maxframe/dataframe/extensions/apply_chunk.py +649 -0
  23. maxframe/dataframe/extensions/flatjson.py +131 -0
  24. maxframe/dataframe/extensions/flatmap.py +314 -0
  25. maxframe/dataframe/extensions/reshuffle.py +1 -1
  26. maxframe/dataframe/extensions/tests/test_apply_chunk.py +186 -0
  27. maxframe/dataframe/extensions/tests/test_extensions.py +108 -3
  28. maxframe/dataframe/groupby/__init__.py +1 -0
  29. maxframe/dataframe/groupby/aggregation.py +1 -0
  30. maxframe/dataframe/groupby/apply.py +9 -1
  31. maxframe/dataframe/groupby/core.py +1 -1
  32. maxframe/dataframe/groupby/fill.py +4 -1
  33. maxframe/dataframe/groupby/getitem.py +6 -0
  34. maxframe/dataframe/groupby/tests/test_groupby.py +1 -1
  35. maxframe/dataframe/groupby/transform.py +8 -2
  36. maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
  37. maxframe/dataframe/indexing/loc.py +6 -4
  38. maxframe/dataframe/indexing/rename.py +11 -0
  39. maxframe/dataframe/initializer.py +11 -1
  40. maxframe/dataframe/merge/__init__.py +9 -1
  41. maxframe/dataframe/merge/concat.py +41 -31
  42. maxframe/dataframe/merge/merge.py +1 -1
  43. maxframe/dataframe/merge/tests/test_merge.py +3 -1
  44. maxframe/dataframe/misc/apply.py +3 -0
  45. maxframe/dataframe/misc/drop_duplicates.py +23 -2
  46. maxframe/dataframe/misc/map.py +3 -1
  47. maxframe/dataframe/misc/tests/test_misc.py +24 -2
  48. maxframe/dataframe/misc/transform.py +22 -13
  49. maxframe/dataframe/reduction/__init__.py +3 -0
  50. maxframe/dataframe/reduction/aggregation.py +1 -0
  51. maxframe/dataframe/reduction/median.py +56 -0
  52. maxframe/dataframe/reduction/tests/test_reduction.py +17 -7
  53. maxframe/dataframe/statistics/quantile.py +8 -2
  54. maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
  55. maxframe/dataframe/tests/test_initializer.py +33 -2
  56. maxframe/dataframe/tests/test_utils.py +60 -0
  57. maxframe/dataframe/utils.py +110 -7
  58. maxframe/dataframe/window/expanding.py +5 -3
  59. maxframe/dataframe/window/tests/test_expanding.py +2 -2
  60. maxframe/io/objects/tests/test_object_io.py +39 -12
  61. maxframe/io/odpsio/arrow.py +30 -2
  62. maxframe/io/odpsio/schema.py +28 -8
  63. maxframe/io/odpsio/tableio.py +55 -133
  64. maxframe/io/odpsio/tests/test_schema.py +40 -4
  65. maxframe/io/odpsio/tests/test_tableio.py +5 -5
  66. maxframe/io/odpsio/tests/test_volumeio.py +35 -11
  67. maxframe/io/odpsio/volumeio.py +36 -6
  68. maxframe/learn/contrib/__init__.py +3 -1
  69. maxframe/learn/contrib/graph/__init__.py +15 -0
  70. maxframe/learn/contrib/graph/connected_components.py +215 -0
  71. maxframe/learn/contrib/graph/tests/__init__.py +13 -0
  72. maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
  73. maxframe/learn/contrib/llm/__init__.py +16 -0
  74. maxframe/learn/contrib/llm/core.py +54 -0
  75. maxframe/learn/contrib/llm/models/__init__.py +14 -0
  76. maxframe/learn/contrib/llm/models/dashscope.py +73 -0
  77. maxframe/learn/contrib/llm/multi_modal.py +42 -0
  78. maxframe/learn/contrib/llm/text.py +42 -0
  79. maxframe/learn/contrib/xgboost/classifier.py +3 -3
  80. maxframe/learn/contrib/xgboost/predict.py +8 -39
  81. maxframe/learn/contrib/xgboost/train.py +4 -3
  82. maxframe/lib/mmh3.cp38-win32.pyd +0 -0
  83. maxframe/lib/sparse/tests/test_sparse.py +15 -15
  84. maxframe/opcodes.py +10 -1
  85. maxframe/protocol.py +6 -1
  86. maxframe/serialization/core.cp38-win32.pyd +0 -0
  87. maxframe/serialization/core.pyx +13 -1
  88. maxframe/serialization/pandas.py +50 -20
  89. maxframe/serialization/serializables/core.py +24 -5
  90. maxframe/serialization/serializables/field_type.py +4 -1
  91. maxframe/serialization/serializables/tests/test_serializable.py +8 -1
  92. maxframe/serialization/tests/test_serial.py +2 -1
  93. maxframe/session.py +9 -2
  94. maxframe/tensor/__init__.py +19 -7
  95. maxframe/tensor/indexing/getitem.py +2 -0
  96. maxframe/tensor/merge/concatenate.py +23 -20
  97. maxframe/tensor/merge/vstack.py +5 -1
  98. maxframe/tensor/misc/transpose.py +1 -1
  99. maxframe/tests/utils.py +16 -0
  100. maxframe/udf.py +27 -0
  101. maxframe/utils.py +64 -14
  102. {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/METADATA +2 -2
  103. {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/RECORD +112 -96
  104. {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/WHEEL +1 -1
  105. maxframe_client/clients/framedriver.py +4 -1
  106. maxframe_client/fetcher.py +28 -10
  107. maxframe_client/session/consts.py +3 -0
  108. maxframe_client/session/odps.py +104 -20
  109. maxframe_client/session/task.py +42 -26
  110. maxframe_client/session/tests/test_task.py +0 -4
  111. maxframe_client/tests/test_session.py +44 -12
  112. {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/top_level.txt +0 -0
@@ -13,6 +13,7 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import dataclasses
16
+ import logging
16
17
  import re
17
18
  from typing import Dict, List, Optional, Tuple, Union
18
19
 
@@ -22,12 +23,14 @@ from odps import ODPS
22
23
  from odps.types import Column, OdpsSchema, validate_data_type
23
24
 
24
25
  from ... import opcodes
26
+ from ...config import options
25
27
  from ...core import OutputType
26
28
  from ...core.graph import DAG
27
29
  from ...io.odpsio import odps_schema_to_pandas_dtypes
28
30
  from ...serialization.serializables import (
29
31
  AnyField,
30
32
  BoolField,
33
+ DictField,
31
34
  FieldTypes,
32
35
  Int64Field,
33
36
  ListField,
@@ -37,6 +40,10 @@ from ...serialization.serializables import (
37
40
  from ..utils import parse_index
38
41
  from .core import ColumnPruneSupportedDataSourceMixin, IncrementalIndexDatasource
39
42
 
43
+ logger = logging.getLogger(__name__)
44
+
45
+ _DEFAULT_ANONYMOUS_COL_PREFIX = "_anon_col_"
46
+
40
47
  _EXPLAIN_DEPENDS_REGEX = re.compile(r"([^\s]+) depends on: ([^\n]+)")
41
48
  _EXPLAIN_JOB_REGEX = re.compile(r"(\S+) is root job")
42
49
  _EXPLAIN_TASKS_HEADER_REGEX = re.compile(r"In Job ([^:]+):")
@@ -46,8 +53,11 @@ _EXPLAIN_TASK_SCHEMA_REGEX = re.compile(
46
53
  r"In Task ([^:]+)[\S\s]+FS: output: ([^\n #]+)[\s\S]+schema:\s+([\S\s]+)$",
47
54
  re.MULTILINE,
48
55
  )
49
- _EXPLAIN_COLUMN_REGEX = re.compile(r"([^\(]+) \(([^)]+)\)(?:| AS ([^ ]+))(?:\n|$)")
50
- _ANONYMOUS_COL_REGEX = re.compile(r"^_c\d+$")
56
+ _EXPLAIN_COLUMN_REGEX = re.compile(r"([^\(]+) \(([^\n]+)\)(?:| AS ([^ ]+))(?:\n|$)")
57
+ _ANONYMOUS_COL_REGEX = re.compile(r"^_c(\d+)$")
58
+
59
+ _SIMPLE_SCHEMA_COLS_REGEX = re.compile(r"SELECT (([^:]+:[^, ]+[, ]*)+)FROM")
60
+ _SIMPLE_SCHEMA_COL_REGEX = re.compile(r"([^ \.\)]+):([^ ]+)")
51
61
 
52
62
 
53
63
  @dataclasses.dataclass
@@ -152,7 +162,7 @@ def _resolve_task_sector(job_name: str, sector: str) -> TaskSector:
152
162
  return TaskSector(job_name, task_name, out_target, schemas)
153
163
 
154
164
 
155
- def _parse_explained_schema(explain_string: str) -> OdpsSchema:
165
+ def _parse_full_explain(explain_string: str) -> OdpsSchema:
156
166
  sectors = _split_explain_string(explain_string)
157
167
  jobs_sector = tasks_sector = None
158
168
 
@@ -170,27 +180,53 @@ def _parse_explained_schema(explain_string: str) -> OdpsSchema:
170
180
 
171
181
  job_dag = jobs_sector.build_dag()
172
182
  indep_job_names = list(job_dag.iter_indep(reverse=True))
173
- if len(indep_job_names) > 1: # pragma: no cover
174
- raise ValueError("Only one final job is allowed in SQL statement")
175
-
176
- tasks_sector = jobs_sector.jobs[indep_job_names[0]]
177
- task_dag = tasks_sector.build_dag()
178
- indep_task_names = list(task_dag.iter_indep(reverse=True))
179
- if len(indep_task_names) > 1: # pragma: no cover
183
+ schema_signatures = dict()
184
+ for job_name in indep_job_names:
185
+ tasks_sector = jobs_sector.jobs[job_name]
186
+ task_dag = tasks_sector.build_dag()
187
+ indep_task_names = list(task_dag.iter_indep(reverse=True))
188
+ for task_name in indep_task_names:
189
+ task_sector = tasks_sector.tasks[task_name]
190
+ if not task_sector.schema: # pragma: no cover
191
+ raise ValueError("Cannot detect output schema")
192
+ if task_sector.output_target != "Screen":
193
+ raise ValueError("The SQL statement should be an instant query")
194
+ sig_tuples = sorted(
195
+ [
196
+ (c.column_alias or c.column_name, c.column_type)
197
+ for c in task_sector.schema
198
+ ]
199
+ )
200
+ schema_signatures[hash(tuple(sig_tuples))] = task_sector.schema
201
+ if len(schema_signatures) != 1:
180
202
  raise ValueError("Only one final task is allowed in SQL statement")
181
-
182
- task_sector = tasks_sector.tasks[indep_task_names[0]]
183
- if not task_sector.schema: # pragma: no cover
184
- raise ValueError("Cannot detect output schema")
185
- if task_sector.output_target != "Screen":
186
- raise ValueError("The SQL statement should be an instant query")
203
+ schema = list(schema_signatures.values())[0]
187
204
  cols = [
188
205
  Column(c.column_alias or c.column_name, validate_data_type(c.column_type))
189
- for c in task_sector.schema
206
+ for c in schema
190
207
  ]
191
208
  return OdpsSchema(cols)
192
209
 
193
210
 
211
+ def _parse_simple_explain(explain_string: str) -> OdpsSchema:
212
+ fields_match = _SIMPLE_SCHEMA_COLS_REGEX.search(explain_string)
213
+ if not fields_match:
214
+ raise ValueError("Cannot detect output table schema")
215
+
216
+ fields_str = fields_match.group(1)
217
+ cols = []
218
+ for field, type_name in _SIMPLE_SCHEMA_COL_REGEX.findall(fields_str):
219
+ cols.append(Column(field, validate_data_type(type_name.rstrip(","))))
220
+ return OdpsSchema(cols)
221
+
222
+
223
+ def _parse_explained_schema(explain_string: str) -> OdpsSchema:
224
+ if explain_string.startswith("AdhocSink"):
225
+ return _parse_simple_explain(explain_string)
226
+ else:
227
+ return _parse_full_explain(explain_string)
228
+
229
+
194
230
  class DataFrameReadODPSQuery(
195
231
  IncrementalIndexDatasource,
196
232
  ColumnPruneSupportedDataSourceMixin,
@@ -205,6 +241,7 @@ class DataFrameReadODPSQuery(
205
241
  string_as_binary = BoolField("string_as_binary", default=None)
206
242
  index_columns = ListField("index_columns", FieldTypes.string, default=None)
207
243
  index_dtypes = SeriesField("index_dtypes", default=None)
244
+ column_renames = DictField("column_renames", default=None)
208
245
 
209
246
  def get_columns(self):
210
247
  return self.columns
@@ -227,12 +264,18 @@ class DataFrameReadODPSQuery(
227
264
  )
228
265
  index_value = parse_index(idx)
229
266
 
230
- columns_value = parse_index(self.dtypes.index, store_data=True)
267
+ if self.dtypes is not None:
268
+ columns_value = parse_index(self.dtypes.index, store_data=True)
269
+ shape = (np.nan, len(self.dtypes))
270
+ else:
271
+ columns_value = None
272
+ shape = (np.nan, np.nan)
273
+
231
274
  self.output_types = [OutputType.dataframe]
232
275
  return self.new_tileable(
233
276
  [],
234
277
  None,
235
- shape=(len(self.dtypes), np.nan),
278
+ shape=shape,
236
279
  dtypes=self.dtypes,
237
280
  index_value=index_value,
238
281
  columns_value=columns_value,
@@ -246,6 +289,9 @@ def read_odps_query(
246
289
  odps_entry: ODPS = None,
247
290
  index_col: Union[None, str, List[str]] = None,
248
291
  string_as_binary: bool = None,
292
+ sql_hints: Dict[str, str] = None,
293
+ anonymous_col_prefix: str = _DEFAULT_ANONYMOUS_COL_PREFIX,
294
+ skip_schema: bool = False,
249
295
  **kw,
250
296
  ):
251
297
  """
@@ -260,29 +306,70 @@ def read_odps_query(
260
306
  MaxCompute SQL statement.
261
307
  index_col: Union[None, str, List[str]]
262
308
  Columns to be specified as indexes.
309
+ string_as_binary: bool, optional
310
+ Whether to convert string columns to binary.
311
+ sql_hints: Dict[str, str], optional
312
+ User specified SQL hints.
313
+ anonymous_col_prefix: str, optional
314
+ Prefix for anonymous columns, '_anon_col_' by default.
315
+ skip_schema: bool, optional
316
+ Skip resolving output schema before execution. Once this is configured,
317
+ the output DataFrame cannot be inputs of other DataFrame operators
318
+ before execution.
263
319
 
264
320
  Returns
265
321
  -------
266
322
  result: DataFrame
267
323
  DataFrame read from MaxCompute (ODPS) table
268
324
  """
325
+ hints = options.sql.settings.copy() or {}
326
+ if sql_hints:
327
+ hints.update(sql_hints)
328
+
269
329
  odps_entry = odps_entry or ODPS.from_global() or ODPS.from_environments()
270
- if odps_entry is None:
271
- raise ValueError("Missing odps_entry parameter")
272
- inst = odps_entry.execute_sql(f"EXPLAIN {query}")
273
- explain_str = list(inst.get_task_results().values())[0]
274
330
 
275
- odps_schema = _parse_explained_schema(explain_str)
331
+ if options.session.enable_schema or odps_entry.is_schema_namespace_enabled():
332
+ hints["odps.namespace.schema"] = "true"
333
+ hints["odps.sql.allow.namespace.schema"] = "true"
334
+
335
+ # fixme workaround for multi-stage split process
336
+ hints["odps.sql.object.table.split.by.object.size.enabled"] = "false"
276
337
 
277
- for col in odps_schema.columns:
278
- if _ANONYMOUS_COL_REGEX.match(col.name) and col.name not in query:
279
- raise ValueError("Need to specify names for all columns in SELECT clause.")
338
+ if odps_entry is None:
339
+ raise ValueError("Missing odps_entry parameter")
280
340
 
281
- dtypes = odps_schema_to_pandas_dtypes(odps_schema)
341
+ col_renames = {}
342
+ if not skip_schema:
343
+ inst = odps_entry.execute_sql(f"EXPLAIN {query}", hints=hints)
344
+ logger.debug("Explain instance ID: %s", inst.id)
345
+ explain_str = list(inst.get_task_results().values())[0]
346
+
347
+ try:
348
+ odps_schema = _parse_explained_schema(explain_str)
349
+ except ValueError as ex:
350
+ exc = ValueError(str(ex) + "\nExplain instance ID: " + inst.id)
351
+ raise exc.with_traceback(ex.__traceback__) from None
352
+
353
+ new_columns = []
354
+ for col in odps_schema.columns:
355
+ anon_match = _ANONYMOUS_COL_REGEX.match(col.name)
356
+ if anon_match and col.name not in query:
357
+ new_name = anonymous_col_prefix + anon_match.group(1)
358
+ col_renames[col.name] = new_name
359
+ new_columns.append(Column(new_name, col.type))
360
+ else:
361
+ new_columns.append(col)
362
+
363
+ dtypes = odps_schema_to_pandas_dtypes(OdpsSchema(new_columns))
364
+ else:
365
+ dtypes = None
282
366
 
283
367
  if not index_col:
284
368
  index_dtypes = None
285
369
  else:
370
+ if dtypes is None:
371
+ raise ValueError("Cannot configure index_col when skip_schema is True")
372
+
286
373
  if isinstance(index_col, str):
287
374
  index_col = [index_col]
288
375
  index_col_set = set(index_col)
@@ -301,5 +388,6 @@ def read_odps_query(
301
388
  string_as_binary=string_as_binary,
302
389
  index_columns=index_col,
303
390
  index_dtypes=index_dtypes,
391
+ column_renames=col_renames,
304
392
  )
305
393
  return op(chunk_bytes=chunk_bytes, chunk_size=chunk_size)
@@ -22,6 +22,7 @@ from odps.models import Table
22
22
  from odps.utils import to_timestamp
23
23
 
24
24
  from ... import opcodes
25
+ from ...config import options
25
26
  from ...core import OutputType
26
27
  from ...io.odpsio import odps_schema_to_pandas_dtypes
27
28
  from ...serialization.serializables import (
@@ -167,12 +168,13 @@ def read_odps_table(
167
168
  DataFrame read from MaxCompute (ODPS) table
168
169
  """
169
170
  odps_entry = odps_entry or ODPS.from_global() or ODPS.from_environments()
171
+ schema = options.session.default_schema or odps_entry.schema
170
172
  if odps_entry is None:
171
173
  raise ValueError("Missing odps_entry parameter")
172
174
  if isinstance(table_name, Table):
173
175
  table = table_name
174
176
  else:
175
- table = odps_entry.get_table(table_name)
177
+ table = odps_entry.get_table(table_name, schema=schema)
176
178
 
177
179
  if not table.table_schema.partitions and (
178
180
  partitions is not None or append_partitions
@@ -13,19 +13,28 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import os
16
+ import uuid
16
17
  from collections import OrderedDict
17
18
 
18
19
  import numpy as np
19
20
  import pandas as pd
20
21
  import pytest
21
22
  from odps import ODPS
23
+ from odps import types as odps_types
22
24
 
23
25
  from .... import tensor as mt
24
26
  from ....core import OutputType
25
27
  from ....tests.utils import tn
26
28
  from ....utils import lazy_import
27
29
  from ... import read_odps_query, read_odps_table
28
- from ...core import DatetimeIndex, Float64Index, IndexValue, Int64Index, MultiIndex
30
+ from ...core import (
31
+ DatetimeIndex,
32
+ Float64Index,
33
+ Index,
34
+ IndexValue,
35
+ Int64Index,
36
+ MultiIndex,
37
+ )
29
38
  from ..dataframe import from_pandas as from_pandas_df
30
39
  from ..date_range import date_range
31
40
  from ..from_tensor import (
@@ -35,7 +44,12 @@ from ..from_tensor import (
35
44
  )
36
45
  from ..index import from_pandas as from_pandas_index
37
46
  from ..index import from_tileable
38
- from ..read_odps_query import ColumnSchema, _resolve_task_sector
47
+ from ..read_odps_query import (
48
+ ColumnSchema,
49
+ _parse_full_explain,
50
+ _parse_simple_explain,
51
+ _resolve_task_sector,
52
+ )
39
53
  from ..series import from_pandas as from_pandas_series
40
54
 
41
55
  ray = lazy_import("ray")
@@ -113,18 +127,22 @@ def test_from_tileable_index():
113
127
 
114
128
  for o in [df, df[0]]:
115
129
  index = o.index
116
- assert isinstance(index, Int64Index)
130
+ assert isinstance(index, (Index, Int64Index))
117
131
  assert index.dtype == np.int64
118
132
  assert index.name == pd_df.index.name
119
- assert isinstance(index.index_value.value, IndexValue.Int64Index)
133
+ assert isinstance(
134
+ index.index_value.value, (IndexValue.Int64Index, IndexValue.Index)
135
+ )
120
136
 
121
137
  t = mt.random.rand(10, chunk_size=6)
122
138
  index = from_tileable(t, name="new_name")
123
139
 
124
- assert isinstance(index, Float64Index)
140
+ assert isinstance(index, (Index, Float64Index))
125
141
  assert index.dtype == np.float64
126
142
  assert index.name == "new_name"
127
- assert isinstance(index.index_value.value, IndexValue.Float64Index)
143
+ assert isinstance(
144
+ index.index_value.value, (IndexValue.Float64Index, IndexValue.Index)
145
+ )
128
146
 
129
147
 
130
148
  def test_from_tensor():
@@ -326,13 +344,12 @@ def test_from_odps_query():
326
344
  odps_entry.write_table(test_table2, [["A", 10, 4.5]])
327
345
 
328
346
  with pytest.raises(ValueError) as err_info:
329
- read_odps_query(f"CREATE TABLE dummy_table AS SELECT * FROM {table1_name}")
347
+ read_odps_query(
348
+ f"CREATE TABLE dummy_table_{uuid.uuid4().hex} "
349
+ f"AS SELECT * FROM {table1_name}"
350
+ )
330
351
  assert "instant query" in err_info.value.args[0]
331
352
 
332
- with pytest.raises(ValueError) as err_info:
333
- read_odps_query(f"SELECT col1, col2 + col3 FROM {table1_name}")
334
- assert "names" in err_info.value.args[0]
335
-
336
353
  query1 = f"SELECT * FROM {table1_name} WHERE col1 > 10"
337
354
  df = read_odps_query(query1)
338
355
  assert df.op.query == query1
@@ -346,6 +363,10 @@ def test_from_odps_query():
346
363
  ),
347
364
  )
348
365
 
366
+ df = read_odps_query(query1, skip_schema=True)
367
+ assert df.dtypes is None
368
+ assert df.columns_value is None
369
+
349
370
  df = read_odps_query(query1, index_col="col1")
350
371
  assert df.op.query == query1
351
372
  assert df.index_value.name == "col1"
@@ -401,7 +422,9 @@ def test_date_range():
401
422
 
402
423
 
403
424
  def test_resolve_task_sector():
404
- input_path = os.path.join(os.path.dirname(__file__), "test-data", "task-input.txt")
425
+ input_path = os.path.join(
426
+ os.path.dirname(__file__), "test-data", "task-input-full.txt"
427
+ )
405
428
  with open(input_path, "r") as f:
406
429
  sector = f.read()
407
430
  actual_sector = _resolve_task_sector("job0", sector)
@@ -413,3 +436,61 @@ def test_resolve_task_sector():
413
436
  assert actual_sector.schema[0] == ColumnSchema("unnamed: 0", "bigint", "")
414
437
  assert actual_sector.schema[1] == ColumnSchema("id", "bigint", "id_alias")
415
438
  assert actual_sector.schema[2] == ColumnSchema("listing_url", "string", "")
439
+
440
+
441
+ def test_resolve_task_odps2():
442
+ input_path = os.path.join(
443
+ os.path.dirname(__file__), "test-data", "task-input-odps2.txt"
444
+ )
445
+ with open(input_path, "r") as f:
446
+ sector = f.read()
447
+ actual_sector = _resolve_task_sector("job0", sector)
448
+
449
+ assert actual_sector.job_name == "job0"
450
+ assert actual_sector.task_name == "M1"
451
+ assert actual_sector.output_target == "Screen"
452
+ assert len(actual_sector.schema) == 2
453
+ assert actual_sector.schema[0] == ColumnSchema("key", "varchar(2048)", "")
454
+ assert actual_sector.schema[1] == ColumnSchema("data", "binary", "")
455
+
456
+
457
+ def test_resolve_simple_explain():
458
+ input_path = os.path.join(
459
+ os.path.dirname(__file__), "test-data", "task-input-simple.txt"
460
+ )
461
+ with open(input_path, "r") as f:
462
+ sector = f.read()
463
+
464
+ schema = _parse_simple_explain(sector)
465
+ assert schema.columns[0].name == "memberid"
466
+ assert schema.columns[0].type == odps_types.string
467
+ assert schema.columns[1].name == "createdate"
468
+ assert schema.columns[1].type == odps_types.bigint
469
+
470
+
471
+ def test_resolve_conditional():
472
+ input_path = os.path.join(
473
+ os.path.dirname(__file__), "test-data", "task-input-multi-cond.txt"
474
+ )
475
+ with open(input_path, "r") as f:
476
+ sector = f.read()
477
+
478
+ expected_col_types = {
479
+ "cs1": "string",
480
+ "cs2": "string",
481
+ "ci1": "bigint",
482
+ "cs3": "string",
483
+ "cs4": "string",
484
+ "cs5": "string",
485
+ "cs6": "string",
486
+ "cs7": "string",
487
+ "cs8": "string",
488
+ "ci2": "int",
489
+ "ci3": "bigint",
490
+ "cs9": "string",
491
+ }
492
+
493
+ schema = _parse_full_explain(sector)
494
+ for col, (exp_nm, exp_tp) in zip(schema.columns, expected_col_types.items()):
495
+ assert col.name == exp_nm
496
+ assert col.type == odps_types.validate_data_type(exp_tp)
@@ -17,6 +17,7 @@
17
17
  import logging
18
18
  from typing import List, Optional, Union
19
19
 
20
+ from odps import ODPS
20
21
  from odps.models import Table as ODPSTable
21
22
  from odps.types import PartitionSpec
22
23
 
@@ -136,8 +137,14 @@ def to_odps_table(
136
137
  --------
137
138
 
138
139
  """
140
+ odps_entry = ODPS.from_global() or ODPS.from_environments()
139
141
  if isinstance(table, ODPSTable):
140
142
  table = table.full_table_name
143
+ elif options.session.enable_schema and "." not in table:
144
+ default_schema = (
145
+ options.session.default_schema or odps_entry.schema or "default"
146
+ )
147
+ table = default_schema + "." + table
141
148
 
142
149
  if isinstance(index_label, str):
143
150
  index_label = [index_label]
@@ -18,6 +18,9 @@ from .accessor import (
18
18
  IndexMaxFrameAccessor,
19
19
  SeriesMaxFrameAccessor,
20
20
  )
21
+ from .apply_chunk import df_apply_chunk, series_apply_chunk
22
+ from .flatjson import series_flatjson
23
+ from .flatmap import df_flatmap, series_flatmap
21
24
  from .reshuffle import DataFrameReshuffle, df_reshuffle
22
25
 
23
26
 
@@ -25,6 +28,11 @@ def _install():
25
28
  from ..core import DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE
26
29
 
27
30
  DataFrameMaxFrameAccessor._register("reshuffle", df_reshuffle)
31
+ DataFrameMaxFrameAccessor._register("flatmap", df_flatmap)
32
+ DataFrameMaxFrameAccessor._register("apply_chunk", df_apply_chunk)
33
+ SeriesMaxFrameAccessor._register("flatmap", series_flatmap)
34
+ SeriesMaxFrameAccessor._register("flatjson", series_flatjson)
35
+ SeriesMaxFrameAccessor._register("apply_chunk", series_apply_chunk)
28
36
 
29
37
  if DataFrameMaxFrameAccessor._api_count:
30
38
  for t in DATAFRAME_TYPE: