maxframe 0.1.0b5__cp311-cp311-macosx_10_9_universal2.whl → 1.0.0__cp311-cp311-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (203) hide show
  1. maxframe/_utils.cpython-311-darwin.so +0 -0
  2. maxframe/codegen.py +10 -4
  3. maxframe/config/config.py +68 -10
  4. maxframe/config/validators.py +42 -11
  5. maxframe/conftest.py +58 -14
  6. maxframe/core/__init__.py +2 -16
  7. maxframe/core/entity/__init__.py +1 -12
  8. maxframe/core/entity/executable.py +1 -1
  9. maxframe/core/entity/objects.py +46 -45
  10. maxframe/core/entity/output_types.py +0 -3
  11. maxframe/core/entity/tests/test_objects.py +43 -0
  12. maxframe/core/entity/tileables.py +5 -78
  13. maxframe/core/graph/__init__.py +2 -2
  14. maxframe/core/graph/builder/__init__.py +0 -1
  15. maxframe/core/graph/builder/base.py +5 -4
  16. maxframe/core/graph/builder/tileable.py +4 -4
  17. maxframe/core/graph/builder/utils.py +4 -8
  18. maxframe/core/graph/core.cpython-311-darwin.so +0 -0
  19. maxframe/core/graph/core.pyx +4 -4
  20. maxframe/core/graph/entity.py +9 -33
  21. maxframe/core/operator/__init__.py +2 -9
  22. maxframe/core/operator/base.py +3 -5
  23. maxframe/core/operator/objects.py +0 -9
  24. maxframe/core/operator/utils.py +55 -0
  25. maxframe/dataframe/__init__.py +1 -1
  26. maxframe/dataframe/arithmetic/around.py +5 -17
  27. maxframe/dataframe/arithmetic/core.py +15 -7
  28. maxframe/dataframe/arithmetic/docstring.py +7 -33
  29. maxframe/dataframe/arithmetic/equal.py +4 -2
  30. maxframe/dataframe/arithmetic/greater.py +4 -2
  31. maxframe/dataframe/arithmetic/greater_equal.py +4 -2
  32. maxframe/dataframe/arithmetic/less.py +2 -2
  33. maxframe/dataframe/arithmetic/less_equal.py +4 -2
  34. maxframe/dataframe/arithmetic/not_equal.py +4 -2
  35. maxframe/dataframe/arithmetic/tests/test_arithmetic.py +39 -16
  36. maxframe/dataframe/core.py +31 -7
  37. maxframe/dataframe/datasource/date_range.py +2 -2
  38. maxframe/dataframe/datasource/read_odps_query.py +117 -23
  39. maxframe/dataframe/datasource/read_odps_table.py +6 -3
  40. maxframe/dataframe/datasource/tests/test_datasource.py +103 -8
  41. maxframe/dataframe/datastore/tests/test_to_odps.py +48 -0
  42. maxframe/dataframe/datastore/to_odps.py +28 -0
  43. maxframe/dataframe/extensions/__init__.py +5 -0
  44. maxframe/dataframe/extensions/flatjson.py +131 -0
  45. maxframe/dataframe/extensions/flatmap.py +317 -0
  46. maxframe/dataframe/extensions/reshuffle.py +1 -1
  47. maxframe/dataframe/extensions/tests/test_extensions.py +108 -3
  48. maxframe/dataframe/groupby/core.py +1 -1
  49. maxframe/dataframe/groupby/cum.py +0 -1
  50. maxframe/dataframe/groupby/fill.py +4 -1
  51. maxframe/dataframe/groupby/getitem.py +6 -0
  52. maxframe/dataframe/groupby/tests/test_groupby.py +5 -1
  53. maxframe/dataframe/groupby/transform.py +5 -1
  54. maxframe/dataframe/indexing/align.py +1 -1
  55. maxframe/dataframe/indexing/loc.py +6 -4
  56. maxframe/dataframe/indexing/rename.py +5 -28
  57. maxframe/dataframe/indexing/sample.py +0 -1
  58. maxframe/dataframe/indexing/set_index.py +68 -1
  59. maxframe/dataframe/initializer.py +11 -1
  60. maxframe/dataframe/merge/__init__.py +9 -1
  61. maxframe/dataframe/merge/concat.py +41 -31
  62. maxframe/dataframe/merge/merge.py +237 -3
  63. maxframe/dataframe/merge/tests/test_merge.py +126 -1
  64. maxframe/dataframe/misc/apply.py +5 -10
  65. maxframe/dataframe/misc/case_when.py +1 -1
  66. maxframe/dataframe/misc/describe.py +2 -2
  67. maxframe/dataframe/misc/drop_duplicates.py +8 -8
  68. maxframe/dataframe/misc/eval.py +4 -0
  69. maxframe/dataframe/misc/memory_usage.py +2 -2
  70. maxframe/dataframe/misc/pct_change.py +1 -83
  71. maxframe/dataframe/misc/tests/test_misc.py +33 -2
  72. maxframe/dataframe/misc/transform.py +1 -30
  73. maxframe/dataframe/misc/value_counts.py +4 -17
  74. maxframe/dataframe/missing/dropna.py +1 -1
  75. maxframe/dataframe/missing/fillna.py +5 -5
  76. maxframe/dataframe/operators.py +1 -17
  77. maxframe/dataframe/reduction/core.py +2 -2
  78. maxframe/dataframe/reduction/tests/test_reduction.py +2 -4
  79. maxframe/dataframe/sort/sort_values.py +1 -11
  80. maxframe/dataframe/statistics/corr.py +3 -3
  81. maxframe/dataframe/statistics/quantile.py +13 -19
  82. maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
  83. maxframe/dataframe/tests/test_initializer.py +33 -2
  84. maxframe/dataframe/utils.py +26 -11
  85. maxframe/dataframe/window/expanding.py +5 -3
  86. maxframe/dataframe/window/tests/test_expanding.py +2 -2
  87. maxframe/errors.py +13 -0
  88. maxframe/extension.py +12 -0
  89. maxframe/io/__init__.py +13 -0
  90. maxframe/io/objects/__init__.py +24 -0
  91. maxframe/io/objects/core.py +140 -0
  92. maxframe/io/objects/tensor.py +76 -0
  93. maxframe/io/objects/tests/__init__.py +13 -0
  94. maxframe/io/objects/tests/test_object_io.py +97 -0
  95. maxframe/{odpsio → io/odpsio}/__init__.py +3 -1
  96. maxframe/{odpsio → io/odpsio}/arrow.py +42 -10
  97. maxframe/{odpsio → io/odpsio}/schema.py +38 -16
  98. maxframe/io/odpsio/tableio.py +719 -0
  99. maxframe/io/odpsio/tests/__init__.py +13 -0
  100. maxframe/{odpsio → io/odpsio}/tests/test_schema.py +59 -22
  101. maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +50 -23
  102. maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
  103. maxframe/io/odpsio/volumeio.py +63 -0
  104. maxframe/learn/contrib/__init__.py +3 -1
  105. maxframe/learn/contrib/graph/__init__.py +15 -0
  106. maxframe/learn/contrib/graph/connected_components.py +215 -0
  107. maxframe/learn/contrib/graph/tests/__init__.py +13 -0
  108. maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
  109. maxframe/learn/contrib/llm/__init__.py +16 -0
  110. maxframe/learn/contrib/llm/core.py +54 -0
  111. maxframe/learn/contrib/llm/models/__init__.py +14 -0
  112. maxframe/learn/contrib/llm/models/dashscope.py +73 -0
  113. maxframe/learn/contrib/llm/multi_modal.py +42 -0
  114. maxframe/learn/contrib/llm/text.py +42 -0
  115. maxframe/learn/contrib/xgboost/classifier.py +26 -2
  116. maxframe/learn/contrib/xgboost/core.py +87 -2
  117. maxframe/learn/contrib/xgboost/dmatrix.py +3 -6
  118. maxframe/learn/contrib/xgboost/predict.py +29 -46
  119. maxframe/learn/contrib/xgboost/regressor.py +3 -10
  120. maxframe/learn/contrib/xgboost/train.py +29 -18
  121. maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
  122. maxframe/lib/mmh3.cpython-311-darwin.so +0 -0
  123. maxframe/lib/mmh3.pyi +43 -0
  124. maxframe/lib/sparse/tests/test_sparse.py +15 -15
  125. maxframe/lib/wrapped_pickle.py +2 -1
  126. maxframe/opcodes.py +8 -0
  127. maxframe/protocol.py +154 -27
  128. maxframe/remote/core.py +4 -8
  129. maxframe/serialization/__init__.py +1 -0
  130. maxframe/serialization/core.cpython-311-darwin.so +0 -0
  131. maxframe/serialization/core.pxd +3 -0
  132. maxframe/serialization/core.pyi +3 -0
  133. maxframe/serialization/core.pyx +67 -26
  134. maxframe/serialization/exception.py +1 -1
  135. maxframe/serialization/pandas.py +52 -17
  136. maxframe/serialization/serializables/core.py +180 -15
  137. maxframe/serialization/serializables/field_type.py +4 -1
  138. maxframe/serialization/serializables/tests/test_serializable.py +54 -5
  139. maxframe/serialization/tests/test_serial.py +2 -1
  140. maxframe/session.py +9 -2
  141. maxframe/tensor/__init__.py +81 -2
  142. maxframe/tensor/arithmetic/isclose.py +1 -0
  143. maxframe/tensor/arithmetic/tests/test_arithmetic.py +22 -18
  144. maxframe/tensor/core.py +5 -136
  145. maxframe/tensor/datasource/array.py +3 -0
  146. maxframe/tensor/datasource/full.py +1 -1
  147. maxframe/tensor/datasource/tests/test_datasource.py +1 -1
  148. maxframe/tensor/indexing/flatnonzero.py +1 -1
  149. maxframe/tensor/indexing/getitem.py +2 -0
  150. maxframe/tensor/merge/__init__.py +2 -0
  151. maxframe/tensor/merge/concatenate.py +101 -0
  152. maxframe/tensor/merge/tests/test_merge.py +30 -1
  153. maxframe/tensor/merge/vstack.py +74 -0
  154. maxframe/tensor/{base → misc}/__init__.py +2 -0
  155. maxframe/tensor/{base → misc}/atleast_1d.py +1 -3
  156. maxframe/tensor/misc/atleast_2d.py +70 -0
  157. maxframe/tensor/misc/atleast_3d.py +85 -0
  158. maxframe/tensor/misc/tests/__init__.py +13 -0
  159. maxframe/tensor/{base → misc}/transpose.py +22 -18
  160. maxframe/tensor/{base → misc}/unique.py +3 -3
  161. maxframe/tensor/operators.py +1 -7
  162. maxframe/tensor/random/core.py +1 -1
  163. maxframe/tensor/reduction/count_nonzero.py +2 -1
  164. maxframe/tensor/reduction/mean.py +1 -0
  165. maxframe/tensor/reduction/nanmean.py +1 -0
  166. maxframe/tensor/reduction/nanvar.py +2 -0
  167. maxframe/tensor/reduction/tests/test_reduction.py +12 -1
  168. maxframe/tensor/reduction/var.py +2 -0
  169. maxframe/tensor/statistics/quantile.py +2 -2
  170. maxframe/tensor/utils.py +2 -22
  171. maxframe/tests/test_protocol.py +34 -0
  172. maxframe/tests/test_utils.py +0 -12
  173. maxframe/tests/utils.py +17 -2
  174. maxframe/typing_.py +4 -1
  175. maxframe/udf.py +8 -9
  176. maxframe/utils.py +106 -86
  177. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/METADATA +25 -25
  178. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/RECORD +197 -173
  179. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/WHEEL +1 -1
  180. maxframe_client/__init__.py +0 -1
  181. maxframe_client/clients/framedriver.py +4 -1
  182. maxframe_client/fetcher.py +81 -74
  183. maxframe_client/session/consts.py +3 -0
  184. maxframe_client/session/graph.py +8 -2
  185. maxframe_client/session/odps.py +194 -40
  186. maxframe_client/session/task.py +94 -39
  187. maxframe_client/tests/test_fetcher.py +21 -3
  188. maxframe_client/tests/test_session.py +109 -8
  189. maxframe/core/entity/chunks.py +0 -68
  190. maxframe/core/entity/fuse.py +0 -73
  191. maxframe/core/graph/builder/chunk.py +0 -430
  192. maxframe/odpsio/tableio.py +0 -322
  193. maxframe/odpsio/volumeio.py +0 -95
  194. maxframe_client/clients/spe.py +0 -104
  195. /maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
  196. /maxframe/{tensor/base → dataframe/datastore}/tests/__init__.py +0 -0
  197. /maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
  198. /maxframe/tensor/{base → misc}/astype.py +0 -0
  199. /maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
  200. /maxframe/tensor/{base → misc}/ravel.py +0 -0
  201. /maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
  202. /maxframe/tensor/{base → misc}/where.py +0 -0
  203. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/top_level.txt +0 -0
@@ -13,6 +13,7 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import dataclasses
16
+ import logging
16
17
  import re
17
18
  from typing import Dict, List, Optional, Tuple, Union
18
19
 
@@ -22,12 +23,14 @@ from odps import ODPS
22
23
  from odps.types import Column, OdpsSchema, validate_data_type
23
24
 
24
25
  from ... import opcodes
26
+ from ...config import options
25
27
  from ...core import OutputType
26
28
  from ...core.graph import DAG
27
- from ...odpsio import odps_schema_to_pandas_dtypes
29
+ from ...io.odpsio import odps_schema_to_pandas_dtypes
28
30
  from ...serialization.serializables import (
29
31
  AnyField,
30
32
  BoolField,
33
+ DictField,
31
34
  FieldTypes,
32
35
  Int64Field,
33
36
  ListField,
@@ -37,6 +40,10 @@ from ...serialization.serializables import (
37
40
  from ..utils import parse_index
38
41
  from .core import ColumnPruneSupportedDataSourceMixin, IncrementalIndexDatasource
39
42
 
43
+ logger = logging.getLogger(__name__)
44
+
45
+ _DEFAULT_ANONYMOUS_COL_PREFIX = "_anon_col_"
46
+
40
47
  _EXPLAIN_DEPENDS_REGEX = re.compile(r"([^\s]+) depends on: ([^\n]+)")
41
48
  _EXPLAIN_JOB_REGEX = re.compile(r"(\S+) is root job")
42
49
  _EXPLAIN_TASKS_HEADER_REGEX = re.compile(r"In Job ([^:]+):")
@@ -46,7 +53,11 @@ _EXPLAIN_TASK_SCHEMA_REGEX = re.compile(
46
53
  r"In Task ([^:]+)[\S\s]+FS: output: ([^\n #]+)[\s\S]+schema:\s+([\S\s]+)$",
47
54
  re.MULTILINE,
48
55
  )
49
- _EXPLAIN_COLUMN_REGEX = re.compile(r"([^\(]+) \(([^)]+)\)(?:| AS ([^ ]+))(?:\n|$)")
56
+ _EXPLAIN_COLUMN_REGEX = re.compile(r"([^\(]+) \(([^\n]+)\)(?:| AS ([^ ]+))(?:\n|$)")
57
+ _ANONYMOUS_COL_REGEX = re.compile(r"^_c(\d+)$")
58
+
59
+ _SIMPLE_SCHEMA_COLS_REGEX = re.compile(r"SELECT (([^:]+:[^, ]+[, ]*)+)FROM")
60
+ _SIMPLE_SCHEMA_COL_REGEX = re.compile(r"([^ \.\)]+):([^ ]+)")
50
61
 
51
62
 
52
63
  @dataclasses.dataclass
@@ -151,7 +162,7 @@ def _resolve_task_sector(job_name: str, sector: str) -> TaskSector:
151
162
  return TaskSector(job_name, task_name, out_target, schemas)
152
163
 
153
164
 
154
- def _parse_explained_schema(explain_string: str) -> OdpsSchema:
165
+ def _parse_full_explain(explain_string: str) -> OdpsSchema:
155
166
  sectors = _split_explain_string(explain_string)
156
167
  jobs_sector = tasks_sector = None
157
168
 
@@ -169,27 +180,53 @@ def _parse_explained_schema(explain_string: str) -> OdpsSchema:
169
180
 
170
181
  job_dag = jobs_sector.build_dag()
171
182
  indep_job_names = list(job_dag.iter_indep(reverse=True))
172
- if len(indep_job_names) > 1: # pragma: no cover
173
- raise ValueError("Only one final job is allowed in SQL statement")
174
-
175
- tasks_sector = jobs_sector.jobs[indep_job_names[0]]
176
- task_dag = tasks_sector.build_dag()
177
- indep_task_names = list(task_dag.iter_indep(reverse=True))
178
- if len(indep_task_names) > 1: # pragma: no cover
183
+ schema_signatures = dict()
184
+ for job_name in indep_job_names:
185
+ tasks_sector = jobs_sector.jobs[job_name]
186
+ task_dag = tasks_sector.build_dag()
187
+ indep_task_names = list(task_dag.iter_indep(reverse=True))
188
+ for task_name in indep_task_names:
189
+ task_sector = tasks_sector.tasks[task_name]
190
+ if not task_sector.schema: # pragma: no cover
191
+ raise ValueError("Cannot detect output schema")
192
+ if task_sector.output_target != "Screen":
193
+ raise ValueError("The SQL statement should be an instant query")
194
+ sig_tuples = sorted(
195
+ [
196
+ (c.column_alias or c.column_name, c.column_type)
197
+ for c in task_sector.schema
198
+ ]
199
+ )
200
+ schema_signatures[hash(tuple(sig_tuples))] = task_sector.schema
201
+ if len(schema_signatures) != 1:
179
202
  raise ValueError("Only one final task is allowed in SQL statement")
180
-
181
- task_sector = tasks_sector.tasks[indep_task_names[0]]
182
- if not task_sector.schema: # pragma: no cover
183
- raise ValueError("Cannot detect output schema")
184
- if task_sector.output_target != "Screen":
185
- raise ValueError("The SQL statement should be an instant query")
203
+ schema = list(schema_signatures.values())[0]
186
204
  cols = [
187
205
  Column(c.column_alias or c.column_name, validate_data_type(c.column_type))
188
- for c in task_sector.schema
206
+ for c in schema
189
207
  ]
190
208
  return OdpsSchema(cols)
191
209
 
192
210
 
211
+ def _parse_simple_explain(explain_string: str) -> OdpsSchema:
212
+ fields_match = _SIMPLE_SCHEMA_COLS_REGEX.search(explain_string)
213
+ if not fields_match:
214
+ raise ValueError("Cannot detect output table schema")
215
+
216
+ fields_str = fields_match.group(1)
217
+ cols = []
218
+ for field, type_name in _SIMPLE_SCHEMA_COL_REGEX.findall(fields_str):
219
+ cols.append(Column(field, validate_data_type(type_name.rstrip(","))))
220
+ return OdpsSchema(cols)
221
+
222
+
223
+ def _parse_explained_schema(explain_string: str) -> OdpsSchema:
224
+ if explain_string.startswith("AdhocSink"):
225
+ return _parse_simple_explain(explain_string)
226
+ else:
227
+ return _parse_full_explain(explain_string)
228
+
229
+
193
230
  class DataFrameReadODPSQuery(
194
231
  IncrementalIndexDatasource,
195
232
  ColumnPruneSupportedDataSourceMixin,
@@ -204,6 +241,7 @@ class DataFrameReadODPSQuery(
204
241
  string_as_binary = BoolField("string_as_binary", default=None)
205
242
  index_columns = ListField("index_columns", FieldTypes.string, default=None)
206
243
  index_dtypes = SeriesField("index_dtypes", default=None)
244
+ column_renames = DictField("column_renames", default=None)
207
245
 
208
246
  def get_columns(self):
209
247
  return self.columns
@@ -226,12 +264,18 @@ class DataFrameReadODPSQuery(
226
264
  )
227
265
  index_value = parse_index(idx)
228
266
 
229
- columns_value = parse_index(self.dtypes.index, store_data=True)
267
+ if self.dtypes is not None:
268
+ columns_value = parse_index(self.dtypes.index, store_data=True)
269
+ shape = (np.nan, len(self.dtypes))
270
+ else:
271
+ columns_value = None
272
+ shape = (np.nan, np.nan)
273
+
230
274
  self.output_types = [OutputType.dataframe]
231
275
  return self.new_tileable(
232
276
  [],
233
277
  None,
234
- shape=(len(self.dtypes), np.nan),
278
+ shape=shape,
235
279
  dtypes=self.dtypes,
236
280
  index_value=index_value,
237
281
  columns_value=columns_value,
@@ -245,6 +289,9 @@ def read_odps_query(
245
289
  odps_entry: ODPS = None,
246
290
  index_col: Union[None, str, List[str]] = None,
247
291
  string_as_binary: bool = None,
292
+ sql_hints: Dict[str, str] = None,
293
+ anonymous_col_prefix: str = _DEFAULT_ANONYMOUS_COL_PREFIX,
294
+ skip_schema: bool = False,
248
295
  **kw,
249
296
  ):
250
297
  """
@@ -259,24 +306,70 @@ def read_odps_query(
259
306
  MaxCompute SQL statement.
260
307
  index_col: Union[None, str, List[str]]
261
308
  Columns to be specified as indexes.
309
+ string_as_binary: bool, optional
310
+ Whether to convert string columns to binary.
311
+ sql_hints: Dict[str, str], optional
312
+ User specified SQL hints.
313
+ anonymous_col_prefix: str, optional
314
+ Prefix for anonymous columns, '_anon_col_' by default.
315
+ skip_schema: bool, optional
316
+ Skip resolving output schema before execution. Once this is configured,
317
+ the output DataFrame cannot be inputs of other DataFrame operators
318
+ before execution.
262
319
 
263
320
  Returns
264
321
  -------
265
322
  result: DataFrame
266
323
  DataFrame read from MaxCompute (ODPS) table
267
324
  """
325
+ hints = options.sql.settings.copy() or {}
326
+ if sql_hints:
327
+ hints.update(sql_hints)
328
+
268
329
  odps_entry = odps_entry or ODPS.from_global() or ODPS.from_environments()
330
+
331
+ if options.session.enable_schema or odps_entry.is_schema_namespace_enabled():
332
+ hints["odps.namespace.schema"] = "true"
333
+ hints["odps.sql.allow.namespace.schema"] = "true"
334
+
335
+ # fixme workaround for multi-stage split process
336
+ hints["odps.sql.object.table.split.by.object.size.enabled"] = "false"
337
+
269
338
  if odps_entry is None:
270
339
  raise ValueError("Missing odps_entry parameter")
271
- inst = odps_entry.execute_sql(f"EXPLAIN {query}")
272
- explain_str = list(inst.get_task_results().values())[0]
273
340
 
274
- odps_schema = _parse_explained_schema(explain_str)
275
- dtypes = odps_schema_to_pandas_dtypes(odps_schema)
341
+ col_renames = {}
342
+ if not skip_schema:
343
+ inst = odps_entry.execute_sql(f"EXPLAIN {query}", hints=hints)
344
+ logger.debug("Explain instance ID: %s", inst.id)
345
+ explain_str = list(inst.get_task_results().values())[0]
346
+
347
+ try:
348
+ odps_schema = _parse_explained_schema(explain_str)
349
+ except ValueError as ex:
350
+ exc = ValueError(str(ex) + "\nExplain instance ID: " + inst.id)
351
+ raise exc.with_traceback(ex.__traceback__) from None
352
+
353
+ new_columns = []
354
+ for col in odps_schema.columns:
355
+ anon_match = _ANONYMOUS_COL_REGEX.match(col.name)
356
+ if anon_match and col.name not in query:
357
+ new_name = anonymous_col_prefix + anon_match.group(1)
358
+ col_renames[col.name] = new_name
359
+ new_columns.append(Column(new_name, col.type))
360
+ else:
361
+ new_columns.append(col)
362
+
363
+ dtypes = odps_schema_to_pandas_dtypes(OdpsSchema(new_columns))
364
+ else:
365
+ dtypes = None
276
366
 
277
367
  if not index_col:
278
368
  index_dtypes = None
279
369
  else:
370
+ if dtypes is None:
371
+ raise ValueError("Cannot configure index_col when skip_schema is True")
372
+
280
373
  if isinstance(index_col, str):
281
374
  index_col = [index_col]
282
375
  index_col_set = set(index_col)
@@ -295,5 +388,6 @@ def read_odps_query(
295
388
  string_as_binary=string_as_binary,
296
389
  index_columns=index_col,
297
390
  index_dtypes=index_dtypes,
391
+ column_renames=col_renames,
298
392
  )
299
393
  return op(chunk_bytes=chunk_bytes, chunk_size=chunk_size)
@@ -22,8 +22,9 @@ from odps.models import Table
22
22
  from odps.utils import to_timestamp
23
23
 
24
24
  from ... import opcodes
25
+ from ...config import options
25
26
  from ...core import OutputType
26
- from ...odpsio import odps_schema_to_pandas_dtypes
27
+ from ...io.odpsio import odps_schema_to_pandas_dtypes
27
28
  from ...serialization.serializables import (
28
29
  AnyField,
29
30
  BoolField,
@@ -119,9 +120,10 @@ class DataFrameReadODPSTable(
119
120
  return self.new_tileable(
120
121
  [],
121
122
  None,
122
- shape=shape,
123
+ shape=shape[:1],
123
124
  name=getattr(index_value, "name", None),
124
125
  names=getattr(index_value, "names", None),
126
+ dtype=self.index_dtypes.iloc[0],
125
127
  index_value=index_value,
126
128
  chunk_bytes=chunk_bytes,
127
129
  chunk_size=chunk_size,
@@ -166,12 +168,13 @@ def read_odps_table(
166
168
  DataFrame read from MaxCompute (ODPS) table
167
169
  """
168
170
  odps_entry = odps_entry or ODPS.from_global() or ODPS.from_environments()
171
+ schema = options.session.default_schema or odps_entry.schema
169
172
  if odps_entry is None:
170
173
  raise ValueError("Missing odps_entry parameter")
171
174
  if isinstance(table_name, Table):
172
175
  table = table_name
173
176
  else:
174
- table = odps_entry.get_table(table_name)
177
+ table = odps_entry.get_table(table_name, schema=schema)
175
178
 
176
179
  if not table.table_schema.partitions and (
177
180
  partitions is not None or append_partitions
@@ -13,18 +13,28 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import os
16
+ import uuid
16
17
  from collections import OrderedDict
17
18
 
18
19
  import numpy as np
19
20
  import pandas as pd
20
21
  import pytest
21
22
  from odps import ODPS
23
+ from odps import types as odps_types
22
24
 
23
25
  from .... import tensor as mt
26
+ from ....core import OutputType
24
27
  from ....tests.utils import tn
25
28
  from ....utils import lazy_import
26
29
  from ... import read_odps_query, read_odps_table
27
- from ...core import DatetimeIndex, Float64Index, IndexValue, Int64Index, MultiIndex
30
+ from ...core import (
31
+ DatetimeIndex,
32
+ Float64Index,
33
+ Index,
34
+ IndexValue,
35
+ Int64Index,
36
+ MultiIndex,
37
+ )
28
38
  from ..dataframe import from_pandas as from_pandas_df
29
39
  from ..date_range import date_range
30
40
  from ..from_tensor import (
@@ -34,7 +44,12 @@ from ..from_tensor import (
34
44
  )
35
45
  from ..index import from_pandas as from_pandas_index
36
46
  from ..index import from_tileable
37
- from ..read_odps_query import ColumnSchema, _resolve_task_sector
47
+ from ..read_odps_query import (
48
+ ColumnSchema,
49
+ _parse_full_explain,
50
+ _parse_simple_explain,
51
+ _resolve_task_sector,
52
+ )
38
53
  from ..series import from_pandas as from_pandas_series
39
54
 
40
55
  ray = lazy_import("ray")
@@ -112,18 +127,22 @@ def test_from_tileable_index():
112
127
 
113
128
  for o in [df, df[0]]:
114
129
  index = o.index
115
- assert isinstance(index, Int64Index)
130
+ assert isinstance(index, (Index, Int64Index))
116
131
  assert index.dtype == np.int64
117
132
  assert index.name == pd_df.index.name
118
- assert isinstance(index.index_value.value, IndexValue.Int64Index)
133
+ assert isinstance(
134
+ index.index_value.value, (IndexValue.Int64Index, IndexValue.Index)
135
+ )
119
136
 
120
137
  t = mt.random.rand(10, chunk_size=6)
121
138
  index = from_tileable(t, name="new_name")
122
139
 
123
- assert isinstance(index, Float64Index)
140
+ assert isinstance(index, (Index, Float64Index))
124
141
  assert index.dtype == np.float64
125
142
  assert index.name == "new_name"
126
- assert isinstance(index.index_value.value, IndexValue.Float64Index)
143
+ assert isinstance(
144
+ index.index_value.value, (IndexValue.Float64Index, IndexValue.Index)
145
+ )
127
146
 
128
147
 
129
148
  def test_from_tensor():
@@ -295,6 +314,15 @@ def test_from_odps_table():
295
314
  ),
296
315
  )
297
316
 
317
+ out_idx = read_odps_table(
318
+ test_table,
319
+ columns=[],
320
+ index_col=["col1", "col2"],
321
+ output_type=OutputType.index,
322
+ )
323
+ assert out_idx.names == ["col1", "col2"]
324
+ assert out_idx.shape == (np.nan,)
325
+
298
326
  test_table.drop()
299
327
  test_parted_table.drop()
300
328
 
@@ -316,7 +344,10 @@ def test_from_odps_query():
316
344
  odps_entry.write_table(test_table2, [["A", 10, 4.5]])
317
345
 
318
346
  with pytest.raises(ValueError) as err_info:
319
- read_odps_query(f"CREATE TABLE dummy_table AS SELECT * FROM {table1_name}")
347
+ read_odps_query(
348
+ f"CREATE TABLE dummy_table_{uuid.uuid4().hex} "
349
+ f"AS SELECT * FROM {table1_name}"
350
+ )
320
351
  assert "instant query" in err_info.value.args[0]
321
352
 
322
353
  query1 = f"SELECT * FROM {table1_name} WHERE col1 > 10"
@@ -332,6 +363,10 @@ def test_from_odps_query():
332
363
  ),
333
364
  )
334
365
 
366
+ df = read_odps_query(query1, skip_schema=True)
367
+ assert df.dtypes is None
368
+ assert df.columns_value is None
369
+
335
370
  df = read_odps_query(query1, index_col="col1")
336
371
  assert df.op.query == query1
337
372
  assert df.index_value.name == "col1"
@@ -387,7 +422,9 @@ def test_date_range():
387
422
 
388
423
 
389
424
  def test_resolve_task_sector():
390
- input_path = os.path.join(os.path.dirname(__file__), "test-data", "task-input.txt")
425
+ input_path = os.path.join(
426
+ os.path.dirname(__file__), "test-data", "task-input-full.txt"
427
+ )
391
428
  with open(input_path, "r") as f:
392
429
  sector = f.read()
393
430
  actual_sector = _resolve_task_sector("job0", sector)
@@ -399,3 +436,61 @@ def test_resolve_task_sector():
399
436
  assert actual_sector.schema[0] == ColumnSchema("unnamed: 0", "bigint", "")
400
437
  assert actual_sector.schema[1] == ColumnSchema("id", "bigint", "id_alias")
401
438
  assert actual_sector.schema[2] == ColumnSchema("listing_url", "string", "")
439
+
440
+
441
+ def test_resolve_task_odps2():
442
+ input_path = os.path.join(
443
+ os.path.dirname(__file__), "test-data", "task-input-odps2.txt"
444
+ )
445
+ with open(input_path, "r") as f:
446
+ sector = f.read()
447
+ actual_sector = _resolve_task_sector("job0", sector)
448
+
449
+ assert actual_sector.job_name == "job0"
450
+ assert actual_sector.task_name == "M1"
451
+ assert actual_sector.output_target == "Screen"
452
+ assert len(actual_sector.schema) == 2
453
+ assert actual_sector.schema[0] == ColumnSchema("key", "varchar(2048)", "")
454
+ assert actual_sector.schema[1] == ColumnSchema("data", "binary", "")
455
+
456
+
457
+ def test_resolve_simple_explain():
458
+ input_path = os.path.join(
459
+ os.path.dirname(__file__), "test-data", "task-input-simple.txt"
460
+ )
461
+ with open(input_path, "r") as f:
462
+ sector = f.read()
463
+
464
+ schema = _parse_simple_explain(sector)
465
+ assert schema.columns[0].name == "memberid"
466
+ assert schema.columns[0].type == odps_types.string
467
+ assert schema.columns[1].name == "createdate"
468
+ assert schema.columns[1].type == odps_types.bigint
469
+
470
+
471
+ def test_resolve_conditional():
472
+ input_path = os.path.join(
473
+ os.path.dirname(__file__), "test-data", "task-input-multi-cond.txt"
474
+ )
475
+ with open(input_path, "r") as f:
476
+ sector = f.read()
477
+
478
+ expected_col_types = {
479
+ "cs1": "string",
480
+ "cs2": "string",
481
+ "ci1": "bigint",
482
+ "cs3": "string",
483
+ "cs4": "string",
484
+ "cs5": "string",
485
+ "cs6": "string",
486
+ "cs7": "string",
487
+ "cs8": "string",
488
+ "ci2": "int",
489
+ "ci3": "bigint",
490
+ "cs9": "string",
491
+ }
492
+
493
+ schema = _parse_full_explain(sector)
494
+ for col, (exp_nm, exp_tp) in zip(schema.columns, expected_col_types.items()):
495
+ assert col.name == exp_nm
496
+ assert col.type == odps_types.validate_data_type(exp_tp)
@@ -0,0 +1,48 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import pytest
16
+
17
+ from ... import DataFrame
18
+ from ..to_odps import to_odps_table
19
+
20
+
21
+ @pytest.fixture
22
+ def df():
23
+ return DataFrame({"A": [1, 2], "B": [3, 4]})
24
+
25
+
26
+ @pytest.mark.parametrize(
27
+ "kwargs",
28
+ [
29
+ {"partition_col": ["A", "C"]},
30
+ {"partition_col": "C"},
31
+ {"partition": "a=1,C=2"},
32
+ ],
33
+ )
34
+ def test_to_odps_table_validation(df, kwargs):
35
+ with pytest.raises(ValueError):
36
+ to_odps_table(df, "test_table", **kwargs)
37
+
38
+
39
+ @pytest.mark.parametrize(
40
+ "kwargs",
41
+ [
42
+ {"partition_col": ["a", "B"]},
43
+ {"partition_col": "a"},
44
+ {"partition": "C=1,d=2"},
45
+ ],
46
+ )
47
+ def test_to_odps_table_vaild(df, kwargs):
48
+ to_odps_table(df, "test_table", **kwargs)
@@ -17,11 +17,14 @@
17
17
  import logging
18
18
  from typing import List, Optional, Union
19
19
 
20
+ from odps import ODPS
20
21
  from odps.models import Table as ODPSTable
22
+ from odps.types import PartitionSpec
21
23
 
22
24
  from ... import opcodes
23
25
  from ...config import options
24
26
  from ...core import OutputType
27
+ from ...io.odpsio import build_dataframe_table_meta
25
28
  from ...serialization.serializables import (
26
29
  BoolField,
27
30
  FieldTypes,
@@ -134,8 +137,14 @@ def to_odps_table(
134
137
  --------
135
138
 
136
139
  """
140
+ odps_entry = ODPS.from_global() or ODPS.from_environments()
137
141
  if isinstance(table, ODPSTable):
138
142
  table = table.full_table_name
143
+ elif options.session.enable_schema and "." not in table:
144
+ default_schema = (
145
+ options.session.default_schema or odps_entry.schema or "default"
146
+ )
147
+ table = default_schema + "." + table
139
148
 
140
149
  if isinstance(index_label, str):
141
150
  index_label = [index_label]
@@ -147,6 +156,25 @@ def to_odps_table(
147
156
  f"index_label needs {len(df.index.nlevels)} labels "
148
157
  f"but it only have {len(index_label)}"
149
158
  )
159
+ table_cols = set(build_dataframe_table_meta(df).table_column_names)
160
+ if partition:
161
+ partition_intersect = (
162
+ set(x.lower() for x in PartitionSpec(partition).keys()) & table_cols
163
+ )
164
+ if partition_intersect:
165
+ raise ValueError(
166
+ f"Data column(s) {partition_intersect} in the dataframe"
167
+ " cannot be used in parameter 'partition'."
168
+ " Use 'partition_col' instead."
169
+ )
170
+
171
+ if partition_col:
172
+ partition_diff = set(x.lower() for x in partition_col) - table_cols
173
+ if partition_diff:
174
+ raise ValueError(
175
+ f"Partition column(s) {partition_diff}"
176
+ " is not the data column(s) of the input dataframe."
177
+ )
150
178
 
151
179
  op = DataFrameToODPSTable(
152
180
  dtypes=df.dtypes,
@@ -18,6 +18,8 @@ from .accessor import (
18
18
  IndexMaxFrameAccessor,
19
19
  SeriesMaxFrameAccessor,
20
20
  )
21
+ from .flatjson import series_flatjson
22
+ from .flatmap import df_flatmap, series_flatmap
21
23
  from .reshuffle import DataFrameReshuffle, df_reshuffle
22
24
 
23
25
 
@@ -25,6 +27,9 @@ def _install():
25
27
  from ..core import DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE
26
28
 
27
29
  DataFrameMaxFrameAccessor._register("reshuffle", df_reshuffle)
30
+ DataFrameMaxFrameAccessor._register("flatmap", df_flatmap)
31
+ SeriesMaxFrameAccessor._register("flatmap", series_flatmap)
32
+ SeriesMaxFrameAccessor._register("flatjson", series_flatjson)
28
33
 
29
34
  if DataFrameMaxFrameAccessor._api_count:
30
35
  for t in DATAFRAME_TYPE: