maxframe 1.0.0rc4__cp37-cp37m-win32.whl → 1.1.0__cp37-cp37m-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (83) hide show
  1. maxframe/_utils.cp37-win32.pyd +0 -0
  2. maxframe/config/config.py +3 -0
  3. maxframe/conftest.py +9 -2
  4. maxframe/core/graph/core.cp37-win32.pyd +0 -0
  5. maxframe/core/operator/base.py +2 -0
  6. maxframe/dataframe/arithmetic/tests/test_arithmetic.py +17 -16
  7. maxframe/dataframe/core.py +24 -2
  8. maxframe/dataframe/datasource/read_odps_query.py +63 -34
  9. maxframe/dataframe/datasource/tests/test_datasource.py +59 -7
  10. maxframe/dataframe/extensions/__init__.py +5 -0
  11. maxframe/dataframe/extensions/apply_chunk.py +649 -0
  12. maxframe/dataframe/extensions/flatjson.py +131 -0
  13. maxframe/dataframe/extensions/flatmap.py +28 -40
  14. maxframe/dataframe/extensions/reshuffle.py +1 -1
  15. maxframe/dataframe/extensions/tests/test_apply_chunk.py +186 -0
  16. maxframe/dataframe/extensions/tests/test_extensions.py +46 -2
  17. maxframe/dataframe/groupby/__init__.py +1 -0
  18. maxframe/dataframe/groupby/aggregation.py +1 -0
  19. maxframe/dataframe/groupby/apply.py +9 -1
  20. maxframe/dataframe/groupby/core.py +1 -1
  21. maxframe/dataframe/groupby/fill.py +4 -1
  22. maxframe/dataframe/groupby/getitem.py +6 -0
  23. maxframe/dataframe/groupby/tests/test_groupby.py +1 -1
  24. maxframe/dataframe/groupby/transform.py +8 -2
  25. maxframe/dataframe/indexing/loc.py +6 -4
  26. maxframe/dataframe/merge/__init__.py +9 -1
  27. maxframe/dataframe/merge/concat.py +41 -31
  28. maxframe/dataframe/merge/merge.py +1 -1
  29. maxframe/dataframe/merge/tests/test_merge.py +3 -1
  30. maxframe/dataframe/misc/apply.py +3 -0
  31. maxframe/dataframe/misc/drop_duplicates.py +5 -1
  32. maxframe/dataframe/misc/map.py +3 -1
  33. maxframe/dataframe/misc/tests/test_misc.py +24 -2
  34. maxframe/dataframe/misc/transform.py +22 -13
  35. maxframe/dataframe/reduction/__init__.py +3 -0
  36. maxframe/dataframe/reduction/aggregation.py +1 -0
  37. maxframe/dataframe/reduction/median.py +56 -0
  38. maxframe/dataframe/reduction/tests/test_reduction.py +17 -7
  39. maxframe/dataframe/statistics/quantile.py +8 -2
  40. maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
  41. maxframe/dataframe/tests/test_utils.py +60 -0
  42. maxframe/dataframe/utils.py +110 -7
  43. maxframe/dataframe/window/expanding.py +5 -3
  44. maxframe/dataframe/window/tests/test_expanding.py +2 -2
  45. maxframe/io/objects/tests/test_object_io.py +39 -12
  46. maxframe/io/odpsio/arrow.py +30 -2
  47. maxframe/io/odpsio/schema.py +23 -5
  48. maxframe/io/odpsio/tableio.py +26 -110
  49. maxframe/io/odpsio/tests/test_schema.py +40 -0
  50. maxframe/io/odpsio/tests/test_tableio.py +5 -5
  51. maxframe/io/odpsio/tests/test_volumeio.py +35 -11
  52. maxframe/io/odpsio/volumeio.py +27 -3
  53. maxframe/learn/contrib/__init__.py +3 -2
  54. maxframe/learn/contrib/llm/__init__.py +16 -0
  55. maxframe/learn/contrib/llm/core.py +54 -0
  56. maxframe/learn/contrib/llm/models/__init__.py +14 -0
  57. maxframe/learn/contrib/llm/models/dashscope.py +73 -0
  58. maxframe/learn/contrib/llm/multi_modal.py +42 -0
  59. maxframe/learn/contrib/llm/text.py +42 -0
  60. maxframe/lib/mmh3.cp37-win32.pyd +0 -0
  61. maxframe/lib/sparse/tests/test_sparse.py +15 -15
  62. maxframe/opcodes.py +7 -1
  63. maxframe/serialization/core.cp37-win32.pyd +0 -0
  64. maxframe/serialization/core.pyx +13 -1
  65. maxframe/serialization/pandas.py +50 -20
  66. maxframe/serialization/serializables/core.py +24 -5
  67. maxframe/serialization/serializables/field_type.py +4 -1
  68. maxframe/serialization/serializables/tests/test_serializable.py +8 -1
  69. maxframe/serialization/tests/test_serial.py +2 -1
  70. maxframe/tensor/__init__.py +19 -7
  71. maxframe/tests/utils.py +16 -0
  72. maxframe/udf.py +27 -0
  73. maxframe/utils.py +36 -8
  74. {maxframe-1.0.0rc4.dist-info → maxframe-1.1.0.dist-info}/METADATA +2 -2
  75. {maxframe-1.0.0rc4.dist-info → maxframe-1.1.0.dist-info}/RECORD +83 -72
  76. maxframe_client/clients/framedriver.py +4 -1
  77. maxframe_client/fetcher.py +18 -2
  78. maxframe_client/session/odps.py +23 -10
  79. maxframe_client/session/task.py +2 -24
  80. maxframe_client/session/tests/test_task.py +0 -4
  81. maxframe_client/tests/test_session.py +30 -10
  82. {maxframe-1.0.0rc4.dist-info → maxframe-1.1.0.dist-info}/WHEEL +0 -0
  83. {maxframe-1.0.0rc4.dist-info → maxframe-1.1.0.dist-info}/top_level.txt +0 -0
Binary file
maxframe/config/config.py CHANGED
@@ -380,6 +380,9 @@ default_options.register_option(
380
380
  default_options.register_option(
381
381
  "session.enable_schema", None, validator=is_null | is_bool, remote=True
382
382
  )
383
+ default_options.register_option(
384
+ "session.enable_high_availability", None, validator=is_null | is_bool, remote=True
385
+ )
383
386
  default_options.register_option(
384
387
  "session.default_schema", None, validator=is_null | is_string, remote=True
385
388
  )
maxframe/conftest.py CHANGED
@@ -126,7 +126,14 @@ def oss_config():
126
126
  oss_rolearn = config.get("oss", "rolearn")
127
127
 
128
128
  options.service_role_arn = oss_rolearn
129
- options.object_cache_url = f"oss://{oss_endpoint}/{oss_bucket_name}"
129
+ if "test" in oss_endpoint:
130
+ oss_svc_endpoint = oss_endpoint
131
+ else:
132
+ endpoint_parts = oss_endpoint.split(".", 1)
133
+ if "-internal" not in endpoint_parts[0]:
134
+ endpoint_parts[0] += "-internal"
135
+ oss_svc_endpoint = ".".join(endpoint_parts)
136
+ options.object_cache_url = f"oss://{oss_svc_endpoint}/{oss_bucket_name}"
130
137
 
131
138
  config.oss_config = (
132
139
  oss_access_id,
@@ -141,7 +148,7 @@ def oss_config():
141
148
  config.oss_bucket = oss2.Bucket(auth, oss_endpoint, oss_bucket_name)
142
149
  config.oss_rolearn = oss_rolearn
143
150
  yield config
144
- except (ConfigParser.NoSectionError, ConfigParser.NoOptionError, ImportError):
151
+ except (NoSectionError, NoOptionError, ImportError):
145
152
  return None
146
153
  finally:
147
154
  options.service_role_arn = old_role_arn
Binary file
@@ -86,6 +86,8 @@ class SchedulingHint(Serializable):
86
86
  # `gpu` indicates that if the operator should be executed on the GPU.
87
87
  gpu = BoolField("gpu", default=None)
88
88
  priority = Int32Field("priority", default=None)
89
+ expect_engine = StringField("expect_engine", default=None)
90
+ expect_resources = DictField("expect_resources", FieldTypes.string, default=None)
89
91
 
90
92
  @classproperty
91
93
  @lru_cache(1)
@@ -22,6 +22,7 @@ import pandas as pd
22
22
  import pytest
23
23
 
24
24
  from ....core import OperatorType
25
+ from ....tests.utils import assert_mf_index_dtype
25
26
  from ....utils import dataslots
26
27
  from ...core import IndexValue
27
28
  from ...datasource.dataframe import from_pandas
@@ -164,7 +165,7 @@ def test_without_shuffle(func_name, func_opts):
164
165
  pd.testing.assert_index_equal(
165
166
  df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
166
167
  )
167
- assert isinstance(df3.index_value.value, IndexValue.Int64Index)
168
+ assert_mf_index_dtype(df3.index_value.value, np.int64)
168
169
  pd.testing.assert_index_equal(
169
170
  df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
170
171
  )
@@ -176,7 +177,7 @@ def test_without_shuffle(func_name, func_opts):
176
177
  pd.testing.assert_index_equal(
177
178
  df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
178
179
  )
179
- assert isinstance(df3.index_value.value, IndexValue.Int64Index)
180
+ assert_mf_index_dtype(df3.index_value.value, np.int64)
180
181
  pd.testing.assert_index_equal(
181
182
  df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
182
183
  )
@@ -370,7 +371,7 @@ def test_with_one_shuffle(func_name, func_opts):
370
371
  pd.testing.assert_index_equal(
371
372
  df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
372
373
  )
373
- assert isinstance(df3.index_value.value, IndexValue.Int64Index)
374
+ assert_mf_index_dtype(df3.index_value.value, np.int64)
374
375
  pd.testing.assert_index_equal(
375
376
  df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
376
377
  )
@@ -403,7 +404,7 @@ def test_with_all_shuffle(func_name, func_opts):
403
404
  pd.testing.assert_index_equal(
404
405
  df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
405
406
  )
406
- assert isinstance(df3.index_value.value, IndexValue.Int64Index)
407
+ assert_mf_index_dtype(df3.index_value.value, np.int64)
407
408
  pd.testing.assert_index_equal(
408
409
  df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
409
410
  )
@@ -433,7 +434,7 @@ def test_with_all_shuffle(func_name, func_opts):
433
434
  pd.testing.assert_index_equal(
434
435
  df6.columns_value.to_pandas(), func_opts.func(data4, data5).columns
435
436
  )
436
- assert isinstance(df6.index_value.value, IndexValue.Int64Index)
437
+ assert_mf_index_dtype(df6.index_value.value, np.int64)
437
438
  pd.testing.assert_index_equal(
438
439
  df6.index_value.to_pandas(), pd.Index([], dtype=np.int64)
439
440
  )
@@ -468,7 +469,7 @@ def test_without_shuffle_and_with_one_chunk(func_name, func_opts):
468
469
  pd.testing.assert_index_equal(
469
470
  df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
470
471
  )
471
- assert isinstance(df3.index_value.value, IndexValue.Int64Index)
472
+ assert_mf_index_dtype(df3.index_value.value, np.int64)
472
473
  pd.testing.assert_index_equal(
473
474
  df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
474
475
  )
@@ -501,7 +502,7 @@ def test_both_one_chunk(func_name, func_opts):
501
502
  pd.testing.assert_index_equal(
502
503
  df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
503
504
  )
504
- assert isinstance(df3.index_value.value, IndexValue.Int64Index)
505
+ assert_mf_index_dtype(df3.index_value.value, np.int64)
505
506
  pd.testing.assert_index_equal(
506
507
  df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
507
508
  )
@@ -534,7 +535,7 @@ def test_with_shuffle_and_one_chunk(func_name, func_opts):
534
535
  pd.testing.assert_index_equal(
535
536
  df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
536
537
  )
537
- assert isinstance(df3.index_value.value, IndexValue.Int64Index)
538
+ assert_mf_index_dtype(df3.index_value.value, np.int64)
538
539
  pd.testing.assert_index_equal(
539
540
  df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
540
541
  )
@@ -558,7 +559,7 @@ def test_on_same_dataframe(func_name, func_opts):
558
559
  pd.testing.assert_index_equal(
559
560
  df2.columns_value.to_pandas(), func_opts.func(data, data).columns
560
561
  )
561
- assert isinstance(df2.index_value.value, IndexValue.Int64Index)
562
+ assert_mf_index_dtype(df2.index_value.value, np.int64)
562
563
  pd.testing.assert_index_equal(
563
564
  df2.index_value.to_pandas(), pd.Index([], dtype=np.int64)
564
565
  )
@@ -590,19 +591,19 @@ def test_dataframe_and_scalar(func_name, func_opts):
590
591
  pd.testing.assert_series_equal(result.dtypes, expected.dtypes)
591
592
 
592
593
  pd.testing.assert_index_equal(result.columns_value.to_pandas(), data.columns)
593
- assert isinstance(result.index_value.value, IndexValue.Int64Index)
594
+ assert_mf_index_dtype(result.index_value.value, np.int64)
594
595
 
595
596
  pd.testing.assert_index_equal(result2.columns_value.to_pandas(), data.columns)
596
- assert isinstance(result2.index_value.value, IndexValue.Int64Index)
597
+ assert_mf_index_dtype(result2.index_value.value, np.int64)
597
598
 
598
599
  pd.testing.assert_index_equal(result3.columns_value.to_pandas(), data.columns)
599
- assert isinstance(result3.index_value.value, IndexValue.Int64Index)
600
+ assert_mf_index_dtype(result3.index_value.value, np.int64)
600
601
 
601
602
  pd.testing.assert_index_equal(result4.columns_value.to_pandas(), data.columns)
602
- assert isinstance(result4.index_value.value, IndexValue.Int64Index)
603
+ assert_mf_index_dtype(result4.index_value.value, np.int64)
603
604
 
604
605
  pd.testing.assert_index_equal(result5.columns_value.to_pandas(), data.columns)
605
- assert isinstance(result5.index_value.value, IndexValue.Int64Index)
606
+ assert_mf_index_dtype(result5.index_value.value, np.int64)
606
607
 
607
608
  if "builtin_function_or_method" not in str(type(func_opts.func)):
608
609
  # skip NotImplemented test for comparison function
@@ -679,7 +680,7 @@ def test_abs():
679
680
  pd.testing.assert_index_equal(
680
681
  df2.columns_value.to_pandas(), df1.columns_value.to_pandas()
681
682
  )
682
- assert isinstance(df2.index_value.value, IndexValue.Int64Index)
683
+ assert_mf_index_dtype(df2.index_value.value, np.int64)
683
684
  assert df2.shape == (10, 10)
684
685
 
685
686
 
@@ -697,7 +698,7 @@ def test_not():
697
698
  pd.testing.assert_index_equal(
698
699
  df2.columns_value.to_pandas(), df1.columns_value.to_pandas()
699
700
  )
700
- assert isinstance(df2.index_value.value, IndexValue.Int64Index)
701
+ assert_mf_index_dtype(df2.index_value.value, np.int64)
701
702
  assert df2.shape == (10, 10)
702
703
 
703
704
 
@@ -142,6 +142,14 @@ class IndexValue(Serializable):
142
142
  _data = NDArrayField("data")
143
143
  _dtype = DataTypeField("dtype")
144
144
 
145
+ @property
146
+ def dtype(self):
147
+ return getattr(self, "_dtype", None)
148
+
149
+ @property
150
+ def inferred_type(self):
151
+ return "floating" if self.dtype.kind == "f" else "integer"
152
+
145
153
  class RangeIndex(IndexBase):
146
154
  _name = AnyField("name")
147
155
  _slice = SliceField("slice")
@@ -243,6 +251,10 @@ class IndexValue(Serializable):
243
251
  _data = NDArrayField("data")
244
252
  _dtype = DataTypeField("dtype")
245
253
 
254
+ @property
255
+ def dtype(self):
256
+ return getattr(self, "_dtype", None)
257
+
246
258
  @property
247
259
  def inferred_type(self):
248
260
  return "integer"
@@ -254,6 +266,10 @@ class IndexValue(Serializable):
254
266
  _data = NDArrayField("data")
255
267
  _dtype = DataTypeField("dtype")
256
268
 
269
+ @property
270
+ def dtype(self):
271
+ return getattr(self, "_dtype", None)
272
+
257
273
  @property
258
274
  def inferred_type(self):
259
275
  return "integer"
@@ -265,6 +281,10 @@ class IndexValue(Serializable):
265
281
  _data = NDArrayField("data")
266
282
  _dtype = DataTypeField("dtype")
267
283
 
284
+ @property
285
+ def dtype(self):
286
+ return getattr(self, "_dtype", None)
287
+
268
288
  @property
269
289
  def inferred_type(self):
270
290
  return "floating"
@@ -1514,8 +1534,7 @@ class BaseDataFrameData(HasShapeTileableData, _ToPandasMixin):
1514
1534
  refresh_index_value(self)
1515
1535
  refresh_dtypes(self)
1516
1536
 
1517
- def refresh_from_table_meta(self, table_meta: DataFrameTableMeta) -> None:
1518
- dtypes = table_meta.pd_column_dtypes
1537
+ def refresh_from_dtypes(self, dtypes: pd.Series) -> None:
1519
1538
  self._dtypes = dtypes
1520
1539
  self._columns_value = parse_index(dtypes.index, store_data=True)
1521
1540
  self._dtypes_value = DtypesValue(key=tokenize(dtypes), value=dtypes)
@@ -1523,6 +1542,9 @@ class BaseDataFrameData(HasShapeTileableData, _ToPandasMixin):
1523
1542
  new_shape[-1] = len(dtypes)
1524
1543
  self._shape = tuple(new_shape)
1525
1544
 
1545
+ def refresh_from_table_meta(self, table_meta: DataFrameTableMeta) -> None:
1546
+ self.refresh_from_dtypes(table_meta.pd_column_dtypes)
1547
+
1526
1548
  @property
1527
1549
  def dtypes(self):
1528
1550
  dt = getattr(self, "_dtypes", None)
@@ -57,7 +57,7 @@ _EXPLAIN_COLUMN_REGEX = re.compile(r"([^\(]+) \(([^\n]+)\)(?:| AS ([^ ]+))(?:\n|
57
57
  _ANONYMOUS_COL_REGEX = re.compile(r"^_c(\d+)$")
58
58
 
59
59
  _SIMPLE_SCHEMA_COLS_REGEX = re.compile(r"SELECT (([^:]+:[^, ]+[, ]*)+)FROM")
60
- _SIMPLE_SCHEMA_COL_REGEX = re.compile(r"([^\.]+):([^, ]+)")
60
+ _SIMPLE_SCHEMA_COL_REGEX = re.compile(r"([^ \.\)]+):([^ ]+)")
61
61
 
62
62
 
63
63
  @dataclasses.dataclass
@@ -180,23 +180,30 @@ def _parse_full_explain(explain_string: str) -> OdpsSchema:
180
180
 
181
181
  job_dag = jobs_sector.build_dag()
182
182
  indep_job_names = list(job_dag.iter_indep(reverse=True))
183
- if len(indep_job_names) > 1: # pragma: no cover
184
- raise ValueError("Only one final job is allowed in SQL statement")
185
-
186
- tasks_sector = jobs_sector.jobs[indep_job_names[0]]
187
- task_dag = tasks_sector.build_dag()
188
- indep_task_names = list(task_dag.iter_indep(reverse=True))
189
- if len(indep_task_names) > 1: # pragma: no cover
183
+ schema_signatures = dict()
184
+ for job_name in indep_job_names:
185
+ tasks_sector = jobs_sector.jobs[job_name]
186
+ task_dag = tasks_sector.build_dag()
187
+ indep_task_names = list(task_dag.iter_indep(reverse=True))
188
+ for task_name in indep_task_names:
189
+ task_sector = tasks_sector.tasks[task_name]
190
+ if not task_sector.schema: # pragma: no cover
191
+ raise ValueError("Cannot detect output schema")
192
+ if task_sector.output_target != "Screen":
193
+ raise ValueError("The SQL statement should be an instant query")
194
+ sig_tuples = sorted(
195
+ [
196
+ (c.column_alias or c.column_name, c.column_type)
197
+ for c in task_sector.schema
198
+ ]
199
+ )
200
+ schema_signatures[hash(tuple(sig_tuples))] = task_sector.schema
201
+ if len(schema_signatures) != 1:
190
202
  raise ValueError("Only one final task is allowed in SQL statement")
191
-
192
- task_sector = tasks_sector.tasks[indep_task_names[0]]
193
- if not task_sector.schema: # pragma: no cover
194
- raise ValueError("Cannot detect output schema")
195
- if task_sector.output_target != "Screen":
196
- raise ValueError("The SQL statement should be an instant query")
203
+ schema = list(schema_signatures.values())[0]
197
204
  cols = [
198
205
  Column(c.column_alias or c.column_name, validate_data_type(c.column_type))
199
- for c in task_sector.schema
206
+ for c in schema
200
207
  ]
201
208
  return OdpsSchema(cols)
202
209
 
@@ -209,7 +216,7 @@ def _parse_simple_explain(explain_string: str) -> OdpsSchema:
209
216
  fields_str = fields_match.group(1)
210
217
  cols = []
211
218
  for field, type_name in _SIMPLE_SCHEMA_COL_REGEX.findall(fields_str):
212
- cols.append(Column(field, validate_data_type(type_name)))
219
+ cols.append(Column(field, validate_data_type(type_name.rstrip(","))))
213
220
  return OdpsSchema(cols)
214
221
 
215
222
 
@@ -257,12 +264,18 @@ class DataFrameReadODPSQuery(
257
264
  )
258
265
  index_value = parse_index(idx)
259
266
 
260
- columns_value = parse_index(self.dtypes.index, store_data=True)
267
+ if self.dtypes is not None:
268
+ columns_value = parse_index(self.dtypes.index, store_data=True)
269
+ shape = (np.nan, len(self.dtypes))
270
+ else:
271
+ columns_value = None
272
+ shape = (np.nan, np.nan)
273
+
261
274
  self.output_types = [OutputType.dataframe]
262
275
  return self.new_tileable(
263
276
  [],
264
277
  None,
265
- shape=(len(self.dtypes), np.nan),
278
+ shape=shape,
266
279
  dtypes=self.dtypes,
267
280
  index_value=index_value,
268
281
  columns_value=columns_value,
@@ -278,6 +291,7 @@ def read_odps_query(
278
291
  string_as_binary: bool = None,
279
292
  sql_hints: Dict[str, str] = None,
280
293
  anonymous_col_prefix: str = _DEFAULT_ANONYMOUS_COL_PREFIX,
294
+ skip_schema: bool = False,
281
295
  **kw,
282
296
  ):
283
297
  """
@@ -298,6 +312,10 @@ def read_odps_query(
298
312
  User specified SQL hints.
299
313
  anonymous_col_prefix: str, optional
300
314
  Prefix for anonymous columns, '_anon_col_' by default.
315
+ skip_schema: bool, optional
316
+ Skip resolving output schema before execution. Once this is configured,
317
+ the output DataFrame cannot be inputs of other DataFrame operators
318
+ before execution.
301
319
 
302
320
  Returns
303
321
  -------
@@ -319,28 +337,39 @@ def read_odps_query(
319
337
 
320
338
  if odps_entry is None:
321
339
  raise ValueError("Missing odps_entry parameter")
322
- inst = odps_entry.execute_sql(f"EXPLAIN {query}", hints=hints)
323
- logger.debug("Explain instance ID: %s", inst.id)
324
- explain_str = list(inst.get_task_results().values())[0]
325
340
 
326
- odps_schema = _parse_explained_schema(explain_str)
327
-
328
- new_columns = []
329
341
  col_renames = {}
330
- for col in odps_schema.columns:
331
- anon_match = _ANONYMOUS_COL_REGEX.match(col.name)
332
- if anon_match and col.name not in query:
333
- new_name = anonymous_col_prefix + anon_match.group(1)
334
- col_renames[col.name] = new_name
335
- new_columns.append(Column(new_name, col.type))
336
- else:
337
- new_columns.append(col)
338
-
339
- dtypes = odps_schema_to_pandas_dtypes(OdpsSchema(new_columns))
342
+ if not skip_schema:
343
+ inst = odps_entry.execute_sql(f"EXPLAIN {query}", hints=hints)
344
+ logger.debug("Explain instance ID: %s", inst.id)
345
+ explain_str = list(inst.get_task_results().values())[0]
346
+
347
+ try:
348
+ odps_schema = _parse_explained_schema(explain_str)
349
+ except ValueError as ex:
350
+ exc = ValueError(str(ex) + "\nExplain instance ID: " + inst.id)
351
+ raise exc.with_traceback(ex.__traceback__) from None
352
+
353
+ new_columns = []
354
+ for col in odps_schema.columns:
355
+ anon_match = _ANONYMOUS_COL_REGEX.match(col.name)
356
+ if anon_match and col.name not in query:
357
+ new_name = anonymous_col_prefix + anon_match.group(1)
358
+ col_renames[col.name] = new_name
359
+ new_columns.append(Column(new_name, col.type))
360
+ else:
361
+ new_columns.append(col)
362
+
363
+ dtypes = odps_schema_to_pandas_dtypes(OdpsSchema(new_columns))
364
+ else:
365
+ dtypes = None
340
366
 
341
367
  if not index_col:
342
368
  index_dtypes = None
343
369
  else:
370
+ if dtypes is None:
371
+ raise ValueError("Cannot configure index_col when skip_schema is True")
372
+
344
373
  if isinstance(index_col, str):
345
374
  index_col = [index_col]
346
375
  index_col_set = set(index_col)
@@ -13,6 +13,7 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import os
16
+ import uuid
16
17
  from collections import OrderedDict
17
18
 
18
19
  import numpy as np
@@ -26,7 +27,14 @@ from ....core import OutputType
26
27
  from ....tests.utils import tn
27
28
  from ....utils import lazy_import
28
29
  from ... import read_odps_query, read_odps_table
29
- from ...core import DatetimeIndex, Float64Index, IndexValue, Int64Index, MultiIndex
30
+ from ...core import (
31
+ DatetimeIndex,
32
+ Float64Index,
33
+ Index,
34
+ IndexValue,
35
+ Int64Index,
36
+ MultiIndex,
37
+ )
30
38
  from ..dataframe import from_pandas as from_pandas_df
31
39
  from ..date_range import date_range
32
40
  from ..from_tensor import (
@@ -36,7 +44,12 @@ from ..from_tensor import (
36
44
  )
37
45
  from ..index import from_pandas as from_pandas_index
38
46
  from ..index import from_tileable
39
- from ..read_odps_query import ColumnSchema, _parse_simple_explain, _resolve_task_sector
47
+ from ..read_odps_query import (
48
+ ColumnSchema,
49
+ _parse_full_explain,
50
+ _parse_simple_explain,
51
+ _resolve_task_sector,
52
+ )
40
53
  from ..series import from_pandas as from_pandas_series
41
54
 
42
55
  ray = lazy_import("ray")
@@ -114,18 +127,22 @@ def test_from_tileable_index():
114
127
 
115
128
  for o in [df, df[0]]:
116
129
  index = o.index
117
- assert isinstance(index, Int64Index)
130
+ assert isinstance(index, (Index, Int64Index))
118
131
  assert index.dtype == np.int64
119
132
  assert index.name == pd_df.index.name
120
- assert isinstance(index.index_value.value, IndexValue.Int64Index)
133
+ assert isinstance(
134
+ index.index_value.value, (IndexValue.Int64Index, IndexValue.Index)
135
+ )
121
136
 
122
137
  t = mt.random.rand(10, chunk_size=6)
123
138
  index = from_tileable(t, name="new_name")
124
139
 
125
- assert isinstance(index, Float64Index)
140
+ assert isinstance(index, (Index, Float64Index))
126
141
  assert index.dtype == np.float64
127
142
  assert index.name == "new_name"
128
- assert isinstance(index.index_value.value, IndexValue.Float64Index)
143
+ assert isinstance(
144
+ index.index_value.value, (IndexValue.Float64Index, IndexValue.Index)
145
+ )
129
146
 
130
147
 
131
148
  def test_from_tensor():
@@ -327,7 +344,10 @@ def test_from_odps_query():
327
344
  odps_entry.write_table(test_table2, [["A", 10, 4.5]])
328
345
 
329
346
  with pytest.raises(ValueError) as err_info:
330
- read_odps_query(f"CREATE TABLE dummy_table AS SELECT * FROM {table1_name}")
347
+ read_odps_query(
348
+ f"CREATE TABLE dummy_table_{uuid.uuid4().hex} "
349
+ f"AS SELECT * FROM {table1_name}"
350
+ )
331
351
  assert "instant query" in err_info.value.args[0]
332
352
 
333
353
  query1 = f"SELECT * FROM {table1_name} WHERE col1 > 10"
@@ -343,6 +363,10 @@ def test_from_odps_query():
343
363
  ),
344
364
  )
345
365
 
366
+ df = read_odps_query(query1, skip_schema=True)
367
+ assert df.dtypes is None
368
+ assert df.columns_value is None
369
+
346
370
  df = read_odps_query(query1, index_col="col1")
347
371
  assert df.op.query == query1
348
372
  assert df.index_value.name == "col1"
@@ -442,3 +466,31 @@ def test_resolve_simple_explain():
442
466
  assert schema.columns[0].type == odps_types.string
443
467
  assert schema.columns[1].name == "createdate"
444
468
  assert schema.columns[1].type == odps_types.bigint
469
+
470
+
471
+ def test_resolve_conditional():
472
+ input_path = os.path.join(
473
+ os.path.dirname(__file__), "test-data", "task-input-multi-cond.txt"
474
+ )
475
+ with open(input_path, "r") as f:
476
+ sector = f.read()
477
+
478
+ expected_col_types = {
479
+ "cs1": "string",
480
+ "cs2": "string",
481
+ "ci1": "bigint",
482
+ "cs3": "string",
483
+ "cs4": "string",
484
+ "cs5": "string",
485
+ "cs6": "string",
486
+ "cs7": "string",
487
+ "cs8": "string",
488
+ "ci2": "int",
489
+ "ci3": "bigint",
490
+ "cs9": "string",
491
+ }
492
+
493
+ schema = _parse_full_explain(sector)
494
+ for col, (exp_nm, exp_tp) in zip(schema.columns, expected_col_types.items()):
495
+ assert col.name == exp_nm
496
+ assert col.type == odps_types.validate_data_type(exp_tp)
@@ -18,6 +18,8 @@ from .accessor import (
18
18
  IndexMaxFrameAccessor,
19
19
  SeriesMaxFrameAccessor,
20
20
  )
21
+ from .apply_chunk import df_apply_chunk, series_apply_chunk
22
+ from .flatjson import series_flatjson
21
23
  from .flatmap import df_flatmap, series_flatmap
22
24
  from .reshuffle import DataFrameReshuffle, df_reshuffle
23
25
 
@@ -27,7 +29,10 @@ def _install():
27
29
 
28
30
  DataFrameMaxFrameAccessor._register("reshuffle", df_reshuffle)
29
31
  DataFrameMaxFrameAccessor._register("flatmap", df_flatmap)
32
+ DataFrameMaxFrameAccessor._register("apply_chunk", df_apply_chunk)
30
33
  SeriesMaxFrameAccessor._register("flatmap", series_flatmap)
34
+ SeriesMaxFrameAccessor._register("flatjson", series_flatjson)
35
+ SeriesMaxFrameAccessor._register("apply_chunk", series_apply_chunk)
31
36
 
32
37
  if DataFrameMaxFrameAccessor._api_count:
33
38
  for t in DATAFRAME_TYPE: