maxframe 1.0.0rc4__cp38-cp38-macosx_10_9_universal2.whl → 1.1.1__cp38-cp38-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (88) hide show
  1. maxframe/_utils.cpython-38-darwin.so +0 -0
  2. maxframe/config/__init__.py +1 -1
  3. maxframe/config/config.py +26 -0
  4. maxframe/config/tests/test_config.py +20 -1
  5. maxframe/conftest.py +17 -4
  6. maxframe/core/graph/core.cpython-38-darwin.so +0 -0
  7. maxframe/core/operator/base.py +2 -0
  8. maxframe/dataframe/arithmetic/tests/test_arithmetic.py +17 -16
  9. maxframe/dataframe/core.py +24 -2
  10. maxframe/dataframe/datasource/read_odps_query.py +65 -35
  11. maxframe/dataframe/datasource/read_odps_table.py +4 -2
  12. maxframe/dataframe/datasource/tests/test_datasource.py +59 -7
  13. maxframe/dataframe/extensions/__init__.py +5 -0
  14. maxframe/dataframe/extensions/apply_chunk.py +649 -0
  15. maxframe/dataframe/extensions/flatjson.py +131 -0
  16. maxframe/dataframe/extensions/flatmap.py +28 -40
  17. maxframe/dataframe/extensions/reshuffle.py +1 -1
  18. maxframe/dataframe/extensions/tests/test_apply_chunk.py +186 -0
  19. maxframe/dataframe/extensions/tests/test_extensions.py +46 -2
  20. maxframe/dataframe/groupby/__init__.py +1 -0
  21. maxframe/dataframe/groupby/aggregation.py +1 -0
  22. maxframe/dataframe/groupby/apply.py +9 -1
  23. maxframe/dataframe/groupby/core.py +1 -1
  24. maxframe/dataframe/groupby/fill.py +4 -1
  25. maxframe/dataframe/groupby/getitem.py +6 -0
  26. maxframe/dataframe/groupby/tests/test_groupby.py +1 -1
  27. maxframe/dataframe/groupby/transform.py +8 -2
  28. maxframe/dataframe/indexing/loc.py +6 -4
  29. maxframe/dataframe/merge/__init__.py +9 -1
  30. maxframe/dataframe/merge/concat.py +41 -31
  31. maxframe/dataframe/merge/merge.py +1 -1
  32. maxframe/dataframe/merge/tests/test_merge.py +3 -1
  33. maxframe/dataframe/misc/apply.py +3 -0
  34. maxframe/dataframe/misc/drop_duplicates.py +5 -1
  35. maxframe/dataframe/misc/map.py +3 -1
  36. maxframe/dataframe/misc/tests/test_misc.py +24 -2
  37. maxframe/dataframe/misc/transform.py +22 -13
  38. maxframe/dataframe/reduction/__init__.py +3 -0
  39. maxframe/dataframe/reduction/aggregation.py +1 -0
  40. maxframe/dataframe/reduction/median.py +56 -0
  41. maxframe/dataframe/reduction/tests/test_reduction.py +17 -7
  42. maxframe/dataframe/statistics/quantile.py +8 -2
  43. maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
  44. maxframe/dataframe/tests/test_utils.py +60 -0
  45. maxframe/dataframe/utils.py +110 -7
  46. maxframe/dataframe/window/expanding.py +5 -3
  47. maxframe/dataframe/window/tests/test_expanding.py +2 -2
  48. maxframe/io/objects/tests/test_object_io.py +39 -12
  49. maxframe/io/odpsio/__init__.py +1 -1
  50. maxframe/io/odpsio/arrow.py +51 -2
  51. maxframe/io/odpsio/schema.py +23 -5
  52. maxframe/io/odpsio/tableio.py +80 -124
  53. maxframe/io/odpsio/tests/test_schema.py +40 -0
  54. maxframe/io/odpsio/tests/test_tableio.py +5 -5
  55. maxframe/io/odpsio/tests/test_volumeio.py +35 -11
  56. maxframe/io/odpsio/volumeio.py +27 -3
  57. maxframe/learn/contrib/__init__.py +3 -2
  58. maxframe/learn/contrib/llm/__init__.py +16 -0
  59. maxframe/learn/contrib/llm/core.py +54 -0
  60. maxframe/learn/contrib/llm/models/__init__.py +14 -0
  61. maxframe/learn/contrib/llm/models/dashscope.py +73 -0
  62. maxframe/learn/contrib/llm/multi_modal.py +42 -0
  63. maxframe/learn/contrib/llm/text.py +42 -0
  64. maxframe/lib/mmh3.cpython-38-darwin.so +0 -0
  65. maxframe/lib/sparse/tests/test_sparse.py +15 -15
  66. maxframe/opcodes.py +7 -1
  67. maxframe/serialization/core.cpython-38-darwin.so +0 -0
  68. maxframe/serialization/core.pyx +13 -1
  69. maxframe/serialization/pandas.py +50 -20
  70. maxframe/serialization/serializables/core.py +70 -15
  71. maxframe/serialization/serializables/field_type.py +4 -1
  72. maxframe/serialization/serializables/tests/test_serializable.py +12 -2
  73. maxframe/serialization/tests/test_serial.py +2 -1
  74. maxframe/tensor/__init__.py +19 -7
  75. maxframe/tensor/merge/vstack.py +1 -1
  76. maxframe/tests/utils.py +16 -0
  77. maxframe/udf.py +27 -0
  78. maxframe/utils.py +42 -8
  79. {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/METADATA +2 -2
  80. {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/RECORD +88 -77
  81. {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/WHEEL +1 -1
  82. maxframe_client/clients/framedriver.py +4 -1
  83. maxframe_client/fetcher.py +23 -8
  84. maxframe_client/session/odps.py +40 -11
  85. maxframe_client/session/task.py +6 -25
  86. maxframe_client/session/tests/test_task.py +35 -6
  87. maxframe_client/tests/test_session.py +30 -10
  88. {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/top_level.txt +0 -0
Binary file
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from .config import AttributeDict, option_context, options
15
+ from .config import AttributeDict, option_context, options, update_wlm_quota_settings
maxframe/config/config.py CHANGED
@@ -28,6 +28,8 @@ except ImportError:
28
28
 
29
29
  available_timezones = lambda: all_timezones
30
30
 
31
+ import logging
32
+
31
33
  from ..utils import get_python_tag
32
34
  from .validators import (
33
35
  ValidatorType,
@@ -43,6 +45,8 @@ from .validators import (
43
45
  is_valid_cache_path,
44
46
  )
45
47
 
48
+ logger = logging.getLogger(__name__)
49
+
46
50
  _DEFAULT_REDIRECT_WARN = "Option {source} has been replaced by {target} and might be removed in a future release."
47
51
  _DEFAULT_MAX_ALIVE_SECONDS = 3 * 24 * 3600
48
52
  _DEFAULT_MAX_IDLE_SECONDS = 3600
@@ -380,6 +384,9 @@ default_options.register_option(
380
384
  default_options.register_option(
381
385
  "session.enable_schema", None, validator=is_null | is_bool, remote=True
382
386
  )
387
+ default_options.register_option(
388
+ "session.enable_high_availability", None, validator=is_null | is_bool, remote=True
389
+ )
383
390
  default_options.register_option(
384
391
  "session.default_schema", None, validator=is_null | is_string, remote=True
385
392
  )
@@ -496,3 +503,22 @@ class OptionsProxy:
496
503
 
497
504
 
498
505
  options = OptionsProxy()
506
+
507
+
508
+ def update_wlm_quota_settings(session_id: str, engine_settings: Dict[str, Any]):
509
+ engine_quota = engine_settings.get("odps.task.wlm.quota", None)
510
+ session_quota = options.session.quota_name or None
511
+ if engine_quota != session_quota and engine_quota:
512
+ logger.warning(
513
+ "[Session=%s] Session quota (%s) is different to SubDag engine quota (%s)",
514
+ session_id,
515
+ session_quota,
516
+ engine_quota,
517
+ )
518
+ # TODO(renxiang): overwrite or not overwrite
519
+ return
520
+
521
+ if session_quota:
522
+ engine_settings["odps.task.wlm.quota"] = session_quota
523
+ elif "odps.task.wlm.quota" in engine_settings:
524
+ engine_settings.pop("odps.task.wlm.quota")
@@ -18,7 +18,14 @@ import threading
18
18
 
19
19
  import pytest
20
20
 
21
- from ..config import Config, is_integer, is_string, option_context, options
21
+ from ..config import (
22
+ Config,
23
+ is_integer,
24
+ is_string,
25
+ option_context,
26
+ options,
27
+ update_wlm_quota_settings,
28
+ )
22
29
 
23
30
 
24
31
  def test_config_context():
@@ -101,3 +108,15 @@ def test_config_copy():
101
108
 
102
109
  target_cfg.update(src_cfg_dict)
103
110
  assert target_cfg.a.b.c == 1
111
+
112
+
113
+ def test_update_wlm_quota_settings():
114
+ with option_context({}):
115
+ options.session.quota_name = "quota1"
116
+ engine_settings = {}
117
+ update_wlm_quota_settings("session_id", engine_settings)
118
+ assert engine_settings["odps.task.wlm.quota"] == "quota1"
119
+ options.session.quota_name = None
120
+ update_wlm_quota_settings("session_id", engine_settings)
121
+ # TODO(renxiang): overwrite or not overwrite
122
+ assert "odps.task.wlm.quota" in engine_settings
maxframe/conftest.py CHANGED
@@ -40,10 +40,14 @@ def _get_odps_env(test_config: ConfigParser, section_name: str) -> ODPS:
40
40
  access_id = test_config.get(section_name, "access_id")
41
41
  except NoOptionError:
42
42
  access_id = test_config.get("odps", "access_id")
43
+ if not access_id:
44
+ access_id = os.getenv("ACCESS_ID")
43
45
  try:
44
46
  secret_access_key = test_config.get(section_name, "secret_access_key")
45
47
  except NoOptionError:
46
48
  secret_access_key = test_config.get("odps", "secret_access_key")
49
+ if not secret_access_key:
50
+ secret_access_key = os.getenv("SECRET_ACCESS_KEY")
47
51
  try:
48
52
  project = test_config.get(section_name, "project")
49
53
  except NoOptionError:
@@ -119,14 +123,23 @@ def oss_config():
119
123
  old_cache_url = options.object_cache_url
120
124
 
121
125
  try:
122
- oss_access_id = config.get("oss", "access_id")
123
- oss_secret_access_key = config.get("oss", "secret_access_key")
126
+ oss_access_id = config.get("oss", "access_id") or os.getenv("ACCESS_ID")
127
+ oss_secret_access_key = config.get("oss", "secret_access_key") or os.getenv(
128
+ "SECRET_ACCESS_KEY"
129
+ )
124
130
  oss_bucket_name = config.get("oss", "bucket_name")
125
131
  oss_endpoint = config.get("oss", "endpoint")
126
132
  oss_rolearn = config.get("oss", "rolearn")
127
133
 
128
134
  options.service_role_arn = oss_rolearn
129
- options.object_cache_url = f"oss://{oss_endpoint}/{oss_bucket_name}"
135
+ if "test" in oss_endpoint:
136
+ oss_svc_endpoint = oss_endpoint
137
+ else:
138
+ endpoint_parts = oss_endpoint.split(".", 1)
139
+ if "-internal" not in endpoint_parts[0]:
140
+ endpoint_parts[0] += "-internal"
141
+ oss_svc_endpoint = ".".join(endpoint_parts)
142
+ options.object_cache_url = f"oss://{oss_svc_endpoint}/{oss_bucket_name}"
130
143
 
131
144
  config.oss_config = (
132
145
  oss_access_id,
@@ -141,7 +154,7 @@ def oss_config():
141
154
  config.oss_bucket = oss2.Bucket(auth, oss_endpoint, oss_bucket_name)
142
155
  config.oss_rolearn = oss_rolearn
143
156
  yield config
144
- except (ConfigParser.NoSectionError, ConfigParser.NoOptionError, ImportError):
157
+ except (NoSectionError, NoOptionError, ImportError):
145
158
  return None
146
159
  finally:
147
160
  options.service_role_arn = old_role_arn
@@ -86,6 +86,8 @@ class SchedulingHint(Serializable):
86
86
  # `gpu` indicates that if the operator should be executed on the GPU.
87
87
  gpu = BoolField("gpu", default=None)
88
88
  priority = Int32Field("priority", default=None)
89
+ expect_engine = StringField("expect_engine", default=None)
90
+ expect_resources = DictField("expect_resources", FieldTypes.string, default=None)
89
91
 
90
92
  @classproperty
91
93
  @lru_cache(1)
@@ -22,6 +22,7 @@ import pandas as pd
22
22
  import pytest
23
23
 
24
24
  from ....core import OperatorType
25
+ from ....tests.utils import assert_mf_index_dtype
25
26
  from ....utils import dataslots
26
27
  from ...core import IndexValue
27
28
  from ...datasource.dataframe import from_pandas
@@ -164,7 +165,7 @@ def test_without_shuffle(func_name, func_opts):
164
165
  pd.testing.assert_index_equal(
165
166
  df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
166
167
  )
167
- assert isinstance(df3.index_value.value, IndexValue.Int64Index)
168
+ assert_mf_index_dtype(df3.index_value.value, np.int64)
168
169
  pd.testing.assert_index_equal(
169
170
  df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
170
171
  )
@@ -176,7 +177,7 @@ def test_without_shuffle(func_name, func_opts):
176
177
  pd.testing.assert_index_equal(
177
178
  df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
178
179
  )
179
- assert isinstance(df3.index_value.value, IndexValue.Int64Index)
180
+ assert_mf_index_dtype(df3.index_value.value, np.int64)
180
181
  pd.testing.assert_index_equal(
181
182
  df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
182
183
  )
@@ -370,7 +371,7 @@ def test_with_one_shuffle(func_name, func_opts):
370
371
  pd.testing.assert_index_equal(
371
372
  df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
372
373
  )
373
- assert isinstance(df3.index_value.value, IndexValue.Int64Index)
374
+ assert_mf_index_dtype(df3.index_value.value, np.int64)
374
375
  pd.testing.assert_index_equal(
375
376
  df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
376
377
  )
@@ -403,7 +404,7 @@ def test_with_all_shuffle(func_name, func_opts):
403
404
  pd.testing.assert_index_equal(
404
405
  df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
405
406
  )
406
- assert isinstance(df3.index_value.value, IndexValue.Int64Index)
407
+ assert_mf_index_dtype(df3.index_value.value, np.int64)
407
408
  pd.testing.assert_index_equal(
408
409
  df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
409
410
  )
@@ -433,7 +434,7 @@ def test_with_all_shuffle(func_name, func_opts):
433
434
  pd.testing.assert_index_equal(
434
435
  df6.columns_value.to_pandas(), func_opts.func(data4, data5).columns
435
436
  )
436
- assert isinstance(df6.index_value.value, IndexValue.Int64Index)
437
+ assert_mf_index_dtype(df6.index_value.value, np.int64)
437
438
  pd.testing.assert_index_equal(
438
439
  df6.index_value.to_pandas(), pd.Index([], dtype=np.int64)
439
440
  )
@@ -468,7 +469,7 @@ def test_without_shuffle_and_with_one_chunk(func_name, func_opts):
468
469
  pd.testing.assert_index_equal(
469
470
  df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
470
471
  )
471
- assert isinstance(df3.index_value.value, IndexValue.Int64Index)
472
+ assert_mf_index_dtype(df3.index_value.value, np.int64)
472
473
  pd.testing.assert_index_equal(
473
474
  df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
474
475
  )
@@ -501,7 +502,7 @@ def test_both_one_chunk(func_name, func_opts):
501
502
  pd.testing.assert_index_equal(
502
503
  df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
503
504
  )
504
- assert isinstance(df3.index_value.value, IndexValue.Int64Index)
505
+ assert_mf_index_dtype(df3.index_value.value, np.int64)
505
506
  pd.testing.assert_index_equal(
506
507
  df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
507
508
  )
@@ -534,7 +535,7 @@ def test_with_shuffle_and_one_chunk(func_name, func_opts):
534
535
  pd.testing.assert_index_equal(
535
536
  df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
536
537
  )
537
- assert isinstance(df3.index_value.value, IndexValue.Int64Index)
538
+ assert_mf_index_dtype(df3.index_value.value, np.int64)
538
539
  pd.testing.assert_index_equal(
539
540
  df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
540
541
  )
@@ -558,7 +559,7 @@ def test_on_same_dataframe(func_name, func_opts):
558
559
  pd.testing.assert_index_equal(
559
560
  df2.columns_value.to_pandas(), func_opts.func(data, data).columns
560
561
  )
561
- assert isinstance(df2.index_value.value, IndexValue.Int64Index)
562
+ assert_mf_index_dtype(df2.index_value.value, np.int64)
562
563
  pd.testing.assert_index_equal(
563
564
  df2.index_value.to_pandas(), pd.Index([], dtype=np.int64)
564
565
  )
@@ -590,19 +591,19 @@ def test_dataframe_and_scalar(func_name, func_opts):
590
591
  pd.testing.assert_series_equal(result.dtypes, expected.dtypes)
591
592
 
592
593
  pd.testing.assert_index_equal(result.columns_value.to_pandas(), data.columns)
593
- assert isinstance(result.index_value.value, IndexValue.Int64Index)
594
+ assert_mf_index_dtype(result.index_value.value, np.int64)
594
595
 
595
596
  pd.testing.assert_index_equal(result2.columns_value.to_pandas(), data.columns)
596
- assert isinstance(result2.index_value.value, IndexValue.Int64Index)
597
+ assert_mf_index_dtype(result2.index_value.value, np.int64)
597
598
 
598
599
  pd.testing.assert_index_equal(result3.columns_value.to_pandas(), data.columns)
599
- assert isinstance(result3.index_value.value, IndexValue.Int64Index)
600
+ assert_mf_index_dtype(result3.index_value.value, np.int64)
600
601
 
601
602
  pd.testing.assert_index_equal(result4.columns_value.to_pandas(), data.columns)
602
- assert isinstance(result4.index_value.value, IndexValue.Int64Index)
603
+ assert_mf_index_dtype(result4.index_value.value, np.int64)
603
604
 
604
605
  pd.testing.assert_index_equal(result5.columns_value.to_pandas(), data.columns)
605
- assert isinstance(result5.index_value.value, IndexValue.Int64Index)
606
+ assert_mf_index_dtype(result5.index_value.value, np.int64)
606
607
 
607
608
  if "builtin_function_or_method" not in str(type(func_opts.func)):
608
609
  # skip NotImplemented test for comparison function
@@ -679,7 +680,7 @@ def test_abs():
679
680
  pd.testing.assert_index_equal(
680
681
  df2.columns_value.to_pandas(), df1.columns_value.to_pandas()
681
682
  )
682
- assert isinstance(df2.index_value.value, IndexValue.Int64Index)
683
+ assert_mf_index_dtype(df2.index_value.value, np.int64)
683
684
  assert df2.shape == (10, 10)
684
685
 
685
686
 
@@ -697,7 +698,7 @@ def test_not():
697
698
  pd.testing.assert_index_equal(
698
699
  df2.columns_value.to_pandas(), df1.columns_value.to_pandas()
699
700
  )
700
- assert isinstance(df2.index_value.value, IndexValue.Int64Index)
701
+ assert_mf_index_dtype(df2.index_value.value, np.int64)
701
702
  assert df2.shape == (10, 10)
702
703
 
703
704
 
@@ -142,6 +142,14 @@ class IndexValue(Serializable):
142
142
  _data = NDArrayField("data")
143
143
  _dtype = DataTypeField("dtype")
144
144
 
145
+ @property
146
+ def dtype(self):
147
+ return getattr(self, "_dtype", None)
148
+
149
+ @property
150
+ def inferred_type(self):
151
+ return "floating" if self.dtype.kind == "f" else "integer"
152
+
145
153
  class RangeIndex(IndexBase):
146
154
  _name = AnyField("name")
147
155
  _slice = SliceField("slice")
@@ -243,6 +251,10 @@ class IndexValue(Serializable):
243
251
  _data = NDArrayField("data")
244
252
  _dtype = DataTypeField("dtype")
245
253
 
254
+ @property
255
+ def dtype(self):
256
+ return getattr(self, "_dtype", None)
257
+
246
258
  @property
247
259
  def inferred_type(self):
248
260
  return "integer"
@@ -254,6 +266,10 @@ class IndexValue(Serializable):
254
266
  _data = NDArrayField("data")
255
267
  _dtype = DataTypeField("dtype")
256
268
 
269
+ @property
270
+ def dtype(self):
271
+ return getattr(self, "_dtype", None)
272
+
257
273
  @property
258
274
  def inferred_type(self):
259
275
  return "integer"
@@ -265,6 +281,10 @@ class IndexValue(Serializable):
265
281
  _data = NDArrayField("data")
266
282
  _dtype = DataTypeField("dtype")
267
283
 
284
+ @property
285
+ def dtype(self):
286
+ return getattr(self, "_dtype", None)
287
+
268
288
  @property
269
289
  def inferred_type(self):
270
290
  return "floating"
@@ -1514,8 +1534,7 @@ class BaseDataFrameData(HasShapeTileableData, _ToPandasMixin):
1514
1534
  refresh_index_value(self)
1515
1535
  refresh_dtypes(self)
1516
1536
 
1517
- def refresh_from_table_meta(self, table_meta: DataFrameTableMeta) -> None:
1518
- dtypes = table_meta.pd_column_dtypes
1537
+ def refresh_from_dtypes(self, dtypes: pd.Series) -> None:
1519
1538
  self._dtypes = dtypes
1520
1539
  self._columns_value = parse_index(dtypes.index, store_data=True)
1521
1540
  self._dtypes_value = DtypesValue(key=tokenize(dtypes), value=dtypes)
@@ -1523,6 +1542,9 @@ class BaseDataFrameData(HasShapeTileableData, _ToPandasMixin):
1523
1542
  new_shape[-1] = len(dtypes)
1524
1543
  self._shape = tuple(new_shape)
1525
1544
 
1545
+ def refresh_from_table_meta(self, table_meta: DataFrameTableMeta) -> None:
1546
+ self.refresh_from_dtypes(table_meta.pd_column_dtypes)
1547
+
1526
1548
  @property
1527
1549
  def dtypes(self):
1528
1550
  dt = getattr(self, "_dtypes", None)
@@ -37,6 +37,7 @@ from ...serialization.serializables import (
37
37
  SeriesField,
38
38
  StringField,
39
39
  )
40
+ from ...utils import is_empty
40
41
  from ..utils import parse_index
41
42
  from .core import ColumnPruneSupportedDataSourceMixin, IncrementalIndexDatasource
42
43
 
@@ -57,7 +58,7 @@ _EXPLAIN_COLUMN_REGEX = re.compile(r"([^\(]+) \(([^\n]+)\)(?:| AS ([^ ]+))(?:\n|
57
58
  _ANONYMOUS_COL_REGEX = re.compile(r"^_c(\d+)$")
58
59
 
59
60
  _SIMPLE_SCHEMA_COLS_REGEX = re.compile(r"SELECT (([^:]+:[^, ]+[, ]*)+)FROM")
60
- _SIMPLE_SCHEMA_COL_REGEX = re.compile(r"([^\.]+):([^, ]+)")
61
+ _SIMPLE_SCHEMA_COL_REGEX = re.compile(r"([^ \.\)]+):([^ ]+)")
61
62
 
62
63
 
63
64
  @dataclasses.dataclass
@@ -180,23 +181,30 @@ def _parse_full_explain(explain_string: str) -> OdpsSchema:
180
181
 
181
182
  job_dag = jobs_sector.build_dag()
182
183
  indep_job_names = list(job_dag.iter_indep(reverse=True))
183
- if len(indep_job_names) > 1: # pragma: no cover
184
- raise ValueError("Only one final job is allowed in SQL statement")
185
-
186
- tasks_sector = jobs_sector.jobs[indep_job_names[0]]
187
- task_dag = tasks_sector.build_dag()
188
- indep_task_names = list(task_dag.iter_indep(reverse=True))
189
- if len(indep_task_names) > 1: # pragma: no cover
184
+ schema_signatures = dict()
185
+ for job_name in indep_job_names:
186
+ tasks_sector = jobs_sector.jobs[job_name]
187
+ task_dag = tasks_sector.build_dag()
188
+ indep_task_names = list(task_dag.iter_indep(reverse=True))
189
+ for task_name in indep_task_names:
190
+ task_sector = tasks_sector.tasks[task_name]
191
+ if not task_sector.schema: # pragma: no cover
192
+ raise ValueError("Cannot detect output schema")
193
+ if task_sector.output_target != "Screen":
194
+ raise ValueError("The SQL statement should be an instant query")
195
+ sig_tuples = sorted(
196
+ [
197
+ (c.column_alias or c.column_name, c.column_type)
198
+ for c in task_sector.schema
199
+ ]
200
+ )
201
+ schema_signatures[hash(tuple(sig_tuples))] = task_sector.schema
202
+ if len(schema_signatures) != 1:
190
203
  raise ValueError("Only one final task is allowed in SQL statement")
191
-
192
- task_sector = tasks_sector.tasks[indep_task_names[0]]
193
- if not task_sector.schema: # pragma: no cover
194
- raise ValueError("Cannot detect output schema")
195
- if task_sector.output_target != "Screen":
196
- raise ValueError("The SQL statement should be an instant query")
204
+ schema = list(schema_signatures.values())[0]
197
205
  cols = [
198
206
  Column(c.column_alias or c.column_name, validate_data_type(c.column_type))
199
- for c in task_sector.schema
207
+ for c in schema
200
208
  ]
201
209
  return OdpsSchema(cols)
202
210
 
@@ -209,7 +217,7 @@ def _parse_simple_explain(explain_string: str) -> OdpsSchema:
209
217
  fields_str = fields_match.group(1)
210
218
  cols = []
211
219
  for field, type_name in _SIMPLE_SCHEMA_COL_REGEX.findall(fields_str):
212
- cols.append(Column(field, validate_data_type(type_name)))
220
+ cols.append(Column(field, validate_data_type(type_name.rstrip(","))))
213
221
  return OdpsSchema(cols)
214
222
 
215
223
 
@@ -243,7 +251,7 @@ class DataFrameReadODPSQuery(
243
251
  self.columns = columns
244
252
 
245
253
  def __call__(self, chunk_bytes=None, chunk_size=None):
246
- if not self.index_columns:
254
+ if is_empty(self.index_columns):
247
255
  index_value = parse_index(pd.RangeIndex(0))
248
256
  elif len(self.index_columns) == 1:
249
257
  index_value = parse_index(
@@ -257,12 +265,18 @@ class DataFrameReadODPSQuery(
257
265
  )
258
266
  index_value = parse_index(idx)
259
267
 
260
- columns_value = parse_index(self.dtypes.index, store_data=True)
268
+ if self.dtypes is not None:
269
+ columns_value = parse_index(self.dtypes.index, store_data=True)
270
+ shape = (np.nan, len(self.dtypes))
271
+ else:
272
+ columns_value = None
273
+ shape = (np.nan, np.nan)
274
+
261
275
  self.output_types = [OutputType.dataframe]
262
276
  return self.new_tileable(
263
277
  [],
264
278
  None,
265
- shape=(len(self.dtypes), np.nan),
279
+ shape=shape,
266
280
  dtypes=self.dtypes,
267
281
  index_value=index_value,
268
282
  columns_value=columns_value,
@@ -278,6 +292,7 @@ def read_odps_query(
278
292
  string_as_binary: bool = None,
279
293
  sql_hints: Dict[str, str] = None,
280
294
  anonymous_col_prefix: str = _DEFAULT_ANONYMOUS_COL_PREFIX,
295
+ skip_schema: bool = False,
281
296
  **kw,
282
297
  ):
283
298
  """
@@ -298,6 +313,10 @@ def read_odps_query(
298
313
  User specified SQL hints.
299
314
  anonymous_col_prefix: str, optional
300
315
  Prefix for anonymous columns, '_anon_col_' by default.
316
+ skip_schema: bool, optional
317
+ Skip resolving output schema before execution. Once this is configured,
318
+ the output DataFrame cannot be inputs of other DataFrame operators
319
+ before execution.
301
320
 
302
321
  Returns
303
322
  -------
@@ -319,28 +338,39 @@ def read_odps_query(
319
338
 
320
339
  if odps_entry is None:
321
340
  raise ValueError("Missing odps_entry parameter")
322
- inst = odps_entry.execute_sql(f"EXPLAIN {query}", hints=hints)
323
- logger.debug("Explain instance ID: %s", inst.id)
324
- explain_str = list(inst.get_task_results().values())[0]
325
341
 
326
- odps_schema = _parse_explained_schema(explain_str)
327
-
328
- new_columns = []
329
342
  col_renames = {}
330
- for col in odps_schema.columns:
331
- anon_match = _ANONYMOUS_COL_REGEX.match(col.name)
332
- if anon_match and col.name not in query:
333
- new_name = anonymous_col_prefix + anon_match.group(1)
334
- col_renames[col.name] = new_name
335
- new_columns.append(Column(new_name, col.type))
336
- else:
337
- new_columns.append(col)
338
-
339
- dtypes = odps_schema_to_pandas_dtypes(OdpsSchema(new_columns))
343
+ if not skip_schema:
344
+ inst = odps_entry.execute_sql(f"EXPLAIN {query}", hints=hints)
345
+ logger.debug("Explain instance ID: %s", inst.id)
346
+ explain_str = list(inst.get_task_results().values())[0]
347
+
348
+ try:
349
+ odps_schema = _parse_explained_schema(explain_str)
350
+ except ValueError as ex:
351
+ exc = ValueError(str(ex) + "\nExplain instance ID: " + inst.id)
352
+ raise exc.with_traceback(ex.__traceback__) from None
353
+
354
+ new_columns = []
355
+ for col in odps_schema.columns:
356
+ anon_match = _ANONYMOUS_COL_REGEX.match(col.name)
357
+ if anon_match and col.name not in query:
358
+ new_name = anonymous_col_prefix + anon_match.group(1)
359
+ col_renames[col.name] = new_name
360
+ new_columns.append(Column(new_name, col.type))
361
+ else:
362
+ new_columns.append(col)
363
+
364
+ dtypes = odps_schema_to_pandas_dtypes(OdpsSchema(new_columns))
365
+ else:
366
+ dtypes = None
340
367
 
341
368
  if not index_col:
342
369
  index_dtypes = None
343
370
  else:
371
+ if dtypes is None:
372
+ raise ValueError("Cannot configure index_col when skip_schema is True")
373
+
344
374
  if isinstance(index_col, str):
345
375
  index_col = [index_col]
346
376
  index_col_set = set(index_col)
@@ -34,6 +34,7 @@ from ...serialization.serializables import (
34
34
  SeriesField,
35
35
  StringField,
36
36
  )
37
+ from ...utils import is_empty
37
38
  from ..core import DataFrame # noqa: F401
38
39
  from ..utils import parse_index
39
40
  from .core import ColumnPruneSupportedDataSourceMixin, IncrementalIndexDatasource
@@ -76,7 +77,7 @@ class DataFrameReadODPSTable(
76
77
  self.columns = columns
77
78
 
78
79
  def __call__(self, shape, chunk_bytes=None, chunk_size=None):
79
- if not self.index_columns:
80
+ if is_empty(self.index_columns):
80
81
  if np.isnan(shape[0]):
81
82
  index_value = parse_index(pd.RangeIndex(0))
82
83
  else:
@@ -238,7 +239,8 @@ def read_odps_table(
238
239
  partitions = [partitions]
239
240
 
240
241
  append_partitions = append_partitions or any(
241
- pt.name in (columns or ()) for pt in (table.table_schema.partitions or ())
242
+ pt.name in (columns if not is_empty(columns) else ())
243
+ for pt in (table.table_schema.partitions or ())
242
244
  )
243
245
  op = DataFrameReadODPSTable(
244
246
  table_name=table.full_table_name,
@@ -13,6 +13,7 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import os
16
+ import uuid
16
17
  from collections import OrderedDict
17
18
 
18
19
  import numpy as np
@@ -26,7 +27,14 @@ from ....core import OutputType
26
27
  from ....tests.utils import tn
27
28
  from ....utils import lazy_import
28
29
  from ... import read_odps_query, read_odps_table
29
- from ...core import DatetimeIndex, Float64Index, IndexValue, Int64Index, MultiIndex
30
+ from ...core import (
31
+ DatetimeIndex,
32
+ Float64Index,
33
+ Index,
34
+ IndexValue,
35
+ Int64Index,
36
+ MultiIndex,
37
+ )
30
38
  from ..dataframe import from_pandas as from_pandas_df
31
39
  from ..date_range import date_range
32
40
  from ..from_tensor import (
@@ -36,7 +44,12 @@ from ..from_tensor import (
36
44
  )
37
45
  from ..index import from_pandas as from_pandas_index
38
46
  from ..index import from_tileable
39
- from ..read_odps_query import ColumnSchema, _parse_simple_explain, _resolve_task_sector
47
+ from ..read_odps_query import (
48
+ ColumnSchema,
49
+ _parse_full_explain,
50
+ _parse_simple_explain,
51
+ _resolve_task_sector,
52
+ )
40
53
  from ..series import from_pandas as from_pandas_series
41
54
 
42
55
  ray = lazy_import("ray")
@@ -114,18 +127,22 @@ def test_from_tileable_index():
114
127
 
115
128
  for o in [df, df[0]]:
116
129
  index = o.index
117
- assert isinstance(index, Int64Index)
130
+ assert isinstance(index, (Index, Int64Index))
118
131
  assert index.dtype == np.int64
119
132
  assert index.name == pd_df.index.name
120
- assert isinstance(index.index_value.value, IndexValue.Int64Index)
133
+ assert isinstance(
134
+ index.index_value.value, (IndexValue.Int64Index, IndexValue.Index)
135
+ )
121
136
 
122
137
  t = mt.random.rand(10, chunk_size=6)
123
138
  index = from_tileable(t, name="new_name")
124
139
 
125
- assert isinstance(index, Float64Index)
140
+ assert isinstance(index, (Index, Float64Index))
126
141
  assert index.dtype == np.float64
127
142
  assert index.name == "new_name"
128
- assert isinstance(index.index_value.value, IndexValue.Float64Index)
143
+ assert isinstance(
144
+ index.index_value.value, (IndexValue.Float64Index, IndexValue.Index)
145
+ )
129
146
 
130
147
 
131
148
  def test_from_tensor():
@@ -327,7 +344,10 @@ def test_from_odps_query():
327
344
  odps_entry.write_table(test_table2, [["A", 10, 4.5]])
328
345
 
329
346
  with pytest.raises(ValueError) as err_info:
330
- read_odps_query(f"CREATE TABLE dummy_table AS SELECT * FROM {table1_name}")
347
+ read_odps_query(
348
+ f"CREATE TABLE dummy_table_{uuid.uuid4().hex} "
349
+ f"AS SELECT * FROM {table1_name}"
350
+ )
331
351
  assert "instant query" in err_info.value.args[0]
332
352
 
333
353
  query1 = f"SELECT * FROM {table1_name} WHERE col1 > 10"
@@ -343,6 +363,10 @@ def test_from_odps_query():
343
363
  ),
344
364
  )
345
365
 
366
+ df = read_odps_query(query1, skip_schema=True)
367
+ assert df.dtypes is None
368
+ assert df.columns_value is None
369
+
346
370
  df = read_odps_query(query1, index_col="col1")
347
371
  assert df.op.query == query1
348
372
  assert df.index_value.name == "col1"
@@ -442,3 +466,31 @@ def test_resolve_simple_explain():
442
466
  assert schema.columns[0].type == odps_types.string
443
467
  assert schema.columns[1].name == "createdate"
444
468
  assert schema.columns[1].type == odps_types.bigint
469
+
470
+
471
+ def test_resolve_conditional():
472
+ input_path = os.path.join(
473
+ os.path.dirname(__file__), "test-data", "task-input-multi-cond.txt"
474
+ )
475
+ with open(input_path, "r") as f:
476
+ sector = f.read()
477
+
478
+ expected_col_types = {
479
+ "cs1": "string",
480
+ "cs2": "string",
481
+ "ci1": "bigint",
482
+ "cs3": "string",
483
+ "cs4": "string",
484
+ "cs5": "string",
485
+ "cs6": "string",
486
+ "cs7": "string",
487
+ "cs8": "string",
488
+ "ci2": "int",
489
+ "ci3": "bigint",
490
+ "cs9": "string",
491
+ }
492
+
493
+ schema = _parse_full_explain(sector)
494
+ for col, (exp_nm, exp_tp) in zip(schema.columns, expected_col_types.items()):
495
+ assert col.name == exp_nm
496
+ assert col.type == odps_types.validate_data_type(exp_tp)