maxframe 0.1.0b4__cp311-cp311-win32.whl → 1.0.0__cp311-cp311-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/__init__.py +1 -0
- maxframe/_utils.cp311-win32.pyd +0 -0
- maxframe/codegen.py +56 -5
- maxframe/config/config.py +78 -10
- maxframe/config/validators.py +42 -11
- maxframe/conftest.py +58 -14
- maxframe/core/__init__.py +2 -16
- maxframe/core/entity/__init__.py +1 -12
- maxframe/core/entity/executable.py +1 -1
- maxframe/core/entity/objects.py +46 -45
- maxframe/core/entity/output_types.py +0 -3
- maxframe/core/entity/tests/test_objects.py +43 -0
- maxframe/core/entity/tileables.py +5 -78
- maxframe/core/graph/__init__.py +2 -2
- maxframe/core/graph/builder/__init__.py +0 -1
- maxframe/core/graph/builder/base.py +5 -4
- maxframe/core/graph/builder/tileable.py +4 -4
- maxframe/core/graph/builder/utils.py +4 -8
- maxframe/core/graph/core.cp311-win32.pyd +0 -0
- maxframe/core/graph/core.pyx +4 -4
- maxframe/core/graph/entity.py +9 -33
- maxframe/core/operator/__init__.py +2 -9
- maxframe/core/operator/base.py +3 -5
- maxframe/core/operator/objects.py +0 -9
- maxframe/core/operator/utils.py +55 -0
- maxframe/dataframe/__init__.py +2 -1
- maxframe/dataframe/arithmetic/around.py +5 -17
- maxframe/dataframe/arithmetic/core.py +15 -7
- maxframe/dataframe/arithmetic/docstring.py +7 -33
- maxframe/dataframe/arithmetic/equal.py +4 -2
- maxframe/dataframe/arithmetic/greater.py +4 -2
- maxframe/dataframe/arithmetic/greater_equal.py +4 -2
- maxframe/dataframe/arithmetic/less.py +2 -2
- maxframe/dataframe/arithmetic/less_equal.py +4 -2
- maxframe/dataframe/arithmetic/not_equal.py +4 -2
- maxframe/dataframe/arithmetic/tests/test_arithmetic.py +39 -16
- maxframe/dataframe/core.py +58 -12
- maxframe/dataframe/datasource/date_range.py +2 -2
- maxframe/dataframe/datasource/read_odps_query.py +120 -24
- maxframe/dataframe/datasource/read_odps_table.py +9 -4
- maxframe/dataframe/datasource/tests/test_datasource.py +103 -8
- maxframe/dataframe/datastore/tests/test_to_odps.py +48 -0
- maxframe/dataframe/datastore/to_odps.py +28 -0
- maxframe/dataframe/extensions/__init__.py +5 -0
- maxframe/dataframe/extensions/flatjson.py +131 -0
- maxframe/dataframe/extensions/flatmap.py +317 -0
- maxframe/dataframe/extensions/reshuffle.py +1 -1
- maxframe/dataframe/extensions/tests/test_extensions.py +108 -3
- maxframe/dataframe/groupby/core.py +1 -1
- maxframe/dataframe/groupby/cum.py +0 -1
- maxframe/dataframe/groupby/fill.py +4 -1
- maxframe/dataframe/groupby/getitem.py +6 -0
- maxframe/dataframe/groupby/tests/test_groupby.py +5 -1
- maxframe/dataframe/groupby/transform.py +5 -1
- maxframe/dataframe/indexing/align.py +1 -1
- maxframe/dataframe/indexing/loc.py +6 -4
- maxframe/dataframe/indexing/rename.py +5 -28
- maxframe/dataframe/indexing/sample.py +0 -1
- maxframe/dataframe/indexing/set_index.py +68 -1
- maxframe/dataframe/initializer.py +11 -1
- maxframe/dataframe/merge/__init__.py +9 -1
- maxframe/dataframe/merge/concat.py +41 -31
- maxframe/dataframe/merge/merge.py +237 -3
- maxframe/dataframe/merge/tests/test_merge.py +126 -1
- maxframe/dataframe/misc/__init__.py +4 -0
- maxframe/dataframe/misc/apply.py +6 -11
- maxframe/dataframe/misc/case_when.py +141 -0
- maxframe/dataframe/misc/describe.py +2 -2
- maxframe/dataframe/misc/drop_duplicates.py +8 -8
- maxframe/dataframe/misc/eval.py +4 -0
- maxframe/dataframe/misc/memory_usage.py +2 -2
- maxframe/dataframe/misc/pct_change.py +1 -83
- maxframe/dataframe/misc/pivot_table.py +262 -0
- maxframe/dataframe/misc/tests/test_misc.py +93 -1
- maxframe/dataframe/misc/transform.py +1 -30
- maxframe/dataframe/misc/value_counts.py +4 -17
- maxframe/dataframe/missing/dropna.py +1 -1
- maxframe/dataframe/missing/fillna.py +5 -5
- maxframe/dataframe/operators.py +1 -17
- maxframe/dataframe/plotting/core.py +2 -2
- maxframe/dataframe/reduction/core.py +4 -3
- maxframe/dataframe/reduction/tests/test_reduction.py +2 -4
- maxframe/dataframe/sort/sort_values.py +1 -11
- maxframe/dataframe/statistics/corr.py +3 -3
- maxframe/dataframe/statistics/quantile.py +13 -19
- maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
- maxframe/dataframe/tests/test_initializer.py +33 -2
- maxframe/dataframe/utils.py +33 -11
- maxframe/dataframe/window/expanding.py +5 -3
- maxframe/dataframe/window/tests/test_expanding.py +2 -2
- maxframe/errors.py +13 -0
- maxframe/extension.py +12 -0
- maxframe/io/__init__.py +13 -0
- maxframe/io/objects/__init__.py +24 -0
- maxframe/io/objects/core.py +140 -0
- maxframe/io/objects/tensor.py +76 -0
- maxframe/io/objects/tests/__init__.py +13 -0
- maxframe/io/objects/tests/test_object_io.py +97 -0
- maxframe/{odpsio → io/odpsio}/__init__.py +3 -1
- maxframe/{odpsio → io/odpsio}/arrow.py +43 -12
- maxframe/{odpsio → io/odpsio}/schema.py +38 -16
- maxframe/io/odpsio/tableio.py +719 -0
- maxframe/io/odpsio/tests/__init__.py +13 -0
- maxframe/{odpsio → io/odpsio}/tests/test_schema.py +75 -33
- maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +50 -23
- maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
- maxframe/io/odpsio/volumeio.py +63 -0
- maxframe/learn/contrib/__init__.py +3 -1
- maxframe/learn/contrib/graph/__init__.py +15 -0
- maxframe/learn/contrib/graph/connected_components.py +215 -0
- maxframe/learn/contrib/graph/tests/__init__.py +13 -0
- maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
- maxframe/learn/contrib/llm/__init__.py +16 -0
- maxframe/learn/contrib/llm/core.py +54 -0
- maxframe/learn/contrib/llm/models/__init__.py +14 -0
- maxframe/learn/contrib/llm/models/dashscope.py +73 -0
- maxframe/learn/contrib/llm/multi_modal.py +42 -0
- maxframe/learn/contrib/llm/text.py +42 -0
- maxframe/learn/contrib/utils.py +52 -0
- maxframe/learn/contrib/xgboost/__init__.py +26 -0
- maxframe/learn/contrib/xgboost/classifier.py +110 -0
- maxframe/learn/contrib/xgboost/core.py +241 -0
- maxframe/learn/contrib/xgboost/dmatrix.py +147 -0
- maxframe/learn/contrib/xgboost/predict.py +121 -0
- maxframe/learn/contrib/xgboost/regressor.py +71 -0
- maxframe/learn/contrib/xgboost/tests/__init__.py +13 -0
- maxframe/learn/contrib/xgboost/tests/test_core.py +43 -0
- maxframe/learn/contrib/xgboost/train.py +132 -0
- maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
- maxframe/learn/utils/__init__.py +15 -0
- maxframe/learn/utils/core.py +29 -0
- maxframe/lib/mmh3.cp311-win32.pyd +0 -0
- maxframe/lib/mmh3.pyi +43 -0
- maxframe/lib/sparse/tests/test_sparse.py +15 -15
- maxframe/lib/wrapped_pickle.py +2 -1
- maxframe/opcodes.py +11 -0
- maxframe/protocol.py +154 -27
- maxframe/remote/core.py +4 -8
- maxframe/serialization/__init__.py +1 -0
- maxframe/serialization/core.cp311-win32.pyd +0 -0
- maxframe/serialization/core.pxd +3 -0
- maxframe/serialization/core.pyi +64 -0
- maxframe/serialization/core.pyx +67 -26
- maxframe/serialization/exception.py +1 -1
- maxframe/serialization/pandas.py +52 -17
- maxframe/serialization/serializables/core.py +180 -15
- maxframe/serialization/serializables/field_type.py +4 -1
- maxframe/serialization/serializables/tests/test_serializable.py +54 -5
- maxframe/serialization/tests/test_serial.py +2 -1
- maxframe/session.py +37 -2
- maxframe/tensor/__init__.py +81 -2
- maxframe/tensor/arithmetic/isclose.py +1 -0
- maxframe/tensor/arithmetic/tests/test_arithmetic.py +22 -18
- maxframe/tensor/core.py +5 -136
- maxframe/tensor/datasource/array.py +7 -2
- maxframe/tensor/datasource/full.py +1 -1
- maxframe/tensor/datasource/scalar.py +1 -1
- maxframe/tensor/datasource/tests/test_datasource.py +1 -1
- maxframe/tensor/indexing/flatnonzero.py +1 -1
- maxframe/tensor/indexing/getitem.py +2 -0
- maxframe/tensor/merge/__init__.py +2 -0
- maxframe/tensor/merge/concatenate.py +101 -0
- maxframe/tensor/merge/tests/test_merge.py +30 -1
- maxframe/tensor/merge/vstack.py +74 -0
- maxframe/tensor/{base → misc}/__init__.py +4 -0
- maxframe/tensor/misc/atleast_1d.py +72 -0
- maxframe/tensor/misc/atleast_2d.py +70 -0
- maxframe/tensor/misc/atleast_3d.py +85 -0
- maxframe/tensor/misc/tests/__init__.py +13 -0
- maxframe/tensor/{base → misc}/transpose.py +22 -18
- maxframe/tensor/misc/unique.py +205 -0
- maxframe/tensor/operators.py +1 -7
- maxframe/tensor/random/core.py +1 -1
- maxframe/tensor/reduction/count_nonzero.py +2 -1
- maxframe/tensor/reduction/mean.py +1 -0
- maxframe/tensor/reduction/nanmean.py +1 -0
- maxframe/tensor/reduction/nanvar.py +2 -0
- maxframe/tensor/reduction/tests/test_reduction.py +12 -1
- maxframe/tensor/reduction/var.py +2 -0
- maxframe/tensor/statistics/quantile.py +2 -2
- maxframe/tensor/utils.py +2 -22
- maxframe/tests/test_protocol.py +34 -0
- maxframe/tests/test_utils.py +0 -12
- maxframe/tests/utils.py +17 -2
- maxframe/typing_.py +4 -1
- maxframe/udf.py +62 -3
- maxframe/utils.py +112 -86
- {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/METADATA +25 -25
- {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/RECORD +208 -167
- {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/WHEEL +1 -1
- maxframe_client/__init__.py +0 -1
- maxframe_client/clients/framedriver.py +4 -1
- maxframe_client/fetcher.py +123 -54
- maxframe_client/session/consts.py +3 -0
- maxframe_client/session/graph.py +8 -2
- maxframe_client/session/odps.py +223 -40
- maxframe_client/session/task.py +108 -80
- maxframe_client/tests/test_fetcher.py +21 -3
- maxframe_client/tests/test_session.py +136 -8
- maxframe/core/entity/chunks.py +0 -68
- maxframe/core/entity/fuse.py +0 -73
- maxframe/core/graph/builder/chunk.py +0 -430
- maxframe/odpsio/tableio.py +0 -300
- maxframe/odpsio/volumeio.py +0 -95
- maxframe_client/clients/spe.py +0 -104
- /maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
- /maxframe/{tensor/base → dataframe/datastore}/tests/__init__.py +0 -0
- /maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
- /maxframe/tensor/{base → misc}/astype.py +0 -0
- /maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
- /maxframe/tensor/{base → misc}/ravel.py +0 -0
- /maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
- /maxframe/tensor/{base → misc}/where.py +0 -0
- {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -13,6 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import dataclasses
|
|
16
|
+
import logging
|
|
16
17
|
import re
|
|
17
18
|
from typing import Dict, List, Optional, Tuple, Union
|
|
18
19
|
|
|
@@ -22,12 +23,14 @@ from odps import ODPS
|
|
|
22
23
|
from odps.types import Column, OdpsSchema, validate_data_type
|
|
23
24
|
|
|
24
25
|
from ... import opcodes
|
|
26
|
+
from ...config import options
|
|
25
27
|
from ...core import OutputType
|
|
26
28
|
from ...core.graph import DAG
|
|
27
|
-
from ...odpsio import odps_schema_to_pandas_dtypes
|
|
29
|
+
from ...io.odpsio import odps_schema_to_pandas_dtypes
|
|
28
30
|
from ...serialization.serializables import (
|
|
29
31
|
AnyField,
|
|
30
32
|
BoolField,
|
|
33
|
+
DictField,
|
|
31
34
|
FieldTypes,
|
|
32
35
|
Int64Field,
|
|
33
36
|
ListField,
|
|
@@ -37,6 +40,10 @@ from ...serialization.serializables import (
|
|
|
37
40
|
from ..utils import parse_index
|
|
38
41
|
from .core import ColumnPruneSupportedDataSourceMixin, IncrementalIndexDatasource
|
|
39
42
|
|
|
43
|
+
logger = logging.getLogger(__name__)
|
|
44
|
+
|
|
45
|
+
_DEFAULT_ANONYMOUS_COL_PREFIX = "_anon_col_"
|
|
46
|
+
|
|
40
47
|
_EXPLAIN_DEPENDS_REGEX = re.compile(r"([^\s]+) depends on: ([^\n]+)")
|
|
41
48
|
_EXPLAIN_JOB_REGEX = re.compile(r"(\S+) is root job")
|
|
42
49
|
_EXPLAIN_TASKS_HEADER_REGEX = re.compile(r"In Job ([^:]+):")
|
|
@@ -46,7 +53,11 @@ _EXPLAIN_TASK_SCHEMA_REGEX = re.compile(
|
|
|
46
53
|
r"In Task ([^:]+)[\S\s]+FS: output: ([^\n #]+)[\s\S]+schema:\s+([\S\s]+)$",
|
|
47
54
|
re.MULTILINE,
|
|
48
55
|
)
|
|
49
|
-
_EXPLAIN_COLUMN_REGEX = re.compile(r"([^\(]+) \(([
|
|
56
|
+
_EXPLAIN_COLUMN_REGEX = re.compile(r"([^\(]+) \(([^\n]+)\)(?:| AS ([^ ]+))(?:\n|$)")
|
|
57
|
+
_ANONYMOUS_COL_REGEX = re.compile(r"^_c(\d+)$")
|
|
58
|
+
|
|
59
|
+
_SIMPLE_SCHEMA_COLS_REGEX = re.compile(r"SELECT (([^:]+:[^, ]+[, ]*)+)FROM")
|
|
60
|
+
_SIMPLE_SCHEMA_COL_REGEX = re.compile(r"([^ \.\)]+):([^ ]+)")
|
|
50
61
|
|
|
51
62
|
|
|
52
63
|
@dataclasses.dataclass
|
|
@@ -151,7 +162,7 @@ def _resolve_task_sector(job_name: str, sector: str) -> TaskSector:
|
|
|
151
162
|
return TaskSector(job_name, task_name, out_target, schemas)
|
|
152
163
|
|
|
153
164
|
|
|
154
|
-
def
|
|
165
|
+
def _parse_full_explain(explain_string: str) -> OdpsSchema:
|
|
155
166
|
sectors = _split_explain_string(explain_string)
|
|
156
167
|
jobs_sector = tasks_sector = None
|
|
157
168
|
|
|
@@ -169,27 +180,53 @@ def _parse_explained_schema(explain_string: str) -> OdpsSchema:
|
|
|
169
180
|
|
|
170
181
|
job_dag = jobs_sector.build_dag()
|
|
171
182
|
indep_job_names = list(job_dag.iter_indep(reverse=True))
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
183
|
+
schema_signatures = dict()
|
|
184
|
+
for job_name in indep_job_names:
|
|
185
|
+
tasks_sector = jobs_sector.jobs[job_name]
|
|
186
|
+
task_dag = tasks_sector.build_dag()
|
|
187
|
+
indep_task_names = list(task_dag.iter_indep(reverse=True))
|
|
188
|
+
for task_name in indep_task_names:
|
|
189
|
+
task_sector = tasks_sector.tasks[task_name]
|
|
190
|
+
if not task_sector.schema: # pragma: no cover
|
|
191
|
+
raise ValueError("Cannot detect output schema")
|
|
192
|
+
if task_sector.output_target != "Screen":
|
|
193
|
+
raise ValueError("The SQL statement should be an instant query")
|
|
194
|
+
sig_tuples = sorted(
|
|
195
|
+
[
|
|
196
|
+
(c.column_alias or c.column_name, c.column_type)
|
|
197
|
+
for c in task_sector.schema
|
|
198
|
+
]
|
|
199
|
+
)
|
|
200
|
+
schema_signatures[hash(tuple(sig_tuples))] = task_sector.schema
|
|
201
|
+
if len(schema_signatures) != 1:
|
|
179
202
|
raise ValueError("Only one final task is allowed in SQL statement")
|
|
180
|
-
|
|
181
|
-
task_sector = tasks_sector.tasks[indep_task_names[0]]
|
|
182
|
-
if not task_sector.schema: # pragma: no cover
|
|
183
|
-
raise ValueError("Cannot detect output schema")
|
|
184
|
-
if task_sector.output_target != "Screen":
|
|
185
|
-
raise ValueError("The SQL statement should be an instant query")
|
|
203
|
+
schema = list(schema_signatures.values())[0]
|
|
186
204
|
cols = [
|
|
187
205
|
Column(c.column_alias or c.column_name, validate_data_type(c.column_type))
|
|
188
|
-
for c in
|
|
206
|
+
for c in schema
|
|
189
207
|
]
|
|
190
208
|
return OdpsSchema(cols)
|
|
191
209
|
|
|
192
210
|
|
|
211
|
+
def _parse_simple_explain(explain_string: str) -> OdpsSchema:
|
|
212
|
+
fields_match = _SIMPLE_SCHEMA_COLS_REGEX.search(explain_string)
|
|
213
|
+
if not fields_match:
|
|
214
|
+
raise ValueError("Cannot detect output table schema")
|
|
215
|
+
|
|
216
|
+
fields_str = fields_match.group(1)
|
|
217
|
+
cols = []
|
|
218
|
+
for field, type_name in _SIMPLE_SCHEMA_COL_REGEX.findall(fields_str):
|
|
219
|
+
cols.append(Column(field, validate_data_type(type_name.rstrip(","))))
|
|
220
|
+
return OdpsSchema(cols)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def _parse_explained_schema(explain_string: str) -> OdpsSchema:
|
|
224
|
+
if explain_string.startswith("AdhocSink"):
|
|
225
|
+
return _parse_simple_explain(explain_string)
|
|
226
|
+
else:
|
|
227
|
+
return _parse_full_explain(explain_string)
|
|
228
|
+
|
|
229
|
+
|
|
193
230
|
class DataFrameReadODPSQuery(
|
|
194
231
|
IncrementalIndexDatasource,
|
|
195
232
|
ColumnPruneSupportedDataSourceMixin,
|
|
@@ -204,6 +241,7 @@ class DataFrameReadODPSQuery(
|
|
|
204
241
|
string_as_binary = BoolField("string_as_binary", default=None)
|
|
205
242
|
index_columns = ListField("index_columns", FieldTypes.string, default=None)
|
|
206
243
|
index_dtypes = SeriesField("index_dtypes", default=None)
|
|
244
|
+
column_renames = DictField("column_renames", default=None)
|
|
207
245
|
|
|
208
246
|
def get_columns(self):
|
|
209
247
|
return self.columns
|
|
@@ -216,7 +254,9 @@ class DataFrameReadODPSQuery(
|
|
|
216
254
|
index_value = parse_index(pd.RangeIndex(0))
|
|
217
255
|
elif len(self.index_columns) == 1:
|
|
218
256
|
index_value = parse_index(
|
|
219
|
-
pd.Index([], name=self.index_columns[0]).astype(
|
|
257
|
+
pd.Index([], name=self.index_columns[0]).astype(
|
|
258
|
+
self.index_dtypes.iloc[0]
|
|
259
|
+
)
|
|
220
260
|
)
|
|
221
261
|
else:
|
|
222
262
|
idx = pd.MultiIndex.from_frame(
|
|
@@ -224,12 +264,18 @@ class DataFrameReadODPSQuery(
|
|
|
224
264
|
)
|
|
225
265
|
index_value = parse_index(idx)
|
|
226
266
|
|
|
227
|
-
|
|
267
|
+
if self.dtypes is not None:
|
|
268
|
+
columns_value = parse_index(self.dtypes.index, store_data=True)
|
|
269
|
+
shape = (np.nan, len(self.dtypes))
|
|
270
|
+
else:
|
|
271
|
+
columns_value = None
|
|
272
|
+
shape = (np.nan, np.nan)
|
|
273
|
+
|
|
228
274
|
self.output_types = [OutputType.dataframe]
|
|
229
275
|
return self.new_tileable(
|
|
230
276
|
[],
|
|
231
277
|
None,
|
|
232
|
-
shape=
|
|
278
|
+
shape=shape,
|
|
233
279
|
dtypes=self.dtypes,
|
|
234
280
|
index_value=index_value,
|
|
235
281
|
columns_value=columns_value,
|
|
@@ -243,6 +289,9 @@ def read_odps_query(
|
|
|
243
289
|
odps_entry: ODPS = None,
|
|
244
290
|
index_col: Union[None, str, List[str]] = None,
|
|
245
291
|
string_as_binary: bool = None,
|
|
292
|
+
sql_hints: Dict[str, str] = None,
|
|
293
|
+
anonymous_col_prefix: str = _DEFAULT_ANONYMOUS_COL_PREFIX,
|
|
294
|
+
skip_schema: bool = False,
|
|
246
295
|
**kw,
|
|
247
296
|
):
|
|
248
297
|
"""
|
|
@@ -257,24 +306,70 @@ def read_odps_query(
|
|
|
257
306
|
MaxCompute SQL statement.
|
|
258
307
|
index_col: Union[None, str, List[str]]
|
|
259
308
|
Columns to be specified as indexes.
|
|
309
|
+
string_as_binary: bool, optional
|
|
310
|
+
Whether to convert string columns to binary.
|
|
311
|
+
sql_hints: Dict[str, str], optional
|
|
312
|
+
User specified SQL hints.
|
|
313
|
+
anonymous_col_prefix: str, optional
|
|
314
|
+
Prefix for anonymous columns, '_anon_col_' by default.
|
|
315
|
+
skip_schema: bool, optional
|
|
316
|
+
Skip resolving output schema before execution. Once this is configured,
|
|
317
|
+
the output DataFrame cannot be inputs of other DataFrame operators
|
|
318
|
+
before execution.
|
|
260
319
|
|
|
261
320
|
Returns
|
|
262
321
|
-------
|
|
263
322
|
result: DataFrame
|
|
264
323
|
DataFrame read from MaxCompute (ODPS) table
|
|
265
324
|
"""
|
|
325
|
+
hints = options.sql.settings.copy() or {}
|
|
326
|
+
if sql_hints:
|
|
327
|
+
hints.update(sql_hints)
|
|
328
|
+
|
|
266
329
|
odps_entry = odps_entry or ODPS.from_global() or ODPS.from_environments()
|
|
330
|
+
|
|
331
|
+
if options.session.enable_schema or odps_entry.is_schema_namespace_enabled():
|
|
332
|
+
hints["odps.namespace.schema"] = "true"
|
|
333
|
+
hints["odps.sql.allow.namespace.schema"] = "true"
|
|
334
|
+
|
|
335
|
+
# fixme workaround for multi-stage split process
|
|
336
|
+
hints["odps.sql.object.table.split.by.object.size.enabled"] = "false"
|
|
337
|
+
|
|
267
338
|
if odps_entry is None:
|
|
268
339
|
raise ValueError("Missing odps_entry parameter")
|
|
269
|
-
inst = odps_entry.execute_sql(f"EXPLAIN {query}")
|
|
270
|
-
explain_str = list(inst.get_task_results().values())[0]
|
|
271
340
|
|
|
272
|
-
|
|
273
|
-
|
|
341
|
+
col_renames = {}
|
|
342
|
+
if not skip_schema:
|
|
343
|
+
inst = odps_entry.execute_sql(f"EXPLAIN {query}", hints=hints)
|
|
344
|
+
logger.debug("Explain instance ID: %s", inst.id)
|
|
345
|
+
explain_str = list(inst.get_task_results().values())[0]
|
|
346
|
+
|
|
347
|
+
try:
|
|
348
|
+
odps_schema = _parse_explained_schema(explain_str)
|
|
349
|
+
except ValueError as ex:
|
|
350
|
+
exc = ValueError(str(ex) + "\nExplain instance ID: " + inst.id)
|
|
351
|
+
raise exc.with_traceback(ex.__traceback__) from None
|
|
352
|
+
|
|
353
|
+
new_columns = []
|
|
354
|
+
for col in odps_schema.columns:
|
|
355
|
+
anon_match = _ANONYMOUS_COL_REGEX.match(col.name)
|
|
356
|
+
if anon_match and col.name not in query:
|
|
357
|
+
new_name = anonymous_col_prefix + anon_match.group(1)
|
|
358
|
+
col_renames[col.name] = new_name
|
|
359
|
+
new_columns.append(Column(new_name, col.type))
|
|
360
|
+
else:
|
|
361
|
+
new_columns.append(col)
|
|
362
|
+
|
|
363
|
+
dtypes = odps_schema_to_pandas_dtypes(OdpsSchema(new_columns))
|
|
364
|
+
else:
|
|
365
|
+
dtypes = None
|
|
274
366
|
|
|
275
367
|
if not index_col:
|
|
276
368
|
index_dtypes = None
|
|
277
369
|
else:
|
|
370
|
+
if dtypes is None:
|
|
371
|
+
raise ValueError("Cannot configure index_col when skip_schema is True")
|
|
372
|
+
|
|
278
373
|
if isinstance(index_col, str):
|
|
279
374
|
index_col = [index_col]
|
|
280
375
|
index_col_set = set(index_col)
|
|
@@ -293,5 +388,6 @@ def read_odps_query(
|
|
|
293
388
|
string_as_binary=string_as_binary,
|
|
294
389
|
index_columns=index_col,
|
|
295
390
|
index_dtypes=index_dtypes,
|
|
391
|
+
column_renames=col_renames,
|
|
296
392
|
)
|
|
297
393
|
return op(chunk_bytes=chunk_bytes, chunk_size=chunk_size)
|
|
@@ -22,8 +22,9 @@ from odps.models import Table
|
|
|
22
22
|
from odps.utils import to_timestamp
|
|
23
23
|
|
|
24
24
|
from ... import opcodes
|
|
25
|
+
from ...config import options
|
|
25
26
|
from ...core import OutputType
|
|
26
|
-
from ...odpsio import odps_schema_to_pandas_dtypes
|
|
27
|
+
from ...io.odpsio import odps_schema_to_pandas_dtypes
|
|
27
28
|
from ...serialization.serializables import (
|
|
28
29
|
AnyField,
|
|
29
30
|
BoolField,
|
|
@@ -82,7 +83,9 @@ class DataFrameReadODPSTable(
|
|
|
82
83
|
index_value = parse_index(pd.RangeIndex(shape[0]))
|
|
83
84
|
elif len(self.index_columns) == 1:
|
|
84
85
|
index_value = parse_index(
|
|
85
|
-
pd.Index([], name=self.index_columns[0]).astype(
|
|
86
|
+
pd.Index([], name=self.index_columns[0]).astype(
|
|
87
|
+
self.index_dtypes.iloc[0]
|
|
88
|
+
)
|
|
86
89
|
)
|
|
87
90
|
else:
|
|
88
91
|
idx = pd.MultiIndex.from_frame(
|
|
@@ -117,9 +120,10 @@ class DataFrameReadODPSTable(
|
|
|
117
120
|
return self.new_tileable(
|
|
118
121
|
[],
|
|
119
122
|
None,
|
|
120
|
-
shape=shape,
|
|
123
|
+
shape=shape[:1],
|
|
121
124
|
name=getattr(index_value, "name", None),
|
|
122
125
|
names=getattr(index_value, "names", None),
|
|
126
|
+
dtype=self.index_dtypes.iloc[0],
|
|
123
127
|
index_value=index_value,
|
|
124
128
|
chunk_bytes=chunk_bytes,
|
|
125
129
|
chunk_size=chunk_size,
|
|
@@ -164,12 +168,13 @@ def read_odps_table(
|
|
|
164
168
|
DataFrame read from MaxCompute (ODPS) table
|
|
165
169
|
"""
|
|
166
170
|
odps_entry = odps_entry or ODPS.from_global() or ODPS.from_environments()
|
|
171
|
+
schema = options.session.default_schema or odps_entry.schema
|
|
167
172
|
if odps_entry is None:
|
|
168
173
|
raise ValueError("Missing odps_entry parameter")
|
|
169
174
|
if isinstance(table_name, Table):
|
|
170
175
|
table = table_name
|
|
171
176
|
else:
|
|
172
|
-
table = odps_entry.get_table(table_name)
|
|
177
|
+
table = odps_entry.get_table(table_name, schema=schema)
|
|
173
178
|
|
|
174
179
|
if not table.table_schema.partitions and (
|
|
175
180
|
partitions is not None or append_partitions
|
|
@@ -13,18 +13,28 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import os
|
|
16
|
+
import uuid
|
|
16
17
|
from collections import OrderedDict
|
|
17
18
|
|
|
18
19
|
import numpy as np
|
|
19
20
|
import pandas as pd
|
|
20
21
|
import pytest
|
|
21
22
|
from odps import ODPS
|
|
23
|
+
from odps import types as odps_types
|
|
22
24
|
|
|
23
25
|
from .... import tensor as mt
|
|
26
|
+
from ....core import OutputType
|
|
24
27
|
from ....tests.utils import tn
|
|
25
28
|
from ....utils import lazy_import
|
|
26
29
|
from ... import read_odps_query, read_odps_table
|
|
27
|
-
from ...core import
|
|
30
|
+
from ...core import (
|
|
31
|
+
DatetimeIndex,
|
|
32
|
+
Float64Index,
|
|
33
|
+
Index,
|
|
34
|
+
IndexValue,
|
|
35
|
+
Int64Index,
|
|
36
|
+
MultiIndex,
|
|
37
|
+
)
|
|
28
38
|
from ..dataframe import from_pandas as from_pandas_df
|
|
29
39
|
from ..date_range import date_range
|
|
30
40
|
from ..from_tensor import (
|
|
@@ -34,7 +44,12 @@ from ..from_tensor import (
|
|
|
34
44
|
)
|
|
35
45
|
from ..index import from_pandas as from_pandas_index
|
|
36
46
|
from ..index import from_tileable
|
|
37
|
-
from ..read_odps_query import
|
|
47
|
+
from ..read_odps_query import (
|
|
48
|
+
ColumnSchema,
|
|
49
|
+
_parse_full_explain,
|
|
50
|
+
_parse_simple_explain,
|
|
51
|
+
_resolve_task_sector,
|
|
52
|
+
)
|
|
38
53
|
from ..series import from_pandas as from_pandas_series
|
|
39
54
|
|
|
40
55
|
ray = lazy_import("ray")
|
|
@@ -112,18 +127,22 @@ def test_from_tileable_index():
|
|
|
112
127
|
|
|
113
128
|
for o in [df, df[0]]:
|
|
114
129
|
index = o.index
|
|
115
|
-
assert isinstance(index, Int64Index)
|
|
130
|
+
assert isinstance(index, (Index, Int64Index))
|
|
116
131
|
assert index.dtype == np.int64
|
|
117
132
|
assert index.name == pd_df.index.name
|
|
118
|
-
assert isinstance(
|
|
133
|
+
assert isinstance(
|
|
134
|
+
index.index_value.value, (IndexValue.Int64Index, IndexValue.Index)
|
|
135
|
+
)
|
|
119
136
|
|
|
120
137
|
t = mt.random.rand(10, chunk_size=6)
|
|
121
138
|
index = from_tileable(t, name="new_name")
|
|
122
139
|
|
|
123
|
-
assert isinstance(index, Float64Index)
|
|
140
|
+
assert isinstance(index, (Index, Float64Index))
|
|
124
141
|
assert index.dtype == np.float64
|
|
125
142
|
assert index.name == "new_name"
|
|
126
|
-
assert isinstance(
|
|
143
|
+
assert isinstance(
|
|
144
|
+
index.index_value.value, (IndexValue.Float64Index, IndexValue.Index)
|
|
145
|
+
)
|
|
127
146
|
|
|
128
147
|
|
|
129
148
|
def test_from_tensor():
|
|
@@ -295,6 +314,15 @@ def test_from_odps_table():
|
|
|
295
314
|
),
|
|
296
315
|
)
|
|
297
316
|
|
|
317
|
+
out_idx = read_odps_table(
|
|
318
|
+
test_table,
|
|
319
|
+
columns=[],
|
|
320
|
+
index_col=["col1", "col2"],
|
|
321
|
+
output_type=OutputType.index,
|
|
322
|
+
)
|
|
323
|
+
assert out_idx.names == ["col1", "col2"]
|
|
324
|
+
assert out_idx.shape == (np.nan,)
|
|
325
|
+
|
|
298
326
|
test_table.drop()
|
|
299
327
|
test_parted_table.drop()
|
|
300
328
|
|
|
@@ -316,7 +344,10 @@ def test_from_odps_query():
|
|
|
316
344
|
odps_entry.write_table(test_table2, [["A", 10, 4.5]])
|
|
317
345
|
|
|
318
346
|
with pytest.raises(ValueError) as err_info:
|
|
319
|
-
read_odps_query(
|
|
347
|
+
read_odps_query(
|
|
348
|
+
f"CREATE TABLE dummy_table_{uuid.uuid4().hex} "
|
|
349
|
+
f"AS SELECT * FROM {table1_name}"
|
|
350
|
+
)
|
|
320
351
|
assert "instant query" in err_info.value.args[0]
|
|
321
352
|
|
|
322
353
|
query1 = f"SELECT * FROM {table1_name} WHERE col1 > 10"
|
|
@@ -332,6 +363,10 @@ def test_from_odps_query():
|
|
|
332
363
|
),
|
|
333
364
|
)
|
|
334
365
|
|
|
366
|
+
df = read_odps_query(query1, skip_schema=True)
|
|
367
|
+
assert df.dtypes is None
|
|
368
|
+
assert df.columns_value is None
|
|
369
|
+
|
|
335
370
|
df = read_odps_query(query1, index_col="col1")
|
|
336
371
|
assert df.op.query == query1
|
|
337
372
|
assert df.index_value.name == "col1"
|
|
@@ -387,7 +422,9 @@ def test_date_range():
|
|
|
387
422
|
|
|
388
423
|
|
|
389
424
|
def test_resolve_task_sector():
|
|
390
|
-
input_path = os.path.join(
|
|
425
|
+
input_path = os.path.join(
|
|
426
|
+
os.path.dirname(__file__), "test-data", "task-input-full.txt"
|
|
427
|
+
)
|
|
391
428
|
with open(input_path, "r") as f:
|
|
392
429
|
sector = f.read()
|
|
393
430
|
actual_sector = _resolve_task_sector("job0", sector)
|
|
@@ -399,3 +436,61 @@ def test_resolve_task_sector():
|
|
|
399
436
|
assert actual_sector.schema[0] == ColumnSchema("unnamed: 0", "bigint", "")
|
|
400
437
|
assert actual_sector.schema[1] == ColumnSchema("id", "bigint", "id_alias")
|
|
401
438
|
assert actual_sector.schema[2] == ColumnSchema("listing_url", "string", "")
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
def test_resolve_task_odps2():
|
|
442
|
+
input_path = os.path.join(
|
|
443
|
+
os.path.dirname(__file__), "test-data", "task-input-odps2.txt"
|
|
444
|
+
)
|
|
445
|
+
with open(input_path, "r") as f:
|
|
446
|
+
sector = f.read()
|
|
447
|
+
actual_sector = _resolve_task_sector("job0", sector)
|
|
448
|
+
|
|
449
|
+
assert actual_sector.job_name == "job0"
|
|
450
|
+
assert actual_sector.task_name == "M1"
|
|
451
|
+
assert actual_sector.output_target == "Screen"
|
|
452
|
+
assert len(actual_sector.schema) == 2
|
|
453
|
+
assert actual_sector.schema[0] == ColumnSchema("key", "varchar(2048)", "")
|
|
454
|
+
assert actual_sector.schema[1] == ColumnSchema("data", "binary", "")
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
def test_resolve_simple_explain():
|
|
458
|
+
input_path = os.path.join(
|
|
459
|
+
os.path.dirname(__file__), "test-data", "task-input-simple.txt"
|
|
460
|
+
)
|
|
461
|
+
with open(input_path, "r") as f:
|
|
462
|
+
sector = f.read()
|
|
463
|
+
|
|
464
|
+
schema = _parse_simple_explain(sector)
|
|
465
|
+
assert schema.columns[0].name == "memberid"
|
|
466
|
+
assert schema.columns[0].type == odps_types.string
|
|
467
|
+
assert schema.columns[1].name == "createdate"
|
|
468
|
+
assert schema.columns[1].type == odps_types.bigint
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
def test_resolve_conditional():
|
|
472
|
+
input_path = os.path.join(
|
|
473
|
+
os.path.dirname(__file__), "test-data", "task-input-multi-cond.txt"
|
|
474
|
+
)
|
|
475
|
+
with open(input_path, "r") as f:
|
|
476
|
+
sector = f.read()
|
|
477
|
+
|
|
478
|
+
expected_col_types = {
|
|
479
|
+
"cs1": "string",
|
|
480
|
+
"cs2": "string",
|
|
481
|
+
"ci1": "bigint",
|
|
482
|
+
"cs3": "string",
|
|
483
|
+
"cs4": "string",
|
|
484
|
+
"cs5": "string",
|
|
485
|
+
"cs6": "string",
|
|
486
|
+
"cs7": "string",
|
|
487
|
+
"cs8": "string",
|
|
488
|
+
"ci2": "int",
|
|
489
|
+
"ci3": "bigint",
|
|
490
|
+
"cs9": "string",
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
schema = _parse_full_explain(sector)
|
|
494
|
+
for col, (exp_nm, exp_tp) in zip(schema.columns, expected_col_types.items()):
|
|
495
|
+
assert col.name == exp_nm
|
|
496
|
+
assert col.type == odps_types.validate_data_type(exp_tp)
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import pytest
|
|
16
|
+
|
|
17
|
+
from ... import DataFrame
|
|
18
|
+
from ..to_odps import to_odps_table
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@pytest.fixture
|
|
22
|
+
def df():
|
|
23
|
+
return DataFrame({"A": [1, 2], "B": [3, 4]})
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@pytest.mark.parametrize(
|
|
27
|
+
"kwargs",
|
|
28
|
+
[
|
|
29
|
+
{"partition_col": ["A", "C"]},
|
|
30
|
+
{"partition_col": "C"},
|
|
31
|
+
{"partition": "a=1,C=2"},
|
|
32
|
+
],
|
|
33
|
+
)
|
|
34
|
+
def test_to_odps_table_validation(df, kwargs):
|
|
35
|
+
with pytest.raises(ValueError):
|
|
36
|
+
to_odps_table(df, "test_table", **kwargs)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@pytest.mark.parametrize(
|
|
40
|
+
"kwargs",
|
|
41
|
+
[
|
|
42
|
+
{"partition_col": ["a", "B"]},
|
|
43
|
+
{"partition_col": "a"},
|
|
44
|
+
{"partition": "C=1,d=2"},
|
|
45
|
+
],
|
|
46
|
+
)
|
|
47
|
+
def test_to_odps_table_vaild(df, kwargs):
|
|
48
|
+
to_odps_table(df, "test_table", **kwargs)
|
|
@@ -17,11 +17,14 @@
|
|
|
17
17
|
import logging
|
|
18
18
|
from typing import List, Optional, Union
|
|
19
19
|
|
|
20
|
+
from odps import ODPS
|
|
20
21
|
from odps.models import Table as ODPSTable
|
|
22
|
+
from odps.types import PartitionSpec
|
|
21
23
|
|
|
22
24
|
from ... import opcodes
|
|
23
25
|
from ...config import options
|
|
24
26
|
from ...core import OutputType
|
|
27
|
+
from ...io.odpsio import build_dataframe_table_meta
|
|
25
28
|
from ...serialization.serializables import (
|
|
26
29
|
BoolField,
|
|
27
30
|
FieldTypes,
|
|
@@ -134,8 +137,14 @@ def to_odps_table(
|
|
|
134
137
|
--------
|
|
135
138
|
|
|
136
139
|
"""
|
|
140
|
+
odps_entry = ODPS.from_global() or ODPS.from_environments()
|
|
137
141
|
if isinstance(table, ODPSTable):
|
|
138
142
|
table = table.full_table_name
|
|
143
|
+
elif options.session.enable_schema and "." not in table:
|
|
144
|
+
default_schema = (
|
|
145
|
+
options.session.default_schema or odps_entry.schema or "default"
|
|
146
|
+
)
|
|
147
|
+
table = default_schema + "." + table
|
|
139
148
|
|
|
140
149
|
if isinstance(index_label, str):
|
|
141
150
|
index_label = [index_label]
|
|
@@ -147,6 +156,25 @@ def to_odps_table(
|
|
|
147
156
|
f"index_label needs {len(df.index.nlevels)} labels "
|
|
148
157
|
f"but it only have {len(index_label)}"
|
|
149
158
|
)
|
|
159
|
+
table_cols = set(build_dataframe_table_meta(df).table_column_names)
|
|
160
|
+
if partition:
|
|
161
|
+
partition_intersect = (
|
|
162
|
+
set(x.lower() for x in PartitionSpec(partition).keys()) & table_cols
|
|
163
|
+
)
|
|
164
|
+
if partition_intersect:
|
|
165
|
+
raise ValueError(
|
|
166
|
+
f"Data column(s) {partition_intersect} in the dataframe"
|
|
167
|
+
" cannot be used in parameter 'partition'."
|
|
168
|
+
" Use 'partition_col' instead."
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
if partition_col:
|
|
172
|
+
partition_diff = set(x.lower() for x in partition_col) - table_cols
|
|
173
|
+
if partition_diff:
|
|
174
|
+
raise ValueError(
|
|
175
|
+
f"Partition column(s) {partition_diff}"
|
|
176
|
+
" is not the data column(s) of the input dataframe."
|
|
177
|
+
)
|
|
150
178
|
|
|
151
179
|
op = DataFrameToODPSTable(
|
|
152
180
|
dtypes=df.dtypes,
|
|
@@ -18,6 +18,8 @@ from .accessor import (
|
|
|
18
18
|
IndexMaxFrameAccessor,
|
|
19
19
|
SeriesMaxFrameAccessor,
|
|
20
20
|
)
|
|
21
|
+
from .flatjson import series_flatjson
|
|
22
|
+
from .flatmap import df_flatmap, series_flatmap
|
|
21
23
|
from .reshuffle import DataFrameReshuffle, df_reshuffle
|
|
22
24
|
|
|
23
25
|
|
|
@@ -25,6 +27,9 @@ def _install():
|
|
|
25
27
|
from ..core import DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE
|
|
26
28
|
|
|
27
29
|
DataFrameMaxFrameAccessor._register("reshuffle", df_reshuffle)
|
|
30
|
+
DataFrameMaxFrameAccessor._register("flatmap", df_flatmap)
|
|
31
|
+
SeriesMaxFrameAccessor._register("flatmap", series_flatmap)
|
|
32
|
+
SeriesMaxFrameAccessor._register("flatjson", series_flatjson)
|
|
28
33
|
|
|
29
34
|
if DataFrameMaxFrameAccessor._api_count:
|
|
30
35
|
for t in DATAFRAME_TYPE:
|