maxframe 0.1.0b3__cp37-cp37m-win32.whl → 0.1.0b5__cp37-cp37m-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/__init__.py +1 -0
- maxframe/_utils.cp37-win32.pyd +0 -0
- maxframe/codegen.py +46 -1
- maxframe/config/config.py +14 -1
- maxframe/core/graph/core.cp37-win32.pyd +0 -0
- maxframe/dataframe/__init__.py +6 -0
- maxframe/dataframe/core.py +34 -10
- maxframe/dataframe/datasource/read_odps_query.py +6 -2
- maxframe/dataframe/datasource/read_odps_table.py +5 -1
- maxframe/dataframe/datastore/core.py +19 -0
- maxframe/dataframe/datastore/to_csv.py +2 -2
- maxframe/dataframe/datastore/to_odps.py +2 -2
- maxframe/dataframe/indexing/reset_index.py +1 -17
- maxframe/dataframe/misc/__init__.py +4 -0
- maxframe/dataframe/misc/apply.py +1 -1
- maxframe/dataframe/misc/case_when.py +141 -0
- maxframe/dataframe/misc/pivot_table.py +262 -0
- maxframe/dataframe/misc/tests/test_misc.py +61 -0
- maxframe/dataframe/plotting/core.py +2 -2
- maxframe/dataframe/reduction/core.py +2 -1
- maxframe/dataframe/utils.py +7 -0
- maxframe/learn/contrib/utils.py +52 -0
- maxframe/learn/contrib/xgboost/__init__.py +26 -0
- maxframe/learn/contrib/xgboost/classifier.py +86 -0
- maxframe/learn/contrib/xgboost/core.py +156 -0
- maxframe/learn/contrib/xgboost/dmatrix.py +150 -0
- maxframe/learn/contrib/xgboost/predict.py +138 -0
- maxframe/learn/contrib/xgboost/regressor.py +78 -0
- maxframe/learn/contrib/xgboost/tests/__init__.py +13 -0
- maxframe/learn/contrib/xgboost/tests/test_core.py +43 -0
- maxframe/learn/contrib/xgboost/train.py +121 -0
- maxframe/learn/utils/__init__.py +15 -0
- maxframe/learn/utils/core.py +29 -0
- maxframe/lib/mmh3.cp37-win32.pyd +0 -0
- maxframe/odpsio/arrow.py +10 -6
- maxframe/odpsio/schema.py +18 -5
- maxframe/odpsio/tableio.py +22 -0
- maxframe/odpsio/tests/test_schema.py +41 -11
- maxframe/opcodes.py +8 -0
- maxframe/serialization/core.cp37-win32.pyd +0 -0
- maxframe/serialization/core.pyi +61 -0
- maxframe/session.py +32 -2
- maxframe/tensor/__init__.py +1 -1
- maxframe/tensor/base/__init__.py +2 -0
- maxframe/tensor/base/atleast_1d.py +74 -0
- maxframe/tensor/base/unique.py +205 -0
- maxframe/tensor/datasource/array.py +4 -2
- maxframe/tensor/datasource/scalar.py +1 -1
- maxframe/udf.py +63 -3
- maxframe/utils.py +11 -0
- {maxframe-0.1.0b3.dist-info → maxframe-0.1.0b5.dist-info}/METADATA +2 -2
- {maxframe-0.1.0b3.dist-info → maxframe-0.1.0b5.dist-info}/RECORD +58 -40
- maxframe_client/fetcher.py +65 -3
- maxframe_client/session/odps.py +41 -11
- maxframe_client/session/task.py +26 -53
- maxframe_client/tests/test_session.py +49 -1
- {maxframe-0.1.0b3.dist-info → maxframe-0.1.0b5.dist-info}/WHEEL +0 -0
- {maxframe-0.1.0b3.dist-info → maxframe-0.1.0b5.dist-info}/top_level.txt +0 -0
maxframe_client/fetcher.py
CHANGED
|
@@ -12,16 +12,21 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
+
import base64
|
|
16
|
+
import json
|
|
15
17
|
from abc import ABC, abstractmethod
|
|
16
18
|
from numbers import Integral
|
|
17
|
-
from typing import Any, Dict, List, Type, Union
|
|
19
|
+
from typing import Any, Dict, List, Optional, Type, Union
|
|
18
20
|
|
|
21
|
+
import pandas as pd
|
|
19
22
|
import pyarrow as pa
|
|
20
23
|
from odps import ODPS
|
|
21
24
|
from odps.models import ExternalVolume, PartedVolume
|
|
25
|
+
from odps.tunnel import TableTunnel
|
|
22
26
|
from tornado import httpclient
|
|
23
27
|
|
|
24
28
|
from maxframe.core import OBJECT_TYPE
|
|
29
|
+
from maxframe.dataframe.core import DATAFRAME_TYPE
|
|
25
30
|
from maxframe.lib import wrapped_pickle as pickle
|
|
26
31
|
from maxframe.odpsio import HaloTableIO, arrow_to_pandas, build_dataframe_table_meta
|
|
27
32
|
from maxframe.protocol import (
|
|
@@ -31,8 +36,9 @@ from maxframe.protocol import (
|
|
|
31
36
|
ResultInfo,
|
|
32
37
|
ResultType,
|
|
33
38
|
)
|
|
39
|
+
from maxframe.tensor.core import TENSOR_TYPE
|
|
34
40
|
from maxframe.typing_ import PandasObjectTypes, TileableType
|
|
35
|
-
from maxframe.utils import ToThreadMixin
|
|
41
|
+
from maxframe.utils import ToThreadMixin, deserialize_serializable
|
|
36
42
|
|
|
37
43
|
_result_fetchers: Dict[ResultType, Type["ResultFetcher"]] = dict()
|
|
38
44
|
|
|
@@ -52,6 +58,14 @@ class ResultFetcher(ABC):
|
|
|
52
58
|
def __init__(self, odps_entry: ODPS):
|
|
53
59
|
self._odps_entry = odps_entry
|
|
54
60
|
|
|
61
|
+
@abstractmethod
|
|
62
|
+
async def update_tileable_meta(
|
|
63
|
+
self,
|
|
64
|
+
tileable: TileableType,
|
|
65
|
+
info: ResultInfo,
|
|
66
|
+
) -> None:
|
|
67
|
+
raise NotImplementedError
|
|
68
|
+
|
|
55
69
|
@abstractmethod
|
|
56
70
|
async def fetch(
|
|
57
71
|
self,
|
|
@@ -66,6 +80,13 @@ class ResultFetcher(ABC):
|
|
|
66
80
|
class NullFetcher(ResultFetcher):
|
|
67
81
|
result_type = ResultType.NULL
|
|
68
82
|
|
|
83
|
+
async def update_tileable_meta(
|
|
84
|
+
self,
|
|
85
|
+
tileable: TileableType,
|
|
86
|
+
info: ResultInfo,
|
|
87
|
+
) -> None:
|
|
88
|
+
return
|
|
89
|
+
|
|
69
90
|
async def fetch(
|
|
70
91
|
self,
|
|
71
92
|
tileable: TileableType,
|
|
@@ -79,6 +100,40 @@ class NullFetcher(ResultFetcher):
|
|
|
79
100
|
class ODPSTableFetcher(ToThreadMixin, ResultFetcher):
|
|
80
101
|
result_type = ResultType.ODPS_TABLE
|
|
81
102
|
|
|
103
|
+
def _get_table_comment(self, table_name: str) -> Optional[str]:
|
|
104
|
+
table = self._odps_entry.get_table(table_name)
|
|
105
|
+
return getattr(table, "comment", None)
|
|
106
|
+
|
|
107
|
+
async def update_tileable_meta(
|
|
108
|
+
self,
|
|
109
|
+
tileable: TileableType,
|
|
110
|
+
info: ODPSTableResultInfo,
|
|
111
|
+
) -> None:
|
|
112
|
+
if isinstance(tileable, DATAFRAME_TYPE) and tileable.dtypes is None:
|
|
113
|
+
tb_comment = await self.to_thread(
|
|
114
|
+
self._get_table_comment, info.full_table_name
|
|
115
|
+
)
|
|
116
|
+
if tb_comment: # pragma: no branch
|
|
117
|
+
comment_data = json.loads(tb_comment)
|
|
118
|
+
|
|
119
|
+
table_meta: DataFrameTableMeta = deserialize_serializable(
|
|
120
|
+
base64.b64decode(comment_data["table_meta"])
|
|
121
|
+
)
|
|
122
|
+
tileable.refresh_from_table_meta(table_meta)
|
|
123
|
+
|
|
124
|
+
if tileable.shape and any(pd.isna(x) for x in tileable.shape):
|
|
125
|
+
part_specs = [None] if not info.partition_specs else info.partition_specs
|
|
126
|
+
tunnel = TableTunnel(self._odps_entry)
|
|
127
|
+
total_records = 0
|
|
128
|
+
for part_spec in part_specs:
|
|
129
|
+
session = tunnel.create_download_session(
|
|
130
|
+
info.full_table_name, part_spec
|
|
131
|
+
)
|
|
132
|
+
total_records += session.count
|
|
133
|
+
new_shape_list = list(tileable.shape)
|
|
134
|
+
new_shape_list[-1] = total_records
|
|
135
|
+
tileable.params = {"shape": tuple(new_shape_list)}
|
|
136
|
+
|
|
82
137
|
def _read_single_source(
|
|
83
138
|
self,
|
|
84
139
|
table_meta: DataFrameTableMeta,
|
|
@@ -149,6 +204,13 @@ class ODPSTableFetcher(ToThreadMixin, ResultFetcher):
|
|
|
149
204
|
class ODPSVolumeFetcher(ToThreadMixin, ResultFetcher):
|
|
150
205
|
result_type = ResultType.ODPS_VOLUME
|
|
151
206
|
|
|
207
|
+
async def update_tileable_meta(
|
|
208
|
+
self,
|
|
209
|
+
tileable: TileableType,
|
|
210
|
+
info: ODPSVolumeResultInfo,
|
|
211
|
+
) -> None:
|
|
212
|
+
return
|
|
213
|
+
|
|
152
214
|
async def _read_parted_volume_data(
|
|
153
215
|
self, volume: PartedVolume, partition: str, file_name: str
|
|
154
216
|
) -> bytes:
|
|
@@ -197,6 +259,6 @@ class ODPSVolumeFetcher(ToThreadMixin, ResultFetcher):
|
|
|
197
259
|
info: ODPSVolumeResultInfo,
|
|
198
260
|
indexes: List[Union[Integral, slice]],
|
|
199
261
|
) -> Any:
|
|
200
|
-
if isinstance(tileable, OBJECT_TYPE):
|
|
262
|
+
if isinstance(tileable, (OBJECT_TYPE, TENSOR_TYPE)):
|
|
201
263
|
return await self._fetch_object(info)
|
|
202
264
|
raise NotImplementedError(f"Fetching {type(tileable)} not implemented")
|
maxframe_client/session/odps.py
CHANGED
|
@@ -84,6 +84,9 @@ class MaxFrameServiceCaller(metaclass=abc.ABCMeta):
|
|
|
84
84
|
def decref(self, tileable_keys: List[str]) -> None:
|
|
85
85
|
raise NotImplementedError
|
|
86
86
|
|
|
87
|
+
def get_logview_address(self, dag_id=None, hours=None) -> Optional[str]:
|
|
88
|
+
return None
|
|
89
|
+
|
|
87
90
|
|
|
88
91
|
class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
|
|
89
92
|
_odps_entry: Optional[ODPS]
|
|
@@ -115,7 +118,7 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
|
|
|
115
118
|
):
|
|
116
119
|
super().__init__(address, session_id)
|
|
117
120
|
self.timeout = timeout
|
|
118
|
-
self._odps_entry = odps_entry or ODPS.from_environments()
|
|
121
|
+
self._odps_entry = odps_entry or ODPS.from_global() or ODPS.from_environments()
|
|
119
122
|
self._tileable_to_infos = weakref.WeakKeyDictionary()
|
|
120
123
|
|
|
121
124
|
self._caller = self._create_caller(odps_entry, address, **kwargs)
|
|
@@ -129,6 +132,7 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
|
|
|
129
132
|
async def _init(self, _address: str):
|
|
130
133
|
session_info = await self.ensure_async_call(self._caller.create_session)
|
|
131
134
|
self._session_id = session_info.session_id
|
|
135
|
+
await self._show_logview_address()
|
|
132
136
|
|
|
133
137
|
def _upload_and_get_read_tileable(self, t: TileableType) -> Optional[TileableType]:
|
|
134
138
|
if (
|
|
@@ -142,20 +146,23 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
|
|
|
142
146
|
if self._odps_entry.exist_table(table_meta.table_name):
|
|
143
147
|
self._odps_entry.delete_table(table_meta.table_name)
|
|
144
148
|
table_name = build_temp_table_name(self.session_id, t.key)
|
|
145
|
-
table_obj = self._odps_entry.create_table(
|
|
149
|
+
table_obj = self._odps_entry.create_table(
|
|
150
|
+
table_name, schema, lifecycle=options.session.temp_table_lifecycle
|
|
151
|
+
)
|
|
146
152
|
|
|
147
153
|
data = t.op.get_data()
|
|
148
154
|
batch_size = options.session.upload_batch_size
|
|
149
155
|
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
156
|
+
if len(data):
|
|
157
|
+
halo_client = HaloTableIO(self._odps_entry)
|
|
158
|
+
with halo_client.open_writer(table_obj.full_table_name) as writer:
|
|
159
|
+
for batch_start in range(0, len(data), batch_size):
|
|
160
|
+
if isinstance(data, pd.Index):
|
|
161
|
+
batch = data[batch_start : batch_start + batch_size]
|
|
162
|
+
else:
|
|
163
|
+
batch = data.iloc[batch_start : batch_start + batch_size]
|
|
164
|
+
arrow_batch, _ = pandas_to_arrow(batch)
|
|
165
|
+
writer.write(arrow_batch)
|
|
159
166
|
|
|
160
167
|
read_tileable = read_odps_table(
|
|
161
168
|
table_obj.full_table_name,
|
|
@@ -238,6 +245,8 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
|
|
|
238
245
|
self._caller.submit_dag, tileable_graph, replaced_infos
|
|
239
246
|
)
|
|
240
247
|
|
|
248
|
+
await self._show_logview_address(dag_info.dag_id)
|
|
249
|
+
|
|
241
250
|
progress = Progress()
|
|
242
251
|
profiling = Profiling()
|
|
243
252
|
aio_task = asyncio.create_task(
|
|
@@ -293,6 +302,8 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
|
|
|
293
302
|
|
|
294
303
|
for key, result_info in dag_info.tileable_to_result_infos.items():
|
|
295
304
|
t = key_to_tileables[key]
|
|
305
|
+
fetcher = get_fetcher_cls(result_info.result_type)(self._odps_entry)
|
|
306
|
+
await fetcher.update_tileable_meta(t, result_info)
|
|
296
307
|
self._tileable_to_infos[t] = result_info
|
|
297
308
|
|
|
298
309
|
def _get_data_tileable_and_indexes(
|
|
@@ -387,6 +398,25 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
|
|
|
387
398
|
async def get_mutable_tensor(self, name: str):
|
|
388
399
|
raise NotImplementedError
|
|
389
400
|
|
|
401
|
+
async def get_logview_address(self, hours=None) -> Optional[str]:
|
|
402
|
+
return await self.get_dag_logview_address(None, hours)
|
|
403
|
+
|
|
404
|
+
async def get_dag_logview_address(self, dag_id=None, hours=None) -> Optional[str]:
|
|
405
|
+
return await self.ensure_async_call(
|
|
406
|
+
self._caller.get_logview_address, dag_id, hours
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
async def _show_logview_address(self, dag_id=None, hours=None):
|
|
410
|
+
identity = f"Session ID: {self._session_id}"
|
|
411
|
+
if dag_id:
|
|
412
|
+
identity += f", DAG ID: {dag_id}"
|
|
413
|
+
|
|
414
|
+
logview_addr = await self.get_dag_logview_address(dag_id, hours)
|
|
415
|
+
if logview_addr:
|
|
416
|
+
logger.info("%s, Logview: %s", identity, logview_addr)
|
|
417
|
+
else:
|
|
418
|
+
logger.info("%s, Logview address does not exist", identity)
|
|
419
|
+
|
|
390
420
|
|
|
391
421
|
class MaxFrameRestCaller(MaxFrameServiceCaller):
|
|
392
422
|
_client: FrameDriverClient
|
maxframe_client/session/task.py
CHANGED
|
@@ -21,9 +21,8 @@ from typing import Dict, List, Optional, Type, Union
|
|
|
21
21
|
import msgpack
|
|
22
22
|
from odps import ODPS
|
|
23
23
|
from odps import options as odps_options
|
|
24
|
-
from odps import serializers
|
|
25
24
|
from odps.errors import parse_instance_error
|
|
26
|
-
from odps.models import Instance,
|
|
25
|
+
from odps.models import Instance, MaxFrameTask
|
|
27
26
|
|
|
28
27
|
from maxframe.config import options
|
|
29
28
|
from maxframe.core import TileableGraph
|
|
@@ -55,55 +54,6 @@ from .odps import MaxFrameServiceCaller, MaxFrameSession
|
|
|
55
54
|
logger = logging.getLogger(__name__)
|
|
56
55
|
|
|
57
56
|
|
|
58
|
-
class MaxFrameTask(Task):
|
|
59
|
-
__slots__ = ("_output_format", "_major_version", "_service_endpoint")
|
|
60
|
-
_root = "MaxFrame"
|
|
61
|
-
_anonymous_task_name = "AnonymousMaxFrameTask"
|
|
62
|
-
|
|
63
|
-
command = serializers.XMLNodeField("Command", default="CREATE_SESSION")
|
|
64
|
-
|
|
65
|
-
def __init__(self, **kwargs):
|
|
66
|
-
kwargs["name"] = kwargs.get("name") or self._anonymous_task_name
|
|
67
|
-
self._output_format = kwargs.pop(
|
|
68
|
-
"output_format", MAXFRAME_OUTPUT_MSGPACK_FORMAT
|
|
69
|
-
)
|
|
70
|
-
self._major_version = kwargs.pop("major_version", None)
|
|
71
|
-
self._service_endpoint = kwargs.pop("service_endpoint", None)
|
|
72
|
-
super().__init__(**kwargs)
|
|
73
|
-
|
|
74
|
-
def serial(self):
|
|
75
|
-
if self.properties is None:
|
|
76
|
-
self.properties = dict()
|
|
77
|
-
|
|
78
|
-
if odps_options.default_task_settings:
|
|
79
|
-
settings = odps_options.default_task_settings
|
|
80
|
-
else:
|
|
81
|
-
settings = dict()
|
|
82
|
-
|
|
83
|
-
if self._major_version is not None:
|
|
84
|
-
settings["odps.task.major.version"] = self._major_version
|
|
85
|
-
|
|
86
|
-
if "settings" in self.properties:
|
|
87
|
-
settings.update(json.loads(self.properties["settings"]))
|
|
88
|
-
|
|
89
|
-
# merge sql options
|
|
90
|
-
sql_settings = (odps_options.sql.settings or {}).copy()
|
|
91
|
-
sql_settings.update(options.sql.settings or {})
|
|
92
|
-
|
|
93
|
-
mf_settings = dict(options.to_dict(remote_only=True).items())
|
|
94
|
-
mf_settings["sql.settings"] = sql_settings
|
|
95
|
-
mf_opts = {
|
|
96
|
-
"odps.maxframe.settings": json.dumps(mf_settings),
|
|
97
|
-
"odps.maxframe.output_format": self._output_format,
|
|
98
|
-
"odps.service.endpoint": self._service_endpoint,
|
|
99
|
-
}
|
|
100
|
-
if mf_version:
|
|
101
|
-
mf_opts["odps.maxframe.client_version"] = mf_version
|
|
102
|
-
settings.update(mf_opts)
|
|
103
|
-
self.properties["settings"] = json.dumps(settings)
|
|
104
|
-
return super().serial()
|
|
105
|
-
|
|
106
|
-
|
|
107
57
|
class MaxFrameInstanceCaller(MaxFrameServiceCaller):
|
|
108
58
|
_instance: Optional[Instance]
|
|
109
59
|
|
|
@@ -159,13 +109,31 @@ class MaxFrameInstanceCaller(MaxFrameServiceCaller):
|
|
|
159
109
|
f"Serialization format {self._output_format} not supported"
|
|
160
110
|
)
|
|
161
111
|
|
|
162
|
-
def
|
|
112
|
+
def _create_maxframe_task(self) -> MaxFrameTask:
|
|
163
113
|
task = MaxFrameTask(
|
|
164
114
|
name=self._task_name,
|
|
165
115
|
major_version=self._major_version,
|
|
166
|
-
output_format=self._output_format,
|
|
167
116
|
service_endpoint=self._odps_entry.endpoint,
|
|
168
117
|
)
|
|
118
|
+
|
|
119
|
+
# merge sql options
|
|
120
|
+
sql_settings = (odps_options.sql.settings or {}).copy()
|
|
121
|
+
sql_settings.update(options.sql.settings or {})
|
|
122
|
+
|
|
123
|
+
mf_settings = dict(options.to_dict(remote_only=True).items())
|
|
124
|
+
mf_settings["sql.settings"] = sql_settings
|
|
125
|
+
|
|
126
|
+
mf_opts = {
|
|
127
|
+
"odps.maxframe.settings": json.dumps(mf_settings),
|
|
128
|
+
"odps.maxframe.output_format": self._output_format,
|
|
129
|
+
}
|
|
130
|
+
if mf_version:
|
|
131
|
+
mf_opts["odps.maxframe.client_version"] = mf_version
|
|
132
|
+
task.update_settings(mf_opts)
|
|
133
|
+
return task
|
|
134
|
+
|
|
135
|
+
def create_session(self) -> SessionInfo:
|
|
136
|
+
task = self._create_maxframe_task()
|
|
169
137
|
if not self._nested:
|
|
170
138
|
self._task_name = task.name
|
|
171
139
|
project = self._odps_entry.get_project(self._project)
|
|
@@ -278,6 +246,11 @@ class MaxFrameInstanceCaller(MaxFrameServiceCaller):
|
|
|
278
246
|
self._task_name, MAXFRAME_TASK_DECREF_METHOD, json.dumps(req_data)
|
|
279
247
|
)
|
|
280
248
|
|
|
249
|
+
def get_logview_address(self, dag_id=None, hours=None) -> Optional[str]:
|
|
250
|
+
hours = hours or options.session.logview_hours
|
|
251
|
+
subquery_suffix = f"&subQuery={dag_id}" if dag_id else ""
|
|
252
|
+
return self._instance.get_logview_address(hours) + subquery_suffix
|
|
253
|
+
|
|
281
254
|
|
|
282
255
|
class MaxFrameTaskSession(MaxFrameSession):
|
|
283
256
|
schemes = [ODPS_SESSION_INSECURE_SCHEME, ODPS_SESSION_SECURE_SCHEME]
|
|
@@ -28,6 +28,7 @@ from maxframe.lib.aio import stop_isolation
|
|
|
28
28
|
from maxframe.protocol import ResultInfo
|
|
29
29
|
from maxframe.serialization import RemoteException
|
|
30
30
|
from maxframe.session import new_session
|
|
31
|
+
from maxframe.tests.utils import tn
|
|
31
32
|
from maxframe.utils import build_temp_table_name
|
|
32
33
|
from maxframe_framedriver.app.tests.test_framedriver_webapp import ( # noqa: F401
|
|
33
34
|
framedriver_app,
|
|
@@ -98,9 +99,12 @@ def test_simple_run_dataframe(start_mock_session):
|
|
|
98
99
|
corner_top, corner_bottom = ExecutableTuple([df.iloc[:10], df.iloc[-10:]]).fetch()
|
|
99
100
|
assert len(corner_top) == len(corner_bottom) == 10
|
|
100
101
|
|
|
101
|
-
# check ellipsis mark in DataFrame
|
|
102
|
+
# check ellipsis mark in DataFrame reprs
|
|
102
103
|
df_str_repr = str(df)
|
|
103
104
|
assert ".." in df_str_repr
|
|
105
|
+
# check ellipsis mark in Series reprs
|
|
106
|
+
series_str_repr = str(df.A.execute())
|
|
107
|
+
assert ".." in series_str_repr
|
|
104
108
|
|
|
105
109
|
key = df.key
|
|
106
110
|
assert odps_entry.exist_table(
|
|
@@ -115,6 +119,26 @@ def test_simple_run_dataframe(start_mock_session):
|
|
|
115
119
|
assert not odps_entry.exist_table(build_temp_table_name(start_mock_session, key))
|
|
116
120
|
|
|
117
121
|
|
|
122
|
+
def test_run_empty_table(start_mock_session):
|
|
123
|
+
odps_entry = ODPS.from_environments()
|
|
124
|
+
|
|
125
|
+
table_name = tn("test_session_empty_table")
|
|
126
|
+
odps_entry.delete_table(table_name, if_exists=True)
|
|
127
|
+
empty_table = odps_entry.create_table(
|
|
128
|
+
table_name, "_idx_0 bigint, a double, b double", lifecycle=1
|
|
129
|
+
)
|
|
130
|
+
df = md.read_odps_table(table_name, index_col="_idx_0")
|
|
131
|
+
df["d"] = df["a"] + df["b"]
|
|
132
|
+
|
|
133
|
+
executed = df.execute()
|
|
134
|
+
assert "Index: []" in str(executed)
|
|
135
|
+
|
|
136
|
+
fetched = executed.fetch()
|
|
137
|
+
assert 0 == len(fetched)
|
|
138
|
+
|
|
139
|
+
empty_table.drop()
|
|
140
|
+
|
|
141
|
+
|
|
118
142
|
def test_run_dataframe_with_pd_source(start_mock_session):
|
|
119
143
|
odps_entry = ODPS.from_environments()
|
|
120
144
|
|
|
@@ -205,3 +229,27 @@ def test_run_remote_error(start_mock_session):
|
|
|
205
229
|
|
|
206
230
|
with pytest.raises((ValueError, RemoteException)):
|
|
207
231
|
v.execute()
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def test_pivot_dataframe(start_mock_session):
|
|
235
|
+
pd_df = pd.DataFrame(
|
|
236
|
+
{
|
|
237
|
+
"A": "foo foo foo foo foo bar bar bar bar".split(),
|
|
238
|
+
"B": "one one one two two one one two two".split(),
|
|
239
|
+
"C": "small large large small small large small small large".split(),
|
|
240
|
+
"D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
|
|
241
|
+
"E": [2, 4, 5, 5, 6, 6, 8, 9, 9],
|
|
242
|
+
}
|
|
243
|
+
)
|
|
244
|
+
df = md.DataFrame(pd_df)
|
|
245
|
+
pivot = df.pivot_table(values="D", index=["A", "B"], columns=["C"], aggfunc="sum")
|
|
246
|
+
executed = pivot.execute()
|
|
247
|
+
assert pivot.shape == (2, 4)
|
|
248
|
+
pd.testing.assert_index_equal(
|
|
249
|
+
pivot.dtypes.index, pd.Index(["large", "small"], name="C")
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
expected = pd_df.pivot_table(
|
|
253
|
+
values="D", index=["A", "B"], columns=["C"], aggfunc="sum"
|
|
254
|
+
)
|
|
255
|
+
pd.testing.assert_frame_equal(executed.to_pandas(), expected)
|
|
File without changes
|
|
File without changes
|