fugue 0.8.2.dev4__py3-none-any.whl → 0.8.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fugue/__init__.py +0 -1
- fugue/_utils/io.py +2 -91
- fugue/api.py +1 -0
- fugue/collections/partition.py +12 -6
- fugue/constants.py +1 -1
- fugue/dataframe/__init__.py +1 -7
- fugue/dataframe/arrow_dataframe.py +1 -1
- fugue/dataframe/function_wrapper.py +2 -3
- fugue/dataframe/utils.py +10 -84
- fugue/execution/api.py +34 -12
- fugue/execution/native_execution_engine.py +33 -19
- fugue/extensions/_builtins/creators.py +4 -2
- fugue/extensions/_builtins/outputters.py +3 -3
- fugue/extensions/_builtins/processors.py +2 -3
- fugue/plugins.py +1 -0
- fugue/workflow/_checkpoint.py +1 -1
- {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/METADATA +20 -10
- {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/RECORD +67 -65
- {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/entry_points.txt +2 -2
- fugue_contrib/viz/_ext.py +7 -1
- fugue_dask/_io.py +0 -13
- fugue_dask/_utils.py +10 -4
- fugue_dask/execution_engine.py +42 -16
- fugue_duckdb/_utils.py +7 -2
- fugue_duckdb/dask.py +1 -1
- fugue_duckdb/dataframe.py +17 -10
- fugue_duckdb/execution_engine.py +12 -22
- fugue_ibis/dataframe.py +2 -7
- fugue_notebook/env.py +5 -10
- fugue_polars/_utils.py +0 -40
- fugue_polars/polars_dataframe.py +22 -7
- fugue_ray/_constants.py +8 -1
- fugue_ray/_utils/dataframe.py +31 -4
- fugue_ray/_utils/io.py +2 -4
- fugue_ray/dataframe.py +13 -4
- fugue_ray/execution_engine.py +39 -21
- fugue_spark/_utils/convert.py +22 -11
- fugue_spark/_utils/io.py +0 -13
- fugue_spark/_utils/misc.py +27 -0
- fugue_spark/_utils/partition.py +11 -18
- fugue_spark/dataframe.py +24 -19
- fugue_spark/execution_engine.py +61 -35
- fugue_spark/registry.py +15 -3
- fugue_test/builtin_suite.py +7 -9
- fugue_test/dataframe_suite.py +7 -3
- fugue_test/execution_suite.py +100 -122
- fugue_version/__init__.py +1 -1
- tests/fugue/collections/test_partition.py +6 -3
- tests/fugue/dataframe/test_utils.py +2 -43
- tests/fugue/execution/test_naive_execution_engine.py +33 -0
- tests/fugue/utils/test_io.py +0 -80
- tests/fugue_dask/test_execution_engine.py +45 -0
- tests/fugue_dask/test_io.py +0 -55
- tests/fugue_duckdb/test_dataframe.py +2 -2
- tests/fugue_duckdb/test_utils.py +1 -1
- tests/fugue_polars/test_api.py +13 -0
- tests/fugue_polars/test_transform.py +11 -5
- tests/fugue_ray/test_execution_engine.py +32 -1
- tests/fugue_spark/test_dataframe.py +0 -8
- tests/fugue_spark/test_execution_engine.py +48 -10
- tests/fugue_spark/test_importless.py +4 -4
- tests/fugue_spark/test_spark_connect.py +82 -0
- tests/fugue_spark/utils/test_convert.py +6 -8
- tests/fugue_spark/utils/test_io.py +0 -17
- fugue_test/_utils.py +0 -13
- {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/LICENSE +0 -0
- {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/WHEEL +0 -0
- {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/top_level.txt +0 -0
fugue_dask/execution_engine.py
CHANGED
|
@@ -3,6 +3,7 @@ import os
|
|
|
3
3
|
from typing import Any, Callable, Dict, List, Optional, Type, Union
|
|
4
4
|
|
|
5
5
|
import dask.dataframe as dd
|
|
6
|
+
import pandas as pd
|
|
6
7
|
from distributed import Client
|
|
7
8
|
from qpd_dask import run_sql_on_dask
|
|
8
9
|
from triad.collections import Schema
|
|
@@ -18,7 +19,7 @@ from fugue.collections.partition import (
|
|
|
18
19
|
PartitionSpec,
|
|
19
20
|
parse_presort_exp,
|
|
20
21
|
)
|
|
21
|
-
from fugue.constants import
|
|
22
|
+
from fugue.constants import KEYWORD_PARALLELISM, KEYWORD_ROWCOUNT
|
|
22
23
|
from fugue.dataframe import (
|
|
23
24
|
AnyDataFrame,
|
|
24
25
|
DataFrame,
|
|
@@ -34,6 +35,8 @@ from fugue_dask._io import load_df, save_df
|
|
|
34
35
|
from fugue_dask._utils import DASK_UTILS, DaskUtils
|
|
35
36
|
from fugue_dask.dataframe import DaskDataFrame
|
|
36
37
|
|
|
38
|
+
_DASK_PARTITION_KEY = "__dask_partition_key__"
|
|
39
|
+
|
|
37
40
|
|
|
38
41
|
class QPDDaskEngine(SQLEngine):
|
|
39
42
|
"""QPD execution implementation."""
|
|
@@ -74,7 +77,8 @@ class DaskMapEngine(MapEngine):
|
|
|
74
77
|
on_init: Optional[Callable[[int, DataFrame], Any]] = None,
|
|
75
78
|
map_func_format_hint: Optional[str] = None,
|
|
76
79
|
) -> DataFrame:
|
|
77
|
-
|
|
80
|
+
is_coarse = partition_spec.algo == "coarse"
|
|
81
|
+
presort = partition_spec.get_sorts(df.schema, with_partition_keys=is_coarse)
|
|
78
82
|
presort_keys = list(presort.keys())
|
|
79
83
|
presort_asc = list(presort.values())
|
|
80
84
|
output_schema = Schema(output_schema)
|
|
@@ -88,10 +92,12 @@ class DaskMapEngine(MapEngine):
|
|
|
88
92
|
)
|
|
89
93
|
)
|
|
90
94
|
|
|
91
|
-
def _map(pdf: Any) ->
|
|
95
|
+
def _map(pdf: Any) -> pd.DataFrame:
|
|
92
96
|
if pdf.shape[0] == 0:
|
|
93
97
|
return PandasDataFrame([], output_schema).as_pandas()
|
|
94
|
-
if
|
|
98
|
+
if is_coarse:
|
|
99
|
+
pdf = pdf.drop(columns=[_DASK_PARTITION_KEY])
|
|
100
|
+
if len(partition_spec.presort) > 0:
|
|
95
101
|
pdf = pdf.sort_values(presort_keys, ascending=presort_asc)
|
|
96
102
|
input_df = PandasDataFrame(
|
|
97
103
|
pdf.reset_index(drop=True), input_schema, pandas_df_wrapper=True
|
|
@@ -100,7 +106,7 @@ class DaskMapEngine(MapEngine):
|
|
|
100
106
|
on_init_once(0, input_df)
|
|
101
107
|
cursor.set(lambda: input_df.peek_array(), 0, 0)
|
|
102
108
|
output_df = map_func(cursor, input_df)
|
|
103
|
-
return output_df.as_pandas()
|
|
109
|
+
return output_df.as_pandas()[output_schema.names]
|
|
104
110
|
|
|
105
111
|
df = self.to_df(df)
|
|
106
112
|
meta = self.execution_engine.pl_utils.safe_to_pandas_dtype( # type: ignore
|
|
@@ -113,8 +119,28 @@ class DaskMapEngine(MapEngine):
|
|
|
113
119
|
df = self.execution_engine.repartition(
|
|
114
120
|
df, PartitionSpec(num=partition_spec.num_partitions)
|
|
115
121
|
)
|
|
122
|
+
if is_coarse:
|
|
123
|
+
input_num_partitions = df.num_partitions
|
|
124
|
+
_utils = self.execution_engine.pl_utils # type: ignore
|
|
125
|
+
input_meta = _utils.safe_to_pandas_dtype(
|
|
126
|
+
(input_schema + (_DASK_PARTITION_KEY, "uint64")).pa_schema
|
|
127
|
+
)
|
|
128
|
+
tddf = df.native.map_partitions(
|
|
129
|
+
lambda pdf: pdf.assign(
|
|
130
|
+
**{
|
|
131
|
+
_DASK_PARTITION_KEY: pd.util.hash_pandas_object(
|
|
132
|
+
pdf[partition_spec.partition_by], index=False
|
|
133
|
+
).mod(input_num_partitions)
|
|
134
|
+
}
|
|
135
|
+
),
|
|
136
|
+
meta=input_meta,
|
|
137
|
+
)
|
|
138
|
+
keys = [_DASK_PARTITION_KEY]
|
|
139
|
+
else:
|
|
140
|
+
tddf = df.native
|
|
141
|
+
keys = partition_spec.partition_by
|
|
116
142
|
result = self.execution_engine.pl_utils.safe_groupby_apply( # type: ignore
|
|
117
|
-
|
|
143
|
+
tddf, keys, _map, meta=meta # type: ignore
|
|
118
144
|
)
|
|
119
145
|
return DaskDataFrame(result, output_schema)
|
|
120
146
|
|
|
@@ -214,7 +240,7 @@ class DaskExecutionEngine(ExecutionEngine):
|
|
|
214
240
|
p = partition_spec.get_num_partitions(
|
|
215
241
|
**{
|
|
216
242
|
KEYWORD_ROWCOUNT: lambda: df.persist().count(), # type: ignore
|
|
217
|
-
|
|
243
|
+
KEYWORD_PARALLELISM: lambda: self.get_current_parallelism(),
|
|
218
244
|
}
|
|
219
245
|
)
|
|
220
246
|
if p > 0:
|
|
@@ -253,7 +279,7 @@ class DaskExecutionEngine(ExecutionEngine):
|
|
|
253
279
|
join_type=how,
|
|
254
280
|
on=key_schema.names,
|
|
255
281
|
)
|
|
256
|
-
return DaskDataFrame(d, output_schema)
|
|
282
|
+
return DaskDataFrame(d, output_schema, type_safe=False)
|
|
257
283
|
|
|
258
284
|
def union(
|
|
259
285
|
self,
|
|
@@ -268,7 +294,7 @@ class DaskExecutionEngine(ExecutionEngine):
|
|
|
268
294
|
d = self.pl_utils.union(
|
|
269
295
|
self.to_df(df1).native, self.to_df(df2).native, unique=distinct
|
|
270
296
|
)
|
|
271
|
-
return DaskDataFrame(d, df1.schema)
|
|
297
|
+
return DaskDataFrame(d, df1.schema, type_safe=False)
|
|
272
298
|
|
|
273
299
|
def subtract(
|
|
274
300
|
self,
|
|
@@ -286,7 +312,7 @@ class DaskExecutionEngine(ExecutionEngine):
|
|
|
286
312
|
d = self.pl_utils.except_df(
|
|
287
313
|
self.to_df(df1).native, self.to_df(df2).native, unique=distinct
|
|
288
314
|
)
|
|
289
|
-
return DaskDataFrame(d, df1.schema)
|
|
315
|
+
return DaskDataFrame(d, df1.schema, type_safe=False)
|
|
290
316
|
|
|
291
317
|
def intersect(
|
|
292
318
|
self,
|
|
@@ -304,11 +330,11 @@ class DaskExecutionEngine(ExecutionEngine):
|
|
|
304
330
|
d = self.pl_utils.intersect(
|
|
305
331
|
self.to_df(df1).native, self.to_df(df2).native, unique=distinct
|
|
306
332
|
)
|
|
307
|
-
return DaskDataFrame(d, df1.schema)
|
|
333
|
+
return DaskDataFrame(d, df1.schema, type_safe=False)
|
|
308
334
|
|
|
309
335
|
def distinct(self, df: DataFrame) -> DataFrame:
|
|
310
336
|
d = self.pl_utils.drop_duplicates(self.to_df(df).native)
|
|
311
|
-
return DaskDataFrame(d, df.schema)
|
|
337
|
+
return DaskDataFrame(d, df.schema, type_safe=False)
|
|
312
338
|
|
|
313
339
|
def dropna(
|
|
314
340
|
self,
|
|
@@ -325,7 +351,7 @@ class DaskExecutionEngine(ExecutionEngine):
|
|
|
325
351
|
if how == "any" and thresh is not None:
|
|
326
352
|
del kw["how"] # to deal with a dask logic flaw
|
|
327
353
|
d = self.to_df(df).native.dropna(**kw)
|
|
328
|
-
return DaskDataFrame(d, df.schema)
|
|
354
|
+
return DaskDataFrame(d, df.schema, type_safe=False)
|
|
329
355
|
|
|
330
356
|
def fillna(self, df: DataFrame, value: Any, subset: List[str] = None) -> DataFrame:
|
|
331
357
|
assert_or_throw(
|
|
@@ -345,7 +371,7 @@ class DaskExecutionEngine(ExecutionEngine):
|
|
|
345
371
|
subset = subset or df.columns
|
|
346
372
|
mapping = {col: value for col in subset}
|
|
347
373
|
d = self.to_df(df).native.fillna(mapping)
|
|
348
|
-
return DaskDataFrame(d, df.schema)
|
|
374
|
+
return DaskDataFrame(d, df.schema, type_safe=False)
|
|
349
375
|
|
|
350
376
|
def sample(
|
|
351
377
|
self,
|
|
@@ -363,7 +389,7 @@ class DaskExecutionEngine(ExecutionEngine):
|
|
|
363
389
|
d = self.to_df(df).native.sample(
|
|
364
390
|
n=n, frac=frac, replace=replace, random_state=seed
|
|
365
391
|
)
|
|
366
|
-
return DaskDataFrame(d, df.schema)
|
|
392
|
+
return DaskDataFrame(d, df.schema, type_safe=False)
|
|
367
393
|
|
|
368
394
|
def take(
|
|
369
395
|
self,
|
|
@@ -419,7 +445,7 @@ class DaskExecutionEngine(ExecutionEngine):
|
|
|
419
445
|
.reset_index(drop=True)
|
|
420
446
|
)
|
|
421
447
|
|
|
422
|
-
return DaskDataFrame(d, df.schema)
|
|
448
|
+
return DaskDataFrame(d, df.schema, type_safe=False)
|
|
423
449
|
|
|
424
450
|
def load_df(
|
|
425
451
|
self,
|
fugue_duckdb/_utils.py
CHANGED
|
@@ -27,7 +27,11 @@ _DUCK_TYPES_TO_PA: Dict[str, pa.DataType] = {
|
|
|
27
27
|
"TIME": pa.time32("ms"),
|
|
28
28
|
}
|
|
29
29
|
|
|
30
|
-
_PA_TYPES_TO_DUCK: Dict[pa.DataType, str] = {
|
|
30
|
+
_PA_TYPES_TO_DUCK: Dict[pa.DataType, str] = {
|
|
31
|
+
v: k
|
|
32
|
+
for k, v in list(_DUCK_TYPES_TO_PA.items())
|
|
33
|
+
+ [("VARCHAR", pa.large_string()), ("BLOB", pa.large_binary())]
|
|
34
|
+
}
|
|
31
35
|
|
|
32
36
|
|
|
33
37
|
def encode_column_name(name: str) -> str:
|
|
@@ -94,8 +98,9 @@ def to_duck_type(tp: pa.DataType) -> str:
|
|
|
94
98
|
raise ValueError(f"can't convert {tp} to DuckDB data type")
|
|
95
99
|
|
|
96
100
|
|
|
97
|
-
def to_pa_type(
|
|
101
|
+
def to_pa_type(duck_type_raw: Any) -> pa.DataType:
|
|
98
102
|
try:
|
|
103
|
+
duck_type = str(duck_type_raw) # for duckdb >= 0.8.0
|
|
99
104
|
if duck_type.endswith("[]"):
|
|
100
105
|
return pa.list_(to_pa_type(duck_type[:-2]))
|
|
101
106
|
p = duck_type.find("(")
|
fugue_duckdb/dask.py
CHANGED
|
@@ -50,7 +50,7 @@ class DuckDaskExecutionEngine(DuckExecutionEngine):
|
|
|
50
50
|
res = DuckDataFrame(self.connection.from_df(ddf.as_pandas()))
|
|
51
51
|
else:
|
|
52
52
|
res = DuckDataFrame(
|
|
53
|
-
duckdb.
|
|
53
|
+
duckdb.from_arrow(ddf.as_arrow(), connection=self.connection)
|
|
54
54
|
)
|
|
55
55
|
if ddf.has_metadata: # pragma: no cover
|
|
56
56
|
res.reset_metadata(ddf.metadata)
|
fugue_duckdb/dataframe.py
CHANGED
|
@@ -4,14 +4,17 @@ import pandas as pd
|
|
|
4
4
|
import pyarrow as pa
|
|
5
5
|
from duckdb import DuckDBPyRelation
|
|
6
6
|
from triad import Schema
|
|
7
|
+
from triad.utils.pyarrow import LARGE_TYPES_REPLACEMENT, replace_types_in_table
|
|
7
8
|
|
|
8
9
|
from fugue import ArrayDataFrame, ArrowDataFrame, DataFrame, LocalBoundedDataFrame
|
|
9
10
|
from fugue.exceptions import FugueDataFrameOperationError, FugueDatasetEmptyError
|
|
10
11
|
from fugue.plugins import (
|
|
12
|
+
as_arrow,
|
|
11
13
|
as_fugue_dataset,
|
|
12
14
|
as_local_bounded,
|
|
13
15
|
get_column_names,
|
|
14
16
|
get_num_partitions,
|
|
17
|
+
get_schema,
|
|
15
18
|
is_df,
|
|
16
19
|
)
|
|
17
20
|
|
|
@@ -26,15 +29,7 @@ class DuckDataFrame(LocalBoundedDataFrame):
|
|
|
26
29
|
|
|
27
30
|
def __init__(self, rel: DuckDBPyRelation):
|
|
28
31
|
self._rel = rel
|
|
29
|
-
super().__init__(schema=self.
|
|
30
|
-
|
|
31
|
-
def _get_schema(self) -> Schema:
|
|
32
|
-
return Schema(
|
|
33
|
-
[
|
|
34
|
-
pa.field(x, to_pa_type(y))
|
|
35
|
-
for x, y in zip(self._rel.columns, self._rel.types)
|
|
36
|
-
]
|
|
37
|
-
)
|
|
32
|
+
super().__init__(schema=lambda: _duck_get_schema(self._rel))
|
|
38
33
|
|
|
39
34
|
@property
|
|
40
35
|
def alias(self) -> str:
|
|
@@ -98,7 +93,7 @@ class DuckDataFrame(LocalBoundedDataFrame):
|
|
|
98
93
|
return DuckDataFrame(self._rel.project(", ".join(fields)))
|
|
99
94
|
|
|
100
95
|
def as_arrow(self, type_safe: bool = False) -> pa.Table:
|
|
101
|
-
return self._rel
|
|
96
|
+
return _duck_as_arrow(self._rel)
|
|
102
97
|
|
|
103
98
|
def as_pandas(self) -> pd.DataFrame:
|
|
104
99
|
if any(pa.types.is_nested(f.type) for f in self.schema.fields):
|
|
@@ -169,6 +164,18 @@ def _duck_as_local(df: DuckDBPyRelation) -> DuckDBPyRelation:
|
|
|
169
164
|
return df
|
|
170
165
|
|
|
171
166
|
|
|
167
|
+
@as_arrow.candidate(lambda df: isinstance(df, DuckDBPyRelation))
|
|
168
|
+
def _duck_as_arrow(df: DuckDBPyRelation) -> pa.Table:
|
|
169
|
+
_df = df.arrow()
|
|
170
|
+
_df = replace_types_in_table(_df, LARGE_TYPES_REPLACEMENT, recursive=True)
|
|
171
|
+
return _df
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
@get_schema.candidate(lambda df: isinstance(df, DuckDBPyRelation))
|
|
175
|
+
def _duck_get_schema(df: DuckDBPyRelation) -> Schema:
|
|
176
|
+
return Schema([pa.field(x, to_pa_type(y)) for x, y in zip(df.columns, df.types)])
|
|
177
|
+
|
|
178
|
+
|
|
172
179
|
@get_column_names.candidate(lambda df: isinstance(df, DuckDBPyRelation))
|
|
173
180
|
def _get_duckdb_columns(df: DuckDBPyRelation) -> List[Any]:
|
|
174
181
|
return list(df.columns)
|
fugue_duckdb/execution_engine.py
CHANGED
|
@@ -2,12 +2,11 @@ import logging
|
|
|
2
2
|
from typing import Any, Dict, Iterable, List, Optional, Union
|
|
3
3
|
|
|
4
4
|
import duckdb
|
|
5
|
-
import pyarrow as pa
|
|
6
5
|
from duckdb import DuckDBPyConnection, DuckDBPyRelation
|
|
7
6
|
from triad import SerializableRLock
|
|
8
7
|
from triad.collections.fs import FileSystem
|
|
9
|
-
from triad.utils.schema import quote_name
|
|
10
8
|
from triad.utils.assertion import assert_or_throw
|
|
9
|
+
from triad.utils.schema import quote_name
|
|
11
10
|
|
|
12
11
|
from fugue import (
|
|
13
12
|
ArrowDataFrame,
|
|
@@ -19,12 +18,7 @@ from fugue import (
|
|
|
19
18
|
)
|
|
20
19
|
from fugue.collections.partition import PartitionSpec, parse_presort_exp
|
|
21
20
|
from fugue.collections.sql import StructuredRawSQL, TempTableName
|
|
22
|
-
from fugue.dataframe import
|
|
23
|
-
DataFrame,
|
|
24
|
-
DataFrames,
|
|
25
|
-
LocalBoundedDataFrame,
|
|
26
|
-
PandasDataFrame,
|
|
27
|
-
)
|
|
21
|
+
from fugue.dataframe import DataFrame, DataFrames, LocalBoundedDataFrame
|
|
28
22
|
from fugue.dataframe.utils import get_join_schemas
|
|
29
23
|
|
|
30
24
|
from ._io import DuckDBIO
|
|
@@ -34,7 +28,7 @@ from ._utils import (
|
|
|
34
28
|
encode_schema_names,
|
|
35
29
|
encode_value_to_expr,
|
|
36
30
|
)
|
|
37
|
-
from .dataframe import DuckDataFrame
|
|
31
|
+
from .dataframe import DuckDataFrame, _duck_as_arrow
|
|
38
32
|
|
|
39
33
|
_FUGUE_DUCKDB_PRAGMA_CONFIG_PREFIX = "fugue.duckdb.pragma."
|
|
40
34
|
_FUGUE_DUCKDB_EXTENSIONS = "fugue.duckdb.extensions"
|
|
@@ -114,8 +108,8 @@ class DuckDBEngine(SQLEngine):
|
|
|
114
108
|
conn = duckdb.connect()
|
|
115
109
|
try:
|
|
116
110
|
for k, v in dfs.items():
|
|
117
|
-
duckdb.
|
|
118
|
-
return ArrowDataFrame(conn.execute(statement)
|
|
111
|
+
duckdb.from_arrow(v.as_arrow(), connection=conn).create_view(k)
|
|
112
|
+
return ArrowDataFrame(_duck_as_arrow(conn.execute(statement)))
|
|
119
113
|
finally:
|
|
120
114
|
conn.close()
|
|
121
115
|
|
|
@@ -235,7 +229,7 @@ class DuckExecutionEngine(ExecutionEngine):
|
|
|
235
229
|
# TODO: we should create DuckDB table, but it has bugs, so can't use by 0.3.1
|
|
236
230
|
if isinstance(df, DuckDataFrame):
|
|
237
231
|
# materialize
|
|
238
|
-
res: DataFrame = ArrowDataFrame(df.
|
|
232
|
+
res: DataFrame = ArrowDataFrame(df.as_arrow())
|
|
239
233
|
else:
|
|
240
234
|
res = self.to_df(df)
|
|
241
235
|
res.reset_metadata(df.metadata)
|
|
@@ -545,19 +539,15 @@ def _to_duck_df(
|
|
|
545
539
|
)
|
|
546
540
|
if isinstance(df, DuckDataFrame):
|
|
547
541
|
return df
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
):
|
|
552
|
-
rdf = DuckDataFrame(engine.connection.from_df(df.as_pandas()))
|
|
553
|
-
else:
|
|
554
|
-
rdf = DuckDataFrame(
|
|
555
|
-
duckdb.arrow(df.as_arrow(), connection=engine.connection)
|
|
556
|
-
)
|
|
542
|
+
rdf = DuckDataFrame(
|
|
543
|
+
duckdb.from_arrow(df.as_arrow(), connection=engine.connection)
|
|
544
|
+
)
|
|
557
545
|
rdf.reset_metadata(df.metadata if df.has_metadata else None)
|
|
558
546
|
return rdf
|
|
559
547
|
tdf = ArrowDataFrame(df, schema)
|
|
560
|
-
return DuckDataFrame(
|
|
548
|
+
return DuckDataFrame(
|
|
549
|
+
duckdb.from_arrow(tdf.native, connection=engine.connection)
|
|
550
|
+
)
|
|
561
551
|
|
|
562
552
|
res = _gen_duck()
|
|
563
553
|
if create_view:
|
fugue_ibis/dataframe.py
CHANGED
|
@@ -5,12 +5,7 @@ import pandas as pd
|
|
|
5
5
|
import pyarrow as pa
|
|
6
6
|
from triad import Schema, assert_or_throw
|
|
7
7
|
|
|
8
|
-
from fugue import
|
|
9
|
-
DataFrame,
|
|
10
|
-
IterableDataFrame,
|
|
11
|
-
LocalBoundedDataFrame,
|
|
12
|
-
to_local_bounded_df,
|
|
13
|
-
)
|
|
8
|
+
from fugue import DataFrame, IterableDataFrame, LocalBoundedDataFrame
|
|
14
9
|
from fugue.dataframe.dataframe import _input_schema
|
|
15
10
|
from fugue.exceptions import FugueDataFrameOperationError, FugueDatasetEmptyError
|
|
16
11
|
from fugue.plugins import drop_columns, get_column_names, is_df, rename
|
|
@@ -153,7 +148,7 @@ class IbisDataFrame(DataFrame):
|
|
|
153
148
|
) -> LocalBoundedDataFrame:
|
|
154
149
|
if columns is not None:
|
|
155
150
|
return self[columns].head(n)
|
|
156
|
-
return
|
|
151
|
+
return self._to_local_df(self._table.head(n)).as_local_bounded()
|
|
157
152
|
|
|
158
153
|
def _alter_table_columns(self, table: IbisTable, new_schema: Schema) -> IbisTable:
|
|
159
154
|
fields: Dict[str, Any] = {}
|
fugue_notebook/env.py
CHANGED
|
@@ -3,21 +3,16 @@ import html
|
|
|
3
3
|
import json
|
|
4
4
|
from typing import Any, Dict, List, Optional
|
|
5
5
|
|
|
6
|
-
from IPython.core.magic import Magics, cell_magic, magics_class, needs_local_scope
|
|
7
6
|
from IPython import get_ipython
|
|
7
|
+
from IPython.core.magic import Magics, cell_magic, magics_class, needs_local_scope
|
|
8
8
|
from IPython.display import HTML, display
|
|
9
9
|
from triad import ParamDict
|
|
10
10
|
from triad.utils.convert import to_instance
|
|
11
11
|
from triad.utils.pyarrow import _field_to_expression
|
|
12
12
|
|
|
13
|
-
import
|
|
14
|
-
from fugue import
|
|
15
|
-
|
|
16
|
-
DataFrameDisplay,
|
|
17
|
-
ExecutionEngine,
|
|
18
|
-
get_dataset_display,
|
|
19
|
-
make_execution_engine,
|
|
20
|
-
)
|
|
13
|
+
from fugue import DataFrame, DataFrameDisplay, ExecutionEngine
|
|
14
|
+
from fugue import fsql as fugue_sql
|
|
15
|
+
from fugue import get_dataset_display, make_execution_engine
|
|
21
16
|
from fugue.dataframe import YieldedDataFrame
|
|
22
17
|
from fugue.exceptions import FugueSQLSyntaxError
|
|
23
18
|
|
|
@@ -58,7 +53,7 @@ class _FugueSQLMagics(Magics):
|
|
|
58
53
|
@cell_magic("fsql")
|
|
59
54
|
def fsql(self, line: str, cell: str, local_ns: Any = None) -> None:
|
|
60
55
|
try:
|
|
61
|
-
dag = fugue_sql
|
|
56
|
+
dag = fugue_sql(
|
|
62
57
|
"\n" + cell, local_ns, fsql_ignore_case=self._fsql_ignore_case
|
|
63
58
|
)
|
|
64
59
|
except FugueSQLSyntaxError as ex:
|
fugue_polars/_utils.py
CHANGED
|
@@ -1,48 +1,8 @@
|
|
|
1
1
|
import polars as pl
|
|
2
|
-
import pyarrow as pa
|
|
3
2
|
from triad import Schema
|
|
4
|
-
from triad.utils.pyarrow import get_alter_func
|
|
5
3
|
|
|
6
4
|
from fugue.dataframe.arrow_dataframe import _build_empty_arrow
|
|
7
5
|
|
|
8
6
|
|
|
9
|
-
def pl_as_arrow(df: pl.DataFrame) -> pa.Table:
|
|
10
|
-
adf = df.to_arrow()
|
|
11
|
-
schema = convert_schema(adf.schema)
|
|
12
|
-
func = get_alter_func(adf.schema, schema, safe=False)
|
|
13
|
-
return func(adf)
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
def to_schema(df: pl.DataFrame) -> Schema:
|
|
17
|
-
return Schema(convert_schema(pl.DataFrame(schema=df.schema).to_arrow().schema))
|
|
18
|
-
|
|
19
|
-
|
|
20
7
|
def build_empty_pl(schema: Schema) -> pl.DataFrame:
|
|
21
8
|
return pl.from_arrow(_build_empty_arrow(schema))
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
def convert_schema(schema: pa.Schema) -> pa.Schema:
|
|
25
|
-
fields = [convert_field(f) for f in schema]
|
|
26
|
-
return pa.schema(fields)
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def convert_field(field: pa.Field) -> pa.Field:
|
|
30
|
-
tp = convert_type(field.type)
|
|
31
|
-
if tp == field.type:
|
|
32
|
-
return field
|
|
33
|
-
print(field.type, tp)
|
|
34
|
-
return pa.field(field.name, tp)
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
def convert_type(tp: pa.DataType) -> pa.DataType:
|
|
38
|
-
if pa.types.is_struct(tp):
|
|
39
|
-
return pa.struct([convert_field(f) for f in tp])
|
|
40
|
-
if pa.types.is_list(tp) or pa.types.is_large_list(tp):
|
|
41
|
-
return pa.list_(convert_type(tp.value_type))
|
|
42
|
-
if pa.types.is_map(tp): # pragma: no cover
|
|
43
|
-
return pa.map_(convert_type(tp.key_type), convert_type(tp.value_type))
|
|
44
|
-
if pa.types.is_large_string(tp):
|
|
45
|
-
return pa.string()
|
|
46
|
-
if pa.types.is_large_binary(tp):
|
|
47
|
-
return pa.binary()
|
|
48
|
-
return tp
|
fugue_polars/polars_dataframe.py
CHANGED
|
@@ -6,9 +6,15 @@ import pyarrow as pa
|
|
|
6
6
|
from triad.collections.schema import Schema
|
|
7
7
|
from triad.exceptions import InvalidOperationError
|
|
8
8
|
from triad.utils.assertion import assert_or_throw
|
|
9
|
+
from triad.utils.pyarrow import (
|
|
10
|
+
LARGE_TYPES_REPLACEMENT,
|
|
11
|
+
replace_types_in_schema,
|
|
12
|
+
replace_types_in_table,
|
|
13
|
+
)
|
|
9
14
|
|
|
10
15
|
from fugue import ArrowDataFrame
|
|
11
16
|
from fugue.api import (
|
|
17
|
+
as_arrow,
|
|
12
18
|
drop_columns,
|
|
13
19
|
get_column_names,
|
|
14
20
|
get_schema,
|
|
@@ -28,7 +34,7 @@ from fugue.dataset.api import (
|
|
|
28
34
|
)
|
|
29
35
|
from fugue.exceptions import FugueDataFrameOperationError
|
|
30
36
|
|
|
31
|
-
from ._utils import build_empty_pl
|
|
37
|
+
from ._utils import build_empty_pl
|
|
32
38
|
|
|
33
39
|
|
|
34
40
|
class PolarsDataFrame(LocalBoundedDataFrame):
|
|
@@ -55,7 +61,7 @@ class PolarsDataFrame(LocalBoundedDataFrame):
|
|
|
55
61
|
InvalidOperationError("can't reset schema for pl.DataFrame"),
|
|
56
62
|
)
|
|
57
63
|
self._native = df
|
|
58
|
-
super().__init__(
|
|
64
|
+
super().__init__(_get_pl_schema(df))
|
|
59
65
|
|
|
60
66
|
@property
|
|
61
67
|
def native(self) -> pl.DataFrame:
|
|
@@ -75,7 +81,7 @@ class PolarsDataFrame(LocalBoundedDataFrame):
|
|
|
75
81
|
|
|
76
82
|
def peek_dict(self) -> Dict[str, Any]:
|
|
77
83
|
self.assert_not_empty()
|
|
78
|
-
return
|
|
84
|
+
return self._native.row(0, named=True)
|
|
79
85
|
|
|
80
86
|
def count(self) -> int:
|
|
81
87
|
return self.native.shape[0]
|
|
@@ -107,7 +113,7 @@ class PolarsDataFrame(LocalBoundedDataFrame):
|
|
|
107
113
|
return PolarsDataFrame(pl.from_arrow(adf.native))
|
|
108
114
|
|
|
109
115
|
def as_arrow(self, type_safe: bool = False) -> pa.Table:
|
|
110
|
-
return
|
|
116
|
+
return _pl_as_arrow(self.native)
|
|
111
117
|
|
|
112
118
|
def as_array(
|
|
113
119
|
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
@@ -121,7 +127,7 @@ class PolarsDataFrame(LocalBoundedDataFrame):
|
|
|
121
127
|
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
122
128
|
) -> Iterable[Any]:
|
|
123
129
|
if not self.empty:
|
|
124
|
-
yield from ArrowDataFrame(
|
|
130
|
+
yield from ArrowDataFrame(_pl_as_arrow(self.native)).as_array_iterable(
|
|
125
131
|
columns=columns
|
|
126
132
|
)
|
|
127
133
|
|
|
@@ -129,7 +135,7 @@ class PolarsDataFrame(LocalBoundedDataFrame):
|
|
|
129
135
|
self, columns: Optional[List[str]] = None
|
|
130
136
|
) -> Iterable[Dict[str, Any]]:
|
|
131
137
|
if not self.empty:
|
|
132
|
-
yield from ArrowDataFrame(
|
|
138
|
+
yield from ArrowDataFrame(_pl_as_arrow(self.native)).as_dict_iterable(
|
|
133
139
|
columns=columns
|
|
134
140
|
)
|
|
135
141
|
|
|
@@ -144,6 +150,13 @@ def _pl_as_local_bounded(df: pl.DataFrame) -> pl.DataFrame:
|
|
|
144
150
|
return df
|
|
145
151
|
|
|
146
152
|
|
|
153
|
+
@as_arrow.candidate(lambda df: isinstance(df, pl.DataFrame))
|
|
154
|
+
def _pl_as_arrow(df: pl.DataFrame) -> pa.Table:
|
|
155
|
+
adf = df.to_arrow()
|
|
156
|
+
adf = replace_types_in_table(adf, LARGE_TYPES_REPLACEMENT)
|
|
157
|
+
return adf
|
|
158
|
+
|
|
159
|
+
|
|
147
160
|
@is_df.candidate(lambda df: isinstance(df, pl.DataFrame))
|
|
148
161
|
def _pl_is_df(df: pl.DataFrame) -> bool:
|
|
149
162
|
return True
|
|
@@ -181,7 +194,9 @@ def _get_pl_columns(df: pl.DataFrame) -> List[Any]:
|
|
|
181
194
|
|
|
182
195
|
@get_schema.candidate(lambda df: isinstance(df, pl.DataFrame))
|
|
183
196
|
def _get_pl_schema(df: pl.DataFrame) -> Schema:
|
|
184
|
-
|
|
197
|
+
adf = df.to_arrow()
|
|
198
|
+
schema = replace_types_in_schema(adf.schema, LARGE_TYPES_REPLACEMENT)
|
|
199
|
+
return Schema(schema)
|
|
185
200
|
|
|
186
201
|
|
|
187
202
|
@rename.candidate(lambda df, *args, **kwargs: isinstance(df, pl.DataFrame))
|
fugue_ray/_constants.py
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import Any, Dict
|
|
2
|
+
|
|
3
|
+
import ray
|
|
2
4
|
|
|
3
5
|
FUGUE_RAY_CONF_SHUFFLE_PARTITIONS = "fugue.ray.shuffle.partitions"
|
|
4
6
|
FUGUE_RAY_DEFAULT_PARTITIONS = "fugue.ray.default.partitions"
|
|
@@ -10,3 +12,8 @@ FUGUE_RAY_DEFAULT_CONF: Dict[str, Any] = {
|
|
|
10
12
|
FUGUE_RAY_DEFAULT_PARTITIONS: 0,
|
|
11
13
|
FUGUE_RAY_ZERO_COPY: True,
|
|
12
14
|
}
|
|
15
|
+
|
|
16
|
+
if ray.__version__ >= "2.3":
|
|
17
|
+
_ZERO_COPY: Dict[str, Any] = {"zero_copy_batch": True}
|
|
18
|
+
else: # pragma: no cover
|
|
19
|
+
_ZERO_COPY = {}
|
fugue_ray/_utils/dataframe.py
CHANGED
|
@@ -1,11 +1,15 @@
|
|
|
1
1
|
import pickle
|
|
2
|
-
from typing import
|
|
2
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
3
3
|
|
|
4
|
+
import pandas as pd
|
|
4
5
|
import pyarrow as pa
|
|
5
6
|
import ray.data as rd
|
|
6
|
-
from fugue.dataframe.arrow_dataframe import _build_empty_arrow
|
|
7
7
|
from triad import Schema
|
|
8
8
|
|
|
9
|
+
from fugue.dataframe.arrow_dataframe import _build_empty_arrow
|
|
10
|
+
|
|
11
|
+
from .._constants import _ZERO_COPY
|
|
12
|
+
|
|
9
13
|
_RAY_NULL_REPR = "__RAY_NULL__"
|
|
10
14
|
|
|
11
15
|
|
|
@@ -15,6 +19,8 @@ def get_dataset_format(df: rd.Dataset) -> Optional[str]:
|
|
|
15
19
|
return None
|
|
16
20
|
if hasattr(df, "_dataset_format"): # pragma: no cover
|
|
17
21
|
return df._dataset_format() # ray<2.2
|
|
22
|
+
ctx = rd.context.DatasetContext.get_current()
|
|
23
|
+
ctx.use_streaming_executor = False
|
|
18
24
|
return df.dataset_format() # ray>=2.2
|
|
19
25
|
|
|
20
26
|
|
|
@@ -50,7 +56,7 @@ def add_partition_key(
|
|
|
50
56
|
)
|
|
51
57
|
|
|
52
58
|
return df.map_batches(
|
|
53
|
-
add_simple_key, batch_format="pyarrow", **ray_remote_args
|
|
59
|
+
add_simple_key, batch_format="pyarrow", **_ZERO_COPY, **ray_remote_args
|
|
54
60
|
), input_schema + (
|
|
55
61
|
output_key,
|
|
56
62
|
str,
|
|
@@ -67,8 +73,29 @@ def add_partition_key(
|
|
|
67
73
|
return fdf.append_column(output_key, sarr)
|
|
68
74
|
|
|
69
75
|
return df.map_batches(
|
|
70
|
-
add_key, batch_format="pyarrow", **ray_remote_args
|
|
76
|
+
add_key, batch_format="pyarrow", **_ZERO_COPY, **ray_remote_args
|
|
71
77
|
), input_schema + (
|
|
72
78
|
output_key,
|
|
73
79
|
pa.binary(),
|
|
74
80
|
)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def add_coarse_partition_key(
|
|
84
|
+
df: rd.Dataset,
|
|
85
|
+
keys: List[str],
|
|
86
|
+
output_key: str,
|
|
87
|
+
bucket: int,
|
|
88
|
+
) -> rd.Dataset:
|
|
89
|
+
ray_remote_args: Dict[str, Any] = {"num_cpus": 1}
|
|
90
|
+
|
|
91
|
+
def add_coarse_key(arrow_df: pa.Table) -> pa.Table: # pragma: no cover
|
|
92
|
+
hdf = arrow_df.select(keys).to_pandas()
|
|
93
|
+
_hash = pd.util.hash_pandas_object(hdf, index=False).mod(bucket)
|
|
94
|
+
return arrow_df.append_column(output_key, pa.Array.from_pandas(_hash))
|
|
95
|
+
|
|
96
|
+
return df.map_batches(
|
|
97
|
+
add_coarse_key,
|
|
98
|
+
batch_format="pyarrow",
|
|
99
|
+
**_ZERO_COPY,
|
|
100
|
+
**ray_remote_args,
|
|
101
|
+
)
|
fugue_ray/_utils/io.py
CHANGED
|
@@ -5,7 +5,7 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Union
|
|
|
5
5
|
import pyarrow as pa
|
|
6
6
|
import ray.data as rd
|
|
7
7
|
from fugue import ExecutionEngine
|
|
8
|
-
from fugue._utils.io import FileParser,
|
|
8
|
+
from fugue._utils.io import FileParser, save_df
|
|
9
9
|
from fugue.collections.partition import PartitionSpec
|
|
10
10
|
from fugue.dataframe import DataFrame
|
|
11
11
|
from fugue_ray.dataframe import RayDataFrame
|
|
@@ -49,8 +49,6 @@ class RayIO(object):
|
|
|
49
49
|
len(fmts) == 1, NotImplementedError("can't support multiple formats")
|
|
50
50
|
)
|
|
51
51
|
fmt = fmts[0]
|
|
52
|
-
if fmt == "avro": # TODO: remove avro support
|
|
53
|
-
return load_df(uri, format_hint=format_hint, columns=columns, **kwargs)
|
|
54
52
|
files = [f.uri for f in fp]
|
|
55
53
|
return self._loads[fmt](files, columns, **kwargs)
|
|
56
54
|
|
|
@@ -75,7 +73,7 @@ class RayIO(object):
|
|
|
75
73
|
except Exception: # pragma: no cover
|
|
76
74
|
pass
|
|
77
75
|
p = FileParser(uri, format_hint)
|
|
78
|
-
if not force_single
|
|
76
|
+
if not force_single:
|
|
79
77
|
df = self._prepartition(df, partition_spec=partition_spec)
|
|
80
78
|
|
|
81
79
|
self._saves[p.file_format](df=df, uri=p.uri, **kwargs)
|