fugue 0.8.2.dev1__py3-none-any.whl → 0.8.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fugue/__init__.py +9 -5
- fugue/_utils/interfaceless.py +1 -558
- fugue/_utils/io.py +2 -91
- fugue/_utils/registry.py +3 -2
- fugue/api.py +1 -0
- fugue/bag/bag.py +8 -4
- fugue/collections/__init__.py +0 -7
- fugue/collections/partition.py +21 -9
- fugue/constants.py +3 -1
- fugue/dataframe/__init__.py +7 -8
- fugue/dataframe/arrow_dataframe.py +1 -2
- fugue/dataframe/dataframe.py +17 -18
- fugue/dataframe/dataframe_iterable_dataframe.py +22 -6
- fugue/dataframe/function_wrapper.py +432 -0
- fugue/dataframe/iterable_dataframe.py +3 -0
- fugue/dataframe/utils.py +11 -79
- fugue/dataset/api.py +0 -4
- fugue/dev.py +47 -0
- fugue/execution/__init__.py +1 -5
- fugue/execution/api.py +36 -14
- fugue/execution/execution_engine.py +30 -4
- fugue/execution/factory.py +0 -6
- fugue/execution/native_execution_engine.py +44 -67
- fugue/extensions/_builtins/creators.py +4 -2
- fugue/extensions/_builtins/outputters.py +4 -3
- fugue/extensions/_builtins/processors.py +3 -3
- fugue/extensions/creator/convert.py +5 -2
- fugue/extensions/outputter/convert.py +2 -2
- fugue/extensions/processor/convert.py +3 -2
- fugue/extensions/transformer/convert.py +22 -9
- fugue/extensions/transformer/transformer.py +15 -1
- fugue/plugins.py +2 -0
- fugue/registry.py +0 -39
- fugue/sql/_utils.py +1 -1
- fugue/workflow/_checkpoint.py +1 -1
- fugue/workflow/api.py +13 -13
- fugue/workflow/module.py +30 -37
- fugue/workflow/workflow.py +6 -0
- {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/METADATA +37 -23
- {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/RECORD +112 -101
- {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/WHEEL +1 -1
- {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/entry_points.txt +2 -1
- {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/top_level.txt +1 -0
- fugue_contrib/contrib.py +1 -0
- fugue_contrib/viz/_ext.py +7 -1
- fugue_dask/_io.py +0 -13
- fugue_dask/_utils.py +10 -4
- fugue_dask/dataframe.py +1 -2
- fugue_dask/execution_engine.py +45 -18
- fugue_dask/registry.py +8 -33
- fugue_duckdb/_io.py +8 -2
- fugue_duckdb/_utils.py +7 -2
- fugue_duckdb/dask.py +1 -1
- fugue_duckdb/dataframe.py +23 -19
- fugue_duckdb/execution_engine.py +19 -22
- fugue_duckdb/registry.py +11 -34
- fugue_ibis/dataframe.py +6 -10
- fugue_ibis/execution_engine.py +7 -1
- fugue_notebook/env.py +5 -10
- fugue_polars/__init__.py +2 -0
- fugue_polars/_utils.py +8 -0
- fugue_polars/polars_dataframe.py +234 -0
- fugue_polars/registry.py +86 -0
- fugue_ray/_constants.py +10 -1
- fugue_ray/_utils/dataframe.py +36 -9
- fugue_ray/_utils/io.py +2 -4
- fugue_ray/dataframe.py +16 -12
- fugue_ray/execution_engine.py +53 -32
- fugue_ray/registry.py +8 -32
- fugue_spark/_utils/convert.py +22 -11
- fugue_spark/_utils/io.py +0 -13
- fugue_spark/_utils/misc.py +27 -0
- fugue_spark/_utils/partition.py +11 -18
- fugue_spark/dataframe.py +26 -22
- fugue_spark/execution_engine.py +136 -54
- fugue_spark/registry.py +29 -78
- fugue_test/builtin_suite.py +36 -14
- fugue_test/dataframe_suite.py +9 -5
- fugue_test/execution_suite.py +100 -122
- fugue_version/__init__.py +1 -1
- tests/fugue/bag/test_array_bag.py +0 -9
- tests/fugue/collections/test_partition.py +10 -3
- tests/fugue/dataframe/test_function_wrapper.py +293 -0
- tests/fugue/dataframe/test_utils.py +2 -34
- tests/fugue/execution/test_factory.py +7 -9
- tests/fugue/execution/test_naive_execution_engine.py +35 -80
- tests/fugue/extensions/test_utils.py +12 -7
- tests/fugue/extensions/transformer/test_convert_cotransformer.py +1 -0
- tests/fugue/extensions/transformer/test_convert_output_cotransformer.py +1 -0
- tests/fugue/extensions/transformer/test_convert_transformer.py +2 -0
- tests/fugue/sql/test_workflow.py +1 -1
- tests/fugue/sql/test_workflow_parse.py +3 -5
- tests/fugue/utils/test_interfaceless.py +1 -325
- tests/fugue/utils/test_io.py +0 -80
- tests/fugue_dask/test_execution_engine.py +48 -0
- tests/fugue_dask/test_io.py +0 -55
- tests/fugue_duckdb/test_dataframe.py +2 -2
- tests/fugue_duckdb/test_execution_engine.py +16 -1
- tests/fugue_duckdb/test_utils.py +1 -1
- tests/fugue_ibis/test_dataframe.py +6 -3
- tests/fugue_polars/__init__.py +0 -0
- tests/fugue_polars/test_api.py +13 -0
- tests/fugue_polars/test_dataframe.py +82 -0
- tests/fugue_polars/test_transform.py +100 -0
- tests/fugue_ray/test_execution_engine.py +40 -4
- tests/fugue_spark/test_dataframe.py +0 -8
- tests/fugue_spark/test_execution_engine.py +50 -11
- tests/fugue_spark/test_importless.py +4 -4
- tests/fugue_spark/test_spark_connect.py +82 -0
- tests/fugue_spark/utils/test_convert.py +6 -8
- tests/fugue_spark/utils/test_io.py +0 -17
- fugue/_utils/register.py +0 -3
- fugue_test/_utils.py +0 -13
- {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/LICENSE +0 -0
fugue_dask/execution_engine.py
CHANGED
|
@@ -3,6 +3,7 @@ import os
|
|
|
3
3
|
from typing import Any, Callable, Dict, List, Optional, Type, Union
|
|
4
4
|
|
|
5
5
|
import dask.dataframe as dd
|
|
6
|
+
import pandas as pd
|
|
6
7
|
from distributed import Client
|
|
7
8
|
from qpd_dask import run_sql_on_dask
|
|
8
9
|
from triad.collections import Schema
|
|
@@ -18,7 +19,7 @@ from fugue.collections.partition import (
|
|
|
18
19
|
PartitionSpec,
|
|
19
20
|
parse_presort_exp,
|
|
20
21
|
)
|
|
21
|
-
from fugue.constants import
|
|
22
|
+
from fugue.constants import KEYWORD_PARALLELISM, KEYWORD_ROWCOUNT
|
|
22
23
|
from fugue.dataframe import (
|
|
23
24
|
AnyDataFrame,
|
|
24
25
|
DataFrame,
|
|
@@ -34,6 +35,8 @@ from fugue_dask._io import load_df, save_df
|
|
|
34
35
|
from fugue_dask._utils import DASK_UTILS, DaskUtils
|
|
35
36
|
from fugue_dask.dataframe import DaskDataFrame
|
|
36
37
|
|
|
38
|
+
_DASK_PARTITION_KEY = "__dask_partition_key__"
|
|
39
|
+
|
|
37
40
|
|
|
38
41
|
class QPDDaskEngine(SQLEngine):
|
|
39
42
|
"""QPD execution implementation."""
|
|
@@ -72,12 +75,15 @@ class DaskMapEngine(MapEngine):
|
|
|
72
75
|
output_schema: Any,
|
|
73
76
|
partition_spec: PartitionSpec,
|
|
74
77
|
on_init: Optional[Callable[[int, DataFrame], Any]] = None,
|
|
78
|
+
map_func_format_hint: Optional[str] = None,
|
|
75
79
|
) -> DataFrame:
|
|
76
|
-
|
|
80
|
+
is_coarse = partition_spec.algo == "coarse"
|
|
81
|
+
presort = partition_spec.get_sorts(df.schema, with_partition_keys=is_coarse)
|
|
77
82
|
presort_keys = list(presort.keys())
|
|
78
83
|
presort_asc = list(presort.values())
|
|
79
84
|
output_schema = Schema(output_schema)
|
|
80
85
|
input_schema = df.schema
|
|
86
|
+
cursor = partition_spec.get_cursor(input_schema, 0)
|
|
81
87
|
on_init_once: Any = (
|
|
82
88
|
None
|
|
83
89
|
if on_init is None
|
|
@@ -86,20 +92,21 @@ class DaskMapEngine(MapEngine):
|
|
|
86
92
|
)
|
|
87
93
|
)
|
|
88
94
|
|
|
89
|
-
def _map(pdf: Any) ->
|
|
95
|
+
def _map(pdf: Any) -> pd.DataFrame:
|
|
90
96
|
if pdf.shape[0] == 0:
|
|
91
97
|
return PandasDataFrame([], output_schema).as_pandas()
|
|
92
|
-
if
|
|
98
|
+
if is_coarse:
|
|
99
|
+
pdf = pdf.drop(columns=[_DASK_PARTITION_KEY])
|
|
100
|
+
if len(partition_spec.presort) > 0:
|
|
93
101
|
pdf = pdf.sort_values(presort_keys, ascending=presort_asc)
|
|
94
102
|
input_df = PandasDataFrame(
|
|
95
103
|
pdf.reset_index(drop=True), input_schema, pandas_df_wrapper=True
|
|
96
104
|
)
|
|
97
105
|
if on_init_once is not None:
|
|
98
106
|
on_init_once(0, input_df)
|
|
99
|
-
cursor
|
|
100
|
-
cursor.set(input_df.peek_array(), 0, 0)
|
|
107
|
+
cursor.set(lambda: input_df.peek_array(), 0, 0)
|
|
101
108
|
output_df = map_func(cursor, input_df)
|
|
102
|
-
return output_df.as_pandas()
|
|
109
|
+
return output_df.as_pandas()[output_schema.names]
|
|
103
110
|
|
|
104
111
|
df = self.to_df(df)
|
|
105
112
|
meta = self.execution_engine.pl_utils.safe_to_pandas_dtype( # type: ignore
|
|
@@ -112,8 +119,28 @@ class DaskMapEngine(MapEngine):
|
|
|
112
119
|
df = self.execution_engine.repartition(
|
|
113
120
|
df, PartitionSpec(num=partition_spec.num_partitions)
|
|
114
121
|
)
|
|
122
|
+
if is_coarse:
|
|
123
|
+
input_num_partitions = df.num_partitions
|
|
124
|
+
_utils = self.execution_engine.pl_utils # type: ignore
|
|
125
|
+
input_meta = _utils.safe_to_pandas_dtype(
|
|
126
|
+
(input_schema + (_DASK_PARTITION_KEY, "uint64")).pa_schema
|
|
127
|
+
)
|
|
128
|
+
tddf = df.native.map_partitions(
|
|
129
|
+
lambda pdf: pdf.assign(
|
|
130
|
+
**{
|
|
131
|
+
_DASK_PARTITION_KEY: pd.util.hash_pandas_object(
|
|
132
|
+
pdf[partition_spec.partition_by], index=False
|
|
133
|
+
).mod(input_num_partitions)
|
|
134
|
+
}
|
|
135
|
+
),
|
|
136
|
+
meta=input_meta,
|
|
137
|
+
)
|
|
138
|
+
keys = [_DASK_PARTITION_KEY]
|
|
139
|
+
else:
|
|
140
|
+
tddf = df.native
|
|
141
|
+
keys = partition_spec.partition_by
|
|
115
142
|
result = self.execution_engine.pl_utils.safe_groupby_apply( # type: ignore
|
|
116
|
-
|
|
143
|
+
tddf, keys, _map, meta=meta # type: ignore
|
|
117
144
|
)
|
|
118
145
|
return DaskDataFrame(result, output_schema)
|
|
119
146
|
|
|
@@ -213,7 +240,7 @@ class DaskExecutionEngine(ExecutionEngine):
|
|
|
213
240
|
p = partition_spec.get_num_partitions(
|
|
214
241
|
**{
|
|
215
242
|
KEYWORD_ROWCOUNT: lambda: df.persist().count(), # type: ignore
|
|
216
|
-
|
|
243
|
+
KEYWORD_PARALLELISM: lambda: self.get_current_parallelism(),
|
|
217
244
|
}
|
|
218
245
|
)
|
|
219
246
|
if p > 0:
|
|
@@ -252,7 +279,7 @@ class DaskExecutionEngine(ExecutionEngine):
|
|
|
252
279
|
join_type=how,
|
|
253
280
|
on=key_schema.names,
|
|
254
281
|
)
|
|
255
|
-
return DaskDataFrame(d, output_schema)
|
|
282
|
+
return DaskDataFrame(d, output_schema, type_safe=False)
|
|
256
283
|
|
|
257
284
|
def union(
|
|
258
285
|
self,
|
|
@@ -267,7 +294,7 @@ class DaskExecutionEngine(ExecutionEngine):
|
|
|
267
294
|
d = self.pl_utils.union(
|
|
268
295
|
self.to_df(df1).native, self.to_df(df2).native, unique=distinct
|
|
269
296
|
)
|
|
270
|
-
return DaskDataFrame(d, df1.schema)
|
|
297
|
+
return DaskDataFrame(d, df1.schema, type_safe=False)
|
|
271
298
|
|
|
272
299
|
def subtract(
|
|
273
300
|
self,
|
|
@@ -285,7 +312,7 @@ class DaskExecutionEngine(ExecutionEngine):
|
|
|
285
312
|
d = self.pl_utils.except_df(
|
|
286
313
|
self.to_df(df1).native, self.to_df(df2).native, unique=distinct
|
|
287
314
|
)
|
|
288
|
-
return DaskDataFrame(d, df1.schema)
|
|
315
|
+
return DaskDataFrame(d, df1.schema, type_safe=False)
|
|
289
316
|
|
|
290
317
|
def intersect(
|
|
291
318
|
self,
|
|
@@ -303,11 +330,11 @@ class DaskExecutionEngine(ExecutionEngine):
|
|
|
303
330
|
d = self.pl_utils.intersect(
|
|
304
331
|
self.to_df(df1).native, self.to_df(df2).native, unique=distinct
|
|
305
332
|
)
|
|
306
|
-
return DaskDataFrame(d, df1.schema)
|
|
333
|
+
return DaskDataFrame(d, df1.schema, type_safe=False)
|
|
307
334
|
|
|
308
335
|
def distinct(self, df: DataFrame) -> DataFrame:
|
|
309
336
|
d = self.pl_utils.drop_duplicates(self.to_df(df).native)
|
|
310
|
-
return DaskDataFrame(d, df.schema)
|
|
337
|
+
return DaskDataFrame(d, df.schema, type_safe=False)
|
|
311
338
|
|
|
312
339
|
def dropna(
|
|
313
340
|
self,
|
|
@@ -324,7 +351,7 @@ class DaskExecutionEngine(ExecutionEngine):
|
|
|
324
351
|
if how == "any" and thresh is not None:
|
|
325
352
|
del kw["how"] # to deal with a dask logic flaw
|
|
326
353
|
d = self.to_df(df).native.dropna(**kw)
|
|
327
|
-
return DaskDataFrame(d, df.schema)
|
|
354
|
+
return DaskDataFrame(d, df.schema, type_safe=False)
|
|
328
355
|
|
|
329
356
|
def fillna(self, df: DataFrame, value: Any, subset: List[str] = None) -> DataFrame:
|
|
330
357
|
assert_or_throw(
|
|
@@ -344,7 +371,7 @@ class DaskExecutionEngine(ExecutionEngine):
|
|
|
344
371
|
subset = subset or df.columns
|
|
345
372
|
mapping = {col: value for col in subset}
|
|
346
373
|
d = self.to_df(df).native.fillna(mapping)
|
|
347
|
-
return DaskDataFrame(d, df.schema)
|
|
374
|
+
return DaskDataFrame(d, df.schema, type_safe=False)
|
|
348
375
|
|
|
349
376
|
def sample(
|
|
350
377
|
self,
|
|
@@ -362,7 +389,7 @@ class DaskExecutionEngine(ExecutionEngine):
|
|
|
362
389
|
d = self.to_df(df).native.sample(
|
|
363
390
|
n=n, frac=frac, replace=replace, random_state=seed
|
|
364
391
|
)
|
|
365
|
-
return DaskDataFrame(d, df.schema)
|
|
392
|
+
return DaskDataFrame(d, df.schema, type_safe=False)
|
|
366
393
|
|
|
367
394
|
def take(
|
|
368
395
|
self,
|
|
@@ -418,7 +445,7 @@ class DaskExecutionEngine(ExecutionEngine):
|
|
|
418
445
|
.reset_index(drop=True)
|
|
419
446
|
)
|
|
420
447
|
|
|
421
|
-
return DaskDataFrame(d, df.schema)
|
|
448
|
+
return DaskDataFrame(d, df.schema, type_safe=False)
|
|
422
449
|
|
|
423
450
|
def load_df(
|
|
424
451
|
self,
|
fugue_dask/registry.py
CHANGED
|
@@ -1,16 +1,15 @@
|
|
|
1
|
-
import
|
|
2
|
-
from typing import Any, Optional
|
|
1
|
+
from typing import Any
|
|
3
2
|
|
|
4
3
|
import dask.dataframe as dd
|
|
5
4
|
from dask.distributed import Client
|
|
6
5
|
from triad import run_at_def
|
|
7
6
|
|
|
8
|
-
from fugue import DataFrame,
|
|
9
|
-
from fugue.
|
|
7
|
+
from fugue import DataFrame, register_execution_engine
|
|
8
|
+
from fugue.dev import (
|
|
10
9
|
DataFrameParam,
|
|
11
10
|
ExecutionEngineParam,
|
|
12
|
-
|
|
13
|
-
|
|
11
|
+
fugue_annotated_param,
|
|
12
|
+
is_pandas_or,
|
|
14
13
|
)
|
|
15
14
|
from fugue.plugins import as_fugue_dataset, infer_execution_engine
|
|
16
15
|
from fugue_dask._utils import DASK_UTILS
|
|
@@ -45,36 +44,13 @@ def _register_engines() -> None:
|
|
|
45
44
|
)
|
|
46
45
|
|
|
47
46
|
|
|
48
|
-
|
|
49
|
-
register_annotation_converter(
|
|
50
|
-
0.8,
|
|
51
|
-
SimpleAnnotationConverter(
|
|
52
|
-
DaskExecutionEngine,
|
|
53
|
-
lambda param: _DaskExecutionEngineParam(param),
|
|
54
|
-
),
|
|
55
|
-
)
|
|
56
|
-
register_annotation_converter(
|
|
57
|
-
0.8,
|
|
58
|
-
SimpleAnnotationConverter(
|
|
59
|
-
dd.DataFrame, lambda param: _DaskDataFrameParam(param)
|
|
60
|
-
),
|
|
61
|
-
)
|
|
62
|
-
|
|
63
|
-
|
|
47
|
+
@fugue_annotated_param(DaskExecutionEngine)
|
|
64
48
|
class _DaskExecutionEngineParam(ExecutionEngineParam):
|
|
65
|
-
|
|
66
|
-
self,
|
|
67
|
-
param: Optional[inspect.Parameter],
|
|
68
|
-
):
|
|
69
|
-
super().__init__(
|
|
70
|
-
param, annotation="DaskExecutionEngine", engine_type=DaskExecutionEngine
|
|
71
|
-
)
|
|
49
|
+
pass
|
|
72
50
|
|
|
73
51
|
|
|
52
|
+
@fugue_annotated_param(dd.DataFrame)
|
|
74
53
|
class _DaskDataFrameParam(DataFrameParam):
|
|
75
|
-
def __init__(self, param: Optional[inspect.Parameter]):
|
|
76
|
-
super().__init__(param, annotation="dask.dataframe.DataFrame")
|
|
77
|
-
|
|
78
54
|
def to_input_data(self, df: DataFrame, ctx: Any) -> Any:
|
|
79
55
|
assert isinstance(ctx, DaskExecutionEngine)
|
|
80
56
|
return ctx.to_df(df).native
|
|
@@ -99,4 +75,3 @@ def _register() -> None:
|
|
|
99
75
|
>>> import fugue_dask
|
|
100
76
|
"""
|
|
101
77
|
_register_engines()
|
|
102
|
-
_register_annotation_converters()
|
fugue_duckdb/_io.py
CHANGED
|
@@ -21,8 +21,14 @@ from fugue_duckdb.dataframe import DuckDataFrame
|
|
|
21
21
|
def _get_single_files(
|
|
22
22
|
fp: Iterable[FileParser], fs: FileSystem, fmt: str
|
|
23
23
|
) -> Iterable[FileParser]:
|
|
24
|
+
def _isdir(d: str) -> bool:
|
|
25
|
+
try:
|
|
26
|
+
return fs.isdir(d)
|
|
27
|
+
except Exception: # pragma: no cover
|
|
28
|
+
return False
|
|
29
|
+
|
|
24
30
|
for f in fp:
|
|
25
|
-
if f.glob_pattern == "" and
|
|
31
|
+
if f.glob_pattern == "" and _isdir(f.uri):
|
|
26
32
|
yield f.with_glob("*." + fmt, fmt)
|
|
27
33
|
else:
|
|
28
34
|
yield f
|
|
@@ -211,7 +217,7 @@ class DuckDBIO:
|
|
|
211
217
|
# for k, v in kw.items():
|
|
212
218
|
# params.append(f"{k}=" + encode_value_to_expr(v))
|
|
213
219
|
pm = ", ".join(params)
|
|
214
|
-
query = f"SELECT {cols} FROM parquet_scan({pm})"
|
|
220
|
+
query = f"SELECT {cols} FROM parquet_scan([{pm}])"
|
|
215
221
|
res = DuckDataFrame(self._con.from_query(query))
|
|
216
222
|
return (
|
|
217
223
|
res # type: ignore
|
fugue_duckdb/_utils.py
CHANGED
|
@@ -27,7 +27,11 @@ _DUCK_TYPES_TO_PA: Dict[str, pa.DataType] = {
|
|
|
27
27
|
"TIME": pa.time32("ms"),
|
|
28
28
|
}
|
|
29
29
|
|
|
30
|
-
_PA_TYPES_TO_DUCK: Dict[pa.DataType, str] = {
|
|
30
|
+
_PA_TYPES_TO_DUCK: Dict[pa.DataType, str] = {
|
|
31
|
+
v: k
|
|
32
|
+
for k, v in list(_DUCK_TYPES_TO_PA.items())
|
|
33
|
+
+ [("VARCHAR", pa.large_string()), ("BLOB", pa.large_binary())]
|
|
34
|
+
}
|
|
31
35
|
|
|
32
36
|
|
|
33
37
|
def encode_column_name(name: str) -> str:
|
|
@@ -94,8 +98,9 @@ def to_duck_type(tp: pa.DataType) -> str:
|
|
|
94
98
|
raise ValueError(f"can't convert {tp} to DuckDB data type")
|
|
95
99
|
|
|
96
100
|
|
|
97
|
-
def to_pa_type(
|
|
101
|
+
def to_pa_type(duck_type_raw: Any) -> pa.DataType:
|
|
98
102
|
try:
|
|
103
|
+
duck_type = str(duck_type_raw) # for duckdb >= 0.8.0
|
|
99
104
|
if duck_type.endswith("[]"):
|
|
100
105
|
return pa.list_(to_pa_type(duck_type[:-2]))
|
|
101
106
|
p = duck_type.find("(")
|
fugue_duckdb/dask.py
CHANGED
|
@@ -50,7 +50,7 @@ class DuckDaskExecutionEngine(DuckExecutionEngine):
|
|
|
50
50
|
res = DuckDataFrame(self.connection.from_df(ddf.as_pandas()))
|
|
51
51
|
else:
|
|
52
52
|
res = DuckDataFrame(
|
|
53
|
-
duckdb.
|
|
53
|
+
duckdb.from_arrow(ddf.as_arrow(), connection=self.connection)
|
|
54
54
|
)
|
|
55
55
|
if ddf.has_metadata: # pragma: no cover
|
|
56
56
|
res.reset_metadata(ddf.metadata)
|
fugue_duckdb/dataframe.py
CHANGED
|
@@ -4,20 +4,17 @@ import pandas as pd
|
|
|
4
4
|
import pyarrow as pa
|
|
5
5
|
from duckdb import DuckDBPyRelation
|
|
6
6
|
from triad import Schema
|
|
7
|
+
from triad.utils.pyarrow import LARGE_TYPES_REPLACEMENT, replace_types_in_table
|
|
7
8
|
|
|
8
|
-
from fugue import
|
|
9
|
-
ArrayDataFrame,
|
|
10
|
-
ArrowDataFrame,
|
|
11
|
-
DataFrame,
|
|
12
|
-
LocalBoundedDataFrame,
|
|
13
|
-
LocalDataFrame,
|
|
14
|
-
)
|
|
9
|
+
from fugue import ArrayDataFrame, ArrowDataFrame, DataFrame, LocalBoundedDataFrame
|
|
15
10
|
from fugue.exceptions import FugueDataFrameOperationError, FugueDatasetEmptyError
|
|
16
11
|
from fugue.plugins import (
|
|
12
|
+
as_arrow,
|
|
17
13
|
as_fugue_dataset,
|
|
18
14
|
as_local_bounded,
|
|
19
15
|
get_column_names,
|
|
20
16
|
get_num_partitions,
|
|
17
|
+
get_schema,
|
|
21
18
|
is_df,
|
|
22
19
|
)
|
|
23
20
|
|
|
@@ -32,15 +29,7 @@ class DuckDataFrame(LocalBoundedDataFrame):
|
|
|
32
29
|
|
|
33
30
|
def __init__(self, rel: DuckDBPyRelation):
|
|
34
31
|
self._rel = rel
|
|
35
|
-
super().__init__(schema=self.
|
|
36
|
-
|
|
37
|
-
def _get_schema(self) -> Schema:
|
|
38
|
-
return Schema(
|
|
39
|
-
[
|
|
40
|
-
pa.field(x, to_pa_type(y))
|
|
41
|
-
for x, y in zip(self._rel.columns, self._rel.types)
|
|
42
|
-
]
|
|
43
|
-
)
|
|
32
|
+
super().__init__(schema=lambda: _duck_get_schema(self._rel))
|
|
44
33
|
|
|
45
34
|
@property
|
|
46
35
|
def alias(self) -> str:
|
|
@@ -104,7 +93,7 @@ class DuckDataFrame(LocalBoundedDataFrame):
|
|
|
104
93
|
return DuckDataFrame(self._rel.project(", ".join(fields)))
|
|
105
94
|
|
|
106
95
|
def as_arrow(self, type_safe: bool = False) -> pa.Table:
|
|
107
|
-
return self._rel
|
|
96
|
+
return _duck_as_arrow(self._rel)
|
|
108
97
|
|
|
109
98
|
def as_pandas(self) -> pd.DataFrame:
|
|
110
99
|
if any(pa.types.is_nested(f.type) for f in self.schema.fields):
|
|
@@ -112,8 +101,11 @@ class DuckDataFrame(LocalBoundedDataFrame):
|
|
|
112
101
|
return ArrowDataFrame(self.as_arrow()).as_pandas()
|
|
113
102
|
return self._rel.to_df()
|
|
114
103
|
|
|
115
|
-
def
|
|
116
|
-
|
|
104
|
+
def as_local_bounded(self) -> LocalBoundedDataFrame:
|
|
105
|
+
res = ArrowDataFrame(self.as_arrow())
|
|
106
|
+
if self.has_metadata:
|
|
107
|
+
res.reset_metadata(self.metadata)
|
|
108
|
+
return res
|
|
117
109
|
|
|
118
110
|
def as_array(
|
|
119
111
|
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
@@ -172,6 +164,18 @@ def _duck_as_local(df: DuckDBPyRelation) -> DuckDBPyRelation:
|
|
|
172
164
|
return df
|
|
173
165
|
|
|
174
166
|
|
|
167
|
+
@as_arrow.candidate(lambda df: isinstance(df, DuckDBPyRelation))
|
|
168
|
+
def _duck_as_arrow(df: DuckDBPyRelation) -> pa.Table:
|
|
169
|
+
_df = df.arrow()
|
|
170
|
+
_df = replace_types_in_table(_df, LARGE_TYPES_REPLACEMENT, recursive=True)
|
|
171
|
+
return _df
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
@get_schema.candidate(lambda df: isinstance(df, DuckDBPyRelation))
|
|
175
|
+
def _duck_get_schema(df: DuckDBPyRelation) -> Schema:
|
|
176
|
+
return Schema([pa.field(x, to_pa_type(y)) for x, y in zip(df.columns, df.types)])
|
|
177
|
+
|
|
178
|
+
|
|
175
179
|
@get_column_names.candidate(lambda df: isinstance(df, DuckDBPyRelation))
|
|
176
180
|
def _get_duckdb_columns(df: DuckDBPyRelation) -> List[Any]:
|
|
177
181
|
return list(df.columns)
|
fugue_duckdb/execution_engine.py
CHANGED
|
@@ -2,12 +2,11 @@ import logging
|
|
|
2
2
|
from typing import Any, Dict, Iterable, List, Optional, Union
|
|
3
3
|
|
|
4
4
|
import duckdb
|
|
5
|
-
import pyarrow as pa
|
|
6
5
|
from duckdb import DuckDBPyConnection, DuckDBPyRelation
|
|
7
6
|
from triad import SerializableRLock
|
|
8
7
|
from triad.collections.fs import FileSystem
|
|
9
|
-
from triad.utils.schema import quote_name
|
|
10
8
|
from triad.utils.assertion import assert_or_throw
|
|
9
|
+
from triad.utils.schema import quote_name
|
|
11
10
|
|
|
12
11
|
from fugue import (
|
|
13
12
|
ArrowDataFrame,
|
|
@@ -19,12 +18,7 @@ from fugue import (
|
|
|
19
18
|
)
|
|
20
19
|
from fugue.collections.partition import PartitionSpec, parse_presort_exp
|
|
21
20
|
from fugue.collections.sql import StructuredRawSQL, TempTableName
|
|
22
|
-
from fugue.dataframe import
|
|
23
|
-
DataFrame,
|
|
24
|
-
DataFrames,
|
|
25
|
-
LocalBoundedDataFrame,
|
|
26
|
-
PandasDataFrame,
|
|
27
|
-
)
|
|
21
|
+
from fugue.dataframe import DataFrame, DataFrames, LocalBoundedDataFrame
|
|
28
22
|
from fugue.dataframe.utils import get_join_schemas
|
|
29
23
|
|
|
30
24
|
from ._io import DuckDBIO
|
|
@@ -34,9 +28,10 @@ from ._utils import (
|
|
|
34
28
|
encode_schema_names,
|
|
35
29
|
encode_value_to_expr,
|
|
36
30
|
)
|
|
37
|
-
from .dataframe import DuckDataFrame
|
|
31
|
+
from .dataframe import DuckDataFrame, _duck_as_arrow
|
|
38
32
|
|
|
39
33
|
_FUGUE_DUCKDB_PRAGMA_CONFIG_PREFIX = "fugue.duckdb.pragma."
|
|
34
|
+
_FUGUE_DUCKDB_EXTENSIONS = "fugue.duckdb.extensions"
|
|
40
35
|
|
|
41
36
|
|
|
42
37
|
class DuckDBEngine(SQLEngine):
|
|
@@ -113,8 +108,8 @@ class DuckDBEngine(SQLEngine):
|
|
|
113
108
|
conn = duckdb.connect()
|
|
114
109
|
try:
|
|
115
110
|
for k, v in dfs.items():
|
|
116
|
-
duckdb.
|
|
117
|
-
return ArrowDataFrame(conn.execute(statement)
|
|
111
|
+
duckdb.from_arrow(v.as_arrow(), connection=conn).create_view(k)
|
|
112
|
+
return ArrowDataFrame(_duck_as_arrow(conn.execute(statement)))
|
|
118
113
|
finally:
|
|
119
114
|
conn.close()
|
|
120
115
|
|
|
@@ -161,6 +156,12 @@ class DuckExecutionEngine(ExecutionEngine):
|
|
|
161
156
|
try:
|
|
162
157
|
for pg in list(self._get_pragmas()): # transactional
|
|
163
158
|
self._con.execute(pg)
|
|
159
|
+
|
|
160
|
+
for ext in self.conf.get(_FUGUE_DUCKDB_EXTENSIONS, "").split(","):
|
|
161
|
+
_ext = ext.strip()
|
|
162
|
+
if _ext != "":
|
|
163
|
+
self._con.install_extension(_ext)
|
|
164
|
+
self._con.load_extension(_ext)
|
|
164
165
|
except Exception:
|
|
165
166
|
self.stop()
|
|
166
167
|
raise
|
|
@@ -228,7 +229,7 @@ class DuckExecutionEngine(ExecutionEngine):
|
|
|
228
229
|
# TODO: we should create DuckDB table, but it has bugs, so can't use by 0.3.1
|
|
229
230
|
if isinstance(df, DuckDataFrame):
|
|
230
231
|
# materialize
|
|
231
|
-
res: DataFrame = ArrowDataFrame(df.
|
|
232
|
+
res: DataFrame = ArrowDataFrame(df.as_arrow())
|
|
232
233
|
else:
|
|
233
234
|
res = self.to_df(df)
|
|
234
235
|
res.reset_metadata(df.metadata)
|
|
@@ -538,19 +539,15 @@ def _to_duck_df(
|
|
|
538
539
|
)
|
|
539
540
|
if isinstance(df, DuckDataFrame):
|
|
540
541
|
return df
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
):
|
|
545
|
-
rdf = DuckDataFrame(engine.connection.from_df(df.as_pandas()))
|
|
546
|
-
else:
|
|
547
|
-
rdf = DuckDataFrame(
|
|
548
|
-
duckdb.arrow(df.as_arrow(), connection=engine.connection)
|
|
549
|
-
)
|
|
542
|
+
rdf = DuckDataFrame(
|
|
543
|
+
duckdb.from_arrow(df.as_arrow(), connection=engine.connection)
|
|
544
|
+
)
|
|
550
545
|
rdf.reset_metadata(df.metadata if df.has_metadata else None)
|
|
551
546
|
return rdf
|
|
552
547
|
tdf = ArrowDataFrame(df, schema)
|
|
553
|
-
return DuckDataFrame(
|
|
548
|
+
return DuckDataFrame(
|
|
549
|
+
duckdb.from_arrow(tdf.native, connection=engine.connection)
|
|
550
|
+
)
|
|
554
551
|
|
|
555
552
|
res = _gen_duck()
|
|
556
553
|
if create_view:
|
fugue_duckdb/registry.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
|
-
import
|
|
2
|
-
from typing import Any, Optional
|
|
1
|
+
from typing import Any
|
|
3
2
|
|
|
4
3
|
from duckdb import DuckDBPyConnection, DuckDBPyRelation
|
|
5
4
|
from triad import run_at_def
|
|
@@ -7,15 +6,14 @@ from triad import run_at_def
|
|
|
7
6
|
from fugue import (
|
|
8
7
|
DataFrame,
|
|
9
8
|
ExecutionEngine,
|
|
10
|
-
is_pandas_or,
|
|
11
9
|
register_execution_engine,
|
|
12
10
|
register_sql_engine,
|
|
13
11
|
)
|
|
14
|
-
from fugue.
|
|
12
|
+
from fugue.dev import (
|
|
15
13
|
DataFrameParam,
|
|
16
14
|
ExecutionEngineParam,
|
|
17
|
-
|
|
18
|
-
|
|
15
|
+
fugue_annotated_param,
|
|
16
|
+
is_pandas_or,
|
|
19
17
|
)
|
|
20
18
|
from fugue.plugins import infer_execution_engine
|
|
21
19
|
from fugue_duckdb.dataframe import DuckDataFrame
|
|
@@ -69,40 +67,20 @@ def _register_engines() -> None:
|
|
|
69
67
|
register_sql_engine("duckdb", lambda engine: DuckDBEngine(engine))
|
|
70
68
|
|
|
71
69
|
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
SimpleAnnotationConverter(
|
|
76
|
-
DuckDBPyConnection,
|
|
77
|
-
lambda param: _DuckDBPyConnectionParam(param),
|
|
78
|
-
),
|
|
79
|
-
)
|
|
80
|
-
register_annotation_converter(
|
|
81
|
-
0.8,
|
|
82
|
-
SimpleAnnotationConverter(
|
|
83
|
-
DuckDBPyRelation,
|
|
84
|
-
lambda param: _DuckDBPyRelationParam(param),
|
|
85
|
-
),
|
|
86
|
-
)
|
|
70
|
+
@fugue_annotated_param(DuckExecutionEngine)
|
|
71
|
+
class _DuckExecutionEngineParam(ExecutionEngineParam):
|
|
72
|
+
pass
|
|
87
73
|
|
|
88
74
|
|
|
75
|
+
@fugue_annotated_param(DuckDBPyConnection)
|
|
89
76
|
class _DuckDBPyConnectionParam(ExecutionEngineParam):
|
|
90
|
-
def __init__(
|
|
91
|
-
self,
|
|
92
|
-
param: Optional[inspect.Parameter],
|
|
93
|
-
):
|
|
94
|
-
super().__init__(
|
|
95
|
-
param, annotation="DuckDBPyConnection", engine_type=DuckExecutionEngine
|
|
96
|
-
)
|
|
97
|
-
|
|
98
77
|
def to_input(self, engine: ExecutionEngine) -> Any:
|
|
99
|
-
|
|
78
|
+
assert isinstance(engine, DuckExecutionEngine)
|
|
79
|
+
return engine.connection # type:ignore
|
|
100
80
|
|
|
101
81
|
|
|
82
|
+
@fugue_annotated_param(DuckDBPyRelation)
|
|
102
83
|
class _DuckDBPyRelationParam(DataFrameParam):
|
|
103
|
-
def __init__(self, param: Optional[inspect.Parameter]):
|
|
104
|
-
super().__init__(param, annotation="DuckDBPyRelation")
|
|
105
|
-
|
|
106
84
|
def to_input_data(self, df: DataFrame, ctx: Any) -> Any:
|
|
107
85
|
assert isinstance(ctx, DuckExecutionEngine)
|
|
108
86
|
return ctx.to_df(df).native # type: ignore
|
|
@@ -127,4 +105,3 @@ def _register() -> None:
|
|
|
127
105
|
>>> import fugue_duckdb
|
|
128
106
|
"""
|
|
129
107
|
_register_engines()
|
|
130
|
-
_register_annotation_converters()
|
fugue_ibis/dataframe.py
CHANGED
|
@@ -5,13 +5,7 @@ import pandas as pd
|
|
|
5
5
|
import pyarrow as pa
|
|
6
6
|
from triad import Schema, assert_or_throw
|
|
7
7
|
|
|
8
|
-
from fugue import
|
|
9
|
-
DataFrame,
|
|
10
|
-
IterableDataFrame,
|
|
11
|
-
LocalBoundedDataFrame,
|
|
12
|
-
LocalDataFrame,
|
|
13
|
-
to_local_bounded_df,
|
|
14
|
-
)
|
|
8
|
+
from fugue import DataFrame, IterableDataFrame, LocalBoundedDataFrame
|
|
15
9
|
from fugue.dataframe.dataframe import _input_schema
|
|
16
10
|
from fugue.exceptions import FugueDataFrameOperationError, FugueDatasetEmptyError
|
|
17
11
|
from fugue.plugins import drop_columns, get_column_names, is_df, rename
|
|
@@ -50,7 +44,9 @@ class IbisDataFrame(DataFrame):
|
|
|
50
44
|
def _to_schema(self, schema: IbisSchema) -> Schema:
|
|
51
45
|
return to_schema(schema)
|
|
52
46
|
|
|
53
|
-
def _to_local_df(
|
|
47
|
+
def _to_local_df(
|
|
48
|
+
self, table: IbisTable, schema: Any = None
|
|
49
|
+
) -> LocalBoundedDataFrame:
|
|
54
50
|
raise NotImplementedError # pragma: no cover
|
|
55
51
|
|
|
56
52
|
def _to_iterable_df(
|
|
@@ -124,7 +120,7 @@ class IbisDataFrame(DataFrame):
|
|
|
124
120
|
def as_pandas(self) -> pd.DataFrame:
|
|
125
121
|
return self.as_local().as_pandas()
|
|
126
122
|
|
|
127
|
-
def
|
|
123
|
+
def as_local_bounded(self) -> LocalBoundedDataFrame:
|
|
128
124
|
res = self._to_local_df(self._table, schema=self.schema)
|
|
129
125
|
if res is not self and self.has_metadata:
|
|
130
126
|
res.reset_metadata(self.metadata)
|
|
@@ -152,7 +148,7 @@ class IbisDataFrame(DataFrame):
|
|
|
152
148
|
) -> LocalBoundedDataFrame:
|
|
153
149
|
if columns is not None:
|
|
154
150
|
return self[columns].head(n)
|
|
155
|
-
return
|
|
151
|
+
return self._to_local_df(self._table.head(n)).as_local_bounded()
|
|
156
152
|
|
|
157
153
|
def _alter_table_columns(self, table: IbisTable, new_schema: Schema) -> IbisTable:
|
|
158
154
|
fields: Dict[str, Any] = {}
|
fugue_ibis/execution_engine.py
CHANGED
|
@@ -324,10 +324,16 @@ class IbisMapEngine(MapEngine):
|
|
|
324
324
|
output_schema: Any,
|
|
325
325
|
partition_spec: PartitionSpec,
|
|
326
326
|
on_init: Optional[Callable[[int, DataFrame], Any]] = None,
|
|
327
|
+
map_func_format_hint: Optional[str] = None,
|
|
327
328
|
) -> DataFrame:
|
|
328
329
|
_df = self._ibis_engine._to_non_ibis_dataframe(df)
|
|
329
330
|
return self._ibis_engine.non_ibis_engine.map_engine.map_dataframe(
|
|
330
|
-
_df,
|
|
331
|
+
_df,
|
|
332
|
+
map_func=map_func,
|
|
333
|
+
output_schema=output_schema,
|
|
334
|
+
partition_spec=partition_spec,
|
|
335
|
+
on_init=on_init,
|
|
336
|
+
map_func_format_hint=map_func_format_hint,
|
|
331
337
|
)
|
|
332
338
|
|
|
333
339
|
def map_bag(
|
fugue_notebook/env.py
CHANGED
|
@@ -3,21 +3,16 @@ import html
|
|
|
3
3
|
import json
|
|
4
4
|
from typing import Any, Dict, List, Optional
|
|
5
5
|
|
|
6
|
-
from IPython.core.magic import Magics, cell_magic, magics_class, needs_local_scope
|
|
7
6
|
from IPython import get_ipython
|
|
7
|
+
from IPython.core.magic import Magics, cell_magic, magics_class, needs_local_scope
|
|
8
8
|
from IPython.display import HTML, display
|
|
9
9
|
from triad import ParamDict
|
|
10
10
|
from triad.utils.convert import to_instance
|
|
11
11
|
from triad.utils.pyarrow import _field_to_expression
|
|
12
12
|
|
|
13
|
-
import
|
|
14
|
-
from fugue import
|
|
15
|
-
|
|
16
|
-
DataFrameDisplay,
|
|
17
|
-
ExecutionEngine,
|
|
18
|
-
get_dataset_display,
|
|
19
|
-
make_execution_engine,
|
|
20
|
-
)
|
|
13
|
+
from fugue import DataFrame, DataFrameDisplay, ExecutionEngine
|
|
14
|
+
from fugue import fsql as fugue_sql
|
|
15
|
+
from fugue import get_dataset_display, make_execution_engine
|
|
21
16
|
from fugue.dataframe import YieldedDataFrame
|
|
22
17
|
from fugue.exceptions import FugueSQLSyntaxError
|
|
23
18
|
|
|
@@ -58,7 +53,7 @@ class _FugueSQLMagics(Magics):
|
|
|
58
53
|
@cell_magic("fsql")
|
|
59
54
|
def fsql(self, line: str, cell: str, local_ns: Any = None) -> None:
|
|
60
55
|
try:
|
|
61
|
-
dag = fugue_sql
|
|
56
|
+
dag = fugue_sql(
|
|
62
57
|
"\n" + cell, local_ns, fsql_ignore_case=self._fsql_ignore_case
|
|
63
58
|
)
|
|
64
59
|
except FugueSQLSyntaxError as ex:
|
fugue_polars/__init__.py
ADDED