fugue 0.8.2.dev1__py3-none-any.whl → 0.8.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. fugue/__init__.py +9 -5
  2. fugue/_utils/interfaceless.py +1 -558
  3. fugue/_utils/io.py +2 -91
  4. fugue/_utils/registry.py +3 -2
  5. fugue/api.py +1 -0
  6. fugue/bag/bag.py +8 -4
  7. fugue/collections/__init__.py +0 -7
  8. fugue/collections/partition.py +21 -9
  9. fugue/constants.py +3 -1
  10. fugue/dataframe/__init__.py +7 -8
  11. fugue/dataframe/arrow_dataframe.py +1 -2
  12. fugue/dataframe/dataframe.py +17 -18
  13. fugue/dataframe/dataframe_iterable_dataframe.py +22 -6
  14. fugue/dataframe/function_wrapper.py +432 -0
  15. fugue/dataframe/iterable_dataframe.py +3 -0
  16. fugue/dataframe/utils.py +11 -79
  17. fugue/dataset/api.py +0 -4
  18. fugue/dev.py +47 -0
  19. fugue/execution/__init__.py +1 -5
  20. fugue/execution/api.py +36 -14
  21. fugue/execution/execution_engine.py +30 -4
  22. fugue/execution/factory.py +0 -6
  23. fugue/execution/native_execution_engine.py +44 -67
  24. fugue/extensions/_builtins/creators.py +4 -2
  25. fugue/extensions/_builtins/outputters.py +4 -3
  26. fugue/extensions/_builtins/processors.py +3 -3
  27. fugue/extensions/creator/convert.py +5 -2
  28. fugue/extensions/outputter/convert.py +2 -2
  29. fugue/extensions/processor/convert.py +3 -2
  30. fugue/extensions/transformer/convert.py +22 -9
  31. fugue/extensions/transformer/transformer.py +15 -1
  32. fugue/plugins.py +2 -0
  33. fugue/registry.py +0 -39
  34. fugue/sql/_utils.py +1 -1
  35. fugue/workflow/_checkpoint.py +1 -1
  36. fugue/workflow/api.py +13 -13
  37. fugue/workflow/module.py +30 -37
  38. fugue/workflow/workflow.py +6 -0
  39. {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/METADATA +37 -23
  40. {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/RECORD +112 -101
  41. {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/WHEEL +1 -1
  42. {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/entry_points.txt +2 -1
  43. {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/top_level.txt +1 -0
  44. fugue_contrib/contrib.py +1 -0
  45. fugue_contrib/viz/_ext.py +7 -1
  46. fugue_dask/_io.py +0 -13
  47. fugue_dask/_utils.py +10 -4
  48. fugue_dask/dataframe.py +1 -2
  49. fugue_dask/execution_engine.py +45 -18
  50. fugue_dask/registry.py +8 -33
  51. fugue_duckdb/_io.py +8 -2
  52. fugue_duckdb/_utils.py +7 -2
  53. fugue_duckdb/dask.py +1 -1
  54. fugue_duckdb/dataframe.py +23 -19
  55. fugue_duckdb/execution_engine.py +19 -22
  56. fugue_duckdb/registry.py +11 -34
  57. fugue_ibis/dataframe.py +6 -10
  58. fugue_ibis/execution_engine.py +7 -1
  59. fugue_notebook/env.py +5 -10
  60. fugue_polars/__init__.py +2 -0
  61. fugue_polars/_utils.py +8 -0
  62. fugue_polars/polars_dataframe.py +234 -0
  63. fugue_polars/registry.py +86 -0
  64. fugue_ray/_constants.py +10 -1
  65. fugue_ray/_utils/dataframe.py +36 -9
  66. fugue_ray/_utils/io.py +2 -4
  67. fugue_ray/dataframe.py +16 -12
  68. fugue_ray/execution_engine.py +53 -32
  69. fugue_ray/registry.py +8 -32
  70. fugue_spark/_utils/convert.py +22 -11
  71. fugue_spark/_utils/io.py +0 -13
  72. fugue_spark/_utils/misc.py +27 -0
  73. fugue_spark/_utils/partition.py +11 -18
  74. fugue_spark/dataframe.py +26 -22
  75. fugue_spark/execution_engine.py +136 -54
  76. fugue_spark/registry.py +29 -78
  77. fugue_test/builtin_suite.py +36 -14
  78. fugue_test/dataframe_suite.py +9 -5
  79. fugue_test/execution_suite.py +100 -122
  80. fugue_version/__init__.py +1 -1
  81. tests/fugue/bag/test_array_bag.py +0 -9
  82. tests/fugue/collections/test_partition.py +10 -3
  83. tests/fugue/dataframe/test_function_wrapper.py +293 -0
  84. tests/fugue/dataframe/test_utils.py +2 -34
  85. tests/fugue/execution/test_factory.py +7 -9
  86. tests/fugue/execution/test_naive_execution_engine.py +35 -80
  87. tests/fugue/extensions/test_utils.py +12 -7
  88. tests/fugue/extensions/transformer/test_convert_cotransformer.py +1 -0
  89. tests/fugue/extensions/transformer/test_convert_output_cotransformer.py +1 -0
  90. tests/fugue/extensions/transformer/test_convert_transformer.py +2 -0
  91. tests/fugue/sql/test_workflow.py +1 -1
  92. tests/fugue/sql/test_workflow_parse.py +3 -5
  93. tests/fugue/utils/test_interfaceless.py +1 -325
  94. tests/fugue/utils/test_io.py +0 -80
  95. tests/fugue_dask/test_execution_engine.py +48 -0
  96. tests/fugue_dask/test_io.py +0 -55
  97. tests/fugue_duckdb/test_dataframe.py +2 -2
  98. tests/fugue_duckdb/test_execution_engine.py +16 -1
  99. tests/fugue_duckdb/test_utils.py +1 -1
  100. tests/fugue_ibis/test_dataframe.py +6 -3
  101. tests/fugue_polars/__init__.py +0 -0
  102. tests/fugue_polars/test_api.py +13 -0
  103. tests/fugue_polars/test_dataframe.py +82 -0
  104. tests/fugue_polars/test_transform.py +100 -0
  105. tests/fugue_ray/test_execution_engine.py +40 -4
  106. tests/fugue_spark/test_dataframe.py +0 -8
  107. tests/fugue_spark/test_execution_engine.py +50 -11
  108. tests/fugue_spark/test_importless.py +4 -4
  109. tests/fugue_spark/test_spark_connect.py +82 -0
  110. tests/fugue_spark/utils/test_convert.py +6 -8
  111. tests/fugue_spark/utils/test_io.py +0 -17
  112. fugue/_utils/register.py +0 -3
  113. fugue_test/_utils.py +0 -13
  114. {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/LICENSE +0 -0
@@ -3,6 +3,7 @@ import os
3
3
  from typing import Any, Callable, Dict, List, Optional, Type, Union
4
4
 
5
5
  import dask.dataframe as dd
6
+ import pandas as pd
6
7
  from distributed import Client
7
8
  from qpd_dask import run_sql_on_dask
8
9
  from triad.collections import Schema
@@ -18,7 +19,7 @@ from fugue.collections.partition import (
18
19
  PartitionSpec,
19
20
  parse_presort_exp,
20
21
  )
21
- from fugue.constants import KEYWORD_CORECOUNT, KEYWORD_ROWCOUNT
22
+ from fugue.constants import KEYWORD_PARALLELISM, KEYWORD_ROWCOUNT
22
23
  from fugue.dataframe import (
23
24
  AnyDataFrame,
24
25
  DataFrame,
@@ -34,6 +35,8 @@ from fugue_dask._io import load_df, save_df
34
35
  from fugue_dask._utils import DASK_UTILS, DaskUtils
35
36
  from fugue_dask.dataframe import DaskDataFrame
36
37
 
38
+ _DASK_PARTITION_KEY = "__dask_partition_key__"
39
+
37
40
 
38
41
  class QPDDaskEngine(SQLEngine):
39
42
  """QPD execution implementation."""
@@ -72,12 +75,15 @@ class DaskMapEngine(MapEngine):
72
75
  output_schema: Any,
73
76
  partition_spec: PartitionSpec,
74
77
  on_init: Optional[Callable[[int, DataFrame], Any]] = None,
78
+ map_func_format_hint: Optional[str] = None,
75
79
  ) -> DataFrame:
76
- presort = partition_spec.presort
80
+ is_coarse = partition_spec.algo == "coarse"
81
+ presort = partition_spec.get_sorts(df.schema, with_partition_keys=is_coarse)
77
82
  presort_keys = list(presort.keys())
78
83
  presort_asc = list(presort.values())
79
84
  output_schema = Schema(output_schema)
80
85
  input_schema = df.schema
86
+ cursor = partition_spec.get_cursor(input_schema, 0)
81
87
  on_init_once: Any = (
82
88
  None
83
89
  if on_init is None
@@ -86,20 +92,21 @@ class DaskMapEngine(MapEngine):
86
92
  )
87
93
  )
88
94
 
89
- def _map(pdf: Any) -> dd.DataFrame:
95
+ def _map(pdf: Any) -> pd.DataFrame:
90
96
  if pdf.shape[0] == 0:
91
97
  return PandasDataFrame([], output_schema).as_pandas()
92
- if len(presort_keys) > 0:
98
+ if is_coarse:
99
+ pdf = pdf.drop(columns=[_DASK_PARTITION_KEY])
100
+ if len(partition_spec.presort) > 0:
93
101
  pdf = pdf.sort_values(presort_keys, ascending=presort_asc)
94
102
  input_df = PandasDataFrame(
95
103
  pdf.reset_index(drop=True), input_schema, pandas_df_wrapper=True
96
104
  )
97
105
  if on_init_once is not None:
98
106
  on_init_once(0, input_df)
99
- cursor = partition_spec.get_cursor(input_schema, 0)
100
- cursor.set(input_df.peek_array(), 0, 0)
107
+ cursor.set(lambda: input_df.peek_array(), 0, 0)
101
108
  output_df = map_func(cursor, input_df)
102
- return output_df.as_pandas()
109
+ return output_df.as_pandas()[output_schema.names]
103
110
 
104
111
  df = self.to_df(df)
105
112
  meta = self.execution_engine.pl_utils.safe_to_pandas_dtype( # type: ignore
@@ -112,8 +119,28 @@ class DaskMapEngine(MapEngine):
112
119
  df = self.execution_engine.repartition(
113
120
  df, PartitionSpec(num=partition_spec.num_partitions)
114
121
  )
122
+ if is_coarse:
123
+ input_num_partitions = df.num_partitions
124
+ _utils = self.execution_engine.pl_utils # type: ignore
125
+ input_meta = _utils.safe_to_pandas_dtype(
126
+ (input_schema + (_DASK_PARTITION_KEY, "uint64")).pa_schema
127
+ )
128
+ tddf = df.native.map_partitions(
129
+ lambda pdf: pdf.assign(
130
+ **{
131
+ _DASK_PARTITION_KEY: pd.util.hash_pandas_object(
132
+ pdf[partition_spec.partition_by], index=False
133
+ ).mod(input_num_partitions)
134
+ }
135
+ ),
136
+ meta=input_meta,
137
+ )
138
+ keys = [_DASK_PARTITION_KEY]
139
+ else:
140
+ tddf = df.native
141
+ keys = partition_spec.partition_by
115
142
  result = self.execution_engine.pl_utils.safe_groupby_apply( # type: ignore
116
- df.native, partition_spec.partition_by, _map, meta=meta # type: ignore
143
+ tddf, keys, _map, meta=meta # type: ignore
117
144
  )
118
145
  return DaskDataFrame(result, output_schema)
119
146
 
@@ -213,7 +240,7 @@ class DaskExecutionEngine(ExecutionEngine):
213
240
  p = partition_spec.get_num_partitions(
214
241
  **{
215
242
  KEYWORD_ROWCOUNT: lambda: df.persist().count(), # type: ignore
216
- KEYWORD_CORECOUNT: lambda: self.get_current_parallelism(),
243
+ KEYWORD_PARALLELISM: lambda: self.get_current_parallelism(),
217
244
  }
218
245
  )
219
246
  if p > 0:
@@ -252,7 +279,7 @@ class DaskExecutionEngine(ExecutionEngine):
252
279
  join_type=how,
253
280
  on=key_schema.names,
254
281
  )
255
- return DaskDataFrame(d, output_schema)
282
+ return DaskDataFrame(d, output_schema, type_safe=False)
256
283
 
257
284
  def union(
258
285
  self,
@@ -267,7 +294,7 @@ class DaskExecutionEngine(ExecutionEngine):
267
294
  d = self.pl_utils.union(
268
295
  self.to_df(df1).native, self.to_df(df2).native, unique=distinct
269
296
  )
270
- return DaskDataFrame(d, df1.schema)
297
+ return DaskDataFrame(d, df1.schema, type_safe=False)
271
298
 
272
299
  def subtract(
273
300
  self,
@@ -285,7 +312,7 @@ class DaskExecutionEngine(ExecutionEngine):
285
312
  d = self.pl_utils.except_df(
286
313
  self.to_df(df1).native, self.to_df(df2).native, unique=distinct
287
314
  )
288
- return DaskDataFrame(d, df1.schema)
315
+ return DaskDataFrame(d, df1.schema, type_safe=False)
289
316
 
290
317
  def intersect(
291
318
  self,
@@ -303,11 +330,11 @@ class DaskExecutionEngine(ExecutionEngine):
303
330
  d = self.pl_utils.intersect(
304
331
  self.to_df(df1).native, self.to_df(df2).native, unique=distinct
305
332
  )
306
- return DaskDataFrame(d, df1.schema)
333
+ return DaskDataFrame(d, df1.schema, type_safe=False)
307
334
 
308
335
  def distinct(self, df: DataFrame) -> DataFrame:
309
336
  d = self.pl_utils.drop_duplicates(self.to_df(df).native)
310
- return DaskDataFrame(d, df.schema)
337
+ return DaskDataFrame(d, df.schema, type_safe=False)
311
338
 
312
339
  def dropna(
313
340
  self,
@@ -324,7 +351,7 @@ class DaskExecutionEngine(ExecutionEngine):
324
351
  if how == "any" and thresh is not None:
325
352
  del kw["how"] # to deal with a dask logic flaw
326
353
  d = self.to_df(df).native.dropna(**kw)
327
- return DaskDataFrame(d, df.schema)
354
+ return DaskDataFrame(d, df.schema, type_safe=False)
328
355
 
329
356
  def fillna(self, df: DataFrame, value: Any, subset: List[str] = None) -> DataFrame:
330
357
  assert_or_throw(
@@ -344,7 +371,7 @@ class DaskExecutionEngine(ExecutionEngine):
344
371
  subset = subset or df.columns
345
372
  mapping = {col: value for col in subset}
346
373
  d = self.to_df(df).native.fillna(mapping)
347
- return DaskDataFrame(d, df.schema)
374
+ return DaskDataFrame(d, df.schema, type_safe=False)
348
375
 
349
376
  def sample(
350
377
  self,
@@ -362,7 +389,7 @@ class DaskExecutionEngine(ExecutionEngine):
362
389
  d = self.to_df(df).native.sample(
363
390
  n=n, frac=frac, replace=replace, random_state=seed
364
391
  )
365
- return DaskDataFrame(d, df.schema)
392
+ return DaskDataFrame(d, df.schema, type_safe=False)
366
393
 
367
394
  def take(
368
395
  self,
@@ -418,7 +445,7 @@ class DaskExecutionEngine(ExecutionEngine):
418
445
  .reset_index(drop=True)
419
446
  )
420
447
 
421
- return DaskDataFrame(d, df.schema)
448
+ return DaskDataFrame(d, df.schema, type_safe=False)
422
449
 
423
450
  def load_df(
424
451
  self,
fugue_dask/registry.py CHANGED
@@ -1,16 +1,15 @@
1
- import inspect
2
- from typing import Any, Optional
1
+ from typing import Any
3
2
 
4
3
  import dask.dataframe as dd
5
4
  from dask.distributed import Client
6
5
  from triad import run_at_def
7
6
 
8
- from fugue import DataFrame, is_pandas_or, register_execution_engine
9
- from fugue._utils.interfaceless import (
7
+ from fugue import DataFrame, register_execution_engine
8
+ from fugue.dev import (
10
9
  DataFrameParam,
11
10
  ExecutionEngineParam,
12
- SimpleAnnotationConverter,
13
- register_annotation_converter,
11
+ fugue_annotated_param,
12
+ is_pandas_or,
14
13
  )
15
14
  from fugue.plugins import as_fugue_dataset, infer_execution_engine
16
15
  from fugue_dask._utils import DASK_UTILS
@@ -45,36 +44,13 @@ def _register_engines() -> None:
45
44
  )
46
45
 
47
46
 
48
- def _register_annotation_converters() -> None:
49
- register_annotation_converter(
50
- 0.8,
51
- SimpleAnnotationConverter(
52
- DaskExecutionEngine,
53
- lambda param: _DaskExecutionEngineParam(param),
54
- ),
55
- )
56
- register_annotation_converter(
57
- 0.8,
58
- SimpleAnnotationConverter(
59
- dd.DataFrame, lambda param: _DaskDataFrameParam(param)
60
- ),
61
- )
62
-
63
-
47
+ @fugue_annotated_param(DaskExecutionEngine)
64
48
  class _DaskExecutionEngineParam(ExecutionEngineParam):
65
- def __init__(
66
- self,
67
- param: Optional[inspect.Parameter],
68
- ):
69
- super().__init__(
70
- param, annotation="DaskExecutionEngine", engine_type=DaskExecutionEngine
71
- )
49
+ pass
72
50
 
73
51
 
52
+ @fugue_annotated_param(dd.DataFrame)
74
53
  class _DaskDataFrameParam(DataFrameParam):
75
- def __init__(self, param: Optional[inspect.Parameter]):
76
- super().__init__(param, annotation="dask.dataframe.DataFrame")
77
-
78
54
  def to_input_data(self, df: DataFrame, ctx: Any) -> Any:
79
55
  assert isinstance(ctx, DaskExecutionEngine)
80
56
  return ctx.to_df(df).native
@@ -99,4 +75,3 @@ def _register() -> None:
99
75
  >>> import fugue_dask
100
76
  """
101
77
  _register_engines()
102
- _register_annotation_converters()
fugue_duckdb/_io.py CHANGED
@@ -21,8 +21,14 @@ from fugue_duckdb.dataframe import DuckDataFrame
21
21
  def _get_single_files(
22
22
  fp: Iterable[FileParser], fs: FileSystem, fmt: str
23
23
  ) -> Iterable[FileParser]:
24
+ def _isdir(d: str) -> bool:
25
+ try:
26
+ return fs.isdir(d)
27
+ except Exception: # pragma: no cover
28
+ return False
29
+
24
30
  for f in fp:
25
- if f.glob_pattern == "" and fs.isdir(f.uri):
31
+ if f.glob_pattern == "" and _isdir(f.uri):
26
32
  yield f.with_glob("*." + fmt, fmt)
27
33
  else:
28
34
  yield f
@@ -211,7 +217,7 @@ class DuckDBIO:
211
217
  # for k, v in kw.items():
212
218
  # params.append(f"{k}=" + encode_value_to_expr(v))
213
219
  pm = ", ".join(params)
214
- query = f"SELECT {cols} FROM parquet_scan({pm})"
220
+ query = f"SELECT {cols} FROM parquet_scan([{pm}])"
215
221
  res = DuckDataFrame(self._con.from_query(query))
216
222
  return (
217
223
  res # type: ignore
fugue_duckdb/_utils.py CHANGED
@@ -27,7 +27,11 @@ _DUCK_TYPES_TO_PA: Dict[str, pa.DataType] = {
27
27
  "TIME": pa.time32("ms"),
28
28
  }
29
29
 
30
- _PA_TYPES_TO_DUCK: Dict[pa.DataType, str] = {v: k for k, v in _DUCK_TYPES_TO_PA.items()}
30
+ _PA_TYPES_TO_DUCK: Dict[pa.DataType, str] = {
31
+ v: k
32
+ for k, v in list(_DUCK_TYPES_TO_PA.items())
33
+ + [("VARCHAR", pa.large_string()), ("BLOB", pa.large_binary())]
34
+ }
31
35
 
32
36
 
33
37
  def encode_column_name(name: str) -> str:
@@ -94,8 +98,9 @@ def to_duck_type(tp: pa.DataType) -> str:
94
98
  raise ValueError(f"can't convert {tp} to DuckDB data type")
95
99
 
96
100
 
97
- def to_pa_type(duck_type: str) -> pa.DataType:
101
+ def to_pa_type(duck_type_raw: Any) -> pa.DataType:
98
102
  try:
103
+ duck_type = str(duck_type_raw) # for duckdb >= 0.8.0
99
104
  if duck_type.endswith("[]"):
100
105
  return pa.list_(to_pa_type(duck_type[:-2]))
101
106
  p = duck_type.find("(")
fugue_duckdb/dask.py CHANGED
@@ -50,7 +50,7 @@ class DuckDaskExecutionEngine(DuckExecutionEngine):
50
50
  res = DuckDataFrame(self.connection.from_df(ddf.as_pandas()))
51
51
  else:
52
52
  res = DuckDataFrame(
53
- duckdb.arrow(ddf.as_arrow(), connection=self.connection)
53
+ duckdb.from_arrow(ddf.as_arrow(), connection=self.connection)
54
54
  )
55
55
  if ddf.has_metadata: # pragma: no cover
56
56
  res.reset_metadata(ddf.metadata)
fugue_duckdb/dataframe.py CHANGED
@@ -4,20 +4,17 @@ import pandas as pd
4
4
  import pyarrow as pa
5
5
  from duckdb import DuckDBPyRelation
6
6
  from triad import Schema
7
+ from triad.utils.pyarrow import LARGE_TYPES_REPLACEMENT, replace_types_in_table
7
8
 
8
- from fugue import (
9
- ArrayDataFrame,
10
- ArrowDataFrame,
11
- DataFrame,
12
- LocalBoundedDataFrame,
13
- LocalDataFrame,
14
- )
9
+ from fugue import ArrayDataFrame, ArrowDataFrame, DataFrame, LocalBoundedDataFrame
15
10
  from fugue.exceptions import FugueDataFrameOperationError, FugueDatasetEmptyError
16
11
  from fugue.plugins import (
12
+ as_arrow,
17
13
  as_fugue_dataset,
18
14
  as_local_bounded,
19
15
  get_column_names,
20
16
  get_num_partitions,
17
+ get_schema,
21
18
  is_df,
22
19
  )
23
20
 
@@ -32,15 +29,7 @@ class DuckDataFrame(LocalBoundedDataFrame):
32
29
 
33
30
  def __init__(self, rel: DuckDBPyRelation):
34
31
  self._rel = rel
35
- super().__init__(schema=self._get_schema)
36
-
37
- def _get_schema(self) -> Schema:
38
- return Schema(
39
- [
40
- pa.field(x, to_pa_type(y))
41
- for x, y in zip(self._rel.columns, self._rel.types)
42
- ]
43
- )
32
+ super().__init__(schema=lambda: _duck_get_schema(self._rel))
44
33
 
45
34
  @property
46
35
  def alias(self) -> str:
@@ -104,7 +93,7 @@ class DuckDataFrame(LocalBoundedDataFrame):
104
93
  return DuckDataFrame(self._rel.project(", ".join(fields)))
105
94
 
106
95
  def as_arrow(self, type_safe: bool = False) -> pa.Table:
107
- return self._rel.arrow()
96
+ return _duck_as_arrow(self._rel)
108
97
 
109
98
  def as_pandas(self) -> pd.DataFrame:
110
99
  if any(pa.types.is_nested(f.type) for f in self.schema.fields):
@@ -112,8 +101,11 @@ class DuckDataFrame(LocalBoundedDataFrame):
112
101
  return ArrowDataFrame(self.as_arrow()).as_pandas()
113
102
  return self._rel.to_df()
114
103
 
115
- def as_local(self) -> LocalDataFrame:
116
- return ArrowDataFrame(self.as_arrow())
104
+ def as_local_bounded(self) -> LocalBoundedDataFrame:
105
+ res = ArrowDataFrame(self.as_arrow())
106
+ if self.has_metadata:
107
+ res.reset_metadata(self.metadata)
108
+ return res
117
109
 
118
110
  def as_array(
119
111
  self, columns: Optional[List[str]] = None, type_safe: bool = False
@@ -172,6 +164,18 @@ def _duck_as_local(df: DuckDBPyRelation) -> DuckDBPyRelation:
172
164
  return df
173
165
 
174
166
 
167
+ @as_arrow.candidate(lambda df: isinstance(df, DuckDBPyRelation))
168
+ def _duck_as_arrow(df: DuckDBPyRelation) -> pa.Table:
169
+ _df = df.arrow()
170
+ _df = replace_types_in_table(_df, LARGE_TYPES_REPLACEMENT, recursive=True)
171
+ return _df
172
+
173
+
174
+ @get_schema.candidate(lambda df: isinstance(df, DuckDBPyRelation))
175
+ def _duck_get_schema(df: DuckDBPyRelation) -> Schema:
176
+ return Schema([pa.field(x, to_pa_type(y)) for x, y in zip(df.columns, df.types)])
177
+
178
+
175
179
  @get_column_names.candidate(lambda df: isinstance(df, DuckDBPyRelation))
176
180
  def _get_duckdb_columns(df: DuckDBPyRelation) -> List[Any]:
177
181
  return list(df.columns)
@@ -2,12 +2,11 @@ import logging
2
2
  from typing import Any, Dict, Iterable, List, Optional, Union
3
3
 
4
4
  import duckdb
5
- import pyarrow as pa
6
5
  from duckdb import DuckDBPyConnection, DuckDBPyRelation
7
6
  from triad import SerializableRLock
8
7
  from triad.collections.fs import FileSystem
9
- from triad.utils.schema import quote_name
10
8
  from triad.utils.assertion import assert_or_throw
9
+ from triad.utils.schema import quote_name
11
10
 
12
11
  from fugue import (
13
12
  ArrowDataFrame,
@@ -19,12 +18,7 @@ from fugue import (
19
18
  )
20
19
  from fugue.collections.partition import PartitionSpec, parse_presort_exp
21
20
  from fugue.collections.sql import StructuredRawSQL, TempTableName
22
- from fugue.dataframe import (
23
- DataFrame,
24
- DataFrames,
25
- LocalBoundedDataFrame,
26
- PandasDataFrame,
27
- )
21
+ from fugue.dataframe import DataFrame, DataFrames, LocalBoundedDataFrame
28
22
  from fugue.dataframe.utils import get_join_schemas
29
23
 
30
24
  from ._io import DuckDBIO
@@ -34,9 +28,10 @@ from ._utils import (
34
28
  encode_schema_names,
35
29
  encode_value_to_expr,
36
30
  )
37
- from .dataframe import DuckDataFrame
31
+ from .dataframe import DuckDataFrame, _duck_as_arrow
38
32
 
39
33
  _FUGUE_DUCKDB_PRAGMA_CONFIG_PREFIX = "fugue.duckdb.pragma."
34
+ _FUGUE_DUCKDB_EXTENSIONS = "fugue.duckdb.extensions"
40
35
 
41
36
 
42
37
  class DuckDBEngine(SQLEngine):
@@ -113,8 +108,8 @@ class DuckDBEngine(SQLEngine):
113
108
  conn = duckdb.connect()
114
109
  try:
115
110
  for k, v in dfs.items():
116
- duckdb.arrow(v.as_arrow(), connection=conn).create_view(k)
117
- return ArrowDataFrame(conn.execute(statement).arrow())
111
+ duckdb.from_arrow(v.as_arrow(), connection=conn).create_view(k)
112
+ return ArrowDataFrame(_duck_as_arrow(conn.execute(statement)))
118
113
  finally:
119
114
  conn.close()
120
115
 
@@ -161,6 +156,12 @@ class DuckExecutionEngine(ExecutionEngine):
161
156
  try:
162
157
  for pg in list(self._get_pragmas()): # transactional
163
158
  self._con.execute(pg)
159
+
160
+ for ext in self.conf.get(_FUGUE_DUCKDB_EXTENSIONS, "").split(","):
161
+ _ext = ext.strip()
162
+ if _ext != "":
163
+ self._con.install_extension(_ext)
164
+ self._con.load_extension(_ext)
164
165
  except Exception:
165
166
  self.stop()
166
167
  raise
@@ -228,7 +229,7 @@ class DuckExecutionEngine(ExecutionEngine):
228
229
  # TODO: we should create DuckDB table, but it has bugs, so can't use by 0.3.1
229
230
  if isinstance(df, DuckDataFrame):
230
231
  # materialize
231
- res: DataFrame = ArrowDataFrame(df.native.arrow())
232
+ res: DataFrame = ArrowDataFrame(df.as_arrow())
232
233
  else:
233
234
  res = self.to_df(df)
234
235
  res.reset_metadata(df.metadata)
@@ -538,19 +539,15 @@ def _to_duck_df(
538
539
  )
539
540
  if isinstance(df, DuckDataFrame):
540
541
  return df
541
-
542
- if isinstance(df, PandasDataFrame) and all(
543
- not pa.types.is_nested(f.type) for f in df.schema.fields
544
- ):
545
- rdf = DuckDataFrame(engine.connection.from_df(df.as_pandas()))
546
- else:
547
- rdf = DuckDataFrame(
548
- duckdb.arrow(df.as_arrow(), connection=engine.connection)
549
- )
542
+ rdf = DuckDataFrame(
543
+ duckdb.from_arrow(df.as_arrow(), connection=engine.connection)
544
+ )
550
545
  rdf.reset_metadata(df.metadata if df.has_metadata else None)
551
546
  return rdf
552
547
  tdf = ArrowDataFrame(df, schema)
553
- return DuckDataFrame(duckdb.arrow(tdf.native, connection=engine.connection))
548
+ return DuckDataFrame(
549
+ duckdb.from_arrow(tdf.native, connection=engine.connection)
550
+ )
554
551
 
555
552
  res = _gen_duck()
556
553
  if create_view:
fugue_duckdb/registry.py CHANGED
@@ -1,5 +1,4 @@
1
- import inspect
2
- from typing import Any, Optional
1
+ from typing import Any
3
2
 
4
3
  from duckdb import DuckDBPyConnection, DuckDBPyRelation
5
4
  from triad import run_at_def
@@ -7,15 +6,14 @@ from triad import run_at_def
7
6
  from fugue import (
8
7
  DataFrame,
9
8
  ExecutionEngine,
10
- is_pandas_or,
11
9
  register_execution_engine,
12
10
  register_sql_engine,
13
11
  )
14
- from fugue._utils.interfaceless import (
12
+ from fugue.dev import (
15
13
  DataFrameParam,
16
14
  ExecutionEngineParam,
17
- SimpleAnnotationConverter,
18
- register_annotation_converter,
15
+ fugue_annotated_param,
16
+ is_pandas_or,
19
17
  )
20
18
  from fugue.plugins import infer_execution_engine
21
19
  from fugue_duckdb.dataframe import DuckDataFrame
@@ -69,40 +67,20 @@ def _register_engines() -> None:
69
67
  register_sql_engine("duckdb", lambda engine: DuckDBEngine(engine))
70
68
 
71
69
 
72
- def _register_annotation_converters() -> None:
73
- register_annotation_converter(
74
- 0.8,
75
- SimpleAnnotationConverter(
76
- DuckDBPyConnection,
77
- lambda param: _DuckDBPyConnectionParam(param),
78
- ),
79
- )
80
- register_annotation_converter(
81
- 0.8,
82
- SimpleAnnotationConverter(
83
- DuckDBPyRelation,
84
- lambda param: _DuckDBPyRelationParam(param),
85
- ),
86
- )
70
+ @fugue_annotated_param(DuckExecutionEngine)
71
+ class _DuckExecutionEngineParam(ExecutionEngineParam):
72
+ pass
87
73
 
88
74
 
75
+ @fugue_annotated_param(DuckDBPyConnection)
89
76
  class _DuckDBPyConnectionParam(ExecutionEngineParam):
90
- def __init__(
91
- self,
92
- param: Optional[inspect.Parameter],
93
- ):
94
- super().__init__(
95
- param, annotation="DuckDBPyConnection", engine_type=DuckExecutionEngine
96
- )
97
-
98
77
  def to_input(self, engine: ExecutionEngine) -> Any:
99
- return super().to_input(engine).connection # type:ignore
78
+ assert isinstance(engine, DuckExecutionEngine)
79
+ return engine.connection # type:ignore
100
80
 
101
81
 
82
+ @fugue_annotated_param(DuckDBPyRelation)
102
83
  class _DuckDBPyRelationParam(DataFrameParam):
103
- def __init__(self, param: Optional[inspect.Parameter]):
104
- super().__init__(param, annotation="DuckDBPyRelation")
105
-
106
84
  def to_input_data(self, df: DataFrame, ctx: Any) -> Any:
107
85
  assert isinstance(ctx, DuckExecutionEngine)
108
86
  return ctx.to_df(df).native # type: ignore
@@ -127,4 +105,3 @@ def _register() -> None:
127
105
  >>> import fugue_duckdb
128
106
  """
129
107
  _register_engines()
130
- _register_annotation_converters()
fugue_ibis/dataframe.py CHANGED
@@ -5,13 +5,7 @@ import pandas as pd
5
5
  import pyarrow as pa
6
6
  from triad import Schema, assert_or_throw
7
7
 
8
- from fugue import (
9
- DataFrame,
10
- IterableDataFrame,
11
- LocalBoundedDataFrame,
12
- LocalDataFrame,
13
- to_local_bounded_df,
14
- )
8
+ from fugue import DataFrame, IterableDataFrame, LocalBoundedDataFrame
15
9
  from fugue.dataframe.dataframe import _input_schema
16
10
  from fugue.exceptions import FugueDataFrameOperationError, FugueDatasetEmptyError
17
11
  from fugue.plugins import drop_columns, get_column_names, is_df, rename
@@ -50,7 +44,9 @@ class IbisDataFrame(DataFrame):
50
44
  def _to_schema(self, schema: IbisSchema) -> Schema:
51
45
  return to_schema(schema)
52
46
 
53
- def _to_local_df(self, table: IbisTable, schema: Any = None) -> LocalDataFrame:
47
+ def _to_local_df(
48
+ self, table: IbisTable, schema: Any = None
49
+ ) -> LocalBoundedDataFrame:
54
50
  raise NotImplementedError # pragma: no cover
55
51
 
56
52
  def _to_iterable_df(
@@ -124,7 +120,7 @@ class IbisDataFrame(DataFrame):
124
120
  def as_pandas(self) -> pd.DataFrame:
125
121
  return self.as_local().as_pandas()
126
122
 
127
- def as_local(self) -> LocalDataFrame:
123
+ def as_local_bounded(self) -> LocalBoundedDataFrame:
128
124
  res = self._to_local_df(self._table, schema=self.schema)
129
125
  if res is not self and self.has_metadata:
130
126
  res.reset_metadata(self.metadata)
@@ -152,7 +148,7 @@ class IbisDataFrame(DataFrame):
152
148
  ) -> LocalBoundedDataFrame:
153
149
  if columns is not None:
154
150
  return self[columns].head(n)
155
- return to_local_bounded_df(self._to_local_df(self._table.head(n)))
151
+ return self._to_local_df(self._table.head(n)).as_local_bounded()
156
152
 
157
153
  def _alter_table_columns(self, table: IbisTable, new_schema: Schema) -> IbisTable:
158
154
  fields: Dict[str, Any] = {}
@@ -324,10 +324,16 @@ class IbisMapEngine(MapEngine):
324
324
  output_schema: Any,
325
325
  partition_spec: PartitionSpec,
326
326
  on_init: Optional[Callable[[int, DataFrame], Any]] = None,
327
+ map_func_format_hint: Optional[str] = None,
327
328
  ) -> DataFrame:
328
329
  _df = self._ibis_engine._to_non_ibis_dataframe(df)
329
330
  return self._ibis_engine.non_ibis_engine.map_engine.map_dataframe(
330
- _df, map_func, output_schema, partition_spec, on_init
331
+ _df,
332
+ map_func=map_func,
333
+ output_schema=output_schema,
334
+ partition_spec=partition_spec,
335
+ on_init=on_init,
336
+ map_func_format_hint=map_func_format_hint,
331
337
  )
332
338
 
333
339
  def map_bag(
fugue_notebook/env.py CHANGED
@@ -3,21 +3,16 @@ import html
3
3
  import json
4
4
  from typing import Any, Dict, List, Optional
5
5
 
6
- from IPython.core.magic import Magics, cell_magic, magics_class, needs_local_scope
7
6
  from IPython import get_ipython
7
+ from IPython.core.magic import Magics, cell_magic, magics_class, needs_local_scope
8
8
  from IPython.display import HTML, display
9
9
  from triad import ParamDict
10
10
  from triad.utils.convert import to_instance
11
11
  from triad.utils.pyarrow import _field_to_expression
12
12
 
13
- import fugue_sql
14
- from fugue import (
15
- DataFrame,
16
- DataFrameDisplay,
17
- ExecutionEngine,
18
- get_dataset_display,
19
- make_execution_engine,
20
- )
13
+ from fugue import DataFrame, DataFrameDisplay, ExecutionEngine
14
+ from fugue import fsql as fugue_sql
15
+ from fugue import get_dataset_display, make_execution_engine
21
16
  from fugue.dataframe import YieldedDataFrame
22
17
  from fugue.exceptions import FugueSQLSyntaxError
23
18
 
@@ -58,7 +53,7 @@ class _FugueSQLMagics(Magics):
58
53
  @cell_magic("fsql")
59
54
  def fsql(self, line: str, cell: str, local_ns: Any = None) -> None:
60
55
  try:
61
- dag = fugue_sql.fsql(
56
+ dag = fugue_sql(
62
57
  "\n" + cell, local_ns, fsql_ignore_case=self._fsql_ignore_case
63
58
  )
64
59
  except FugueSQLSyntaxError as ex:
@@ -0,0 +1,2 @@
1
+ # flake8: noqa
2
+ from .polars_dataframe import PolarsDataFrame
fugue_polars/_utils.py ADDED
@@ -0,0 +1,8 @@
1
+ import polars as pl
2
+ from triad import Schema
3
+
4
+ from fugue.dataframe.arrow_dataframe import _build_empty_arrow
5
+
6
+
7
+ def build_empty_pl(schema: Schema) -> pl.DataFrame:
8
+ return pl.from_arrow(_build_empty_arrow(schema))