fugue 0.8.7.dev5__py3-none-any.whl → 0.8.7.dev7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. fugue/__init__.py +0 -1
  2. fugue/_utils/io.py +84 -89
  3. fugue/api.py +1 -0
  4. fugue/dataframe/api.py +19 -2
  5. fugue/dataframe/arrow_dataframe.py +48 -11
  6. fugue/dataframe/dataframe.py +20 -2
  7. fugue/dataframe/function_wrapper.py +1 -1
  8. fugue/dataframe/iterable_dataframe.py +3 -0
  9. fugue/dataframe/pandas_dataframe.py +73 -0
  10. fugue/dataframe/utils.py +78 -25
  11. fugue/execution/execution_engine.py +1 -8
  12. fugue/execution/native_execution_engine.py +5 -11
  13. fugue/plugins.py +1 -0
  14. fugue/workflow/_checkpoint.py +9 -9
  15. {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev7.dist-info}/METADATA +4 -4
  16. {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev7.dist-info}/RECORD +40 -38
  17. {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev7.dist-info}/WHEEL +1 -1
  18. {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev7.dist-info}/entry_points.txt +3 -2
  19. fugue_dask/_io.py +22 -29
  20. fugue_dask/_utils.py +15 -2
  21. fugue_dask/dataframe.py +105 -18
  22. fugue_dask/execution_engine.py +5 -12
  23. fugue_duckdb/_io.py +21 -37
  24. fugue_duckdb/dataframe.py +87 -29
  25. fugue_duckdb/execution_engine.py +2 -7
  26. fugue_ibis/dataframe.py +13 -0
  27. fugue_ibis/execution_engine.py +1 -5
  28. fugue_polars/polars_dataframe.py +53 -16
  29. fugue_ray/_utils/io.py +15 -17
  30. fugue_ray/dataframe.py +71 -19
  31. fugue_spark/_utils/io.py +3 -5
  32. fugue_spark/dataframe.py +69 -13
  33. fugue_spark/execution_engine.py +2 -7
  34. fugue_test/builtin_suite.py +12 -12
  35. fugue_test/dataframe_suite.py +14 -0
  36. fugue_test/execution_suite.py +13 -18
  37. fugue_test/plugins/misc/__init__.py +2 -0
  38. fugue_test/plugins/misc/fixtures.py +18 -0
  39. {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev7.dist-info}/LICENSE +0 -0
  40. {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev7.dist-info}/top_level.txt +0 -0
fugue_dask/_io.py CHANGED
@@ -1,13 +1,12 @@
1
1
  from typing import Any, Callable, Dict, List, Optional, Tuple, Union
2
2
 
3
- import fsspec
4
- import fs as pfs
5
3
  import pandas as pd
6
4
  from dask import dataframe as dd
5
+ from fsspec import AbstractFileSystem
7
6
  from triad.collections.dict import ParamDict
8
- from triad.collections.fs import FileSystem
9
7
  from triad.collections.schema import Schema
10
8
  from triad.utils.assertion import assert_or_throw
9
+ from triad.utils.io import join, makedirs, url_to_fs
11
10
 
12
11
  from fugue._utils.io import FileParser, _get_single_files
13
12
  from fugue_dask.dataframe import DaskDataFrame
@@ -19,7 +18,7 @@ def load_df(
19
18
  uri: Union[str, List[str]],
20
19
  format_hint: Optional[str] = None,
21
20
  columns: Any = None,
22
- fs: Optional[FileSystem] = None,
21
+ fs: Optional[AbstractFileSystem] = None,
23
22
  **kwargs: Any,
24
23
  ) -> DaskDataFrame:
25
24
  if isinstance(uri, str):
@@ -39,7 +38,7 @@ def save_df(
39
38
  uri: str,
40
39
  format_hint: Optional[str] = None,
41
40
  mode: str = "overwrite",
42
- fs: Optional[FileSystem] = None,
41
+ fs: Optional[AbstractFileSystem] = None,
43
42
  **kwargs: Any,
44
43
  ) -> None:
45
44
  assert_or_throw(
@@ -48,16 +47,13 @@ def save_df(
48
47
  )
49
48
  p = FileParser(uri, format_hint).assert_no_glob()
50
49
  if fs is None:
51
- fs = FileSystem()
50
+ fs, _ = url_to_fs(uri)
52
51
  if fs.exists(uri):
53
52
  assert_or_throw(mode == "overwrite", FileExistsError(uri))
54
53
  try:
55
- fs.remove(uri)
56
- except Exception:
57
- try:
58
- fs.removetree(uri)
59
- except Exception: # pragma: no cover
60
- pass
54
+ fs.rm(uri, recursive=True)
55
+ except Exception: # pragma: no cover
56
+ pass
61
57
  _FORMAT_SAVE[p.file_format](df, p, **kwargs)
62
58
 
63
59
 
@@ -67,7 +63,7 @@ def _save_parquet(df: DaskDataFrame, p: FileParser, **kwargs: Any) -> None:
67
63
  "write_index": False,
68
64
  **kwargs,
69
65
  }
70
- DASK_UTILS.to_parquet_friendly(df.native).to_parquet(p.uri, **params)
66
+ DASK_UTILS.to_parquet_friendly(df.native).to_parquet(p.path, **params)
71
67
 
72
68
 
73
69
  def _load_parquet(
@@ -80,27 +76,26 @@ def _load_parquet(
80
76
  if pd.__version__ >= "1.5":
81
77
  dtype_backend = "pyarrow"
82
78
  if columns is None:
83
- pdf = dd.read_parquet(p.uri, dtype_backend=dtype_backend, **params)
79
+ pdf = dd.read_parquet(p.path, dtype_backend=dtype_backend, **params)
84
80
  schema = Schema(pdf.head(1))
85
81
  return pdf, schema
86
82
  if isinstance(columns, list): # column names
87
83
  pdf = dd.read_parquet(
88
- p.uri, columns=columns, dtype_backend=dtype_backend, **params
84
+ p.path, columns=columns, dtype_backend=dtype_backend, **params
89
85
  )
90
86
  schema = Schema(pdf.head(1))
91
87
  return pdf, schema
92
88
  schema = Schema(columns)
93
89
  pdf = dd.read_parquet(
94
- p.uri, columns=schema.names, dtype_backend=dtype_backend, **params
90
+ p.path, columns=schema.names, dtype_backend=dtype_backend, **params
95
91
  )
96
92
  return pdf, schema
97
93
 
98
94
 
99
95
  def _save_csv(df: DaskDataFrame, p: FileParser, **kwargs: Any) -> None:
100
- fs, path = fsspec.core.url_to_fs(p.uri)
101
- fs.makedirs(path, exist_ok=True)
96
+ makedirs(p.path, exist_ok=True)
102
97
  df.native.to_csv(
103
- pfs.path.combine(p.uri, "*.csv"), **{"index": False, "header": False, **kwargs}
98
+ p.join("*.csv").path, **{"index": False, "header": False, **kwargs}
104
99
  )
105
100
 
106
101
 
@@ -108,7 +103,7 @@ def _safe_load_csv(path: str, **kwargs: Any) -> dd.DataFrame:
108
103
  try:
109
104
  return dd.read_csv(path, **kwargs)
110
105
  except (IsADirectoryError, PermissionError):
111
- return dd.read_csv(pfs.path.combine(path, "*.csv"), **kwargs)
106
+ return dd.read_csv(join(path, "*.csv"), **kwargs)
112
107
 
113
108
 
114
109
  def _load_csv( # noqa: C901
@@ -127,7 +122,7 @@ def _load_csv( # noqa: C901
127
122
  header = kw["header"]
128
123
  del kw["header"]
129
124
  if str(header) in ["True", "0"]:
130
- pdf = _safe_load_csv(p.uri, **{"header": 0, **kw})
125
+ pdf = _safe_load_csv(p.path, **{"header": 0, **kw})
131
126
  if columns is None:
132
127
  return pdf, None
133
128
  if isinstance(columns, list): # column names
@@ -138,34 +133,32 @@ def _load_csv( # noqa: C901
138
133
  if columns is None:
139
134
  raise ValueError("columns must be set if without header")
140
135
  if isinstance(columns, list): # column names
141
- pdf = _safe_load_csv(p.uri, **{"header": None, "names": columns, **kw})
136
+ pdf = _safe_load_csv(p.path, **{"header": None, "names": columns, **kw})
142
137
  return pdf, None
143
138
  schema = Schema(columns)
144
- pdf = _safe_load_csv(p.uri, **{"header": None, "names": schema.names, **kw})
139
+ pdf = _safe_load_csv(p.path, **{"header": None, "names": schema.names, **kw})
145
140
  return pdf, schema
146
141
  else:
147
142
  raise NotImplementedError(f"{header} is not supported")
148
143
 
149
144
 
150
145
  def _save_json(df: DaskDataFrame, p: FileParser, **kwargs: Any) -> None:
151
- fs, path = fsspec.core.url_to_fs(p.uri)
152
- fs.makedirs(path, exist_ok=True)
153
- df.native.to_json(pfs.path.combine(p.uri, "*.json"), **kwargs)
146
+ makedirs(p.path, exist_ok=True)
147
+ df.native.to_json(p.join("*.json").path, **kwargs)
154
148
 
155
149
 
156
150
  def _safe_load_json(path: str, **kwargs: Any) -> dd.DataFrame:
157
151
  try:
158
152
  return dd.read_json(path, **kwargs)
159
153
  except (IsADirectoryError, PermissionError):
160
- x = dd.read_json(pfs.path.combine(path, "*.json"), **kwargs)
161
- print(x.compute())
154
+ x = dd.read_json(join(path, "*.json"), **kwargs)
162
155
  return x
163
156
 
164
157
 
165
158
  def _load_json(
166
159
  p: FileParser, columns: Any = None, **kwargs: Any
167
160
  ) -> Tuple[dd.DataFrame, Any]:
168
- pdf = _safe_load_json(p.uri, **kwargs).reset_index(drop=True)
161
+ pdf = _safe_load_json(p.path, **kwargs).reset_index(drop=True)
169
162
  if columns is None:
170
163
  return pdf, None
171
164
  if isinstance(columns, list): # column names
fugue_dask/_utils.py CHANGED
@@ -1,13 +1,14 @@
1
1
  import math
2
- from typing import Any, List, Optional, Tuple
2
+ from typing import Any, Callable, List, Optional, Tuple, TypeVar
3
3
 
4
4
  import dask.dataframe as dd
5
5
  import numpy as np
6
6
  import pandas as pd
7
7
  import pyarrow as pa
8
8
  from dask.dataframe.core import DataFrame
9
+ from dask.delayed import delayed
9
10
  from dask.distributed import Client, get_client
10
- from triad.utils.pandas_like import PandasLikeUtils, PD_UTILS
11
+ from triad.utils.pandas_like import PD_UTILS, PandasLikeUtils
11
12
  from triad.utils.pyarrow import to_pandas_dtype
12
13
 
13
14
  import fugue.api as fa
@@ -16,6 +17,7 @@ from fugue.constants import FUGUE_CONF_DEFAULT_PARTITIONS
16
17
  from ._constants import FUGUE_DASK_CONF_DEFAULT_PARTITIONS
17
18
 
18
19
  _FUGUE_DASK_TEMP_IDX_COLUMN = "_fugue_dask_temp_index"
20
+ T = TypeVar("T")
19
21
 
20
22
 
21
23
  def get_default_partitions() -> int:
@@ -28,6 +30,17 @@ def get_default_partitions() -> int:
28
30
  return n if n > 0 else fa.get_current_parallelism() * 2
29
31
 
30
32
 
33
+ def collect(df: dd.DataFrame, func: Callable[[pd.DataFrame], T]) -> Tuple[T]:
34
+ """Compute each partition in parallel and collect the results
35
+
36
+ :param df: dask dataframe
37
+ :return: the collected result
38
+ """
39
+ dfs = df.to_delayed()
40
+ objs = [delayed(func)(df) for df in dfs]
41
+ return dd.compute(*objs)
42
+
43
+
31
44
  def hash_repartition(df: dd.DataFrame, num: int, cols: List[Any]) -> dd.DataFrame:
32
45
  """Repartition the dataframe by hashing the given columns
33
46
 
fugue_dask/dataframe.py CHANGED
@@ -3,20 +3,21 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple
3
3
  import dask.dataframe as dd
4
4
  import pandas as pd
5
5
  import pyarrow as pa
6
+ from triad import assert_or_throw
6
7
  from triad.collections.schema import Schema
7
8
  from triad.utils.assertion import assert_arg_not_none
8
9
  from triad.utils.pandas_like import PD_UTILS
9
10
  from triad.utils.pyarrow import cast_pa_table
10
11
 
11
- from fugue.dataframe import (
12
- ArrowDataFrame,
13
- DataFrame,
14
- LocalBoundedDataFrame,
15
- PandasDataFrame,
16
- )
12
+ from fugue.dataframe import DataFrame, LocalBoundedDataFrame, PandasDataFrame
17
13
  from fugue.dataframe.dataframe import _input_schema
14
+ from fugue.dataframe.pandas_dataframe import _pd_as_dicts
18
15
  from fugue.exceptions import FugueDataFrameOperationError
19
16
  from fugue.plugins import (
17
+ as_array,
18
+ as_array_iterable,
19
+ as_dict_iterable,
20
+ as_dicts,
20
21
  as_local_bounded,
21
22
  count,
22
23
  drop_columns,
@@ -32,7 +33,7 @@ from fugue.plugins import (
32
33
  )
33
34
 
34
35
  from ._constants import FUGUE_DASK_USE_ARROW
35
- from ._utils import DASK_UTILS, get_default_partitions
36
+ from ._utils import DASK_UTILS, collect, get_default_partitions
36
37
 
37
38
 
38
39
  class DaskDataFrame(DataFrame):
@@ -150,8 +151,16 @@ class DaskDataFrame(DataFrame):
150
151
  )
151
152
 
152
153
  def as_arrow(self, type_safe: bool = False) -> pa.Table:
153
- adf = pa.Table.from_pandas(self.native.compute().reset_index(drop=True))
154
- return cast_pa_table(adf, self.schema.pa_schema)
154
+ schema = self.schema.pa_schema
155
+ return pa.concat_tables(
156
+ collect(
157
+ self.native,
158
+ lambda df: cast_pa_table(
159
+ pa.Table.from_pandas(df.reset_index(drop=True), schema=schema),
160
+ schema=schema,
161
+ ),
162
+ )
163
+ )
155
164
 
156
165
  def rename(self, columns: Dict[str, str]) -> DataFrame:
157
166
  try:
@@ -170,17 +179,28 @@ class DaskDataFrame(DataFrame):
170
179
  def as_array(
171
180
  self, columns: Optional[List[str]] = None, type_safe: bool = False
172
181
  ) -> List[Any]:
173
- df: DataFrame = self
174
- if columns is not None:
175
- df = df[columns]
176
- return ArrowDataFrame(df.as_pandas(), schema=df.schema).as_array(
177
- type_safe=type_safe
178
- )
182
+ chunks = _to_array_chunks(self.native, columns, type_safe, schema=self.schema)
183
+ res: List[List[Any]] = []
184
+ for x in chunks:
185
+ res += x
186
+ return res
179
187
 
180
188
  def as_array_iterable(
181
189
  self, columns: Optional[List[str]] = None, type_safe: bool = False
182
190
  ) -> Iterable[Any]:
183
- yield from self.as_array(columns=columns, type_safe=type_safe)
191
+ chunks = _to_array_chunks(self.native, columns, type_safe, schema=self.schema)
192
+ for x in chunks:
193
+ yield from x
194
+
195
+ def as_dicts(
196
+ self, columns: Optional[List[str]] = None, type_safe: bool = False
197
+ ) -> List[Dict[str, Any]]:
198
+ return _dd_as_dicts(self.native, columns)
199
+
200
+ def as_dict_iterable(
201
+ self, columns: Optional[List[str]] = None, type_safe: bool = False
202
+ ) -> Iterable[Dict[str, Any]]:
203
+ yield from _dd_as_dict_iterable(self.native, columns)
184
204
 
185
205
  def head(
186
206
  self, n: int, columns: Optional[List[str]] = None
@@ -197,8 +217,11 @@ class DaskDataFrame(DataFrame):
197
217
  assert_arg_not_none(schema, "schema")
198
218
  return pdf, schema
199
219
  DASK_UTILS.ensure_compatible(pdf)
200
- pschema = Schema(DASK_UTILS.to_schema(pdf))
201
- if schema is None or pschema == schema:
220
+ # when pdf contains bytes, or any object types, and schema contains str
221
+ # there is no way to get the real schema of the pdf, (pschema will contain
222
+ # strs instead of the real types) so we have to force cast it to the schema
223
+ if schema is None:
224
+ pschema = Schema(DASK_UTILS.to_schema(pdf))
202
225
  return pdf, pschema.assert_not_empty()
203
226
  pdf = pdf[schema.assert_not_empty().names]
204
227
  return (
@@ -295,6 +318,48 @@ def _dd_head(
295
318
  return PandasDataFrame(res) if as_fugue else res
296
319
 
297
320
 
321
+ @as_array.candidate(lambda df, *args, **kwargs: isinstance(df, dd.DataFrame))
322
+ def _dd_as_array(
323
+ df: dd.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
324
+ ) -> List[Any]:
325
+ chunks = _to_array_chunks(df, columns, type_safe)
326
+ res: List[List[Any]] = []
327
+ for x in chunks:
328
+ res += x
329
+ return res
330
+
331
+
332
+ @as_array_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, dd.DataFrame))
333
+ def _dd_as_array_iterable(
334
+ df: dd.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
335
+ ) -> Iterable[Any]:
336
+ chunks = _to_array_chunks(df, columns, type_safe)
337
+ for x in chunks:
338
+ yield from x
339
+
340
+
341
+ @as_dicts.candidate(lambda df, *args, **kwargs: isinstance(df, dd.DataFrame))
342
+ def _dd_as_dicts(
343
+ df: dd.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
344
+ ) -> List[Dict[str, Any]]:
345
+ assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
346
+ _df = df if columns is None or len(columns) == 0 else df[columns]
347
+ res: List[Dict[str, Any]] = []
348
+ for x in collect(_df, lambda df: _pd_as_dicts(df, columns)):
349
+ res += x
350
+ return res
351
+
352
+
353
+ @as_dict_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, dd.DataFrame))
354
+ def _dd_as_dict_iterable(
355
+ df: dd.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
356
+ ) -> Iterable[Dict[str, Any]]:
357
+ assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
358
+ _df = df if columns is None or len(columns) == 0 else df[columns]
359
+ for x in collect(_df, lambda df: _pd_as_dicts(df, columns)):
360
+ yield from x
361
+
362
+
298
363
  def _assert_no_missing(df: dd.DataFrame, columns: Iterable[Any]) -> None:
299
364
  missing = set(columns) - set(df.columns)
300
365
  if len(missing) > 0:
@@ -303,3 +368,25 @@ def _assert_no_missing(df: dd.DataFrame, columns: Iterable[Any]) -> None:
303
368
 
304
369
  def _adjust_df(res: dd.DataFrame, as_fugue: bool):
305
370
  return res if not as_fugue else DaskDataFrame(res)
371
+
372
+
373
+ def _to_array_chunks(
374
+ df: dd.DataFrame,
375
+ columns: Optional[List[str]] = None,
376
+ type_safe: bool = False,
377
+ schema: Optional[Schema] = None,
378
+ ) -> Tuple[List[Any]]:
379
+ assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
380
+ _df = df if columns is None or len(columns) == 0 else df[columns]
381
+
382
+ def _to_list(pdf: pd.DataFrame) -> List[Any]:
383
+ return list(
384
+ PD_UTILS.as_array_iterable(
385
+ pdf,
386
+ schema=None if schema is None else schema.pa_schema,
387
+ columns=columns,
388
+ type_safe=type_safe,
389
+ )
390
+ )
391
+
392
+ return collect(_df, _to_list)
@@ -7,18 +7,17 @@ import pandas as pd
7
7
  from distributed import Client
8
8
  from triad.collections import Schema
9
9
  from triad.collections.dict import IndexedOrderedDict, ParamDict
10
- from triad.collections.fs import FileSystem
11
10
  from triad.utils.assertion import assert_or_throw
12
11
  from triad.utils.hash import to_uuid
13
12
  from triad.utils.pandas_like import PandasUtils
14
13
  from triad.utils.threading import RunOnce
14
+ from triad.utils.io import makedirs
15
15
  from fugue import StructuredRawSQL
16
16
  from fugue.collections.partition import (
17
17
  PartitionCursor,
18
18
  PartitionSpec,
19
19
  parse_presort_exp,
20
20
  )
21
- from fugue.exceptions import FugueBug
22
21
  from fugue.constants import KEYWORD_PARALLELISM, KEYWORD_ROWCOUNT
23
22
  from fugue.dataframe import (
24
23
  AnyDataFrame,
@@ -28,6 +27,7 @@ from fugue.dataframe import (
28
27
  PandasDataFrame,
29
28
  )
30
29
  from fugue.dataframe.utils import get_join_schemas
30
+ from fugue.exceptions import FugueBug
31
31
  from fugue.execution.execution_engine import ExecutionEngine, MapEngine, SQLEngine
32
32
  from fugue.execution.native_execution_engine import NativeExecutionEngine
33
33
  from fugue_dask._constants import FUGUE_DASK_DEFAULT_CONF
@@ -206,7 +206,6 @@ class DaskExecutionEngine(ExecutionEngine):
206
206
  p = ParamDict(FUGUE_DASK_DEFAULT_CONF)
207
207
  p.update(ParamDict(conf))
208
208
  super().__init__(p)
209
- self._fs = FileSystem()
210
209
  self._log = logging.getLogger()
211
210
  self._client = DASK_UTILS.get_or_create_client(dask_client)
212
211
  self._native = NativeExecutionEngine(conf=conf)
@@ -227,10 +226,6 @@ class DaskExecutionEngine(ExecutionEngine):
227
226
  def log(self) -> logging.Logger:
228
227
  return self._log
229
228
 
230
- @property
231
- def fs(self) -> FileSystem:
232
- return self._fs
233
-
234
229
  def create_default_sql_engine(self) -> SQLEngine:
235
230
  return DaskSQLEngine(self)
236
231
 
@@ -527,9 +522,7 @@ class DaskExecutionEngine(ExecutionEngine):
527
522
  **kwargs: Any,
528
523
  ) -> DaskDataFrame:
529
524
  return self.to_df(
530
- load_df(
531
- path, format_hint=format_hint, columns=columns, fs=self.fs, **kwargs
532
- )
525
+ load_df(path, format_hint=format_hint, columns=columns, **kwargs)
533
526
  )
534
527
 
535
528
  def save_df(
@@ -556,9 +549,9 @@ class DaskExecutionEngine(ExecutionEngine):
556
549
  else:
557
550
  if not partition_spec.empty:
558
551
  kwargs["partition_on"] = partition_spec.partition_by
559
- self.fs.makedirs(os.path.dirname(path), recreate=True)
552
+ makedirs(os.path.dirname(path), exist_ok=True)
560
553
  df = self.to_df(df)
561
- save_df(df, path, format_hint=format_hint, mode=mode, fs=self.fs, **kwargs)
554
+ save_df(df, path, format_hint=format_hint, mode=mode, **kwargs)
562
555
 
563
556
 
564
557
  def to_dask_engine_df(df: Any, schema: Any = None) -> DaskDataFrame:
fugue_duckdb/_io.py CHANGED
@@ -3,9 +3,9 @@ from typing import Any, Iterable, List, Optional, Union
3
3
 
4
4
  from duckdb import DuckDBPyConnection
5
5
  from triad import ParamDict, Schema
6
- from triad.collections.fs import FileSystem
7
- from triad.utils.assertion import assert_or_throw
8
6
 
7
+ from triad.utils.assertion import assert_or_throw
8
+ from triad.utils.io import isdir, makedirs, rm, exists
9
9
  from fugue._utils.io import FileParser, load_df, save_df
10
10
  from fugue.collections.sql import TempTableName
11
11
  from fugue.dataframe import ArrowDataFrame, LocalBoundedDataFrame
@@ -18,26 +18,17 @@ from fugue_duckdb._utils import (
18
18
  from fugue_duckdb.dataframe import DuckDataFrame
19
19
 
20
20
 
21
- def _get_single_files(
22
- fp: Iterable[FileParser], fs: FileSystem, fmt: str
23
- ) -> Iterable[FileParser]:
24
- def _isdir(d: str) -> bool:
25
- try:
26
- return fs.isdir(d)
27
- except Exception: # pragma: no cover
28
- return False
29
-
21
+ def _get_files(fp: Iterable[FileParser], fmt: str) -> Iterable[FileParser]:
30
22
  for f in fp:
31
- if f.glob_pattern == "" and _isdir(f.uri):
32
- yield f.with_glob("*." + fmt, fmt)
23
+ if not f.has_glob and isdir(f.path):
24
+ yield from f.join("*." + fmt, fmt).find_all()
33
25
  else:
34
26
  yield f
35
27
 
36
28
 
37
29
  class DuckDBIO:
38
- def __init__(self, fs: FileSystem, con: DuckDBPyConnection) -> None:
30
+ def __init__(self, con: DuckDBPyConnection) -> None:
39
31
  self._con = con
40
- self._fs = fs
41
32
  self._format_load = {"csv": self._load_csv, "parquet": self._load_parquet}
42
33
  self._format_save = {"csv": self._save_csv, "parquet": self._save_parquet}
43
34
 
@@ -55,11 +46,9 @@ class DuckDBIO:
55
46
  else:
56
47
  fp = [FileParser(u, format_hint) for u in uri]
57
48
  if fp[0].file_format not in self._format_load:
58
- return load_df(
59
- uri, format_hint=format_hint, columns=columns, fs=self._fs, **kwargs
60
- )
49
+ return load_df(uri, format_hint=format_hint, columns=columns, **kwargs)
61
50
  dfs: List[DuckDataFrame] = []
62
- for f in _get_single_files(fp, self._fs, fp[0].file_format):
51
+ for f in _get_files(fp, fp[0].file_format):
63
52
  df = self._format_load[f.file_format](f, columns, **kwargs)
64
53
  dfs.append(df)
65
54
  rel = dfs[0].native
@@ -83,26 +72,20 @@ class DuckDBIO:
83
72
  )
84
73
  p = FileParser(uri, format_hint).assert_no_glob()
85
74
  if (p.file_format not in self._format_save) or ("partition_cols" in kwargs):
86
- self._fs.makedirs(os.path.dirname(uri), recreate=True)
75
+ makedirs(os.path.dirname(uri), exist_ok=True)
87
76
  ldf = ArrowDataFrame(df.as_arrow())
88
- return save_df(
89
- ldf, uri=uri, format_hint=format_hint, mode=mode, fs=self._fs, **kwargs
90
- )
91
- fs = self._fs
92
- if fs.exists(uri):
77
+ return save_df(ldf, uri=uri, format_hint=format_hint, mode=mode, **kwargs)
78
+ if exists(uri):
93
79
  assert_or_throw(mode == "overwrite", FileExistsError(uri))
94
80
  try:
95
- fs.remove(uri)
96
- except Exception:
97
- try:
98
- fs.removetree(uri)
99
- except Exception: # pragma: no cover
100
- pass
101
- if not fs.exists(p.parent):
102
- fs.makedirs(p.parent, recreate=True)
81
+ rm(uri, recursive=True)
82
+ except Exception: # pragma: no cover
83
+ pass
84
+ p.make_parent_dirs()
103
85
  self._format_save[p.file_format](df, p, **kwargs)
104
86
 
105
87
  def _save_csv(self, df: DuckDataFrame, p: FileParser, **kwargs: Any):
88
+ p.assert_no_glob()
106
89
  dn = TempTableName()
107
90
  df.native.create_view(dn.key)
108
91
  kw = ParamDict({k.lower(): v for k, v in kwargs.items()})
@@ -111,7 +94,7 @@ class DuckDBIO:
111
94
  for k, v in kw.items():
112
95
  params.append(f"{k.upper()} " + encode_value_to_expr(v))
113
96
  pm = ", ".join(params)
114
- query = f"COPY {dn.key} TO {encode_value_to_expr(p.uri)} WITH ({pm})"
97
+ query = f"COPY {dn.key} TO {encode_value_to_expr(p.path)} WITH ({pm})"
115
98
  self._con.execute(query)
116
99
 
117
100
  def _load_csv( # noqa: C901
@@ -125,7 +108,7 @@ class DuckDBIO:
125
108
  ValueError("when csv has no header, columns must be specified"),
126
109
  )
127
110
  kw.pop("auto_detect", None)
128
- params: List[str] = [encode_value_to_expr(p.uri_with_glob)]
111
+ params: List[str] = [encode_value_to_expr(p.path)]
129
112
  kw["header"] = 1 if header else 0
130
113
  kw["auto_detect"] = 1 if infer_schema else 0
131
114
  if infer_schema:
@@ -188,6 +171,7 @@ class DuckDBIO:
188
171
  return DuckDataFrame(self._con.from_query(query))
189
172
 
190
173
  def _save_parquet(self, df: DuckDataFrame, p: FileParser, **kwargs: Any):
174
+ p.assert_no_glob()
191
175
  dn = TempTableName()
192
176
  df.native.create_view(dn.key)
193
177
  kw = ParamDict({k.lower(): v for k, v in kwargs.items()})
@@ -196,7 +180,7 @@ class DuckDBIO:
196
180
  for k, v in kw.items():
197
181
  params.append(f"{k.upper()} " + encode_value_to_expr(v))
198
182
  pm = ", ".join(params)
199
- query = f"COPY {dn.key} TO {encode_value_to_expr(p.uri)}"
183
+ query = f"COPY {dn.key} TO {encode_value_to_expr(p.path)}"
200
184
  if len(params) > 0:
201
185
  query += f" WITH ({pm})"
202
186
  self._con.execute(query)
@@ -205,7 +189,7 @@ class DuckDBIO:
205
189
  self, p: FileParser, columns: Any = None, **kwargs: Any
206
190
  ) -> DuckDataFrame:
207
191
  kw = ParamDict({k.lower(): v for k, v in kwargs.items()})
208
- params: List[str] = [encode_value_to_expr(p.uri_with_glob)]
192
+ params: List[str] = [encode_value_to_expr(p.path)]
209
193
  if isinstance(columns, list):
210
194
  cols = ", ".join(encode_column_names(columns))
211
195
  else: