fugue 0.8.2.dev1__py3-none-any.whl → 0.8.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fugue/__init__.py +9 -5
- fugue/_utils/interfaceless.py +1 -558
- fugue/_utils/io.py +2 -91
- fugue/_utils/registry.py +3 -2
- fugue/api.py +1 -0
- fugue/bag/bag.py +8 -4
- fugue/collections/__init__.py +0 -7
- fugue/collections/partition.py +21 -9
- fugue/constants.py +3 -1
- fugue/dataframe/__init__.py +7 -8
- fugue/dataframe/arrow_dataframe.py +1 -2
- fugue/dataframe/dataframe.py +17 -18
- fugue/dataframe/dataframe_iterable_dataframe.py +22 -6
- fugue/dataframe/function_wrapper.py +432 -0
- fugue/dataframe/iterable_dataframe.py +3 -0
- fugue/dataframe/utils.py +11 -79
- fugue/dataset/api.py +0 -4
- fugue/dev.py +47 -0
- fugue/execution/__init__.py +1 -5
- fugue/execution/api.py +36 -14
- fugue/execution/execution_engine.py +30 -4
- fugue/execution/factory.py +0 -6
- fugue/execution/native_execution_engine.py +44 -67
- fugue/extensions/_builtins/creators.py +4 -2
- fugue/extensions/_builtins/outputters.py +4 -3
- fugue/extensions/_builtins/processors.py +3 -3
- fugue/extensions/creator/convert.py +5 -2
- fugue/extensions/outputter/convert.py +2 -2
- fugue/extensions/processor/convert.py +3 -2
- fugue/extensions/transformer/convert.py +22 -9
- fugue/extensions/transformer/transformer.py +15 -1
- fugue/plugins.py +2 -0
- fugue/registry.py +0 -39
- fugue/sql/_utils.py +1 -1
- fugue/workflow/_checkpoint.py +1 -1
- fugue/workflow/api.py +13 -13
- fugue/workflow/module.py +30 -37
- fugue/workflow/workflow.py +6 -0
- {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/METADATA +37 -23
- {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/RECORD +112 -101
- {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/WHEEL +1 -1
- {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/entry_points.txt +2 -1
- {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/top_level.txt +1 -0
- fugue_contrib/contrib.py +1 -0
- fugue_contrib/viz/_ext.py +7 -1
- fugue_dask/_io.py +0 -13
- fugue_dask/_utils.py +10 -4
- fugue_dask/dataframe.py +1 -2
- fugue_dask/execution_engine.py +45 -18
- fugue_dask/registry.py +8 -33
- fugue_duckdb/_io.py +8 -2
- fugue_duckdb/_utils.py +7 -2
- fugue_duckdb/dask.py +1 -1
- fugue_duckdb/dataframe.py +23 -19
- fugue_duckdb/execution_engine.py +19 -22
- fugue_duckdb/registry.py +11 -34
- fugue_ibis/dataframe.py +6 -10
- fugue_ibis/execution_engine.py +7 -1
- fugue_notebook/env.py +5 -10
- fugue_polars/__init__.py +2 -0
- fugue_polars/_utils.py +8 -0
- fugue_polars/polars_dataframe.py +234 -0
- fugue_polars/registry.py +86 -0
- fugue_ray/_constants.py +10 -1
- fugue_ray/_utils/dataframe.py +36 -9
- fugue_ray/_utils/io.py +2 -4
- fugue_ray/dataframe.py +16 -12
- fugue_ray/execution_engine.py +53 -32
- fugue_ray/registry.py +8 -32
- fugue_spark/_utils/convert.py +22 -11
- fugue_spark/_utils/io.py +0 -13
- fugue_spark/_utils/misc.py +27 -0
- fugue_spark/_utils/partition.py +11 -18
- fugue_spark/dataframe.py +26 -22
- fugue_spark/execution_engine.py +136 -54
- fugue_spark/registry.py +29 -78
- fugue_test/builtin_suite.py +36 -14
- fugue_test/dataframe_suite.py +9 -5
- fugue_test/execution_suite.py +100 -122
- fugue_version/__init__.py +1 -1
- tests/fugue/bag/test_array_bag.py +0 -9
- tests/fugue/collections/test_partition.py +10 -3
- tests/fugue/dataframe/test_function_wrapper.py +293 -0
- tests/fugue/dataframe/test_utils.py +2 -34
- tests/fugue/execution/test_factory.py +7 -9
- tests/fugue/execution/test_naive_execution_engine.py +35 -80
- tests/fugue/extensions/test_utils.py +12 -7
- tests/fugue/extensions/transformer/test_convert_cotransformer.py +1 -0
- tests/fugue/extensions/transformer/test_convert_output_cotransformer.py +1 -0
- tests/fugue/extensions/transformer/test_convert_transformer.py +2 -0
- tests/fugue/sql/test_workflow.py +1 -1
- tests/fugue/sql/test_workflow_parse.py +3 -5
- tests/fugue/utils/test_interfaceless.py +1 -325
- tests/fugue/utils/test_io.py +0 -80
- tests/fugue_dask/test_execution_engine.py +48 -0
- tests/fugue_dask/test_io.py +0 -55
- tests/fugue_duckdb/test_dataframe.py +2 -2
- tests/fugue_duckdb/test_execution_engine.py +16 -1
- tests/fugue_duckdb/test_utils.py +1 -1
- tests/fugue_ibis/test_dataframe.py +6 -3
- tests/fugue_polars/__init__.py +0 -0
- tests/fugue_polars/test_api.py +13 -0
- tests/fugue_polars/test_dataframe.py +82 -0
- tests/fugue_polars/test_transform.py +100 -0
- tests/fugue_ray/test_execution_engine.py +40 -4
- tests/fugue_spark/test_dataframe.py +0 -8
- tests/fugue_spark/test_execution_engine.py +50 -11
- tests/fugue_spark/test_importless.py +4 -4
- tests/fugue_spark/test_spark_connect.py +82 -0
- tests/fugue_spark/utils/test_convert.py +6 -8
- tests/fugue_spark/utils/test_io.py +0 -17
- fugue/_utils/register.py +0 -3
- fugue_test/_utils.py +0 -13
- {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/LICENSE +0 -0
fugue/_utils/io.py
CHANGED
|
@@ -5,13 +5,13 @@ from urllib.parse import urlparse
|
|
|
5
5
|
|
|
6
6
|
import fs as pfs
|
|
7
7
|
import pandas as pd
|
|
8
|
-
from fs.errors import FileExpected
|
|
9
|
-
from fugue.dataframe import LocalBoundedDataFrame, LocalDataFrame, PandasDataFrame
|
|
10
8
|
from triad.collections.dict import ParamDict
|
|
11
9
|
from triad.collections.fs import FileSystem
|
|
12
10
|
from triad.collections.schema import Schema
|
|
13
11
|
from triad.utils.assertion import assert_or_throw
|
|
14
12
|
|
|
13
|
+
from fugue.dataframe import LocalBoundedDataFrame, LocalDataFrame, PandasDataFrame
|
|
14
|
+
|
|
15
15
|
|
|
16
16
|
class FileParser(object):
|
|
17
17
|
def __init__(self, path: str, format_hint: Optional[str] = None):
|
|
@@ -271,111 +271,22 @@ def _load_json(
|
|
|
271
271
|
return pdf[schema.names], schema
|
|
272
272
|
|
|
273
273
|
|
|
274
|
-
def _save_avro(df: LocalDataFrame, p: FileParser, **kwargs: Any):
|
|
275
|
-
"""Save pandas dataframe as avro.
|
|
276
|
-
If providing your own schema, the usage of schema argument is preferred
|
|
277
|
-
|
|
278
|
-
:param schema: Avro Schema determines dtypes saved
|
|
279
|
-
"""
|
|
280
|
-
import pandavro as pdx
|
|
281
|
-
|
|
282
|
-
kw = ParamDict(kwargs)
|
|
283
|
-
|
|
284
|
-
# pandavro defaults
|
|
285
|
-
schema = None
|
|
286
|
-
append = False
|
|
287
|
-
times_as_micros = True
|
|
288
|
-
|
|
289
|
-
if "schema" in kw:
|
|
290
|
-
schema = kw["schema"]
|
|
291
|
-
del kw["schema"]
|
|
292
|
-
|
|
293
|
-
if "append" in kw:
|
|
294
|
-
append = kw["append"] # default is overwrite (False) instead of append (True)
|
|
295
|
-
del kw["append"]
|
|
296
|
-
|
|
297
|
-
if "times_as_micros" in kw:
|
|
298
|
-
times_as_micros = kw["times_as_micros"]
|
|
299
|
-
del kw["times_as_micros"]
|
|
300
|
-
|
|
301
|
-
pdf = df.as_pandas()
|
|
302
|
-
pdx.to_avro(
|
|
303
|
-
p.uri, pdf, schema=schema, append=append, times_as_micros=times_as_micros, **kw
|
|
304
|
-
)
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
def _load_avro(
|
|
308
|
-
p: FileParser, columns: Any = None, **kwargs: Any
|
|
309
|
-
) -> Tuple[pd.DataFrame, Any]:
|
|
310
|
-
path = p.uri
|
|
311
|
-
try:
|
|
312
|
-
pdf = _load_single_avro(path, **kwargs)
|
|
313
|
-
except (IsADirectoryError, PermissionError, FileExpected):
|
|
314
|
-
fs = FileSystem()
|
|
315
|
-
pdf = pd.concat(
|
|
316
|
-
[
|
|
317
|
-
_load_single_avro(
|
|
318
|
-
pfs.path.combine(path, pfs.path.basename(x.path)), **kwargs
|
|
319
|
-
)
|
|
320
|
-
for x in fs.opendir(path).glob("*.avro")
|
|
321
|
-
]
|
|
322
|
-
)
|
|
323
|
-
|
|
324
|
-
if columns is None:
|
|
325
|
-
return pdf, None
|
|
326
|
-
if isinstance(columns, list): # column names
|
|
327
|
-
return pdf[columns], None
|
|
328
|
-
|
|
329
|
-
schema = Schema(columns)
|
|
330
|
-
|
|
331
|
-
# Return created DataFrame
|
|
332
|
-
return pdf[schema.names], schema
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
def _load_single_avro(path: str, **kwargs: Any) -> pd.DataFrame:
|
|
336
|
-
from fastavro import reader
|
|
337
|
-
|
|
338
|
-
kw = ParamDict(kwargs)
|
|
339
|
-
process_record = None
|
|
340
|
-
if "process_record" in kw:
|
|
341
|
-
process_record = kw["process_record"]
|
|
342
|
-
del kw["process_record"]
|
|
343
|
-
|
|
344
|
-
fs = FileSystem()
|
|
345
|
-
with fs.openbin(path) as fp:
|
|
346
|
-
# Configure Avro reader
|
|
347
|
-
avro_reader = reader(fp)
|
|
348
|
-
# Load records in memory
|
|
349
|
-
if process_record:
|
|
350
|
-
records = [process_record(r) for r in avro_reader]
|
|
351
|
-
|
|
352
|
-
else:
|
|
353
|
-
records = list(avro_reader)
|
|
354
|
-
|
|
355
|
-
# Populate pandas.DataFrame with records
|
|
356
|
-
return pd.DataFrame.from_records(records)
|
|
357
|
-
|
|
358
|
-
|
|
359
274
|
_FORMAT_MAP: Dict[str, str] = {
|
|
360
275
|
".csv": "csv",
|
|
361
276
|
".csv.gz": "csv",
|
|
362
277
|
".parquet": "parquet",
|
|
363
278
|
".json": "json",
|
|
364
279
|
".json.gz": "json",
|
|
365
|
-
".avro": "avro",
|
|
366
|
-
".avro.gz": "avro",
|
|
367
280
|
}
|
|
368
281
|
|
|
369
282
|
_FORMAT_LOAD: Dict[str, Callable[..., Tuple[pd.DataFrame, Any]]] = {
|
|
370
283
|
"csv": _load_csv,
|
|
371
284
|
"parquet": _load_parquet,
|
|
372
285
|
"json": _load_json,
|
|
373
|
-
"avro": _load_avro,
|
|
374
286
|
}
|
|
375
287
|
|
|
376
288
|
_FORMAT_SAVE: Dict[str, Callable] = {
|
|
377
289
|
"csv": _save_csv,
|
|
378
290
|
"parquet": _save_parquet,
|
|
379
291
|
"json": _save_json,
|
|
380
|
-
"avro": _save_avro,
|
|
381
292
|
}
|
fugue/_utils/registry.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
from typing import Callable
|
|
2
|
+
|
|
2
3
|
from triad import conditional_dispatcher
|
|
3
4
|
from triad.utils.dispatcher import ConditionalDispatcher
|
|
4
5
|
|
|
5
|
-
|
|
6
|
+
from ..constants import FUGUE_ENTRYPOINT
|
|
6
7
|
|
|
7
8
|
|
|
8
9
|
def fugue_plugin(func: Callable) -> ConditionalDispatcher:
|
|
9
|
-
return conditional_dispatcher(entry_point=
|
|
10
|
+
return conditional_dispatcher(entry_point=FUGUE_ENTRYPOINT)(func) # type: ignore
|
fugue/api.py
CHANGED
fugue/bag/bag.py
CHANGED
|
@@ -9,9 +9,13 @@ class Bag(Dataset):
|
|
|
9
9
|
unordered objects.
|
|
10
10
|
"""
|
|
11
11
|
|
|
12
|
-
@abstractmethod
|
|
13
12
|
def as_local(self) -> "LocalBag": # pragma: no cover
|
|
14
13
|
"""Convert this bag to a :class:`.LocalBag`"""
|
|
14
|
+
return self.as_local_bounded()
|
|
15
|
+
|
|
16
|
+
@abstractmethod
|
|
17
|
+
def as_local_bounded(self) -> "LocalBoundedBag": # pragma: no cover
|
|
18
|
+
"""Convert this bag to a :class:`.LocalBoundedBag`"""
|
|
15
19
|
raise NotImplementedError
|
|
16
20
|
|
|
17
21
|
@abstractmethod
|
|
@@ -50,9 +54,6 @@ class LocalBag(Bag):
|
|
|
50
54
|
def is_local(self) -> bool:
|
|
51
55
|
return True
|
|
52
56
|
|
|
53
|
-
def as_local(self) -> "LocalBag":
|
|
54
|
-
return self
|
|
55
|
-
|
|
56
57
|
@property
|
|
57
58
|
def num_partitions(self) -> int:
|
|
58
59
|
return 1
|
|
@@ -63,6 +64,9 @@ class LocalBoundedBag(LocalBag):
|
|
|
63
64
|
def is_bounded(self) -> bool:
|
|
64
65
|
return True
|
|
65
66
|
|
|
67
|
+
def as_local_bounded(self) -> "LocalBoundedBag":
|
|
68
|
+
return self
|
|
69
|
+
|
|
66
70
|
|
|
67
71
|
class BagDisplay(DatasetDisplay):
|
|
68
72
|
""":class:`~.Bag` plain display class"""
|
fugue/collections/__init__.py
CHANGED
fugue/collections/partition.py
CHANGED
|
@@ -98,7 +98,7 @@ class PartitionSpec(object):
|
|
|
98
98
|
|
|
99
99
|
Partition consists for these specs:
|
|
100
100
|
|
|
101
|
-
* **algo**: can be one of ``hash`` (default), ``rand``
|
|
101
|
+
* **algo**: can be one of ``hash`` (default), ``rand``, ``even`` or ``coarse``
|
|
102
102
|
* **num** or **num_partitions**: number of physical partitions, it can be an
|
|
103
103
|
expression or integer numbers, e.g ``(ROWCOUNT+4) / 3``
|
|
104
104
|
* **by** or **partition_by**: keys to partition on
|
|
@@ -208,7 +208,9 @@ class PartitionSpec(object):
|
|
|
208
208
|
|
|
209
209
|
@property
|
|
210
210
|
def algo(self) -> str:
|
|
211
|
-
"""Get algo of the spec, one of ``hash`` (default),
|
|
211
|
+
"""Get algo of the spec, one of ``hash`` (default),
|
|
212
|
+
``rand`` ``even`` or ``coarse``
|
|
213
|
+
"""
|
|
212
214
|
return self._algo if self._algo != "" else "hash"
|
|
213
215
|
|
|
214
216
|
@property
|
|
@@ -258,11 +260,14 @@ class PartitionSpec(object):
|
|
|
258
260
|
"""Get deterministic unique id of this object"""
|
|
259
261
|
return to_uuid(self.jsondict)
|
|
260
262
|
|
|
261
|
-
def get_sorts(
|
|
263
|
+
def get_sorts(
|
|
264
|
+
self, schema: Schema, with_partition_keys: bool = True
|
|
265
|
+
) -> IndexedOrderedDict[str, bool]:
|
|
262
266
|
"""Get keys for sorting in a partition, it's the combination of partition
|
|
263
267
|
keys plus the presort keys
|
|
264
268
|
|
|
265
269
|
:param schema: the dataframe schema this partition spec to operate on
|
|
270
|
+
:param with_partition_keys: whether to include partition keys
|
|
266
271
|
:return: an ordered dictionary of key, order pairs
|
|
267
272
|
|
|
268
273
|
.. admonition:: Examples
|
|
@@ -272,9 +277,10 @@ class PartitionSpec(object):
|
|
|
272
277
|
>>> assert p.get_sorts(schema) == {"a":True, "b":True, "c": False}
|
|
273
278
|
"""
|
|
274
279
|
d: IndexedOrderedDict[str, bool] = IndexedOrderedDict()
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
280
|
+
if with_partition_keys:
|
|
281
|
+
for p in self.partition_by:
|
|
282
|
+
aot(p in schema, lambda: KeyError(f"{p} not in {schema}"))
|
|
283
|
+
d[p] = True
|
|
278
284
|
for p, v in self.presort.items():
|
|
279
285
|
aot(p in schema, lambda: KeyError(f"{p} not in {schema}"))
|
|
280
286
|
d[p] = v
|
|
@@ -348,7 +354,7 @@ class DatasetPartitionCursor:
|
|
|
348
354
|
"""reset the cursor to a row (which should be the first row of a
|
|
349
355
|
new logical partition)
|
|
350
356
|
|
|
351
|
-
:param item: an item of the dataset
|
|
357
|
+
:param item: an item of the dataset, or an function generating the item
|
|
352
358
|
:param partition_no: logical partition number
|
|
353
359
|
:param slice_no: slice number inside the logical partition (to be deprecated)
|
|
354
360
|
"""
|
|
@@ -359,6 +365,8 @@ class DatasetPartitionCursor:
|
|
|
359
365
|
@property
|
|
360
366
|
def item(self) -> Any:
|
|
361
367
|
"""Get current item"""
|
|
368
|
+
if callable(self._item):
|
|
369
|
+
self._item = self._item()
|
|
362
370
|
return self._item
|
|
363
371
|
|
|
364
372
|
@property
|
|
@@ -417,11 +425,15 @@ class PartitionCursor(DatasetPartitionCursor):
|
|
|
417
425
|
"""reset the cursor to a row (which should be the first row of a
|
|
418
426
|
new logical partition)
|
|
419
427
|
|
|
420
|
-
:param row: list-like row data
|
|
428
|
+
:param row: list-like row data or a function generating a list-like row
|
|
421
429
|
:param partition_no: logical partition number
|
|
422
430
|
:param slice_no: slice number inside the logical partition (to be deprecated)
|
|
423
431
|
"""
|
|
424
|
-
super().set(
|
|
432
|
+
super().set(
|
|
433
|
+
list(row) if not callable(row) else lambda: list(row()),
|
|
434
|
+
partition_no=partition_no,
|
|
435
|
+
slice_no=slice_no,
|
|
436
|
+
)
|
|
425
437
|
|
|
426
438
|
@property
|
|
427
439
|
def row(self) -> List[Any]:
|
fugue/constants.py
CHANGED
fugue/dataframe/__init__.py
CHANGED
|
@@ -9,14 +9,13 @@ from .dataframe import (
|
|
|
9
9
|
LocalDataFrame,
|
|
10
10
|
YieldedDataFrame,
|
|
11
11
|
)
|
|
12
|
-
from .dataframe_iterable_dataframe import
|
|
12
|
+
from .dataframe_iterable_dataframe import (
|
|
13
|
+
IterableArrowDataFrame,
|
|
14
|
+
IterablePandasDataFrame,
|
|
15
|
+
LocalDataFrameIterableDataFrame,
|
|
16
|
+
)
|
|
13
17
|
from .dataframes import DataFrames
|
|
18
|
+
from .function_wrapper import DataFrameFunctionWrapper, fugue_annotated_param
|
|
14
19
|
from .iterable_dataframe import IterableDataFrame
|
|
15
20
|
from .pandas_dataframe import PandasDataFrame
|
|
16
|
-
from .utils import
|
|
17
|
-
get_column_names,
|
|
18
|
-
normalize_dataframe_column_names,
|
|
19
|
-
rename,
|
|
20
|
-
to_local_bounded_df,
|
|
21
|
-
to_local_df,
|
|
22
|
-
)
|
|
21
|
+
from .utils import get_column_names, normalize_dataframe_column_names, rename
|
|
@@ -49,7 +49,6 @@ class ArrowDataFrame(LocalBoundedDataFrame):
|
|
|
49
49
|
self,
|
|
50
50
|
df: Any = None,
|
|
51
51
|
schema: Any = None,
|
|
52
|
-
pandas_df_wrapper: bool = False,
|
|
53
52
|
):
|
|
54
53
|
if df is None:
|
|
55
54
|
schema = _input_schema(schema).assert_not_empty()
|
|
@@ -142,7 +141,7 @@ class ArrowDataFrame(LocalBoundedDataFrame):
|
|
|
142
141
|
return self.native.shape[0]
|
|
143
142
|
|
|
144
143
|
def as_pandas(self) -> pd.DataFrame:
|
|
145
|
-
return self.native.to_pandas()
|
|
144
|
+
return self.native.to_pandas(use_threads=False, date_as_object=False)
|
|
146
145
|
|
|
147
146
|
def head(
|
|
148
147
|
self, n: int, columns: Optional[List[str]] = None
|
fugue/dataframe/dataframe.py
CHANGED
|
@@ -85,9 +85,13 @@ class DataFrame(Dataset):
|
|
|
85
85
|
"""
|
|
86
86
|
raise NotImplementedError
|
|
87
87
|
|
|
88
|
-
@abstractmethod
|
|
89
88
|
def as_local(self) -> "LocalDataFrame": # pragma: no cover
|
|
90
89
|
"""Convert this dataframe to a :class:`.LocalDataFrame`"""
|
|
90
|
+
return self.as_local_bounded()
|
|
91
|
+
|
|
92
|
+
@abstractmethod
|
|
93
|
+
def as_local_bounded(self) -> "LocalBoundedDataFrame": # pragma: no cover
|
|
94
|
+
"""Convert this dataframe to a :class:`.LocalBoundedDataFrame`"""
|
|
91
95
|
raise NotImplementedError
|
|
92
96
|
|
|
93
97
|
@abstractmethod
|
|
@@ -317,10 +321,6 @@ class LocalDataFrame(DataFrame):
|
|
|
317
321
|
"""Always True because it's a LocalDataFrame"""
|
|
318
322
|
return True
|
|
319
323
|
|
|
320
|
-
def as_local(self) -> "LocalDataFrame":
|
|
321
|
-
"""Always return self, because it's a LocalDataFrame"""
|
|
322
|
-
return self
|
|
323
|
-
|
|
324
324
|
@property
|
|
325
325
|
def num_partitions(self) -> int: # pragma: no cover
|
|
326
326
|
"""Always 1 because it's a LocalDataFrame"""
|
|
@@ -346,6 +346,10 @@ class LocalBoundedDataFrame(LocalDataFrame):
|
|
|
346
346
|
"""Always True because it's a bounded dataframe"""
|
|
347
347
|
return True
|
|
348
348
|
|
|
349
|
+
def as_local_bounded(self) -> "LocalBoundedDataFrame":
|
|
350
|
+
"""Always True because it's a bounded dataframe"""
|
|
351
|
+
return self
|
|
352
|
+
|
|
349
353
|
|
|
350
354
|
class LocalUnboundedDataFrame(LocalDataFrame):
|
|
351
355
|
"""Base class of all local unbounded dataframes. Read
|
|
@@ -367,6 +371,9 @@ class LocalUnboundedDataFrame(LocalDataFrame):
|
|
|
367
371
|
"""Always False because it's an unbounded dataframe"""
|
|
368
372
|
return False
|
|
369
373
|
|
|
374
|
+
def as_local(self) -> "LocalDataFrame":
|
|
375
|
+
return self
|
|
376
|
+
|
|
370
377
|
def count(self) -> int:
|
|
371
378
|
"""
|
|
372
379
|
:raises InvalidOperationError: You can't count an unbounded dataframe
|
|
@@ -458,22 +465,14 @@ def _get_dataframe_display(ds: DataFrame):
|
|
|
458
465
|
return DataFrameDisplay(ds)
|
|
459
466
|
|
|
460
467
|
|
|
461
|
-
@as_local.candidate(lambda df: isinstance(df, DataFrame)
|
|
462
|
-
def _df_to_local(df: DataFrame) ->
|
|
468
|
+
@as_local.candidate(lambda df: isinstance(df, DataFrame))
|
|
469
|
+
def _df_to_local(df: DataFrame) -> LocalDataFrame:
|
|
463
470
|
return df.as_local()
|
|
464
471
|
|
|
465
472
|
|
|
466
|
-
@as_local_bounded.candidate(
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
)
|
|
470
|
-
def _df_to_local_bounded(df: DataFrame) -> DataFrame:
|
|
471
|
-
res: DataFrame = df.as_local()
|
|
472
|
-
if not res.is_bounded:
|
|
473
|
-
res = as_fugue_df(res.as_array(), schema=df.schema)
|
|
474
|
-
if res is not df and df.has_metadata:
|
|
475
|
-
res.reset_metadata(df.metadata)
|
|
476
|
-
return res
|
|
473
|
+
@as_local_bounded.candidate(lambda df: isinstance(df, DataFrame))
|
|
474
|
+
def _df_to_local_bounded(df: DataFrame) -> LocalBoundedDataFrame:
|
|
475
|
+
return df.as_local_bounded()
|
|
477
476
|
|
|
478
477
|
|
|
479
478
|
def _get_schema_change(
|
|
@@ -2,16 +2,20 @@ from typing import Any, Dict, Iterable, List, Optional
|
|
|
2
2
|
|
|
3
3
|
import pandas as pd
|
|
4
4
|
import pyarrow as pa
|
|
5
|
-
from
|
|
6
|
-
from
|
|
5
|
+
from triad import Schema, assert_or_throw
|
|
6
|
+
from triad.utils.iter import EmptyAwareIterable, make_empty_aware
|
|
7
|
+
|
|
8
|
+
from fugue.exceptions import FugueDataFrameInitError
|
|
9
|
+
|
|
10
|
+
from .array_dataframe import ArrayDataFrame
|
|
11
|
+
from .arrow_dataframe import ArrowDataFrame
|
|
12
|
+
from .dataframe import (
|
|
7
13
|
DataFrame,
|
|
14
|
+
LocalBoundedDataFrame,
|
|
8
15
|
LocalDataFrame,
|
|
9
16
|
LocalUnboundedDataFrame,
|
|
10
|
-
LocalBoundedDataFrame,
|
|
11
17
|
)
|
|
12
|
-
from
|
|
13
|
-
from triad import Schema, assert_or_throw
|
|
14
|
-
from triad.utils.iter import EmptyAwareIterable, make_empty_aware
|
|
18
|
+
from .pandas_dataframe import PandasDataFrame
|
|
15
19
|
|
|
16
20
|
|
|
17
21
|
class LocalDataFrameIterableDataFrame(LocalUnboundedDataFrame):
|
|
@@ -142,6 +146,9 @@ class LocalDataFrameIterableDataFrame(LocalUnboundedDataFrame):
|
|
|
142
146
|
|
|
143
147
|
return LocalDataFrameIterableDataFrame(_transform())
|
|
144
148
|
|
|
149
|
+
def as_local_bounded(self) -> "LocalBoundedDataFrame":
|
|
150
|
+
return ArrowDataFrame(self.as_arrow())
|
|
151
|
+
|
|
145
152
|
def as_array(
|
|
146
153
|
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
147
154
|
) -> List[Any]:
|
|
@@ -190,3 +197,12 @@ class LocalDataFrameIterableDataFrame(LocalUnboundedDataFrame):
|
|
|
190
197
|
yield df._drop_cols(cols)
|
|
191
198
|
|
|
192
199
|
return LocalDataFrameIterableDataFrame(_transform())
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
class IterablePandasDataFrame(LocalDataFrameIterableDataFrame):
|
|
203
|
+
def as_local_bounded(self) -> "LocalBoundedDataFrame":
|
|
204
|
+
return PandasDataFrame(self.as_pandas(), schema=self.schema)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
class IterableArrowDataFrame(LocalDataFrameIterableDataFrame):
|
|
208
|
+
pass
|