PyPI - fugue - Versions diffs - 0.8.2.dev1__py3-none-any.whl → 0.8.4__py3-none-any.whl - Mend

fugue 0.8.2.dev1py3-none-any.whl → 0.8.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (114) hide show

fugue/__init__.py +9 -5
fugue/_utils/interfaceless.py +1 -558
fugue/_utils/io.py +2 -91
fugue/_utils/registry.py +3 -2
fugue/api.py +1 -0
fugue/bag/bag.py +8 -4
fugue/collections/__init__.py +0 -7
fugue/collections/partition.py +21 -9
fugue/constants.py +3 -1
fugue/dataframe/__init__.py +7 -8
fugue/dataframe/arrow_dataframe.py +1 -2
fugue/dataframe/dataframe.py +17 -18
fugue/dataframe/dataframe_iterable_dataframe.py +22 -6
fugue/dataframe/function_wrapper.py +432 -0
fugue/dataframe/iterable_dataframe.py +3 -0
fugue/dataframe/utils.py +11 -79
fugue/dataset/api.py +0 -4
fugue/dev.py +47 -0
fugue/execution/__init__.py +1 -5
fugue/execution/api.py +36 -14
fugue/execution/execution_engine.py +30 -4
fugue/execution/factory.py +0 -6
fugue/execution/native_execution_engine.py +44 -67
fugue/extensions/_builtins/creators.py +4 -2
fugue/extensions/_builtins/outputters.py +4 -3
fugue/extensions/_builtins/processors.py +3 -3
fugue/extensions/creator/convert.py +5 -2
fugue/extensions/outputter/convert.py +2 -2
fugue/extensions/processor/convert.py +3 -2
fugue/extensions/transformer/convert.py +22 -9
fugue/extensions/transformer/transformer.py +15 -1
fugue/plugins.py +2 -0
fugue/registry.py +0 -39
fugue/sql/_utils.py +1 -1
fugue/workflow/_checkpoint.py +1 -1
fugue/workflow/api.py +13 -13
fugue/workflow/module.py +30 -37
fugue/workflow/workflow.py +6 -0
{fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/METADATA +37 -23
{fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/RECORD +112 -101
{fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/WHEEL +1 -1
{fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/entry_points.txt +2 -1
{fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/top_level.txt +1 -0
fugue_contrib/contrib.py +1 -0
fugue_contrib/viz/_ext.py +7 -1
fugue_dask/_io.py +0 -13
fugue_dask/_utils.py +10 -4
fugue_dask/dataframe.py +1 -2
fugue_dask/execution_engine.py +45 -18
fugue_dask/registry.py +8 -33
fugue_duckdb/_io.py +8 -2
fugue_duckdb/_utils.py +7 -2
fugue_duckdb/dask.py +1 -1
fugue_duckdb/dataframe.py +23 -19
fugue_duckdb/execution_engine.py +19 -22
fugue_duckdb/registry.py +11 -34
fugue_ibis/dataframe.py +6 -10
fugue_ibis/execution_engine.py +7 -1
fugue_notebook/env.py +5 -10
fugue_polars/__init__.py +2 -0
fugue_polars/_utils.py +8 -0
fugue_polars/polars_dataframe.py +234 -0
fugue_polars/registry.py +86 -0
fugue_ray/_constants.py +10 -1
fugue_ray/_utils/dataframe.py +36 -9
fugue_ray/_utils/io.py +2 -4
fugue_ray/dataframe.py +16 -12
fugue_ray/execution_engine.py +53 -32
fugue_ray/registry.py +8 -32
fugue_spark/_utils/convert.py +22 -11
fugue_spark/_utils/io.py +0 -13
fugue_spark/_utils/misc.py +27 -0
fugue_spark/_utils/partition.py +11 -18
fugue_spark/dataframe.py +26 -22
fugue_spark/execution_engine.py +136 -54
fugue_spark/registry.py +29 -78
fugue_test/builtin_suite.py +36 -14
fugue_test/dataframe_suite.py +9 -5
fugue_test/execution_suite.py +100 -122
fugue_version/__init__.py +1 -1
tests/fugue/bag/test_array_bag.py +0 -9
tests/fugue/collections/test_partition.py +10 -3
tests/fugue/dataframe/test_function_wrapper.py +293 -0
tests/fugue/dataframe/test_utils.py +2 -34
tests/fugue/execution/test_factory.py +7 -9
tests/fugue/execution/test_naive_execution_engine.py +35 -80
tests/fugue/extensions/test_utils.py +12 -7
tests/fugue/extensions/transformer/test_convert_cotransformer.py +1 -0
tests/fugue/extensions/transformer/test_convert_output_cotransformer.py +1 -0
tests/fugue/extensions/transformer/test_convert_transformer.py +2 -0
tests/fugue/sql/test_workflow.py +1 -1
tests/fugue/sql/test_workflow_parse.py +3 -5
tests/fugue/utils/test_interfaceless.py +1 -325
tests/fugue/utils/test_io.py +0 -80
tests/fugue_dask/test_execution_engine.py +48 -0
tests/fugue_dask/test_io.py +0 -55
tests/fugue_duckdb/test_dataframe.py +2 -2
tests/fugue_duckdb/test_execution_engine.py +16 -1
tests/fugue_duckdb/test_utils.py +1 -1
tests/fugue_ibis/test_dataframe.py +6 -3
tests/fugue_polars/__init__.py +0 -0
tests/fugue_polars/test_api.py +13 -0
tests/fugue_polars/test_dataframe.py +82 -0
tests/fugue_polars/test_transform.py +100 -0
tests/fugue_ray/test_execution_engine.py +40 -4
tests/fugue_spark/test_dataframe.py +0 -8
tests/fugue_spark/test_execution_engine.py +50 -11
tests/fugue_spark/test_importless.py +4 -4
tests/fugue_spark/test_spark_connect.py +82 -0
tests/fugue_spark/utils/test_convert.py +6 -8
tests/fugue_spark/utils/test_io.py +0 -17
fugue/_utils/register.py +0 -3
fugue_test/_utils.py +0 -13
{fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/LICENSE +0 -0

fugue/_utils/io.py CHANGED Viewed

@@ -5,13 +5,13 @@ from urllib.parse import urlparse
 import fs as pfs
 import pandas as pd
-from fs.errors import FileExpected
-from fugue.dataframe import LocalBoundedDataFrame, LocalDataFrame, PandasDataFrame
 from triad.collections.dict import ParamDict
 from triad.collections.fs import FileSystem
 from triad.collections.schema import Schema
 from triad.utils.assertion import assert_or_throw
+from fugue.dataframe import LocalBoundedDataFrame, LocalDataFrame, PandasDataFrame
 class FileParser(object):
     def __init__(self, path: str, format_hint: Optional[str] = None):
@@ -271,111 +271,22 @@ def _load_json(
     return pdf[schema.names], schema
-def _save_avro(df: LocalDataFrame, p: FileParser, **kwargs: Any):
-    """Save pandas dataframe as avro.
-    If providing your own schema, the usage of schema argument is preferred
-    :param schema: Avro Schema determines dtypes saved
-    """
-    import pandavro as pdx
-    kw = ParamDict(kwargs)
-    # pandavro defaults
-    schema = None
-    append = False
-    times_as_micros = True
-    if "schema" in kw:
-        schema = kw["schema"]
-        del kw["schema"]
-    if "append" in kw:
-        append = kw["append"]  # default is overwrite (False) instead of append (True)
-        del kw["append"]
-    if "times_as_micros" in kw:
-        times_as_micros = kw["times_as_micros"]
-        del kw["times_as_micros"]
-    pdf = df.as_pandas()
-    pdx.to_avro(
-        p.uri, pdf, schema=schema, append=append, times_as_micros=times_as_micros, **kw
-    )
-def _load_avro(
-    p: FileParser, columns: Any = None, **kwargs: Any
-) -> Tuple[pd.DataFrame, Any]:
-    path = p.uri
-    try:
-        pdf = _load_single_avro(path, **kwargs)
-    except (IsADirectoryError, PermissionError, FileExpected):
-        fs = FileSystem()
-        pdf = pd.concat(
-            [
-                _load_single_avro(
-                    pfs.path.combine(path, pfs.path.basename(x.path)), **kwargs
-                )
-                for x in fs.opendir(path).glob("*.avro")
-            ]
-        )
-    if columns is None:
-        return pdf, None
-    if isinstance(columns, list):  # column names
-        return pdf[columns], None
-    schema = Schema(columns)
-    # Return created DataFrame
-    return pdf[schema.names], schema
-def _load_single_avro(path: str, **kwargs: Any) -> pd.DataFrame:
-    from fastavro import reader
-    kw = ParamDict(kwargs)
-    process_record = None
-    if "process_record" in kw:
-        process_record = kw["process_record"]
-        del kw["process_record"]
-    fs = FileSystem()
-    with fs.openbin(path) as fp:
-        # Configure Avro reader
-        avro_reader = reader(fp)
-        # Load records in memory
-        if process_record:
-            records = [process_record(r) for r in avro_reader]
-        else:
-            records = list(avro_reader)
-        # Populate pandas.DataFrame with records
-        return pd.DataFrame.from_records(records)
 _FORMAT_MAP: Dict[str, str] = {
     ".csv": "csv",
     ".csv.gz": "csv",
     ".parquet": "parquet",
     ".json": "json",
     ".json.gz": "json",
-    ".avro": "avro",
-    ".avro.gz": "avro",
 }
 _FORMAT_LOAD: Dict[str, Callable[..., Tuple[pd.DataFrame, Any]]] = {
     "csv": _load_csv,
     "parquet": _load_parquet,
     "json": _load_json,
-    "avro": _load_avro,
 }
 _FORMAT_SAVE: Dict[str, Callable] = {
     "csv": _save_csv,
     "parquet": _save_parquet,
     "json": _save_json,
-    "avro": _save_avro,
 }

fugue/_utils/registry.py CHANGED Viewed

@@ -1,9 +1,10 @@
 from typing import Callable
 from triad import conditional_dispatcher
 from triad.utils.dispatcher import ConditionalDispatcher
-_FUGUE_ENTRYPOINT = "fugue.plugins"
+from ..constants import FUGUE_ENTRYPOINT
 def fugue_plugin(func: Callable) -> ConditionalDispatcher:
-    return conditional_dispatcher(entry_point=_FUGUE_ENTRYPOINT)(func)  # type: ignore
+    return conditional_dispatcher(entry_point=FUGUE_ENTRYPOINT)(func)  # type: ignore

fugue/api.py CHANGED Viewed

@@ -34,6 +34,7 @@ from .dataset.api import (
 from .execution.api import (
     aggregate,
     anti_join,
+    as_fugue_engine_df,
     assign,
     broadcast,
     clear_global_engine,

fugue/bag/bag.py CHANGED Viewed

@@ -9,9 +9,13 @@ class Bag(Dataset):
     unordered objects.
     """
-    @abstractmethod
     def as_local(self) -> "LocalBag":  # pragma: no cover
         """Convert this bag to a :class:`.LocalBag`"""
+        return self.as_local_bounded()
+    @abstractmethod
+    def as_local_bounded(self) -> "LocalBoundedBag":  # pragma: no cover
+        """Convert this bag to a :class:`.LocalBoundedBag`"""
         raise NotImplementedError
     @abstractmethod
@@ -50,9 +54,6 @@ class LocalBag(Bag):
     def is_local(self) -> bool:
         return True
-    def as_local(self) -> "LocalBag":
-        return self
     @property
     def num_partitions(self) -> int:
         return 1
@@ -63,6 +64,9 @@ class LocalBoundedBag(LocalBag):
     def is_bounded(self) -> bool:
         return True
+    def as_local_bounded(self) -> "LocalBoundedBag":
+        return self
 class BagDisplay(DatasetDisplay):
     """:class:`~.Bag` plain display class"""

fugue/collections/__init__.py CHANGED Viewed

@@ -1,7 +0,0 @@
-# flake8: noqa
-from fugue.collections.partition import (
-    BagPartitionCursor,
-    PartitionCursor,
-    PartitionSpec,
-)
-from fugue.collections.yielded import Yielded, PhysicalYielded

fugue/collections/partition.py CHANGED Viewed

@@ -98,7 +98,7 @@ class PartitionSpec(object):
     Partition consists for these specs:
-    * **algo**: can be one of ``hash`` (default), ``rand`` and ``even``
+    * **algo**: can be one of ``hash`` (default), ``rand``, ``even`` or ``coarse``
     * **num** or **num_partitions**: number of physical partitions, it can be an
       expression or integer numbers, e.g ``(ROWCOUNT+4) / 3``
     * **by** or **partition_by**: keys to partition on
@@ -208,7 +208,9 @@ class PartitionSpec(object):
     @property
     def algo(self) -> str:
-        """Get algo of the spec, one of ``hash`` (default), ``rand`` and ``even``"""
+        """Get algo of the spec, one of ``hash`` (default),
+        ``rand`` ``even`` or ``coarse``
+        """
         return self._algo if self._algo != "" else "hash"
     @property
@@ -258,11 +260,14 @@ class PartitionSpec(object):
         """Get deterministic unique id of this object"""
         return to_uuid(self.jsondict)
-    def get_sorts(self, schema: Schema) -> IndexedOrderedDict[str, bool]:
+    def get_sorts(
+        self, schema: Schema, with_partition_keys: bool = True
+    ) -> IndexedOrderedDict[str, bool]:
         """Get keys for sorting in a partition, it's the combination of partition
         keys plus the presort keys
         :param schema: the dataframe schema this partition spec to operate on
+        :param with_partition_keys: whether to include partition keys
         :return: an ordered dictionary of key, order pairs
         .. admonition:: Examples
@@ -272,9 +277,10 @@ class PartitionSpec(object):
             >>> assert p.get_sorts(schema) == {"a":True, "b":True, "c": False}
         """
         d: IndexedOrderedDict[str, bool] = IndexedOrderedDict()
-        for p in self.partition_by:
-            aot(p in schema, lambda: KeyError(f"{p} not in {schema}"))
-            d[p] = True
+        if with_partition_keys:
+            for p in self.partition_by:
+                aot(p in schema, lambda: KeyError(f"{p} not in {schema}"))
+                d[p] = True
         for p, v in self.presort.items():
             aot(p in schema, lambda: KeyError(f"{p} not in {schema}"))
             d[p] = v
@@ -348,7 +354,7 @@ class DatasetPartitionCursor:
         """reset the cursor to a row (which should be the first row of a
         new logical partition)
-        :param item: an item of the dataset
+        :param item: an item of the dataset, or an function generating the item
         :param partition_no: logical partition number
         :param slice_no: slice number inside the logical partition (to be deprecated)
         """
@@ -359,6 +365,8 @@ class DatasetPartitionCursor:
     @property
     def item(self) -> Any:
         """Get current item"""
+        if callable(self._item):
+            self._item = self._item()
         return self._item
     @property
@@ -417,11 +425,15 @@ class PartitionCursor(DatasetPartitionCursor):
         """reset the cursor to a row (which should be the first row of a
         new logical partition)
-        :param row: list-like row data
+        :param row: list-like row data or a function generating a list-like row
         :param partition_no: logical partition number
         :param slice_no: slice number inside the logical partition (to be deprecated)
         """
-        super().set(list(row), partition_no=partition_no, slice_no=slice_no)
+        super().set(
+            list(row) if not callable(row) else lambda: list(row()),
+            partition_no=partition_no,
+            slice_no=slice_no,
+        )
     @property
     def row(self) -> List[Any]:

fugue/constants.py CHANGED Viewed

@@ -2,7 +2,9 @@ from typing import Any, Dict
 from triad import ParamDict
 KEYWORD_ROWCOUNT = "ROWCOUNT"
-KEYWORD_CORECOUNT = "CORECOUNT"
+KEYWORD_PARALLELISM = "CONCURRENCY"
+FUGUE_ENTRYPOINT = "fugue.plugins"
 FUGUE_SQL_DEFAULT_DIALECT = "spark"

fugue/dataframe/__init__.py CHANGED Viewed

@@ -9,14 +9,13 @@ from .dataframe import (
     LocalDataFrame,
     YieldedDataFrame,
 )
-from .dataframe_iterable_dataframe import LocalDataFrameIterableDataFrame
+from .dataframe_iterable_dataframe import (
+    IterableArrowDataFrame,
+    IterablePandasDataFrame,
+    LocalDataFrameIterableDataFrame,
+)
 from .dataframes import DataFrames
+from .function_wrapper import DataFrameFunctionWrapper, fugue_annotated_param
 from .iterable_dataframe import IterableDataFrame
 from .pandas_dataframe import PandasDataFrame
-from .utils import (
-    get_column_names,
-    normalize_dataframe_column_names,
-    rename,
-    to_local_bounded_df,
-    to_local_df,
-)
+from .utils import get_column_names, normalize_dataframe_column_names, rename

fugue/dataframe/arrow_dataframe.py CHANGED Viewed

@@ -49,7 +49,6 @@ class ArrowDataFrame(LocalBoundedDataFrame):
         self,
         df: Any = None,
         schema: Any = None,
-        pandas_df_wrapper: bool = False,
     ):
         if df is None:
             schema = _input_schema(schema).assert_not_empty()
@@ -142,7 +141,7 @@ class ArrowDataFrame(LocalBoundedDataFrame):
         return self.native.shape[0]
     def as_pandas(self) -> pd.DataFrame:
-        return self.native.to_pandas()
+        return self.native.to_pandas(use_threads=False, date_as_object=False)
     def head(
         self, n: int, columns: Optional[List[str]] = None

fugue/dataframe/dataframe.py CHANGED Viewed

@@ -85,9 +85,13 @@ class DataFrame(Dataset):
         """
         raise NotImplementedError
-    @abstractmethod
     def as_local(self) -> "LocalDataFrame":  # pragma: no cover
         """Convert this dataframe to a :class:`.LocalDataFrame`"""
+        return self.as_local_bounded()
+    @abstractmethod
+    def as_local_bounded(self) -> "LocalBoundedDataFrame":  # pragma: no cover
+        """Convert this dataframe to a :class:`.LocalBoundedDataFrame`"""
         raise NotImplementedError
     @abstractmethod
@@ -317,10 +321,6 @@ class LocalDataFrame(DataFrame):
         """Always True because it's a LocalDataFrame"""
         return True
-    def as_local(self) -> "LocalDataFrame":
-        """Always return self, because it's a LocalDataFrame"""
-        return self
     @property
     def num_partitions(self) -> int:  # pragma: no cover
         """Always 1 because it's a LocalDataFrame"""
@@ -346,6 +346,10 @@ class LocalBoundedDataFrame(LocalDataFrame):
         """Always True because it's a bounded dataframe"""
         return True
+    def as_local_bounded(self) -> "LocalBoundedDataFrame":
+        """Always True because it's a bounded dataframe"""
+        return self
 class LocalUnboundedDataFrame(LocalDataFrame):
     """Base class of all local unbounded dataframes. Read
@@ -367,6 +371,9 @@ class LocalUnboundedDataFrame(LocalDataFrame):
         """Always False because it's an unbounded dataframe"""
         return False
+    def as_local(self) -> "LocalDataFrame":
+        return self
     def count(self) -> int:
         """
         :raises InvalidOperationError: You can't count an unbounded dataframe
@@ -458,22 +465,14 @@ def _get_dataframe_display(ds: DataFrame):
     return DataFrameDisplay(ds)
-@as_local.candidate(lambda df: isinstance(df, DataFrame) and not df.is_local)
-def _df_to_local(df: DataFrame) -> DataFrame:
+@as_local.candidate(lambda df: isinstance(df, DataFrame))
+def _df_to_local(df: DataFrame) -> LocalDataFrame:
     return df.as_local()
-@as_local_bounded.candidate(
-    lambda df: isinstance(df, DataFrame) and not (df.is_local and df.is_bounded),
-    priority=0.9,
-)
-def _df_to_local_bounded(df: DataFrame) -> DataFrame:
-    res: DataFrame = df.as_local()
-    if not res.is_bounded:
-        res = as_fugue_df(res.as_array(), schema=df.schema)
-    if res is not df and df.has_metadata:
-        res.reset_metadata(df.metadata)
-    return res
+@as_local_bounded.candidate(lambda df: isinstance(df, DataFrame))
+def _df_to_local_bounded(df: DataFrame) -> LocalBoundedDataFrame:
+    return df.as_local_bounded()
 def _get_schema_change(

fugue/dataframe/dataframe_iterable_dataframe.py CHANGED Viewed

@@ -2,16 +2,20 @@ from typing import Any, Dict, Iterable, List, Optional
 import pandas as pd
 import pyarrow as pa
-from fugue.dataframe.array_dataframe import ArrayDataFrame
-from fugue.dataframe.dataframe import (
+from triad import Schema, assert_or_throw
+from triad.utils.iter import EmptyAwareIterable, make_empty_aware
+from fugue.exceptions import FugueDataFrameInitError
+from .array_dataframe import ArrayDataFrame
+from .arrow_dataframe import ArrowDataFrame
+from .dataframe import (
     DataFrame,
+    LocalBoundedDataFrame,
     LocalDataFrame,
     LocalUnboundedDataFrame,
-    LocalBoundedDataFrame,
 )
-from fugue.exceptions import FugueDataFrameInitError
-from triad import Schema, assert_or_throw
-from triad.utils.iter import EmptyAwareIterable, make_empty_aware
+from .pandas_dataframe import PandasDataFrame
 class LocalDataFrameIterableDataFrame(LocalUnboundedDataFrame):
@@ -142,6 +146,9 @@ class LocalDataFrameIterableDataFrame(LocalUnboundedDataFrame):
         return LocalDataFrameIterableDataFrame(_transform())
+    def as_local_bounded(self) -> "LocalBoundedDataFrame":
+        return ArrowDataFrame(self.as_arrow())
     def as_array(
         self, columns: Optional[List[str]] = None, type_safe: bool = False
     ) -> List[Any]:
@@ -190,3 +197,12 @@ class LocalDataFrameIterableDataFrame(LocalUnboundedDataFrame):
                 yield df._drop_cols(cols)
         return LocalDataFrameIterableDataFrame(_transform())
+class IterablePandasDataFrame(LocalDataFrameIterableDataFrame):
+    def as_local_bounded(self) -> "LocalBoundedDataFrame":
+        return PandasDataFrame(self.as_pandas(), schema=self.schema)
+class IterableArrowDataFrame(LocalDataFrameIterableDataFrame):
+    pass

fugue 0.8.2.dev1__py3-none-any.whl → 0.8.4__py3-none-any.whl

fugue 0.8.2.dev1py3-none-any.whl → 0.8.4py3-none-any.whl