PyPI - fugue - Versions diffs - 0.8.2.dev4__py3-none-any.whl → 0.8.4__py3-none-any.whl - Mend

fugue 0.8.2.dev4py3-none-any.whl → 0.8.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

fugue/__init__.py +0 -1
fugue/_utils/io.py +2 -91
fugue/api.py +1 -0
fugue/collections/partition.py +12 -6
fugue/constants.py +1 -1
fugue/dataframe/__init__.py +1 -7
fugue/dataframe/arrow_dataframe.py +1 -1
fugue/dataframe/function_wrapper.py +2 -3
fugue/dataframe/utils.py +10 -84
fugue/execution/api.py +34 -12
fugue/execution/native_execution_engine.py +33 -19
fugue/extensions/_builtins/creators.py +4 -2
fugue/extensions/_builtins/outputters.py +3 -3
fugue/extensions/_builtins/processors.py +2 -3
fugue/plugins.py +1 -0
fugue/workflow/_checkpoint.py +1 -1
{fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/METADATA +20 -10
{fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/RECORD +67 -65
{fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/entry_points.txt +2 -2
fugue_contrib/viz/_ext.py +7 -1
fugue_dask/_io.py +0 -13
fugue_dask/_utils.py +10 -4
fugue_dask/execution_engine.py +42 -16
fugue_duckdb/_utils.py +7 -2
fugue_duckdb/dask.py +1 -1
fugue_duckdb/dataframe.py +17 -10
fugue_duckdb/execution_engine.py +12 -22
fugue_ibis/dataframe.py +2 -7
fugue_notebook/env.py +5 -10
fugue_polars/_utils.py +0 -40
fugue_polars/polars_dataframe.py +22 -7
fugue_ray/_constants.py +8 -1
fugue_ray/_utils/dataframe.py +31 -4
fugue_ray/_utils/io.py +2 -4
fugue_ray/dataframe.py +13 -4
fugue_ray/execution_engine.py +39 -21
fugue_spark/_utils/convert.py +22 -11
fugue_spark/_utils/io.py +0 -13
fugue_spark/_utils/misc.py +27 -0
fugue_spark/_utils/partition.py +11 -18
fugue_spark/dataframe.py +24 -19
fugue_spark/execution_engine.py +61 -35
fugue_spark/registry.py +15 -3
fugue_test/builtin_suite.py +7 -9
fugue_test/dataframe_suite.py +7 -3
fugue_test/execution_suite.py +100 -122
fugue_version/__init__.py +1 -1
tests/fugue/collections/test_partition.py +6 -3
tests/fugue/dataframe/test_utils.py +2 -43
tests/fugue/execution/test_naive_execution_engine.py +33 -0
tests/fugue/utils/test_io.py +0 -80
tests/fugue_dask/test_execution_engine.py +45 -0
tests/fugue_dask/test_io.py +0 -55
tests/fugue_duckdb/test_dataframe.py +2 -2
tests/fugue_duckdb/test_utils.py +1 -1
tests/fugue_polars/test_api.py +13 -0
tests/fugue_polars/test_transform.py +11 -5
tests/fugue_ray/test_execution_engine.py +32 -1
tests/fugue_spark/test_dataframe.py +0 -8
tests/fugue_spark/test_execution_engine.py +48 -10
tests/fugue_spark/test_importless.py +4 -4
tests/fugue_spark/test_spark_connect.py +82 -0
tests/fugue_spark/utils/test_convert.py +6 -8
tests/fugue_spark/utils/test_io.py +0 -17
fugue_test/_utils.py +0 -13
{fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/LICENSE +0 -0
{fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/WHEEL +0 -0
{fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/top_level.txt +0 -0

fugue/__init__.py CHANGED Viewed

@@ -26,7 +26,6 @@ from fugue.dataframe.dataframe_iterable_dataframe import (
 from fugue.dataframe.dataframes import DataFrames
 from fugue.dataframe.iterable_dataframe import IterableDataFrame
 from fugue.dataframe.pandas_dataframe import PandasDataFrame
-from fugue.dataframe.utils import to_local_bounded_df, to_local_df
 from fugue.dataset import (
     AnyDataset,
     Dataset,

fugue/_utils/io.py CHANGED Viewed

@@ -5,13 +5,13 @@ from urllib.parse import urlparse
 import fs as pfs
 import pandas as pd
-from fs.errors import FileExpected
-from fugue.dataframe import LocalBoundedDataFrame, LocalDataFrame, PandasDataFrame
 from triad.collections.dict import ParamDict
 from triad.collections.fs import FileSystem
 from triad.collections.schema import Schema
 from triad.utils.assertion import assert_or_throw
+from fugue.dataframe import LocalBoundedDataFrame, LocalDataFrame, PandasDataFrame
 class FileParser(object):
     def __init__(self, path: str, format_hint: Optional[str] = None):
@@ -271,111 +271,22 @@ def _load_json(
     return pdf[schema.names], schema
-def _save_avro(df: LocalDataFrame, p: FileParser, **kwargs: Any):
-    """Save pandas dataframe as avro.
-    If providing your own schema, the usage of schema argument is preferred
-    :param schema: Avro Schema determines dtypes saved
-    """
-    import pandavro as pdx
-    kw = ParamDict(kwargs)
-    # pandavro defaults
-    schema = None
-    append = False
-    times_as_micros = True
-    if "schema" in kw:
-        schema = kw["schema"]
-        del kw["schema"]
-    if "append" in kw:
-        append = kw["append"]  # default is overwrite (False) instead of append (True)
-        del kw["append"]
-    if "times_as_micros" in kw:
-        times_as_micros = kw["times_as_micros"]
-        del kw["times_as_micros"]
-    pdf = df.as_pandas()
-    pdx.to_avro(
-        p.uri, pdf, schema=schema, append=append, times_as_micros=times_as_micros, **kw
-    )
-def _load_avro(
-    p: FileParser, columns: Any = None, **kwargs: Any
-) -> Tuple[pd.DataFrame, Any]:
-    path = p.uri
-    try:
-        pdf = _load_single_avro(path, **kwargs)
-    except (IsADirectoryError, PermissionError, FileExpected):
-        fs = FileSystem()
-        pdf = pd.concat(
-            [
-                _load_single_avro(
-                    pfs.path.combine(path, pfs.path.basename(x.path)), **kwargs
-                )
-                for x in fs.opendir(path).glob("*.avro")
-            ]
-        )
-    if columns is None:
-        return pdf, None
-    if isinstance(columns, list):  # column names
-        return pdf[columns], None
-    schema = Schema(columns)
-    # Return created DataFrame
-    return pdf[schema.names], schema
-def _load_single_avro(path: str, **kwargs: Any) -> pd.DataFrame:
-    from fastavro import reader
-    kw = ParamDict(kwargs)
-    process_record = None
-    if "process_record" in kw:
-        process_record = kw["process_record"]
-        del kw["process_record"]
-    fs = FileSystem()
-    with fs.openbin(path) as fp:
-        # Configure Avro reader
-        avro_reader = reader(fp)
-        # Load records in memory
-        if process_record:
-            records = [process_record(r) for r in avro_reader]
-        else:
-            records = list(avro_reader)
-        # Populate pandas.DataFrame with records
-        return pd.DataFrame.from_records(records)
 _FORMAT_MAP: Dict[str, str] = {
     ".csv": "csv",
     ".csv.gz": "csv",
     ".parquet": "parquet",
     ".json": "json",
     ".json.gz": "json",
-    ".avro": "avro",
-    ".avro.gz": "avro",
 }
 _FORMAT_LOAD: Dict[str, Callable[..., Tuple[pd.DataFrame, Any]]] = {
     "csv": _load_csv,
     "parquet": _load_parquet,
     "json": _load_json,
-    "avro": _load_avro,
 }
 _FORMAT_SAVE: Dict[str, Callable] = {
     "csv": _save_csv,
     "parquet": _save_parquet,
     "json": _save_json,
-    "avro": _save_avro,
 }

fugue/api.py CHANGED Viewed

@@ -34,6 +34,7 @@ from .dataset.api import (
 from .execution.api import (
     aggregate,
     anti_join,
+    as_fugue_engine_df,
     assign,
     broadcast,
     clear_global_engine,

fugue/collections/partition.py CHANGED Viewed

@@ -98,7 +98,7 @@ class PartitionSpec(object):
     Partition consists for these specs:
-    * **algo**: can be one of ``hash`` (default), ``rand`` and ``even``
+    * **algo**: can be one of ``hash`` (default), ``rand``, ``even`` or ``coarse``
     * **num** or **num_partitions**: number of physical partitions, it can be an
       expression or integer numbers, e.g ``(ROWCOUNT+4) / 3``
     * **by** or **partition_by**: keys to partition on
@@ -208,7 +208,9 @@ class PartitionSpec(object):
     @property
     def algo(self) -> str:
-        """Get algo of the spec, one of ``hash`` (default), ``rand`` and ``even``"""
+        """Get algo of the spec, one of ``hash`` (default),
+        ``rand`` ``even`` or ``coarse``
+        """
         return self._algo if self._algo != "" else "hash"
     @property
@@ -258,11 +260,14 @@ class PartitionSpec(object):
         """Get deterministic unique id of this object"""
         return to_uuid(self.jsondict)
-    def get_sorts(self, schema: Schema) -> IndexedOrderedDict[str, bool]:
+    def get_sorts(
+        self, schema: Schema, with_partition_keys: bool = True
+    ) -> IndexedOrderedDict[str, bool]:
         """Get keys for sorting in a partition, it's the combination of partition
         keys plus the presort keys
         :param schema: the dataframe schema this partition spec to operate on
+        :param with_partition_keys: whether to include partition keys
         :return: an ordered dictionary of key, order pairs
         .. admonition:: Examples
@@ -272,9 +277,10 @@ class PartitionSpec(object):
             >>> assert p.get_sorts(schema) == {"a":True, "b":True, "c": False}
         """
         d: IndexedOrderedDict[str, bool] = IndexedOrderedDict()
-        for p in self.partition_by:
-            aot(p in schema, lambda: KeyError(f"{p} not in {schema}"))
-            d[p] = True
+        if with_partition_keys:
+            for p in self.partition_by:
+                aot(p in schema, lambda: KeyError(f"{p} not in {schema}"))
+                d[p] = True
         for p, v in self.presort.items():
             aot(p in schema, lambda: KeyError(f"{p} not in {schema}"))
             d[p] = v

fugue/constants.py CHANGED Viewed

@@ -2,7 +2,7 @@ from typing import Any, Dict
 from triad import ParamDict
 KEYWORD_ROWCOUNT = "ROWCOUNT"
-KEYWORD_CORECOUNT = "CORECOUNT"
+KEYWORD_PARALLELISM = "CONCURRENCY"
 FUGUE_ENTRYPOINT = "fugue.plugins"

fugue/dataframe/__init__.py CHANGED Viewed

@@ -18,10 +18,4 @@ from .dataframes import DataFrames
 from .function_wrapper import DataFrameFunctionWrapper, fugue_annotated_param
 from .iterable_dataframe import IterableDataFrame
 from .pandas_dataframe import PandasDataFrame
-from .utils import (
-    get_column_names,
-    normalize_dataframe_column_names,
-    rename,
-    to_local_bounded_df,
-    to_local_df,
-)
+from .utils import get_column_names, normalize_dataframe_column_names, rename

fugue/dataframe/arrow_dataframe.py CHANGED Viewed

@@ -141,7 +141,7 @@ class ArrowDataFrame(LocalBoundedDataFrame):
         return self.native.shape[0]
     def as_pandas(self) -> pd.DataFrame:
-        return self.native.to_pandas()
+        return self.native.to_pandas(use_threads=False, date_as_object=False)
     def head(
         self, n: int, columns: Optional[List[str]] = None

fugue/dataframe/function_wrapper.py CHANGED Viewed

@@ -34,7 +34,6 @@ from .dataframe_iterable_dataframe import (
 from .dataframes import DataFrames
 from .iterable_dataframe import IterableDataFrame
 from .pandas_dataframe import PandasDataFrame
-from .utils import to_local_df
 @function_wrapper(FUGUE_ENTRYPOINT)
@@ -176,7 +175,7 @@ class DataFrameParam(_DataFrameParamBase):
 @fugue_annotated_param(LocalDataFrame, "l", child_can_reuse_code=True)
 class LocalDataFrameParam(DataFrameParam):
     def to_input_data(self, df: DataFrame, ctx: Any) -> LocalDataFrame:
-        return to_local_df(df)
+        return df.as_local()
     def to_output_df(self, output: LocalDataFrame, schema: Any, ctx: Any) -> DataFrame:
         assert_or_throw(
@@ -256,7 +255,7 @@ class _EmptyAwareIterableListParam(_LocalNoSchemaDataFrameParam):
 class _ListDictParam(_LocalNoSchemaDataFrameParam):
     @no_type_check
     def to_input_data(self, df: DataFrame, ctx: Any) -> List[Dict[str, Any]]:
-        return list(to_local_df(df).as_dict_iterable())
+        return list(df.as_local().as_dict_iterable())
     @no_type_check
     def to_output_df(

fugue/dataframe/utils.py CHANGED Viewed

@@ -13,11 +13,9 @@ from triad.exceptions import InvalidOperationError
 from triad.utils.assertion import assert_arg_not_none
 from triad.utils.assertion import assert_or_throw as aot
-from .api import get_column_names, normalize_column_names, rename
+from .api import get_column_names, normalize_column_names, rename, as_fugue_df
 from .array_dataframe import ArrayDataFrame
-from .arrow_dataframe import ArrowDataFrame
-from .dataframe import DataFrame, LocalBoundedDataFrame, LocalDataFrame
-from .iterable_dataframe import IterableDataFrame
+from .dataframe import DataFrame, LocalBoundedDataFrame
 from .pandas_dataframe import PandasDataFrame
 # For backward compatibility, TODO: remove!
@@ -74,8 +72,11 @@ def _df_eq(
     :param throw: if to throw error if not equal, defaults to False
     :return: if they equal
     """
-    df1 = to_local_bounded_df(df)
-    df2 = to_local_bounded_df(data, schema)
+    df1 = as_fugue_df(df).as_local_bounded()
+    if schema is not None:
+        df2 = as_fugue_df(data, schema=schema).as_local_bounded()
+    else:
+        df2 = as_fugue_df(data).as_local_bounded()
     try:
         assert (
             df1.count() == df2.count()
@@ -99,7 +100,7 @@ def _df_eq(
         d1 = d1.reset_index(drop=True)
         d2 = d2.reset_index(drop=True)
         pd.testing.assert_frame_equal(
-            d1, d2, check_less_precise=digits, check_dtype=False
+            d1, d2, rtol=0, atol=10 ** (-digits), check_dtype=False, check_exact=False
         )
         return True
     except AssertionError:
@@ -108,84 +109,9 @@ def _df_eq(
         return False
-def to_local_df(df: Any, schema: Any = None) -> LocalDataFrame:
-    """Convert a data structure to :class:`~fugue.dataframe.dataframe.LocalDataFrame`
-    :param df: :class:`~fugue.dataframe.dataframe.DataFrame`, pandas DataFramme and
-      list or iterable of arrays
-    :param schema: |SchemaLikeObject|, defaults to None, it should not be set for
-      :class:`~fugue.dataframe.dataframe.DataFrame` type
-    :raises ValueError: if ``df`` is :class:`~fugue.dataframe.dataframe.DataFrame`
-      but you set ``schema``
-    :raises TypeError: if ``df`` is not compatible
-    :return: the dataframe itself if it's
-      :class:`~fugue.dataframe.dataframe.LocalDataFrame` else a converted one
-    .. admonition:: Examples
-        >>> a = to_local_df([[0,'a'],[1,'b']],"a:int,b:str")
-        >>> assert to_local_df(a) is a
-        >>> to_local_df(SparkDataFrame([[0,'a'],[1,'b']],"a:int,b:str"))
-    """
-    assert_arg_not_none(df, "df")
-    if isinstance(df, DataFrame):
-        aot(
-            schema is None,
-            ValueError("schema and metadata must be None when df is a DataFrame"),
-        )
-        return df.as_local()
-    if isinstance(df, pd.DataFrame):
-        return PandasDataFrame(df, schema)
-    if isinstance(df, pa.Table):
-        return ArrowDataFrame(df, schema)
-    if isinstance(df, List):
-        return ArrayDataFrame(df, schema)
-    if isinstance(df, Iterable):
-        return IterableDataFrame(df, schema)
-    raise ValueError(f"{df} cannot convert to a LocalDataFrame")
-def to_local_bounded_df(df: Any, schema: Any = None) -> LocalBoundedDataFrame:
-    """Convert a data structure to
-    :class:`~fugue.dataframe.dataframe.LocalBoundedDataFrame`
-    :param df: :class:`~fugue.dataframe.dataframe.DataFrame`, pandas DataFramme and
-      list or iterable of arrays
-    :param schema: |SchemaLikeObject|, defaults to None, it should not be set for
-      :class:`~fugue.dataframe.dataframe.DataFrame` type
-    :raises ValueError: if ``df`` is :class:`~fugue.dataframe.dataframe.DataFrame`
-      but you set ``schema``
-    :raises TypeError: if ``df`` is not compatible
-    :return: the dataframe itself if it's
-      :class:`~fugue.dataframe.dataframe.LocalBoundedDataFrame` else a converted one
-    .. admonition:: Examples
-        >>> a = IterableDataFrame([[0,'a'],[1,'b']],"a:int,b:str")
-        >>> assert isinstance(to_local_bounded_df(a), LocalBoundedDataFrame)
-        >>> to_local_bounded_df(SparkDataFrame([[0,'a'],[1,'b']],"a:int,b:str"))
-    .. note::
-        Compared to :func:`.to_local_df`, this function makes sure the dataframe is also
-        bounded, so :class:`~fugue.dataframe.iterable_dataframe.IterableDataFrame` will
-        be converted although it's local.
-    """
-    if isinstance(df, DataFrame):
-        aot(
-            schema is None,
-            ValueError("schema and metadata must be None when df is a DataFrame"),
-        )
-        return df.as_local_bounded()
-    df = to_local_df(df, schema)
-    if isinstance(df, LocalBoundedDataFrame):
-        return df
-    raise ValueError(f"{df} cannot convert to a LocalBoundedDataFrame")
 def pickle_df(df: DataFrame) -> bytes:
     """Pickles a dataframe to bytes array. It firstly converts the dataframe
-    using :func:`.to_local_bounded_df`, and then serialize the underlying data.
+    local bounded, and then serialize the underlying data.
     :param df: input DataFrame
     :return: pickled binary data
@@ -195,7 +121,7 @@ def pickle_df(df: DataFrame) -> bytes:
         Be careful to use on large dataframes or non-local, un-materialized dataframes,
         it can be slow. You should always use :func:`.unpickle_df` to deserialize.
     """
-    df = to_local_bounded_df(df)
+    df = df.as_local_bounded()
     o: List[Any] = [df.schema]
     if isinstance(df, PandasDataFrame):
         o.append("p")

fugue/execution/api.py CHANGED Viewed

@@ -15,6 +15,7 @@ from .execution_engine import (
     ExecutionEngine,
 )
 from .factory import make_execution_engine, try_get_context_execution_engine
+from .._utils.registry import fugue_plugin
 @contextmanager
@@ -120,6 +121,27 @@ def get_current_parallelism() -> int:
     return make_execution_engine().get_current_parallelism()
+@fugue_plugin
+def as_fugue_engine_df(
+    engine: ExecutionEngine, df: AnyDataFrame, schema: Any = None
+) -> DataFrame:
+    """Convert a dataframe to a Fugue engine dependent DataFrame.
+    This function is used internally by Fugue. It is not recommended
+    to use
+    :param engine: the ExecutionEngine to use, must not be None
+    :param df: a dataframe like object
+    :param schema: the schema of the dataframe, defaults to None
+    :return: the engine dependent DataFrame
+    """
+    if schema is None:
+        fdf = as_fugue_df(df)
+    else:
+        fdf = as_fugue_df(df, schema=schema)
+    return engine.to_df(fdf)
 def run_engine_function(
     func: Callable[[ExecutionEngine], Any],
     engine: AnyExecutionEngine = None,
@@ -549,11 +571,11 @@ def join(
     """
     def _join(e: ExecutionEngine):
-        edf1 = e.to_df(df1)
-        edf2 = e.to_df(df2)
+        edf1 = as_fugue_engine_df(e, df1)
+        edf2 = as_fugue_engine_df(e, df2)
         res = e.join(edf1, edf2, how=how, on=on)
         for odf in dfs:
-            res = e.join(res, e.to_df(odf), how=how, on=on)
+            res = e.join(res, as_fugue_engine_df(e, odf), how=how, on=on)
         return res
     return run_engine_function(
@@ -837,11 +859,11 @@ def union(
     """
     def _union(e: ExecutionEngine):
-        edf1 = e.to_df(df1)
-        edf2 = e.to_df(df2)
+        edf1 = as_fugue_engine_df(e, df1)
+        edf2 = as_fugue_engine_df(e, df2)
         res = e.union(edf1, edf2, distinct=distinct)
         for odf in dfs:
-            res = e.union(res, e.to_df(odf), distinct=distinct)
+            res = e.union(res, as_fugue_engine_df(e, odf), distinct=distinct)
         return res
     return run_engine_function(
@@ -885,11 +907,11 @@ def subtract(
     """
     def _subtract(e: ExecutionEngine):
-        edf1 = e.to_df(df1)
-        edf2 = e.to_df(df2)
+        edf1 = as_fugue_engine_df(e, df1)
+        edf2 = as_fugue_engine_df(e, df2)
         res = e.subtract(edf1, edf2, distinct=distinct)
         for odf in dfs:
-            res = e.subtract(res, e.to_df(odf), distinct=distinct)
+            res = e.subtract(res, as_fugue_engine_df(e, odf), distinct=distinct)
         return res
     return run_engine_function(
@@ -933,11 +955,11 @@ def intersect(
     """
     def _intersect(e: ExecutionEngine):
-        edf1 = e.to_df(df1)
-        edf2 = e.to_df(df2)
+        edf1 = as_fugue_engine_df(e, df1)
+        edf2 = as_fugue_engine_df(e, df2)
         res = e.intersect(edf1, edf2, distinct=distinct)
         for odf in dfs:
-            res = e.intersect(res, e.to_df(odf), distinct=distinct)
+            res = e.intersect(res, as_fugue_engine_df(e, odf), distinct=distinct)
         return res
     return run_engine_function(

fugue/execution/native_execution_engine.py CHANGED Viewed

@@ -25,9 +25,9 @@ from fugue.dataframe import (
     LocalDataFrame,
     PandasDataFrame,
     fugue_annotated_param,
-    to_local_bounded_df,
 )
-from fugue.dataframe.utils import get_join_schemas, to_local_df
+from fugue.dataframe.dataframe import as_fugue_df
+from fugue.dataframe.utils import get_join_schemas
 from .execution_engine import (
     ExecutionEngine,
@@ -83,19 +83,36 @@ class PandasMapEngine(MapEngine):
         on_init: Optional[Callable[[int, DataFrame], Any]] = None,
         map_func_format_hint: Optional[str] = None,
     ) -> DataFrame:
-        if partition_spec.num_partitions != "0":
-            self.log.warning(
-                "%s doesn't respect num_partitions %s",
-                self,
-                partition_spec.num_partitions,
-            )
+        # if partition_spec.num_partitions != "0":
+        #     self.log.warning(
+        #         "%s doesn't respect num_partitions %s",
+        #         self,
+        #         partition_spec.num_partitions,
+        #     )
+        is_coarse = partition_spec.algo == "coarse"
+        presort = partition_spec.get_sorts(df.schema, with_partition_keys=is_coarse)
+        presort_keys = list(presort.keys())
+        presort_asc = list(presort.values())
+        output_schema = Schema(output_schema)
         cursor = partition_spec.get_cursor(df.schema, 0)
         if on_init is not None:
             on_init(0, df)
-        if len(partition_spec.partition_by) == 0:  # no partition
-            df = to_local_df(df)
-            cursor.set(lambda: df.peek_array(), 0, 0)
-            output_df = map_func(cursor, df)
+        if (
+            len(partition_spec.partition_by) == 0 or partition_spec.algo == "coarse"
+        ):  # no partition
+            if len(partition_spec.presort) > 0:
+                pdf = (
+                    df.as_pandas()
+                    .sort_values(presort_keys, ascending=presort_asc)
+                    .reset_index(drop=True)
+                )
+                input_df = PandasDataFrame(pdf, df.schema, pandas_df_wrapper=True)
+                cursor.set(lambda: input_df.peek_array(), cursor.partition_no + 1, 0)
+                output_df = map_func(cursor, input_df)
+            else:
+                df = df.as_local()
+                cursor.set(lambda: df.peek_array(), 0, 0)
+                output_df = map_func(cursor, df)
             if (
                 isinstance(output_df, PandasDataFrame)
                 and output_df.schema != output_schema
@@ -107,13 +124,9 @@ class PandasMapEngine(MapEngine):
                 f"mismatches given {output_schema}",
             )
             return self.to_df(output_df)  # type: ignore
-        presort = partition_spec.presort
-        presort_keys = list(presort.keys())
-        presort_asc = list(presort.values())
-        output_schema = Schema(output_schema)
         def _map(pdf: pd.DataFrame) -> pd.DataFrame:
-            if len(presort_keys) > 0:
+            if len(partition_spec.presort) > 0:
                 pdf = pdf.sort_values(presort_keys, ascending=presort_asc).reset_index(
                     drop=True
                 )
@@ -177,7 +190,7 @@ class NativeExecutionEngine(ExecutionEngine):
     def repartition(
         self, df: DataFrame, partition_spec: PartitionSpec
     ) -> DataFrame:  # pragma: no cover
-        self.log.warning("%s doesn't respect repartition", self)
+        # self.log.warning("%s doesn't respect repartition", self)
         return df
     def broadcast(self, df: DataFrame) -> DataFrame:
@@ -384,4 +397,5 @@ class _NativeExecutionEngineParam(ExecutionEngineParam):
 def _to_native_execution_engine_df(df: AnyDataFrame, schema: Any = None) -> DataFrame:
-    return to_local_bounded_df(df, schema)
+    fdf = as_fugue_df(df) if schema is None else as_fugue_df(df, schema=schema)
+    return fdf.as_local_bounded()

fugue/extensions/_builtins/creators.py CHANGED Viewed

@@ -1,10 +1,12 @@
 from typing import Any, Callable, Optional
+from triad import Schema, assert_or_throw, to_uuid
 from fugue.collections.yielded import Yielded
 from fugue.dataframe import DataFrame
 from fugue.exceptions import FugueWorkflowCompileError
+from fugue.execution.api import as_fugue_engine_df
 from fugue.extensions.creator import Creator
-from triad import Schema, assert_or_throw, to_uuid
 class Load(Creator):
@@ -39,7 +41,7 @@ class CreateData(Creator):
     def create(self) -> DataFrame:
         if isinstance(self._df, Yielded):
             return self.execution_engine.load_yielded(self._df)
-        return self.execution_engine.to_df(self._df, schema=self._schema)
+        return as_fugue_engine_df(self.execution_engine, self._df, schema=self._schema)
     def _df_uid(self):
         if self._data_determiner is not None:

fugue/extensions/_builtins/outputters.py CHANGED Viewed

@@ -6,7 +6,7 @@ from triad.utils.convert import to_type
 from fugue.collections.partition import PartitionCursor
 from fugue.dataframe import DataFrame, DataFrames, LocalDataFrame
 from fugue.dataframe.array_dataframe import ArrayDataFrame
-from fugue.dataframe.utils import _df_eq, to_local_bounded_df
+from fugue.dataframe.utils import _df_eq
 from fugue.exceptions import FugueWorkflowError
 from fugue.execution.execution_engine import _generate_comap_empty_dfs
 from fugue.rpc import EmptyRPCHandler, to_rpc_handler
@@ -136,7 +136,7 @@ class _TransformerRunner(object):
     def run(self, cursor: PartitionCursor, df: LocalDataFrame) -> LocalDataFrame:
         self.transformer._cursor = cursor  # type: ignore
         try:
-            to_local_bounded_df(self.transformer.transform(df))
+            self.transformer.transform(df).as_local_bounded()
             return ArrayDataFrame([], self.transformer.output_schema)
         except self.ignore_errors:  # type: ignore
             return ArrayDataFrame([], self.transformer.output_schema)
@@ -160,7 +160,7 @@ class _CoTransformerRunner(object):
     def run(self, cursor: PartitionCursor, dfs: DataFrames) -> LocalDataFrame:
         self.transformer._cursor = cursor  # type: ignore
         try:
-            to_local_bounded_df(self.transformer.transform(dfs))
+            self.transformer.transform(dfs).as_local_bounded()
             return ArrayDataFrame([], self.transformer.output_schema)
         except self.ignore_errors:  # type: ignore
             return ArrayDataFrame([], self.transformer.output_schema)

fugue/extensions/_builtins/processors.py CHANGED Viewed

@@ -6,7 +6,6 @@ from fugue.dataframe import (
     DataFrame,
     DataFrames,
     LocalDataFrame,
-    to_local_bounded_df,
 )
 from fugue.column import ColumnExpr, SelectColumns as ColumnsSelect
 from fugue.exceptions import FugueWorkflowError
@@ -334,7 +333,7 @@ class _TransformerRunner(object):
             return self.transformer.transform(df)
         else:
             try:
-                return to_local_bounded_df(self.transformer.transform(df))
+                return self.transformer.transform(df).as_local_bounded()
             except self.ignore_errors:  # type: ignore  # pylint: disable=E0712
                 return ArrayDataFrame([], self.transformer.output_schema)
@@ -364,7 +363,7 @@ class _CoTransformerRunner(object):
         else:
             try:
-                return to_local_bounded_df(self.transformer.transform(dfs))
+                return self.transformer.transform(dfs).as_local_bounded()
             except self.ignore_errors:  # type: ignore  # pylint: disable=E0712
                 return ArrayDataFrame([], self.transformer.output_schema)

fugue/plugins.py CHANGED Viewed

@@ -30,6 +30,7 @@ from fugue.dataset import (
     is_empty,
     is_local,
 )
+from fugue.execution.api import as_fugue_engine_df
 from fugue.execution.factory import (
     infer_execution_engine,
     parse_execution_engine,

fugue/workflow/_checkpoint.py CHANGED Viewed

@@ -166,7 +166,7 @@ class CheckpointPath(object):
     def get_table_name(self, obj_id: str, permanent: bool) -> str:
         path = self._path if permanent else self._temp_path
-        return to_uuid(path, obj_id)[:5]
+        return "temp_" + to_uuid(path, obj_id)[:5]
     def temp_file_exists(self, path: str) -> bool:
         try:

fugue 0.8.2.dev4__py3-none-any.whl → 0.8.4__py3-none-any.whl

fugue 0.8.2.dev4py3-none-any.whl → 0.8.4py3-none-any.whl