fugue 0.8.6.dev1__tar.gz → 0.8.6.dev2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/PKG-INFO +1 -1
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/dataframe/dataframe.py +7 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/dataframe/dataframe_iterable_dataframe.py +1 -1
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/dataframe/function_wrapper.py +38 -7
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue.egg-info/PKG-INFO +1 -1
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue.egg-info/entry_points.txt +1 -1
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue.egg-info/requires.txt +2 -2
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_spark/execution_engine.py +14 -4
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_test/builtin_suite.py +55 -5
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/setup.py +2 -2
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/LICENSE +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/README.md +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/__init__.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/_utils/__init__.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/_utils/display.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/_utils/exception.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/_utils/interfaceless.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/_utils/io.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/_utils/misc.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/_utils/registry.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/api.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/bag/__init__.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/bag/array_bag.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/bag/bag.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/collections/__init__.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/collections/partition.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/collections/sql.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/collections/yielded.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/column/__init__.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/column/expressions.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/column/functions.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/column/sql.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/constants.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/dataframe/__init__.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/dataframe/api.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/dataframe/array_dataframe.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/dataframe/arrow_dataframe.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/dataframe/dataframes.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/dataframe/iterable_dataframe.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/dataframe/pandas_dataframe.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/dataframe/utils.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/dataset/__init__.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/dataset/api.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/dataset/dataset.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/dev.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/exceptions.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/execution/__init__.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/execution/api.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/execution/execution_engine.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/execution/factory.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/execution/native_execution_engine.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/extensions/__init__.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/extensions/_builtins/__init__.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/extensions/_builtins/creators.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/extensions/_builtins/outputters.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/extensions/_builtins/processors.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/extensions/_utils.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/extensions/context.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/extensions/creator/__init__.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/extensions/creator/convert.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/extensions/creator/creator.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/extensions/outputter/__init__.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/extensions/outputter/convert.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/extensions/outputter/outputter.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/extensions/processor/__init__.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/extensions/processor/convert.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/extensions/processor/processor.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/extensions/transformer/__init__.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/extensions/transformer/constants.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/extensions/transformer/convert.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/extensions/transformer/transformer.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/plugins.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/py.typed +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/registry.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/rpc/__init__.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/rpc/base.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/rpc/flask.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/sql/__init__.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/sql/_utils.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/sql/_visitors.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/sql/api.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/sql/workflow.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/workflow/__init__.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/workflow/_checkpoint.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/workflow/_tasks.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/workflow/_workflow_context.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/workflow/api.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/workflow/input.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/workflow/module.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue/workflow/workflow.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue.egg-info/SOURCES.txt +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue.egg-info/dependency_links.txt +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue.egg-info/top_level.txt +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_contrib/__init__.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_contrib/contrib.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_contrib/seaborn/__init__.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_contrib/viz/__init__.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_contrib/viz/_ext.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_dask/__init__.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_dask/_constants.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_dask/_io.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_dask/_utils.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_dask/dataframe.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_dask/execution_engine.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_dask/ibis_engine.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_dask/registry.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_duckdb/__init__.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_duckdb/_io.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_duckdb/_utils.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_duckdb/dask.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_duckdb/dataframe.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_duckdb/execution_engine.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_duckdb/ibis_engine.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_duckdb/registry.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_ibis/__init__.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_ibis/_compat.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_ibis/_utils.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_ibis/dataframe.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_ibis/execution/__init__.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_ibis/execution/ibis_engine.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_ibis/execution/pandas_backend.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_ibis/execution_engine.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_ibis/extensions.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_notebook/__init__.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_notebook/env.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_notebook/nbextension/README.md +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_notebook/nbextension/__init__.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_notebook/nbextension/description.yaml +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_notebook/nbextension/main.js +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_polars/__init__.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_polars/_utils.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_polars/polars_dataframe.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_polars/registry.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_ray/__init__.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_ray/_constants.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_ray/_utils/__init__.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_ray/_utils/cluster.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_ray/_utils/dataframe.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_ray/_utils/io.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_ray/dataframe.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_ray/execution_engine.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_ray/registry.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_spark/__init__.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_spark/_constants.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_spark/_utils/__init__.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_spark/_utils/convert.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_spark/_utils/io.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_spark/_utils/misc.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_spark/_utils/partition.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_spark/dataframe.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_spark/ibis_engine.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_spark/registry.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_sql/__init__.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_sql/exceptions.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_test/__init__.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_test/bag_suite.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_test/dataframe_suite.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_test/execution_suite.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_test/ibis_suite.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/fugue_version/__init__.py +0 -0
- {fugue-0.8.6.dev1 → fugue-0.8.6.dev2}/setup.cfg +0 -0
|
@@ -113,6 +113,13 @@ class DataFrame(Dataset):
|
|
|
113
113
|
def as_pandas(self) -> pd.DataFrame:
|
|
114
114
|
"""Convert to pandas DataFrame"""
|
|
115
115
|
pdf = pd.DataFrame(self.as_array(), columns=self.columns)
|
|
116
|
+
if len(pdf) == 0: # TODO: move to triad
|
|
117
|
+
return pd.DataFrame(
|
|
118
|
+
{
|
|
119
|
+
k: pd.Series(dtype=v.type.to_pandas_dtype())
|
|
120
|
+
for k, v in self.schema.items()
|
|
121
|
+
}
|
|
122
|
+
)
|
|
116
123
|
return PD_UTILS.enforce_type(pdf, self.schema.pa_schema, null_safe=True)
|
|
117
124
|
|
|
118
125
|
def as_arrow(self, type_safe: bool = False) -> pa.Table:
|
|
@@ -165,7 +165,7 @@ class LocalDataFrameIterableDataFrame(LocalUnboundedDataFrame):
|
|
|
165
165
|
|
|
166
166
|
def as_pandas(self) -> pd.DataFrame:
|
|
167
167
|
if self.empty:
|
|
168
|
-
return
|
|
168
|
+
return PandasDataFrame(schema=self.schema).as_pandas()
|
|
169
169
|
|
|
170
170
|
return pd.concat(df.as_pandas() for df in self.native)
|
|
171
171
|
|
|
@@ -23,9 +23,10 @@ from triad.collections.function_wrapper import (
|
|
|
23
23
|
from triad.utils.iter import EmptyAwareIterable, make_empty_aware
|
|
24
24
|
|
|
25
25
|
from ..constants import FUGUE_ENTRYPOINT
|
|
26
|
+
from ..dataset.api import count as df_count
|
|
26
27
|
from .array_dataframe import ArrayDataFrame
|
|
27
28
|
from .arrow_dataframe import ArrowDataFrame
|
|
28
|
-
from .dataframe import DataFrame, LocalDataFrame
|
|
29
|
+
from .dataframe import AnyDataFrame, DataFrame, LocalDataFrame, as_fugue_df
|
|
29
30
|
from .dataframe_iterable_dataframe import (
|
|
30
31
|
IterableArrowDataFrame,
|
|
31
32
|
IterablePandasDataFrame,
|
|
@@ -172,6 +173,19 @@ class DataFrameParam(_DataFrameParamBase):
|
|
|
172
173
|
return sum(1 for _ in df.as_array_iterable())
|
|
173
174
|
|
|
174
175
|
|
|
176
|
+
@fugue_annotated_param(AnyDataFrame)
|
|
177
|
+
class _AnyDataFrameParam(DataFrameParam):
|
|
178
|
+
def to_output_df(self, output: AnyDataFrame, schema: Any, ctx: Any) -> DataFrame:
|
|
179
|
+
return (
|
|
180
|
+
as_fugue_df(output)
|
|
181
|
+
if schema is None
|
|
182
|
+
else as_fugue_df(output, schema=schema)
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
def count(self, df: Any) -> int:
|
|
186
|
+
return df_count(df)
|
|
187
|
+
|
|
188
|
+
|
|
175
189
|
@fugue_annotated_param(LocalDataFrame, "l", child_can_reuse_code=True)
|
|
176
190
|
class LocalDataFrameParam(DataFrameParam):
|
|
177
191
|
def to_input_data(self, df: DataFrame, ctx: Any) -> LocalDataFrame:
|
|
@@ -333,6 +347,9 @@ class _PandasParam(LocalDataFrameParam):
|
|
|
333
347
|
|
|
334
348
|
@no_type_check
|
|
335
349
|
def to_output_df(self, output: pd.DataFrame, schema: Any, ctx: Any) -> DataFrame:
|
|
350
|
+
_schema: Optional[Schema] = None if schema is None else Schema(schema)
|
|
351
|
+
if _schema is not None and _schema.names != list(output.columns):
|
|
352
|
+
output = output[_schema.names]
|
|
336
353
|
return PandasDataFrame(output, schema)
|
|
337
354
|
|
|
338
355
|
@no_type_check
|
|
@@ -361,8 +378,15 @@ class _IterablePandasParam(LocalDataFrameParam):
|
|
|
361
378
|
self, output: Iterable[pd.DataFrame], schema: Any, ctx: Any
|
|
362
379
|
) -> DataFrame:
|
|
363
380
|
def dfs():
|
|
381
|
+
_schema: Optional[Schema] = None if schema is None else Schema(schema)
|
|
382
|
+
has_return = False
|
|
364
383
|
for df in output:
|
|
365
|
-
|
|
384
|
+
if _schema is not None and _schema.names != list(df.columns):
|
|
385
|
+
df = df[_schema.names]
|
|
386
|
+
yield PandasDataFrame(df, _schema)
|
|
387
|
+
has_return = True
|
|
388
|
+
if not has_return and _schema is not None:
|
|
389
|
+
yield PandasDataFrame(schema=_schema)
|
|
366
390
|
|
|
367
391
|
return IterablePandasDataFrame(dfs())
|
|
368
392
|
|
|
@@ -381,7 +405,12 @@ class _PyArrowTableParam(LocalDataFrameParam):
|
|
|
381
405
|
|
|
382
406
|
def to_output_df(self, output: Any, schema: Any, ctx: Any) -> DataFrame:
|
|
383
407
|
assert isinstance(output, pa.Table)
|
|
384
|
-
|
|
408
|
+
adf: DataFrame = ArrowDataFrame(output)
|
|
409
|
+
if schema is not None:
|
|
410
|
+
_schema = Schema(schema)
|
|
411
|
+
if adf.schema != _schema:
|
|
412
|
+
adf = adf[_schema.names].alter_columns(_schema)
|
|
413
|
+
return adf
|
|
385
414
|
|
|
386
415
|
def count(self, df: Any) -> int: # pragma: no cover
|
|
387
416
|
return df.count()
|
|
@@ -409,13 +438,15 @@ class _IterableArrowParam(LocalDataFrameParam):
|
|
|
409
438
|
) -> DataFrame:
|
|
410
439
|
def dfs():
|
|
411
440
|
_schema: Optional[Schema] = None if schema is None else Schema(schema)
|
|
441
|
+
has_return = False
|
|
412
442
|
for df in output:
|
|
413
|
-
adf = ArrowDataFrame(df)
|
|
414
|
-
if _schema is not None and
|
|
415
|
-
adf.schema == schema
|
|
416
|
-
):
|
|
443
|
+
adf: DataFrame = ArrowDataFrame(df)
|
|
444
|
+
if _schema is not None and adf.schema != _schema:
|
|
417
445
|
adf = adf[_schema.names].alter_columns(_schema)
|
|
418
446
|
yield adf
|
|
447
|
+
has_return = True
|
|
448
|
+
if not has_return and _schema is not None:
|
|
449
|
+
yield ArrowDataFrame(schema=_schema)
|
|
419
450
|
|
|
420
451
|
return IterableArrowDataFrame(dfs())
|
|
421
452
|
|
|
@@ -26,7 +26,7 @@ dask[dataframe,distributed]
|
|
|
26
26
|
ibis-framework>=2.1.1
|
|
27
27
|
|
|
28
28
|
[all:python_version >= "3.8"]
|
|
29
|
-
dask[dataframe,distributed]
|
|
29
|
+
dask[dataframe,distributed]<2023.7.1,>=2022.9.0
|
|
30
30
|
ibis-framework<6,>=3.2.0
|
|
31
31
|
|
|
32
32
|
[cpp_sql_parser]
|
|
@@ -39,7 +39,7 @@ qpd[dask]>=0.4.4
|
|
|
39
39
|
dask[dataframe,distributed]
|
|
40
40
|
|
|
41
41
|
[dask:python_version >= "3.8"]
|
|
42
|
-
dask[dataframe,distributed]
|
|
42
|
+
dask[dataframe,distributed]<2023.7.1,>=2022.9.0
|
|
43
43
|
|
|
44
44
|
[duckdb]
|
|
45
45
|
duckdb>=0.5.0
|
|
@@ -134,9 +134,8 @@ class SparkMapEngine(MapEngine):
|
|
|
134
134
|
) -> DataFrame:
|
|
135
135
|
output_schema = Schema(output_schema)
|
|
136
136
|
if self._should_use_pandas_udf(output_schema):
|
|
137
|
-
# pandas udf can only be used for pyspark > 3
|
|
138
137
|
if len(partition_spec.partition_by) > 0:
|
|
139
|
-
if partition_spec.algo
|
|
138
|
+
if partition_spec.algo in ["coarse", "even"]:
|
|
140
139
|
return self._map_by_pandas_udf(
|
|
141
140
|
df,
|
|
142
141
|
map_func=map_func,
|
|
@@ -145,7 +144,18 @@ class SparkMapEngine(MapEngine):
|
|
|
145
144
|
on_init=on_init,
|
|
146
145
|
map_func_format_hint=map_func_format_hint,
|
|
147
146
|
)
|
|
148
|
-
|
|
147
|
+
else:
|
|
148
|
+
if ( # not simple partitioning
|
|
149
|
+
partition_spec.algo != "hash"
|
|
150
|
+
or partition_spec.num_partitions != "0"
|
|
151
|
+
):
|
|
152
|
+
# TODO: not sure if presort should be done
|
|
153
|
+
# on physical partition level
|
|
154
|
+
df = self.to_df(
|
|
155
|
+
self.execution_engine.repartition(
|
|
156
|
+
df, PartitionSpec(partition_spec, presort=[])
|
|
157
|
+
)
|
|
158
|
+
)
|
|
149
159
|
return self._group_map_by_pandas_udf(
|
|
150
160
|
df,
|
|
151
161
|
map_func=map_func,
|
|
@@ -154,7 +164,7 @@ class SparkMapEngine(MapEngine):
|
|
|
154
164
|
on_init=on_init,
|
|
155
165
|
map_func_format_hint=map_func_format_hint,
|
|
156
166
|
)
|
|
157
|
-
|
|
167
|
+
else:
|
|
158
168
|
return self._map_by_pandas_udf(
|
|
159
169
|
df,
|
|
160
170
|
map_func=map_func,
|
|
@@ -22,6 +22,7 @@ from triad import SerializableRLock
|
|
|
22
22
|
|
|
23
23
|
import fugue.api as fa
|
|
24
24
|
from fugue import (
|
|
25
|
+
AnyDataFrame,
|
|
25
26
|
ArrayDataFrame,
|
|
26
27
|
CoTransformer,
|
|
27
28
|
DataFrame,
|
|
@@ -365,6 +366,12 @@ class BuiltInTests(object):
|
|
|
365
366
|
dag.output(dict(df=a), using=mock_outputter2)
|
|
366
367
|
a.partition(num=3).output(MockOutputter3)
|
|
367
368
|
dag.output(dict(aa=a, bb=b), using=MockOutputter4)
|
|
369
|
+
|
|
370
|
+
a = dag.create(mock_creator2, params=dict(p=2))
|
|
371
|
+
b = dag.create(mock_creator2, params=dict(p=2))
|
|
372
|
+
c = dag.process(a, b, using=mock_processor4)
|
|
373
|
+
c.assert_eq(ArrayDataFrame([[2]], "a:int"))
|
|
374
|
+
dag.output(a, b, using=mock_outputter4)
|
|
368
375
|
dag.run(self.engine)
|
|
369
376
|
|
|
370
377
|
def test_zip(self):
|
|
@@ -435,9 +442,14 @@ class BuiltInTests(object):
|
|
|
435
442
|
# this test is important for using mapInPandas in spark
|
|
436
443
|
|
|
437
444
|
# schema: *,c:int
|
|
438
|
-
def mt_pandas(
|
|
445
|
+
def mt_pandas(
|
|
446
|
+
dfs: Iterable[pd.DataFrame], empty: bool = False
|
|
447
|
+
) -> Iterator[pd.DataFrame]:
|
|
439
448
|
for df in dfs:
|
|
440
|
-
|
|
449
|
+
if not empty:
|
|
450
|
+
df = df.assign(c=2)
|
|
451
|
+
df = df[reversed(list(df.columns))]
|
|
452
|
+
yield df
|
|
441
453
|
|
|
442
454
|
with FugueWorkflow() as dag:
|
|
443
455
|
a = dag.df([[1, 2], [3, 4]], "a:int,b:int")
|
|
@@ -445,10 +457,25 @@ class BuiltInTests(object):
|
|
|
445
457
|
dag.df([[1, 2, 2], [3, 4, 2]], "a:int,b:int,c:int").assert_eq(b)
|
|
446
458
|
dag.run(self.engine)
|
|
447
459
|
|
|
460
|
+
# when iterable returns nothing
|
|
461
|
+
with FugueWorkflow() as dag:
|
|
462
|
+
a = dag.df([[1, 2], [3, 4]], "a:int,b:int")
|
|
463
|
+
# without partitioning
|
|
464
|
+
b = a.transform(mt_pandas, params=dict(empty=True))
|
|
465
|
+
dag.df([], "a:int,b:int,c:int").assert_eq(b)
|
|
466
|
+
# with partitioning
|
|
467
|
+
b = a.partition_by("a").transform(mt_pandas, params=dict(empty=True))
|
|
468
|
+
dag.df([], "a:int,b:int,c:int").assert_eq(b)
|
|
469
|
+
dag.run(self.engine)
|
|
470
|
+
|
|
448
471
|
# schema: *
|
|
449
|
-
def mt_arrow(
|
|
472
|
+
def mt_arrow(
|
|
473
|
+
dfs: Iterable[pa.Table], empty: bool = False
|
|
474
|
+
) -> Iterator[pa.Table]:
|
|
450
475
|
for df in dfs:
|
|
451
|
-
|
|
476
|
+
if not empty:
|
|
477
|
+
df = df.select(reversed(df.schema.names))
|
|
478
|
+
yield df
|
|
452
479
|
|
|
453
480
|
# schema: a:long
|
|
454
481
|
def mt_arrow_2(dfs: Iterable[pa.Table]) -> Iterator[pa.Table]:
|
|
@@ -463,6 +490,17 @@ class BuiltInTests(object):
|
|
|
463
490
|
dag.df([[1], [3]], "a:long").assert_eq(b)
|
|
464
491
|
dag.run(self.engine)
|
|
465
492
|
|
|
493
|
+
# when iterable returns nothing
|
|
494
|
+
with FugueWorkflow() as dag:
|
|
495
|
+
a = dag.df([[1, 2], [3, 4]], "a:int,b:int")
|
|
496
|
+
# without partitioning
|
|
497
|
+
b = a.transform(mt_arrow, params=dict(empty=True))
|
|
498
|
+
dag.df([], "a:int,b:int").assert_eq(b)
|
|
499
|
+
# with partitioning
|
|
500
|
+
b = a.partition_by("a").transform(mt_arrow, params=dict(empty=True))
|
|
501
|
+
dag.df([], "a:int,b:int").assert_eq(b)
|
|
502
|
+
dag.run(self.engine)
|
|
503
|
+
|
|
466
504
|
def test_transform_binary(self):
|
|
467
505
|
with FugueWorkflow() as dag:
|
|
468
506
|
a = dag.df([[1, pickle.dumps([0, "a"])]], "a:int,b:bytes")
|
|
@@ -1829,6 +1867,10 @@ def mock_creator(p: int) -> DataFrame:
|
|
|
1829
1867
|
return ArrayDataFrame([[p]], "a:int")
|
|
1830
1868
|
|
|
1831
1869
|
|
|
1870
|
+
def mock_creator2(p: int) -> AnyDataFrame:
|
|
1871
|
+
return fa.as_fugue_df([[p]], schema="a:int")
|
|
1872
|
+
|
|
1873
|
+
|
|
1832
1874
|
def mock_processor(df1: List[List[Any]], df2: List[List[Any]]) -> DataFrame:
|
|
1833
1875
|
return ArrayDataFrame([[len(df1) + len(df2)]], "a:int")
|
|
1834
1876
|
|
|
@@ -1844,6 +1886,10 @@ class MockProcessor3(Processor):
|
|
|
1844
1886
|
return ArrayDataFrame([[sum(s.count() for s in dfs.values())]], "a:int")
|
|
1845
1887
|
|
|
1846
1888
|
|
|
1889
|
+
def mock_processor4(df1: AnyDataFrame, df2: AnyDataFrame) -> AnyDataFrame:
|
|
1890
|
+
return ArrayDataFrame([[fa.count(df1) + fa.count(df2)]], "a:int")
|
|
1891
|
+
|
|
1892
|
+
|
|
1847
1893
|
def mock_outputter(df1: List[List[Any]], df2: List[List[Any]]) -> None:
|
|
1848
1894
|
assert len(df1) == len(df2)
|
|
1849
1895
|
|
|
@@ -1857,6 +1903,10 @@ class MockOutputter3(Outputter):
|
|
|
1857
1903
|
assert "3" == self.partition_spec.num_partitions
|
|
1858
1904
|
|
|
1859
1905
|
|
|
1906
|
+
def mock_outputter4(df1: AnyDataFrame, df2: AnyDataFrame) -> None:
|
|
1907
|
+
assert fa.count(df1) == fa.count(df2)
|
|
1908
|
+
|
|
1909
|
+
|
|
1860
1910
|
class MockOutputter4(Outputter):
|
|
1861
1911
|
def process(self, dfs):
|
|
1862
1912
|
for k, v in dfs.items():
|
|
@@ -1895,8 +1945,8 @@ def mock_tf0(df: pd.DataFrame, p=1, col="p") -> pd.DataFrame:
|
|
|
1895
1945
|
|
|
1896
1946
|
# schema: *,ct:int,p:int
|
|
1897
1947
|
def mock_tf1(df: pd.DataFrame, p=1) -> pd.DataFrame:
|
|
1898
|
-
df["ct"] = df.shape[0]
|
|
1899
1948
|
df["p"] = p
|
|
1949
|
+
df["ct"] = df.shape[0]
|
|
1900
1950
|
return df
|
|
1901
1951
|
|
|
1902
1952
|
|
|
@@ -52,7 +52,7 @@ setup(
|
|
|
52
52
|
"spark": ["pyspark>=3.1.1"],
|
|
53
53
|
"dask": [
|
|
54
54
|
"dask[distributed,dataframe]; python_version < '3.8'",
|
|
55
|
-
"dask[distributed,dataframe]>=2022.9.0; python_version >= '3.8'",
|
|
55
|
+
"dask[distributed,dataframe]>=2022.9.0,<2023.7.1; python_version >= '3.8'",
|
|
56
56
|
"qpd[dask]>=0.4.4",
|
|
57
57
|
],
|
|
58
58
|
"ray": ["ray[data]>=2.1.0", "duckdb>=0.5.0", "pyarrow>=6.0.1"],
|
|
@@ -73,7 +73,7 @@ setup(
|
|
|
73
73
|
"fugue-sql-antlr[cpp]>=0.1.6",
|
|
74
74
|
"pyspark>=3.1.1",
|
|
75
75
|
"dask[distributed,dataframe]; python_version < '3.8'",
|
|
76
|
-
"dask[distributed,dataframe]>=2022.9.0; python_version >= '3.8'",
|
|
76
|
+
"dask[distributed,dataframe]>=2022.9.0,<2023.7.1; python_version >= '3.8'",
|
|
77
77
|
"ray[data]>=2.1.0",
|
|
78
78
|
"qpd[dask]>=0.4.4",
|
|
79
79
|
"notebook",
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|