fugue 0.8.2.dev4__py3-none-any.whl → 0.8.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. fugue/__init__.py +0 -1
  2. fugue/_utils/io.py +2 -91
  3. fugue/api.py +1 -0
  4. fugue/collections/partition.py +12 -6
  5. fugue/constants.py +1 -1
  6. fugue/dataframe/__init__.py +1 -7
  7. fugue/dataframe/arrow_dataframe.py +1 -1
  8. fugue/dataframe/function_wrapper.py +2 -3
  9. fugue/dataframe/utils.py +10 -84
  10. fugue/execution/api.py +34 -12
  11. fugue/execution/native_execution_engine.py +33 -19
  12. fugue/extensions/_builtins/creators.py +4 -2
  13. fugue/extensions/_builtins/outputters.py +3 -3
  14. fugue/extensions/_builtins/processors.py +2 -3
  15. fugue/plugins.py +1 -0
  16. fugue/workflow/_checkpoint.py +1 -1
  17. {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/METADATA +20 -10
  18. {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/RECORD +67 -65
  19. {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/entry_points.txt +2 -2
  20. fugue_contrib/viz/_ext.py +7 -1
  21. fugue_dask/_io.py +0 -13
  22. fugue_dask/_utils.py +10 -4
  23. fugue_dask/execution_engine.py +42 -16
  24. fugue_duckdb/_utils.py +7 -2
  25. fugue_duckdb/dask.py +1 -1
  26. fugue_duckdb/dataframe.py +17 -10
  27. fugue_duckdb/execution_engine.py +12 -22
  28. fugue_ibis/dataframe.py +2 -7
  29. fugue_notebook/env.py +5 -10
  30. fugue_polars/_utils.py +0 -40
  31. fugue_polars/polars_dataframe.py +22 -7
  32. fugue_ray/_constants.py +8 -1
  33. fugue_ray/_utils/dataframe.py +31 -4
  34. fugue_ray/_utils/io.py +2 -4
  35. fugue_ray/dataframe.py +13 -4
  36. fugue_ray/execution_engine.py +39 -21
  37. fugue_spark/_utils/convert.py +22 -11
  38. fugue_spark/_utils/io.py +0 -13
  39. fugue_spark/_utils/misc.py +27 -0
  40. fugue_spark/_utils/partition.py +11 -18
  41. fugue_spark/dataframe.py +24 -19
  42. fugue_spark/execution_engine.py +61 -35
  43. fugue_spark/registry.py +15 -3
  44. fugue_test/builtin_suite.py +7 -9
  45. fugue_test/dataframe_suite.py +7 -3
  46. fugue_test/execution_suite.py +100 -122
  47. fugue_version/__init__.py +1 -1
  48. tests/fugue/collections/test_partition.py +6 -3
  49. tests/fugue/dataframe/test_utils.py +2 -43
  50. tests/fugue/execution/test_naive_execution_engine.py +33 -0
  51. tests/fugue/utils/test_io.py +0 -80
  52. tests/fugue_dask/test_execution_engine.py +45 -0
  53. tests/fugue_dask/test_io.py +0 -55
  54. tests/fugue_duckdb/test_dataframe.py +2 -2
  55. tests/fugue_duckdb/test_utils.py +1 -1
  56. tests/fugue_polars/test_api.py +13 -0
  57. tests/fugue_polars/test_transform.py +11 -5
  58. tests/fugue_ray/test_execution_engine.py +32 -1
  59. tests/fugue_spark/test_dataframe.py +0 -8
  60. tests/fugue_spark/test_execution_engine.py +48 -10
  61. tests/fugue_spark/test_importless.py +4 -4
  62. tests/fugue_spark/test_spark_connect.py +82 -0
  63. tests/fugue_spark/utils/test_convert.py +6 -8
  64. tests/fugue_spark/utils/test_io.py +0 -17
  65. fugue_test/_utils.py +0 -13
  66. {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/LICENSE +0 -0
  67. {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/WHEEL +0 -0
  68. {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/top_level.txt +0 -0
@@ -3,6 +3,7 @@ import os
3
3
  from typing import Any, Callable, Dict, List, Optional, Type, Union
4
4
 
5
5
  import dask.dataframe as dd
6
+ import pandas as pd
6
7
  from distributed import Client
7
8
  from qpd_dask import run_sql_on_dask
8
9
  from triad.collections import Schema
@@ -18,7 +19,7 @@ from fugue.collections.partition import (
18
19
  PartitionSpec,
19
20
  parse_presort_exp,
20
21
  )
21
- from fugue.constants import KEYWORD_CORECOUNT, KEYWORD_ROWCOUNT
22
+ from fugue.constants import KEYWORD_PARALLELISM, KEYWORD_ROWCOUNT
22
23
  from fugue.dataframe import (
23
24
  AnyDataFrame,
24
25
  DataFrame,
@@ -34,6 +35,8 @@ from fugue_dask._io import load_df, save_df
34
35
  from fugue_dask._utils import DASK_UTILS, DaskUtils
35
36
  from fugue_dask.dataframe import DaskDataFrame
36
37
 
38
+ _DASK_PARTITION_KEY = "__dask_partition_key__"
39
+
37
40
 
38
41
  class QPDDaskEngine(SQLEngine):
39
42
  """QPD execution implementation."""
@@ -74,7 +77,8 @@ class DaskMapEngine(MapEngine):
74
77
  on_init: Optional[Callable[[int, DataFrame], Any]] = None,
75
78
  map_func_format_hint: Optional[str] = None,
76
79
  ) -> DataFrame:
77
- presort = partition_spec.presort
80
+ is_coarse = partition_spec.algo == "coarse"
81
+ presort = partition_spec.get_sorts(df.schema, with_partition_keys=is_coarse)
78
82
  presort_keys = list(presort.keys())
79
83
  presort_asc = list(presort.values())
80
84
  output_schema = Schema(output_schema)
@@ -88,10 +92,12 @@ class DaskMapEngine(MapEngine):
88
92
  )
89
93
  )
90
94
 
91
- def _map(pdf: Any) -> dd.DataFrame:
95
+ def _map(pdf: Any) -> pd.DataFrame:
92
96
  if pdf.shape[0] == 0:
93
97
  return PandasDataFrame([], output_schema).as_pandas()
94
- if len(presort_keys) > 0:
98
+ if is_coarse:
99
+ pdf = pdf.drop(columns=[_DASK_PARTITION_KEY])
100
+ if len(partition_spec.presort) > 0:
95
101
  pdf = pdf.sort_values(presort_keys, ascending=presort_asc)
96
102
  input_df = PandasDataFrame(
97
103
  pdf.reset_index(drop=True), input_schema, pandas_df_wrapper=True
@@ -100,7 +106,7 @@ class DaskMapEngine(MapEngine):
100
106
  on_init_once(0, input_df)
101
107
  cursor.set(lambda: input_df.peek_array(), 0, 0)
102
108
  output_df = map_func(cursor, input_df)
103
- return output_df.as_pandas()
109
+ return output_df.as_pandas()[output_schema.names]
104
110
 
105
111
  df = self.to_df(df)
106
112
  meta = self.execution_engine.pl_utils.safe_to_pandas_dtype( # type: ignore
@@ -113,8 +119,28 @@ class DaskMapEngine(MapEngine):
113
119
  df = self.execution_engine.repartition(
114
120
  df, PartitionSpec(num=partition_spec.num_partitions)
115
121
  )
122
+ if is_coarse:
123
+ input_num_partitions = df.num_partitions
124
+ _utils = self.execution_engine.pl_utils # type: ignore
125
+ input_meta = _utils.safe_to_pandas_dtype(
126
+ (input_schema + (_DASK_PARTITION_KEY, "uint64")).pa_schema
127
+ )
128
+ tddf = df.native.map_partitions(
129
+ lambda pdf: pdf.assign(
130
+ **{
131
+ _DASK_PARTITION_KEY: pd.util.hash_pandas_object(
132
+ pdf[partition_spec.partition_by], index=False
133
+ ).mod(input_num_partitions)
134
+ }
135
+ ),
136
+ meta=input_meta,
137
+ )
138
+ keys = [_DASK_PARTITION_KEY]
139
+ else:
140
+ tddf = df.native
141
+ keys = partition_spec.partition_by
116
142
  result = self.execution_engine.pl_utils.safe_groupby_apply( # type: ignore
117
- df.native, partition_spec.partition_by, _map, meta=meta # type: ignore
143
+ tddf, keys, _map, meta=meta # type: ignore
118
144
  )
119
145
  return DaskDataFrame(result, output_schema)
120
146
 
@@ -214,7 +240,7 @@ class DaskExecutionEngine(ExecutionEngine):
214
240
  p = partition_spec.get_num_partitions(
215
241
  **{
216
242
  KEYWORD_ROWCOUNT: lambda: df.persist().count(), # type: ignore
217
- KEYWORD_CORECOUNT: lambda: self.get_current_parallelism(),
243
+ KEYWORD_PARALLELISM: lambda: self.get_current_parallelism(),
218
244
  }
219
245
  )
220
246
  if p > 0:
@@ -253,7 +279,7 @@ class DaskExecutionEngine(ExecutionEngine):
253
279
  join_type=how,
254
280
  on=key_schema.names,
255
281
  )
256
- return DaskDataFrame(d, output_schema)
282
+ return DaskDataFrame(d, output_schema, type_safe=False)
257
283
 
258
284
  def union(
259
285
  self,
@@ -268,7 +294,7 @@ class DaskExecutionEngine(ExecutionEngine):
268
294
  d = self.pl_utils.union(
269
295
  self.to_df(df1).native, self.to_df(df2).native, unique=distinct
270
296
  )
271
- return DaskDataFrame(d, df1.schema)
297
+ return DaskDataFrame(d, df1.schema, type_safe=False)
272
298
 
273
299
  def subtract(
274
300
  self,
@@ -286,7 +312,7 @@ class DaskExecutionEngine(ExecutionEngine):
286
312
  d = self.pl_utils.except_df(
287
313
  self.to_df(df1).native, self.to_df(df2).native, unique=distinct
288
314
  )
289
- return DaskDataFrame(d, df1.schema)
315
+ return DaskDataFrame(d, df1.schema, type_safe=False)
290
316
 
291
317
  def intersect(
292
318
  self,
@@ -304,11 +330,11 @@ class DaskExecutionEngine(ExecutionEngine):
304
330
  d = self.pl_utils.intersect(
305
331
  self.to_df(df1).native, self.to_df(df2).native, unique=distinct
306
332
  )
307
- return DaskDataFrame(d, df1.schema)
333
+ return DaskDataFrame(d, df1.schema, type_safe=False)
308
334
 
309
335
  def distinct(self, df: DataFrame) -> DataFrame:
310
336
  d = self.pl_utils.drop_duplicates(self.to_df(df).native)
311
- return DaskDataFrame(d, df.schema)
337
+ return DaskDataFrame(d, df.schema, type_safe=False)
312
338
 
313
339
  def dropna(
314
340
  self,
@@ -325,7 +351,7 @@ class DaskExecutionEngine(ExecutionEngine):
325
351
  if how == "any" and thresh is not None:
326
352
  del kw["how"] # to deal with a dask logic flaw
327
353
  d = self.to_df(df).native.dropna(**kw)
328
- return DaskDataFrame(d, df.schema)
354
+ return DaskDataFrame(d, df.schema, type_safe=False)
329
355
 
330
356
  def fillna(self, df: DataFrame, value: Any, subset: List[str] = None) -> DataFrame:
331
357
  assert_or_throw(
@@ -345,7 +371,7 @@ class DaskExecutionEngine(ExecutionEngine):
345
371
  subset = subset or df.columns
346
372
  mapping = {col: value for col in subset}
347
373
  d = self.to_df(df).native.fillna(mapping)
348
- return DaskDataFrame(d, df.schema)
374
+ return DaskDataFrame(d, df.schema, type_safe=False)
349
375
 
350
376
  def sample(
351
377
  self,
@@ -363,7 +389,7 @@ class DaskExecutionEngine(ExecutionEngine):
363
389
  d = self.to_df(df).native.sample(
364
390
  n=n, frac=frac, replace=replace, random_state=seed
365
391
  )
366
- return DaskDataFrame(d, df.schema)
392
+ return DaskDataFrame(d, df.schema, type_safe=False)
367
393
 
368
394
  def take(
369
395
  self,
@@ -419,7 +445,7 @@ class DaskExecutionEngine(ExecutionEngine):
419
445
  .reset_index(drop=True)
420
446
  )
421
447
 
422
- return DaskDataFrame(d, df.schema)
448
+ return DaskDataFrame(d, df.schema, type_safe=False)
423
449
 
424
450
  def load_df(
425
451
  self,
fugue_duckdb/_utils.py CHANGED
@@ -27,7 +27,11 @@ _DUCK_TYPES_TO_PA: Dict[str, pa.DataType] = {
27
27
  "TIME": pa.time32("ms"),
28
28
  }
29
29
 
30
- _PA_TYPES_TO_DUCK: Dict[pa.DataType, str] = {v: k for k, v in _DUCK_TYPES_TO_PA.items()}
30
+ _PA_TYPES_TO_DUCK: Dict[pa.DataType, str] = {
31
+ v: k
32
+ for k, v in list(_DUCK_TYPES_TO_PA.items())
33
+ + [("VARCHAR", pa.large_string()), ("BLOB", pa.large_binary())]
34
+ }
31
35
 
32
36
 
33
37
  def encode_column_name(name: str) -> str:
@@ -94,8 +98,9 @@ def to_duck_type(tp: pa.DataType) -> str:
94
98
  raise ValueError(f"can't convert {tp} to DuckDB data type")
95
99
 
96
100
 
97
- def to_pa_type(duck_type: str) -> pa.DataType:
101
+ def to_pa_type(duck_type_raw: Any) -> pa.DataType:
98
102
  try:
103
+ duck_type = str(duck_type_raw) # for duckdb >= 0.8.0
99
104
  if duck_type.endswith("[]"):
100
105
  return pa.list_(to_pa_type(duck_type[:-2]))
101
106
  p = duck_type.find("(")
fugue_duckdb/dask.py CHANGED
@@ -50,7 +50,7 @@ class DuckDaskExecutionEngine(DuckExecutionEngine):
50
50
  res = DuckDataFrame(self.connection.from_df(ddf.as_pandas()))
51
51
  else:
52
52
  res = DuckDataFrame(
53
- duckdb.arrow(ddf.as_arrow(), connection=self.connection)
53
+ duckdb.from_arrow(ddf.as_arrow(), connection=self.connection)
54
54
  )
55
55
  if ddf.has_metadata: # pragma: no cover
56
56
  res.reset_metadata(ddf.metadata)
fugue_duckdb/dataframe.py CHANGED
@@ -4,14 +4,17 @@ import pandas as pd
4
4
  import pyarrow as pa
5
5
  from duckdb import DuckDBPyRelation
6
6
  from triad import Schema
7
+ from triad.utils.pyarrow import LARGE_TYPES_REPLACEMENT, replace_types_in_table
7
8
 
8
9
  from fugue import ArrayDataFrame, ArrowDataFrame, DataFrame, LocalBoundedDataFrame
9
10
  from fugue.exceptions import FugueDataFrameOperationError, FugueDatasetEmptyError
10
11
  from fugue.plugins import (
12
+ as_arrow,
11
13
  as_fugue_dataset,
12
14
  as_local_bounded,
13
15
  get_column_names,
14
16
  get_num_partitions,
17
+ get_schema,
15
18
  is_df,
16
19
  )
17
20
 
@@ -26,15 +29,7 @@ class DuckDataFrame(LocalBoundedDataFrame):
26
29
 
27
30
  def __init__(self, rel: DuckDBPyRelation):
28
31
  self._rel = rel
29
- super().__init__(schema=self._get_schema)
30
-
31
- def _get_schema(self) -> Schema:
32
- return Schema(
33
- [
34
- pa.field(x, to_pa_type(y))
35
- for x, y in zip(self._rel.columns, self._rel.types)
36
- ]
37
- )
32
+ super().__init__(schema=lambda: _duck_get_schema(self._rel))
38
33
 
39
34
  @property
40
35
  def alias(self) -> str:
@@ -98,7 +93,7 @@ class DuckDataFrame(LocalBoundedDataFrame):
98
93
  return DuckDataFrame(self._rel.project(", ".join(fields)))
99
94
 
100
95
  def as_arrow(self, type_safe: bool = False) -> pa.Table:
101
- return self._rel.arrow()
96
+ return _duck_as_arrow(self._rel)
102
97
 
103
98
  def as_pandas(self) -> pd.DataFrame:
104
99
  if any(pa.types.is_nested(f.type) for f in self.schema.fields):
@@ -169,6 +164,18 @@ def _duck_as_local(df: DuckDBPyRelation) -> DuckDBPyRelation:
169
164
  return df
170
165
 
171
166
 
167
+ @as_arrow.candidate(lambda df: isinstance(df, DuckDBPyRelation))
168
+ def _duck_as_arrow(df: DuckDBPyRelation) -> pa.Table:
169
+ _df = df.arrow()
170
+ _df = replace_types_in_table(_df, LARGE_TYPES_REPLACEMENT, recursive=True)
171
+ return _df
172
+
173
+
174
+ @get_schema.candidate(lambda df: isinstance(df, DuckDBPyRelation))
175
+ def _duck_get_schema(df: DuckDBPyRelation) -> Schema:
176
+ return Schema([pa.field(x, to_pa_type(y)) for x, y in zip(df.columns, df.types)])
177
+
178
+
172
179
  @get_column_names.candidate(lambda df: isinstance(df, DuckDBPyRelation))
173
180
  def _get_duckdb_columns(df: DuckDBPyRelation) -> List[Any]:
174
181
  return list(df.columns)
@@ -2,12 +2,11 @@ import logging
2
2
  from typing import Any, Dict, Iterable, List, Optional, Union
3
3
 
4
4
  import duckdb
5
- import pyarrow as pa
6
5
  from duckdb import DuckDBPyConnection, DuckDBPyRelation
7
6
  from triad import SerializableRLock
8
7
  from triad.collections.fs import FileSystem
9
- from triad.utils.schema import quote_name
10
8
  from triad.utils.assertion import assert_or_throw
9
+ from triad.utils.schema import quote_name
11
10
 
12
11
  from fugue import (
13
12
  ArrowDataFrame,
@@ -19,12 +18,7 @@ from fugue import (
19
18
  )
20
19
  from fugue.collections.partition import PartitionSpec, parse_presort_exp
21
20
  from fugue.collections.sql import StructuredRawSQL, TempTableName
22
- from fugue.dataframe import (
23
- DataFrame,
24
- DataFrames,
25
- LocalBoundedDataFrame,
26
- PandasDataFrame,
27
- )
21
+ from fugue.dataframe import DataFrame, DataFrames, LocalBoundedDataFrame
28
22
  from fugue.dataframe.utils import get_join_schemas
29
23
 
30
24
  from ._io import DuckDBIO
@@ -34,7 +28,7 @@ from ._utils import (
34
28
  encode_schema_names,
35
29
  encode_value_to_expr,
36
30
  )
37
- from .dataframe import DuckDataFrame
31
+ from .dataframe import DuckDataFrame, _duck_as_arrow
38
32
 
39
33
  _FUGUE_DUCKDB_PRAGMA_CONFIG_PREFIX = "fugue.duckdb.pragma."
40
34
  _FUGUE_DUCKDB_EXTENSIONS = "fugue.duckdb.extensions"
@@ -114,8 +108,8 @@ class DuckDBEngine(SQLEngine):
114
108
  conn = duckdb.connect()
115
109
  try:
116
110
  for k, v in dfs.items():
117
- duckdb.arrow(v.as_arrow(), connection=conn).create_view(k)
118
- return ArrowDataFrame(conn.execute(statement).arrow())
111
+ duckdb.from_arrow(v.as_arrow(), connection=conn).create_view(k)
112
+ return ArrowDataFrame(_duck_as_arrow(conn.execute(statement)))
119
113
  finally:
120
114
  conn.close()
121
115
 
@@ -235,7 +229,7 @@ class DuckExecutionEngine(ExecutionEngine):
235
229
  # TODO: we should create DuckDB table, but it has bugs, so can't use by 0.3.1
236
230
  if isinstance(df, DuckDataFrame):
237
231
  # materialize
238
- res: DataFrame = ArrowDataFrame(df.native.arrow())
232
+ res: DataFrame = ArrowDataFrame(df.as_arrow())
239
233
  else:
240
234
  res = self.to_df(df)
241
235
  res.reset_metadata(df.metadata)
@@ -545,19 +539,15 @@ def _to_duck_df(
545
539
  )
546
540
  if isinstance(df, DuckDataFrame):
547
541
  return df
548
-
549
- if isinstance(df, PandasDataFrame) and all(
550
- not pa.types.is_nested(f.type) for f in df.schema.fields
551
- ):
552
- rdf = DuckDataFrame(engine.connection.from_df(df.as_pandas()))
553
- else:
554
- rdf = DuckDataFrame(
555
- duckdb.arrow(df.as_arrow(), connection=engine.connection)
556
- )
542
+ rdf = DuckDataFrame(
543
+ duckdb.from_arrow(df.as_arrow(), connection=engine.connection)
544
+ )
557
545
  rdf.reset_metadata(df.metadata if df.has_metadata else None)
558
546
  return rdf
559
547
  tdf = ArrowDataFrame(df, schema)
560
- return DuckDataFrame(duckdb.arrow(tdf.native, connection=engine.connection))
548
+ return DuckDataFrame(
549
+ duckdb.from_arrow(tdf.native, connection=engine.connection)
550
+ )
561
551
 
562
552
  res = _gen_duck()
563
553
  if create_view:
fugue_ibis/dataframe.py CHANGED
@@ -5,12 +5,7 @@ import pandas as pd
5
5
  import pyarrow as pa
6
6
  from triad import Schema, assert_or_throw
7
7
 
8
- from fugue import (
9
- DataFrame,
10
- IterableDataFrame,
11
- LocalBoundedDataFrame,
12
- to_local_bounded_df,
13
- )
8
+ from fugue import DataFrame, IterableDataFrame, LocalBoundedDataFrame
14
9
  from fugue.dataframe.dataframe import _input_schema
15
10
  from fugue.exceptions import FugueDataFrameOperationError, FugueDatasetEmptyError
16
11
  from fugue.plugins import drop_columns, get_column_names, is_df, rename
@@ -153,7 +148,7 @@ class IbisDataFrame(DataFrame):
153
148
  ) -> LocalBoundedDataFrame:
154
149
  if columns is not None:
155
150
  return self[columns].head(n)
156
- return to_local_bounded_df(self._to_local_df(self._table.head(n)))
151
+ return self._to_local_df(self._table.head(n)).as_local_bounded()
157
152
 
158
153
  def _alter_table_columns(self, table: IbisTable, new_schema: Schema) -> IbisTable:
159
154
  fields: Dict[str, Any] = {}
fugue_notebook/env.py CHANGED
@@ -3,21 +3,16 @@ import html
3
3
  import json
4
4
  from typing import Any, Dict, List, Optional
5
5
 
6
- from IPython.core.magic import Magics, cell_magic, magics_class, needs_local_scope
7
6
  from IPython import get_ipython
7
+ from IPython.core.magic import Magics, cell_magic, magics_class, needs_local_scope
8
8
  from IPython.display import HTML, display
9
9
  from triad import ParamDict
10
10
  from triad.utils.convert import to_instance
11
11
  from triad.utils.pyarrow import _field_to_expression
12
12
 
13
- import fugue_sql
14
- from fugue import (
15
- DataFrame,
16
- DataFrameDisplay,
17
- ExecutionEngine,
18
- get_dataset_display,
19
- make_execution_engine,
20
- )
13
+ from fugue import DataFrame, DataFrameDisplay, ExecutionEngine
14
+ from fugue import fsql as fugue_sql
15
+ from fugue import get_dataset_display, make_execution_engine
21
16
  from fugue.dataframe import YieldedDataFrame
22
17
  from fugue.exceptions import FugueSQLSyntaxError
23
18
 
@@ -58,7 +53,7 @@ class _FugueSQLMagics(Magics):
58
53
  @cell_magic("fsql")
59
54
  def fsql(self, line: str, cell: str, local_ns: Any = None) -> None:
60
55
  try:
61
- dag = fugue_sql.fsql(
56
+ dag = fugue_sql(
62
57
  "\n" + cell, local_ns, fsql_ignore_case=self._fsql_ignore_case
63
58
  )
64
59
  except FugueSQLSyntaxError as ex:
fugue_polars/_utils.py CHANGED
@@ -1,48 +1,8 @@
1
1
  import polars as pl
2
- import pyarrow as pa
3
2
  from triad import Schema
4
- from triad.utils.pyarrow import get_alter_func
5
3
 
6
4
  from fugue.dataframe.arrow_dataframe import _build_empty_arrow
7
5
 
8
6
 
9
- def pl_as_arrow(df: pl.DataFrame) -> pa.Table:
10
- adf = df.to_arrow()
11
- schema = convert_schema(adf.schema)
12
- func = get_alter_func(adf.schema, schema, safe=False)
13
- return func(adf)
14
-
15
-
16
- def to_schema(df: pl.DataFrame) -> Schema:
17
- return Schema(convert_schema(pl.DataFrame(schema=df.schema).to_arrow().schema))
18
-
19
-
20
7
  def build_empty_pl(schema: Schema) -> pl.DataFrame:
21
8
  return pl.from_arrow(_build_empty_arrow(schema))
22
-
23
-
24
- def convert_schema(schema: pa.Schema) -> pa.Schema:
25
- fields = [convert_field(f) for f in schema]
26
- return pa.schema(fields)
27
-
28
-
29
- def convert_field(field: pa.Field) -> pa.Field:
30
- tp = convert_type(field.type)
31
- if tp == field.type:
32
- return field
33
- print(field.type, tp)
34
- return pa.field(field.name, tp)
35
-
36
-
37
- def convert_type(tp: pa.DataType) -> pa.DataType:
38
- if pa.types.is_struct(tp):
39
- return pa.struct([convert_field(f) for f in tp])
40
- if pa.types.is_list(tp) or pa.types.is_large_list(tp):
41
- return pa.list_(convert_type(tp.value_type))
42
- if pa.types.is_map(tp): # pragma: no cover
43
- return pa.map_(convert_type(tp.key_type), convert_type(tp.value_type))
44
- if pa.types.is_large_string(tp):
45
- return pa.string()
46
- if pa.types.is_large_binary(tp):
47
- return pa.binary()
48
- return tp
@@ -6,9 +6,15 @@ import pyarrow as pa
6
6
  from triad.collections.schema import Schema
7
7
  from triad.exceptions import InvalidOperationError
8
8
  from triad.utils.assertion import assert_or_throw
9
+ from triad.utils.pyarrow import (
10
+ LARGE_TYPES_REPLACEMENT,
11
+ replace_types_in_schema,
12
+ replace_types_in_table,
13
+ )
9
14
 
10
15
  from fugue import ArrowDataFrame
11
16
  from fugue.api import (
17
+ as_arrow,
12
18
  drop_columns,
13
19
  get_column_names,
14
20
  get_schema,
@@ -28,7 +34,7 @@ from fugue.dataset.api import (
28
34
  )
29
35
  from fugue.exceptions import FugueDataFrameOperationError
30
36
 
31
- from ._utils import build_empty_pl, pl_as_arrow, to_schema
37
+ from ._utils import build_empty_pl
32
38
 
33
39
 
34
40
  class PolarsDataFrame(LocalBoundedDataFrame):
@@ -55,7 +61,7 @@ class PolarsDataFrame(LocalBoundedDataFrame):
55
61
  InvalidOperationError("can't reset schema for pl.DataFrame"),
56
62
  )
57
63
  self._native = df
58
- super().__init__(to_schema(df))
64
+ super().__init__(_get_pl_schema(df))
59
65
 
60
66
  @property
61
67
  def native(self) -> pl.DataFrame:
@@ -75,7 +81,7 @@ class PolarsDataFrame(LocalBoundedDataFrame):
75
81
 
76
82
  def peek_dict(self) -> Dict[str, Any]:
77
83
  self.assert_not_empty()
78
- return dict(zip(self._native.columns, self._native.row(0)))
84
+ return self._native.row(0, named=True)
79
85
 
80
86
  def count(self) -> int:
81
87
  return self.native.shape[0]
@@ -107,7 +113,7 @@ class PolarsDataFrame(LocalBoundedDataFrame):
107
113
  return PolarsDataFrame(pl.from_arrow(adf.native))
108
114
 
109
115
  def as_arrow(self, type_safe: bool = False) -> pa.Table:
110
- return pl_as_arrow(self.native)
116
+ return _pl_as_arrow(self.native)
111
117
 
112
118
  def as_array(
113
119
  self, columns: Optional[List[str]] = None, type_safe: bool = False
@@ -121,7 +127,7 @@ class PolarsDataFrame(LocalBoundedDataFrame):
121
127
  self, columns: Optional[List[str]] = None, type_safe: bool = False
122
128
  ) -> Iterable[Any]:
123
129
  if not self.empty:
124
- yield from ArrowDataFrame(pl_as_arrow(self.native)).as_array_iterable(
130
+ yield from ArrowDataFrame(_pl_as_arrow(self.native)).as_array_iterable(
125
131
  columns=columns
126
132
  )
127
133
 
@@ -129,7 +135,7 @@ class PolarsDataFrame(LocalBoundedDataFrame):
129
135
  self, columns: Optional[List[str]] = None
130
136
  ) -> Iterable[Dict[str, Any]]:
131
137
  if not self.empty:
132
- yield from ArrowDataFrame(pl_as_arrow(self.native)).as_dict_iterable(
138
+ yield from ArrowDataFrame(_pl_as_arrow(self.native)).as_dict_iterable(
133
139
  columns=columns
134
140
  )
135
141
 
@@ -144,6 +150,13 @@ def _pl_as_local_bounded(df: pl.DataFrame) -> pl.DataFrame:
144
150
  return df
145
151
 
146
152
 
153
+ @as_arrow.candidate(lambda df: isinstance(df, pl.DataFrame))
154
+ def _pl_as_arrow(df: pl.DataFrame) -> pa.Table:
155
+ adf = df.to_arrow()
156
+ adf = replace_types_in_table(adf, LARGE_TYPES_REPLACEMENT)
157
+ return adf
158
+
159
+
147
160
  @is_df.candidate(lambda df: isinstance(df, pl.DataFrame))
148
161
  def _pl_is_df(df: pl.DataFrame) -> bool:
149
162
  return True
@@ -181,7 +194,9 @@ def _get_pl_columns(df: pl.DataFrame) -> List[Any]:
181
194
 
182
195
  @get_schema.candidate(lambda df: isinstance(df, pl.DataFrame))
183
196
  def _get_pl_schema(df: pl.DataFrame) -> Schema:
184
- return to_schema(df)
197
+ adf = df.to_arrow()
198
+ schema = replace_types_in_schema(adf.schema, LARGE_TYPES_REPLACEMENT)
199
+ return Schema(schema)
185
200
 
186
201
 
187
202
  @rename.candidate(lambda df, *args, **kwargs: isinstance(df, pl.DataFrame))
fugue_ray/_constants.py CHANGED
@@ -1,4 +1,6 @@
1
- from typing import Dict, Any
1
+ from typing import Any, Dict
2
+
3
+ import ray
2
4
 
3
5
  FUGUE_RAY_CONF_SHUFFLE_PARTITIONS = "fugue.ray.shuffle.partitions"
4
6
  FUGUE_RAY_DEFAULT_PARTITIONS = "fugue.ray.default.partitions"
@@ -10,3 +12,8 @@ FUGUE_RAY_DEFAULT_CONF: Dict[str, Any] = {
10
12
  FUGUE_RAY_DEFAULT_PARTITIONS: 0,
11
13
  FUGUE_RAY_ZERO_COPY: True,
12
14
  }
15
+
16
+ if ray.__version__ >= "2.3":
17
+ _ZERO_COPY: Dict[str, Any] = {"zero_copy_batch": True}
18
+ else: # pragma: no cover
19
+ _ZERO_COPY = {}
@@ -1,11 +1,15 @@
1
1
  import pickle
2
- from typing import List, Optional, Tuple, Dict, Any
2
+ from typing import Any, Dict, List, Optional, Tuple
3
3
 
4
+ import pandas as pd
4
5
  import pyarrow as pa
5
6
  import ray.data as rd
6
- from fugue.dataframe.arrow_dataframe import _build_empty_arrow
7
7
  from triad import Schema
8
8
 
9
+ from fugue.dataframe.arrow_dataframe import _build_empty_arrow
10
+
11
+ from .._constants import _ZERO_COPY
12
+
9
13
  _RAY_NULL_REPR = "__RAY_NULL__"
10
14
 
11
15
 
@@ -15,6 +19,8 @@ def get_dataset_format(df: rd.Dataset) -> Optional[str]:
15
19
  return None
16
20
  if hasattr(df, "_dataset_format"): # pragma: no cover
17
21
  return df._dataset_format() # ray<2.2
22
+ ctx = rd.context.DatasetContext.get_current()
23
+ ctx.use_streaming_executor = False
18
24
  return df.dataset_format() # ray>=2.2
19
25
 
20
26
 
@@ -50,7 +56,7 @@ def add_partition_key(
50
56
  )
51
57
 
52
58
  return df.map_batches(
53
- add_simple_key, batch_format="pyarrow", **ray_remote_args
59
+ add_simple_key, batch_format="pyarrow", **_ZERO_COPY, **ray_remote_args
54
60
  ), input_schema + (
55
61
  output_key,
56
62
  str,
@@ -67,8 +73,29 @@ def add_partition_key(
67
73
  return fdf.append_column(output_key, sarr)
68
74
 
69
75
  return df.map_batches(
70
- add_key, batch_format="pyarrow", **ray_remote_args
76
+ add_key, batch_format="pyarrow", **_ZERO_COPY, **ray_remote_args
71
77
  ), input_schema + (
72
78
  output_key,
73
79
  pa.binary(),
74
80
  )
81
+
82
+
83
+ def add_coarse_partition_key(
84
+ df: rd.Dataset,
85
+ keys: List[str],
86
+ output_key: str,
87
+ bucket: int,
88
+ ) -> rd.Dataset:
89
+ ray_remote_args: Dict[str, Any] = {"num_cpus": 1}
90
+
91
+ def add_coarse_key(arrow_df: pa.Table) -> pa.Table: # pragma: no cover
92
+ hdf = arrow_df.select(keys).to_pandas()
93
+ _hash = pd.util.hash_pandas_object(hdf, index=False).mod(bucket)
94
+ return arrow_df.append_column(output_key, pa.Array.from_pandas(_hash))
95
+
96
+ return df.map_batches(
97
+ add_coarse_key,
98
+ batch_format="pyarrow",
99
+ **_ZERO_COPY,
100
+ **ray_remote_args,
101
+ )
fugue_ray/_utils/io.py CHANGED
@@ -5,7 +5,7 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Union
5
5
  import pyarrow as pa
6
6
  import ray.data as rd
7
7
  from fugue import ExecutionEngine
8
- from fugue._utils.io import FileParser, load_df, save_df
8
+ from fugue._utils.io import FileParser, save_df
9
9
  from fugue.collections.partition import PartitionSpec
10
10
  from fugue.dataframe import DataFrame
11
11
  from fugue_ray.dataframe import RayDataFrame
@@ -49,8 +49,6 @@ class RayIO(object):
49
49
  len(fmts) == 1, NotImplementedError("can't support multiple formats")
50
50
  )
51
51
  fmt = fmts[0]
52
- if fmt == "avro": # TODO: remove avro support
53
- return load_df(uri, format_hint=format_hint, columns=columns, **kwargs)
54
52
  files = [f.uri for f in fp]
55
53
  return self._loads[fmt](files, columns, **kwargs)
56
54
 
@@ -75,7 +73,7 @@ class RayIO(object):
75
73
  except Exception: # pragma: no cover
76
74
  pass
77
75
  p = FileParser(uri, format_hint)
78
- if not force_single and p.file_format != "avro":
76
+ if not force_single:
79
77
  df = self._prepartition(df, partition_spec=partition_spec)
80
78
 
81
79
  self._saves[p.file_format](df=df, uri=p.uri, **kwargs)