fugue 0.9.0.dev3__py3-none-any.whl → 0.9.0.dev4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
fugue/collections/sql.py CHANGED
@@ -15,7 +15,7 @@ class TempTableName:
15
15
  """Generating a temporary, random and globaly unique table name"""
16
16
 
17
17
  def __init__(self):
18
- self.key = "_" + str(uuid4())[:5]
18
+ self.key = "_" + str(uuid4())[:5].upper()
19
19
 
20
20
  def __repr__(self) -> str:
21
21
  return _TEMP_TABLE_EXPR_PREFIX + self.key + _TEMP_TABLE_EXPR_SUFFIX
fugue/dataframe/utils.py CHANGED
@@ -21,22 +21,6 @@ normalize_dataframe_column_names = normalize_column_names
21
21
  rename_dataframe_column_names = rename
22
22
 
23
23
 
24
- def _pa_type_eq(t1: pa.DataType, t2: pa.DataType) -> bool:
25
- # should ignore the name difference of list
26
- # e.g. list<item: string> == list<l: string>
27
- if pa.types.is_list(t1) and pa.types.is_list(t2): # pragma: no cover
28
- return _pa_type_eq(t1.value_type, t2.value_type)
29
- return t1 == t2
30
-
31
-
32
- def _schema_eq(s1: Schema, s2: Schema) -> bool:
33
- if s1 == s2:
34
- return True
35
- return s1.names == s2.names and all(
36
- _pa_type_eq(f1.type, f2.type) for f1, f2 in zip(s1.fields, s2.fields)
37
- )
38
-
39
-
40
24
  def _df_eq(
41
25
  df: DataFrame,
42
26
  data: Any,
@@ -46,6 +30,7 @@ def _df_eq(
46
30
  check_schema: bool = True,
47
31
  check_content: bool = True,
48
32
  no_pandas: bool = False,
33
+ equal_type_groups: Optional[List[List[Any]]] = None,
49
34
  throw=False,
50
35
  ) -> bool:
51
36
  """Compare if two dataframes are equal. Is for internal, unit test
@@ -66,6 +51,7 @@ def _df_eq(
66
51
  :param no_pandas: if true, it will compare the string representations of the
67
52
  dataframes, otherwise, it will convert both to pandas dataframe to compare,
68
53
  defaults to False
54
+ :param equal_type_groups: the groups to treat as equal types, defaults to None.
69
55
  :param throw: if to throw error if not equal, defaults to False
70
56
  :return: if they equal
71
57
  """
@@ -78,8 +64,8 @@ def _df_eq(
78
64
  assert (
79
65
  df1.count() == df2.count()
80
66
  ), f"count mismatch {df1.count()}, {df2.count()}"
81
- assert not check_schema or _schema_eq(
82
- df.schema, df2.schema
67
+ assert not check_schema or df.schema.is_like(
68
+ df2.schema, equal_groups=equal_type_groups
83
69
  ), f"schema mismatch {df.schema.pa_schema}, {df2.schema.pa_schema}"
84
70
  if not check_content:
85
71
  return True
fugue/test/plugins.py CHANGED
@@ -2,7 +2,7 @@ from contextlib import contextmanager
2
2
  from dataclasses import dataclass
3
3
  from pathlib import Path
4
4
  from typing import Any, Dict, Iterator, List, Optional, Tuple, Type
5
-
5
+ from fugue.dataframe.utils import _df_eq
6
6
  from triad import assert_or_throw, run_once
7
7
  from triad.utils.entry_points import load_entry_point
8
8
 
@@ -160,6 +160,7 @@ class FugueTestSuite:
160
160
 
161
161
  backend: Any
162
162
  tmp_path: Path
163
+ equal_type_groups: Any = None
163
164
 
164
165
  __test__ = False
165
166
  _test_context: Any = None
@@ -180,6 +181,15 @@ class FugueTestSuite:
180
181
  """The engine object inside the ``FugueTestContext``"""
181
182
  return self.context.engine
182
183
 
184
+ def get_equal_type_groups(self) -> Optional[List[List[Any]]]:
185
+ return None # pragma: no cover
186
+
187
+ def df_eq(self, *args: Any, **kwargs: Any) -> bool:
188
+ """A wrapper of :func:`~fugue.dataframe.utils.df_eq`"""
189
+ if "equal_type_groups" not in kwargs:
190
+ kwargs["equal_type_groups"] = self.equal_type_groups
191
+ return _df_eq(*args, **kwargs)
192
+
183
193
 
184
194
  def fugue_test_suite(backend: Any, mark_test: Optional[bool] = None) -> Any:
185
195
  def deco(cls: Type["FugueTestSuite"]) -> Type["FugueTestSuite"]:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: fugue
3
- Version: 0.9.0.dev3
3
+ Version: 0.9.0.dev4
4
4
  Summary: An abstraction layer for distributed computation
5
5
  Home-page: http://github.com/fugue-project/fugue
6
6
  Author: The Fugue Development Team
@@ -20,7 +20,7 @@ Classifier: Programming Language :: Python :: 3.11
20
20
  Classifier: Programming Language :: Python :: 3 :: Only
21
21
  Requires-Python: >=3.8
22
22
  Description-Content-Type: text/markdown
23
- Requires-Dist: triad >=0.9.4
23
+ Requires-Dist: triad >=0.9.6
24
24
  Requires-Dist: adagio >=0.2.4
25
25
  Provides-Extra: all
26
26
  Requires-Dist: qpd >=0.4.4 ; extra == 'all'
@@ -30,7 +30,7 @@ Requires-Dist: jinja2 ; extra == 'all'
30
30
  Requires-Dist: pyspark >=3.1.1 ; extra == 'all'
31
31
  Requires-Dist: dask[dataframe,distributed] >=2023.5.0 ; extra == 'all'
32
32
  Requires-Dist: dask-sql ; extra == 'all'
33
- Requires-Dist: ray[data] >=2.4.0 ; extra == 'all'
33
+ Requires-Dist: ray[data] >=2.5.0 ; extra == 'all'
34
34
  Requires-Dist: notebook ; extra == 'all'
35
35
  Requires-Dist: jupyterlab ; extra == 'all'
36
36
  Requires-Dist: ipython >=7.10.0 ; extra == 'all'
@@ -45,6 +45,7 @@ Provides-Extra: dask
45
45
  Requires-Dist: dask[dataframe,distributed] >=2023.5.0 ; extra == 'dask'
46
46
  Requires-Dist: pyarrow >=7.0.0 ; extra == 'dask'
47
47
  Requires-Dist: pandas >=2.0.2 ; extra == 'dask'
48
+ Requires-Dist: dask[dataframe,distributed] >=2024.4.0 ; (python_version >= "3.11.9") and extra == 'dask'
48
49
  Provides-Extra: duckdb
49
50
  Requires-Dist: qpd >=0.4.4 ; extra == 'duckdb'
50
51
  Requires-Dist: fugue-sql-antlr >=0.2.0 ; extra == 'duckdb'
@@ -58,6 +59,7 @@ Requires-Dist: fugue-sql-antlr >=0.2.0 ; extra == 'ibis'
58
59
  Requires-Dist: sqlglot ; extra == 'ibis'
59
60
  Requires-Dist: jinja2 ; extra == 'ibis'
60
61
  Requires-Dist: ibis-framework ; extra == 'ibis'
62
+ Requires-Dist: pandas <2.2 ; extra == 'ibis'
61
63
  Provides-Extra: notebook
62
64
  Requires-Dist: notebook ; extra == 'notebook'
63
65
  Requires-Dist: jupyterlab ; extra == 'notebook'
@@ -65,9 +67,9 @@ Requires-Dist: ipython >=7.10.0 ; extra == 'notebook'
65
67
  Provides-Extra: polars
66
68
  Requires-Dist: polars ; extra == 'polars'
67
69
  Provides-Extra: ray
68
- Requires-Dist: ray[data] >=2.4.0 ; extra == 'ray'
70
+ Requires-Dist: ray[data] >=2.5.0 ; extra == 'ray'
69
71
  Requires-Dist: duckdb >=0.5.0 ; extra == 'ray'
70
- Requires-Dist: pyarrow >=6.0.1 ; extra == 'ray'
72
+ Requires-Dist: pyarrow >=7.0.0 ; extra == 'ray'
71
73
  Requires-Dist: pandas <2.2 ; extra == 'ray'
72
74
  Provides-Extra: spark
73
75
  Requires-Dist: pyspark >=3.1.1 ; extra == 'spark'
@@ -18,7 +18,7 @@ fugue/bag/array_bag.py,sha256=b0UdDPmZpEAI3R0SBbZVOLVLAwMQnBCFeYDEpFWen14,1111
18
18
  fugue/bag/bag.py,sha256=sNBAzPmEh5fEm8ME8NEEOOre6l58ri6oouVBWwafqTc,3018
19
19
  fugue/collections/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
20
  fugue/collections/partition.py,sha256=rPkU-3y6E598Q7wvE-jTSbSwWh3fzIVxdwPpbQvWS-M,17257
21
- fugue/collections/sql.py,sha256=mXxgOr_BAwPWb5DAd0PZuNtCWpMWxIFp8uulVikvlZ8,4947
21
+ fugue/collections/sql.py,sha256=3MjnuQMPuUMq55n-EypikkRqcpOCZtOjp7S2fs7ujAA,4955
22
22
  fugue/collections/yielded.py,sha256=KAvCXAZpeuErGww7Y217_F7M2zv9G5hfdl2AWiO7wEM,2040
23
23
  fugue/column/__init__.py,sha256=aoZwwzyJtNL-duLxzU2sNGoaKikWd-yesbigE_Wj29s,208
24
24
  fugue/column/expressions.py,sha256=fdGX9oPCqJBuROFZqrOYVcwkjghdXT9ngaSTG5tW_i8,26544
@@ -34,7 +34,7 @@ fugue/dataframe/dataframes.py,sha256=tBSpHsENgbcdOJ0Jgst6PTKbjG7_uoFJch96oTlaQIs
34
34
  fugue/dataframe/function_wrapper.py,sha256=V1eQMOn27UroEYT7_YiwoEF0RjZYIM0zkD3vfaMAQFs,14813
35
35
  fugue/dataframe/iterable_dataframe.py,sha256=TcOoNKa4jNbHbvAZ0XAhtMmGcioygIHPxI9budDtenQ,4758
36
36
  fugue/dataframe/pandas_dataframe.py,sha256=0L0wYCGhD2BpQbruoT07Ox9iQM5YLHLNrcgzudc-yKs,11633
37
- fugue/dataframe/utils.py,sha256=shN1eHYTnPhb38BHEpLlCdLSzX_qpoQ3-fsDgu1hCzQ,10840
37
+ fugue/dataframe/utils.py,sha256=bA_otOJt9oju1yq5gtn21L_GDT_pUgNc6luYuBIhbUQ,10488
38
38
  fugue/dataset/__init__.py,sha256=5f2CAJ4xst6Z2o9Q2e2twfDOGUw8ZJoE2ild4JEU2pg,112
39
39
  fugue/dataset/api.py,sha256=DacI4L2w5NJ-eZ6nFxNMqmReEnb0WUXswbjVp7BeErk,2794
40
40
  fugue/dataset/dataset.py,sha256=jWXZqy3msMPFFkhas2PYJEX55ZAI3gk3Txq5f4-Qya4,4759
@@ -73,7 +73,7 @@ fugue/sql/api.py,sha256=l2I9CAy_W2oFFTct9fDPLyXF0LiDxQhMx5O8jBHTAxU,10050
73
73
  fugue/sql/workflow.py,sha256=S1pOhp0b0t6johFAJWmj6xUB7Ti5LQgNABpAzmLGjrQ,3010
74
74
  fugue/test/__init__.py,sha256=hvVrNbJYkWI_6otpILneyTjUafxURaA4obK6AoDyCUw,250
75
75
  fugue/test/pandas_tester.py,sha256=_w6rFqlzZKjBtmFf-08a4C97W5xtqGw5XorLhj6Zyoo,622
76
- fugue/test/plugins.py,sha256=VlVTAOiz2vnOI0G5CB408Bm77HMIcJS0gCQIh8Wb3b0,11866
76
+ fugue/test/plugins.py,sha256=GLZia5GCmy0eBVGNbIqTbX7Ou3euf2SY4litKgdigwY,12318
77
77
  fugue/workflow/__init__.py,sha256=tXM_KYO8Q358W6qAVlwhIQIaYNRDgZtTubrIEX4QMgM,229
78
78
  fugue/workflow/_checkpoint.py,sha256=tt5Iv7c5ZStC0MD1inItksQ0GuK0ViniA3nvrgym-5c,5681
79
79
  fugue/workflow/_tasks.py,sha256=Zq_jXJO_VaF8DrWUuBiwO2Y3OVuhsiOQdzP4VBsp7Fo,11826
@@ -89,14 +89,14 @@ fugue_contrib/viz/__init__.py,sha256=osgZx63Br-yMZImyEfYf9MVzJNM2Cqqke_-WsuDmG5M
89
89
  fugue_contrib/viz/_ext.py,sha256=Lu_DlS5DcmrFz27fHcKTCkhKyknVWcfS5kzZVVuO9xM,1345
90
90
  fugue_dask/__init__.py,sha256=2CcJ0AsN-k_f7dZ-yAyYpaICfUMPfH3l0FvUJSBzTr0,161
91
91
  fugue_dask/_constants.py,sha256=35UmTVITk21GhRyRlbJOwPPdQsytM_p_2NytOXEay18,510
92
- fugue_dask/_io.py,sha256=HmL3Q2lRSptX1-GwiB3MN2VpjTRfmVKD8TDZkhS4x5c,5818
93
- fugue_dask/_utils.py,sha256=n70N3wPPMz13Jh0GWJM3Je-TCYpU36yGP_YCwIHqUrc,8908
92
+ fugue_dask/_io.py,sha256=pl4F7mbVgP7Rwh1FFG7xfOz2TBZRUj1l3lLvDY4jOf4,6020
93
+ fugue_dask/_utils.py,sha256=1uplEqvpCDZDp2YdwJxa6cuGScpgG9VvN3057J02bys,8956
94
94
  fugue_dask/dataframe.py,sha256=MuG9TqCND7qI66lPvxzuomfE7yA4sW7DjrvbyvE6XEU,13471
95
95
  fugue_dask/execution_engine.py,sha256=60IiwYRBVhN-pX3v6i9BZ8Pa4bcSh5UoklvCScM_XAM,21361
96
96
  fugue_dask/registry.py,sha256=jepWKH55VWNIWV3pOF5vpCl2OpO0rI1IULx5GM2Gk6w,2274
97
97
  fugue_dask/tester.py,sha256=E7BZjgFpJgrHsLMKzvSO5im5OwocYcratjzulJSQZl0,718
98
98
  fugue_duckdb/__init__.py,sha256=ZzhmAWbROR1YL9Kmlt7OlwkgPZzFhsSdwLV2pFmAqGI,268
99
- fugue_duckdb/_io.py,sha256=E35_GoD1uGuuAMOY4H8E2j-UazdAgTmLp4lLWqJrNsE,8437
99
+ fugue_duckdb/_io.py,sha256=vnd8m8C6XeMCBJBbAdA5h695NMfsduQrvONyS0HcEFA,8475
100
100
  fugue_duckdb/_utils.py,sha256=ElKbHUyn5fWSPGXsK57iqMzcqKtCf0c8pBVBYGe5Ql4,5020
101
101
  fugue_duckdb/dask.py,sha256=agoLzeB7Swxj2kVWfmXFbWD1NS2lbbTlnrjSkR8kKWY,5014
102
102
  fugue_duckdb/dataframe.py,sha256=LRfTv7Y46wMM_IDYSP1R-5OXuHuBg8GHjPGFFt8u7l0,8444
@@ -107,7 +107,7 @@ fugue_ibis/__init__.py,sha256=z7TkK7M2_0p9XO6jQATNDgT0aHXn5k69Ttz2ga-eQG8,190
107
107
  fugue_ibis/_compat.py,sha256=zKdTaTfuC02eUIzZPkcd7oObnVBi_X5mQjQf7SDme3Y,246
108
108
  fugue_ibis/_utils.py,sha256=BUL5swA5FE4eQu0t5Z17hZVu9a2MFfxlFH6Ymy9xifg,6607
109
109
  fugue_ibis/dataframe.py,sha256=k4Q6qBLBIADF5YhbvaDplXO7OkMZSHuf_Wg5o-AusEI,7796
110
- fugue_ibis/execution_engine.py,sha256=cBU71BK6JuG_CAvPOZLhVetccwz1JAwX_x1iZ7SEgA4,18366
110
+ fugue_ibis/execution_engine.py,sha256=5I-ou5xPdomVu-srdvidvP8f7wDYbGrCV_lGffZa_ac,18679
111
111
  fugue_notebook/__init__.py,sha256=9r_-2uxu1lBeZ8GgpYCKom_OZy2soIOYZajg7JDO-HY,4326
112
112
  fugue_notebook/env.py,sha256=TYiTxYPFi-BVJJY49jDsvw9mddhK8WrifeRxBke30I8,4773
113
113
  fugue_notebook/nbextension/README.md,sha256=QLnr957YeGfwzy2r4c4qbZPaXyCbyGrKPvcqSBQYSnU,123
@@ -119,15 +119,15 @@ fugue_polars/_utils.py,sha256=7rGGWgB1-VqFwh4PcBLYk_5VNjd8FNOS4TDFyDVz2sg,159
119
119
  fugue_polars/polars_dataframe.py,sha256=8LQ0IB-JFFdjW2ltDzq8DfIbUC_jjjDr1YM29usJag0,8831
120
120
  fugue_polars/registry.py,sha256=gd6qQ-OxYtTAQFyvYbLDPXmSvCR-LW6n5K5ylgMY_7A,2950
121
121
  fugue_ray/__init__.py,sha256=HzEHfG2mpc0ugf3nf1Pdy15Bhg35K6maZpYejn1aoyI,119
122
- fugue_ray/_constants.py,sha256=vu5l1w-Wi-2V_nm0HLXKOYhh5HdWRCc5yQktO2XzhOg,569
122
+ fugue_ray/_constants.py,sha256=RHlaVKyjQnwdbo5mFO_GBtQZcz5GvWcCbkOkLfVTQ1A,565
123
123
  fugue_ray/dataframe.py,sha256=7asw2qf9vm6vLBSzqghm9pUcNAppJOz5CkT7XyR0S5g,12514
124
- fugue_ray/execution_engine.py,sha256=NT_mnacijp1zskFbtganUwA3JNRPU-FNNvJswA6U_Yg,12607
124
+ fugue_ray/execution_engine.py,sha256=PZlWbmdCwTPfZJhN2I-44JW7so8NVCFFumaKIhJLfoI,12566
125
125
  fugue_ray/registry.py,sha256=TS-HWy2IUozp6_A0vqc8_ZdVUT_Z9yVjG6e1gbbgy2A,1757
126
126
  fugue_ray/tester.py,sha256=oTA_xOzvQhJU3ohc4hsVpZc0zv4bwJn1c8a9u8kcuIs,537
127
127
  fugue_ray/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
128
128
  fugue_ray/_utils/cluster.py,sha256=3T3Gyra6lAHlzktta-Ro35j6YZQfH6fNrj2hC5ATF9k,621
129
- fugue_ray/_utils/dataframe.py,sha256=_EadzS4rPom1A_cF0pqoPlwrNYZTfTwcyyu86_fFsqU,4400
130
- fugue_ray/_utils/io.py,sha256=fj6aVt2qnFRGCm7SkHOd25KKXU3kd8yO18Z6y9Sib20,8862
129
+ fugue_ray/_utils/dataframe.py,sha256=5c4duGV--mdLkKrbJRgjDWvVcp9BegA3yX16pmYDYLE,3954
130
+ fugue_ray/_utils/io.py,sha256=3hFNDeBuh4bfCud40ZsGrGZLSvCSuxL_1VlqCTnn6RA,9794
131
131
  fugue_spark/__init__.py,sha256=rvrMpFs9socMgyH_58gLbnAqmirBf5oidXoO4cekW6U,165
132
132
  fugue_spark/_constants.py,sha256=K2uLQfjvMxXk75K-7_Wn47Alpwq5rW57BtECAUrOeqA,177
133
133
  fugue_spark/dataframe.py,sha256=lYa8FizM3p_lsKYFR49FazkVZMJKyi2LABKTpP5YBLo,12006
@@ -143,14 +143,14 @@ fugue_sql/__init__.py,sha256=Cmr7w0Efr7PzoXdQzdJfc4Dgqd69qKqcHZZodENq7EU,287
143
143
  fugue_sql/exceptions.py,sha256=ltS0MC8gMnVVrJbQiOZ0kRUWvVQ2LTx33dCW3ugqtb0,260
144
144
  fugue_test/__init__.py,sha256=xoQuVobhU64uyODRdnzf6MSWe9lw5khkhpJ2atvADoc,2315
145
145
  fugue_test/bag_suite.py,sha256=WbDCFjuAHYoJh4GXSPiSJxOoOwE1VMtYpJ3lQrsUK-Y,2483
146
- fugue_test/builtin_suite.py,sha256=uE5cP8PBT-VLG0OXbdCj-gVu5VyDrq7if8tJb2fX2Pg,77940
147
- fugue_test/dataframe_suite.py,sha256=YhircCw7Le27rESYTWiRoS5rOpcnmNc83UFK7F2_HI4,19029
148
- fugue_test/execution_suite.py,sha256=RyDJeXjqXqPMaRtdetcKnIAAFl3bA8eOjx3mZm36AZI,47969
146
+ fugue_test/builtin_suite.py,sha256=cOkZG6w1RHhWWxtjQhZClZQaGT6haNd576BoUmNC_cA,77960
147
+ fugue_test/dataframe_suite.py,sha256=7ym4sshDUly6004cq1UlppqDVtbwxD6CKxR4Lu70i0s,18994
148
+ fugue_test/execution_suite.py,sha256=jcSSoKqTGbeWzTxkyYU-8i2zJAjzuXn7BqE8ul-JjIc,48646
149
149
  fugue_test/fixtures.py,sha256=8Pev-mxRZOWwTFlsGjcSZ0iIs78zyWbp5tq4KG1wyvk,1432
150
150
  fugue_version/__init__.py,sha256=H9NWRZb7NbeRRPLP_V1fARmLNXranorVM-OOY-8_2ug,22
151
- fugue-0.9.0.dev3.dist-info/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
152
- fugue-0.9.0.dev3.dist-info/METADATA,sha256=giFBHER9khAwE2zSWd0YwgWzJpjXBezFrmQE4Jor8b0,18235
153
- fugue-0.9.0.dev3.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
154
- fugue-0.9.0.dev3.dist-info/entry_points.txt,sha256=kiRuUkKOnnHFvlWpYSfVUZiXJW3hOez6gjYoOhGht3Q,302
155
- fugue-0.9.0.dev3.dist-info/top_level.txt,sha256=y1eCfzGdQ1_RkgcShcfbvXs-bopD3DwJcIOxP9EFXno,140
156
- fugue-0.9.0.dev3.dist-info/RECORD,,
151
+ fugue-0.9.0.dev4.dist-info/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
152
+ fugue-0.9.0.dev4.dist-info/METADATA,sha256=smbI6QuuMajmoMhJ14Y4MUs2mGpb4onc6kImR83D9DQ,18385
153
+ fugue-0.9.0.dev4.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
154
+ fugue-0.9.0.dev4.dist-info/entry_points.txt,sha256=kiRuUkKOnnHFvlWpYSfVUZiXJW3hOez6gjYoOhGht3Q,302
155
+ fugue-0.9.0.dev4.dist-info/top_level.txt,sha256=y1eCfzGdQ1_RkgcShcfbvXs-bopD3DwJcIOxP9EFXno,140
156
+ fugue-0.9.0.dev4.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.42.0)
2
+ Generator: bdist_wheel (0.43.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
fugue_dask/_io.py CHANGED
@@ -6,7 +6,7 @@ from fsspec import AbstractFileSystem
6
6
  from triad.collections.dict import ParamDict
7
7
  from triad.collections.schema import Schema
8
8
  from triad.utils.assertion import assert_or_throw
9
- from triad.utils.io import join, makedirs, url_to_fs
9
+ from triad.utils.io import isfile, join, makedirs, url_to_fs
10
10
 
11
11
  from fugue._utils.io import FileParser, _get_single_files
12
12
  from fugue_dask.dataframe import DaskDataFrame
@@ -100,9 +100,11 @@ def _save_csv(df: DaskDataFrame, p: FileParser, **kwargs: Any) -> None:
100
100
 
101
101
 
102
102
  def _safe_load_csv(path: str, **kwargs: Any) -> dd.DataFrame:
103
+ if not isfile(path):
104
+ return dd.read_csv(join(path, "*.csv"), **kwargs)
103
105
  try:
104
106
  return dd.read_csv(path, **kwargs)
105
- except (IsADirectoryError, PermissionError):
107
+ except (IsADirectoryError, PermissionError): # pragma: no cover
106
108
  return dd.read_csv(join(path, "*.csv"), **kwargs)
107
109
 
108
110
 
@@ -148,11 +150,12 @@ def _save_json(df: DaskDataFrame, p: FileParser, **kwargs: Any) -> None:
148
150
 
149
151
 
150
152
  def _safe_load_json(path: str, **kwargs: Any) -> dd.DataFrame:
153
+ if not isfile(path):
154
+ return dd.read_json(join(path, "*.json"), **kwargs)
151
155
  try:
152
156
  return dd.read_json(path, **kwargs)
153
- except (IsADirectoryError, PermissionError):
154
- x = dd.read_json(join(path, "*.json"), **kwargs)
155
- return x
157
+ except (IsADirectoryError, PermissionError): # pragma: no cover
158
+ return dd.read_json(join(path, "*.json"), **kwargs)
156
159
 
157
160
 
158
161
  def _load_json(
fugue_dask/_utils.py CHANGED
@@ -53,7 +53,7 @@ def hash_repartition(df: dd.DataFrame, num: int, cols: List[Any]) -> dd.DataFram
53
53
  if num < 1:
54
54
  return df
55
55
  if num == 1:
56
- return df.repartition(1)
56
+ return df.repartition(npartitions=1)
57
57
  df = df.reset_index(drop=True).clear_divisions()
58
58
  idf, ct = _add_hash_index(df, num, cols)
59
59
  return _postprocess(idf, ct, num)
@@ -76,7 +76,7 @@ def even_repartition(df: dd.DataFrame, num: int, cols: List[Any]) -> dd.DataFram
76
76
  the number of partitions will be the number of groups.
77
77
  """
78
78
  if num == 1:
79
- return df.repartition(1)
79
+ return df.repartition(npartitions=1)
80
80
  if len(cols) == 0 and num <= 0:
81
81
  return df
82
82
  df = df.reset_index(drop=True).clear_divisions()
@@ -111,7 +111,7 @@ def rand_repartition(
111
111
  if num < 1:
112
112
  return df
113
113
  if num == 1:
114
- return df.repartition(1)
114
+ return df.repartition(npartitions=1)
115
115
  df = df.reset_index(drop=True).clear_divisions()
116
116
  if len(cols) == 0:
117
117
  idf, ct = _add_random_index(df, num=num, seed=seed)
@@ -124,7 +124,7 @@ def rand_repartition(
124
124
  def _postprocess(idf: dd.DataFrame, ct: int, num: int) -> dd.DataFrame:
125
125
  parts = min(ct, num)
126
126
  if parts <= 1:
127
- return idf.repartition(1)
127
+ return idf.repartition(npartitions=1)
128
128
  divisions = list(np.arange(ct, step=math.ceil(ct / parts)))
129
129
  divisions.append(ct - 1)
130
130
  return idf.repartition(divisions=divisions, force=True)
fugue_duckdb/_io.py CHANGED
@@ -140,6 +140,7 @@ class DuckDBIO:
140
140
  else:
141
141
  if header:
142
142
  kw["ALL_VARCHAR"] = 1
143
+ kw["auto_detect"] = 1
143
144
  if columns is None:
144
145
  cols = "*"
145
146
  elif isinstance(columns, list):
@@ -23,8 +23,8 @@ from ._compat import IbisTable
23
23
  from ._utils import to_ibis_schema
24
24
  from .dataframe import IbisDataFrame
25
25
 
26
- _JOIN_RIGHT_SUFFIX = "_ibis_y__"
27
- _GEN_TABLE_NAMES = (f"_fugue_temp_table_{i:d}" for i in itertools.count())
26
+ _JOIN_RIGHT_SUFFIX = "_ibis_y__".upper()
27
+ _GEN_TABLE_NAMES = (f"_fugue_temp_table_{i:d}".upper() for i in itertools.count())
28
28
 
29
29
 
30
30
  class IbisSQLEngine(SQLEngine):
@@ -224,7 +224,7 @@ class IbisSQLEngine(SQLEngine):
224
224
  _presort = parse_presort_exp(presort)
225
225
  else:
226
226
  _presort = partition_spec.presort
227
- tbn = "_temp"
227
+ tbn = "_TEMP"
228
228
  idf = self.to_df(df)
229
229
 
230
230
  if len(_presort) == 0:
@@ -233,9 +233,10 @@ class IbisSQLEngine(SQLEngine):
233
233
  pcols = ", ".join(
234
234
  self.encode_column_name(x) for x in partition_spec.partition_by
235
235
  )
236
+ dummy_order_by = self._dummy_window_order_by()
236
237
  sql = (
237
238
  f"SELECT * FROM ("
238
- f"SELECT *, ROW_NUMBER() OVER (PARTITION BY {pcols}) "
239
+ f"SELECT *, ROW_NUMBER() OVER (PARTITION BY {pcols} {dummy_order_by}) "
239
240
  f"AS __fugue_take_param FROM {tbn}"
240
241
  f") WHERE __fugue_take_param<={n}"
241
242
  )
@@ -290,6 +291,12 @@ class IbisSQLEngine(SQLEngine):
290
291
  def load_table(self, table: str, **kwargs: Any) -> DataFrame:
291
292
  return self.to_df(self.backend.table(table))
292
293
 
294
+ def _dummy_window_order_by(self) -> str:
295
+ """Return a dummy window order by clause, this is required for
296
+ some SQL backends when there is no real order by clause in window
297
+ """
298
+ return ""
299
+
293
300
 
294
301
  class IbisMapEngine(MapEngine):
295
302
  """IbisExecutionEngine's MapEngine, it is a wrapper of the map engine
fugue_ray/_constants.py CHANGED
@@ -1,6 +1,7 @@
1
1
  from typing import Any, Dict
2
2
 
3
3
  import ray
4
+ from packaging import version
4
5
 
5
6
  FUGUE_RAY_CONF_SHUFFLE_PARTITIONS = "fugue.ray.shuffle.partitions"
6
7
  FUGUE_RAY_DEFAULT_PARTITIONS = "fugue.ray.default.partitions"
@@ -12,8 +13,6 @@ FUGUE_RAY_DEFAULT_CONF: Dict[str, Any] = {
12
13
  FUGUE_RAY_DEFAULT_PARTITIONS: 0,
13
14
  FUGUE_RAY_ZERO_COPY: True,
14
15
  }
16
+ RAY_VERSION = version.parse(ray.__version__)
15
17
 
16
- if ray.__version__ >= "2.3":
17
- _ZERO_COPY: Dict[str, Any] = {"zero_copy_batch": True}
18
- else: # pragma: no cover
19
- _ZERO_COPY = {}
18
+ _ZERO_COPY: Dict[str, Any] = {"zero_copy_batch": True}
@@ -3,7 +3,6 @@ from typing import Any, Dict, List, Optional, Tuple
3
3
 
4
4
  import pandas as pd
5
5
  import pyarrow as pa
6
- import ray
7
6
  import ray.data as rd
8
7
  from triad import Schema
9
8
 
@@ -31,31 +30,21 @@ def get_dataset_format(df: rd.Dataset) -> Tuple[Optional[str], rd.Dataset]:
31
30
  df = materialize(df)
32
31
  if df.count() == 0:
33
32
  return None, df
34
- if ray.__version__ < "2.5.0": # pragma: no cover
35
- if hasattr(df, "_dataset_format"): # pragma: no cover
36
- return df._dataset_format(), df # ray<2.2
37
- ctx = rd.context.DatasetContext.get_current()
38
- ctx.use_streaming_executor = False
39
- return df.dataset_format(), df # ray>=2.2
40
- else:
41
- schema = df.schema(fetch_if_missing=True)
42
- if schema is None: # pragma: no cover
43
- return None, df
44
- if isinstance(schema.base_schema, pa.Schema):
45
- return "arrow", df
46
- return "pandas", df
33
+ schema = df.schema(fetch_if_missing=True)
34
+ if schema is None: # pragma: no cover
35
+ return None, df
36
+ if isinstance(schema.base_schema, pa.Schema):
37
+ return "arrow", df
38
+ return "pandas", df
47
39
 
48
40
 
49
41
  def to_schema(schema: Any) -> Schema: # pragma: no cover
50
42
  if isinstance(schema, pa.Schema):
51
43
  return Schema(schema)
52
- if ray.__version__ >= "2.5.0":
53
- if isinstance(schema, rd.Schema):
54
- if hasattr(schema, "base_schema") and isinstance(
55
- schema.base_schema, pa.Schema
56
- ):
57
- return Schema(schema.base_schema)
58
- return Schema(list(zip(schema.names, schema.types)))
44
+ if isinstance(schema, rd.Schema):
45
+ if hasattr(schema, "base_schema") and isinstance(schema.base_schema, pa.Schema):
46
+ return Schema(schema.base_schema)
47
+ return Schema(list(zip(schema.names, schema.types)))
59
48
  raise ValueError(f"{schema} is not supported")
60
49
 
61
50
 
fugue_ray/_utils/io.py CHANGED
@@ -3,15 +3,15 @@ import pathlib
3
3
  from typing import Any, Callable, Dict, Iterable, List, Optional, Union
4
4
 
5
5
  import pyarrow as pa
6
- import ray
7
6
  import ray.data as rd
7
+ from packaging import version
8
8
  from pyarrow import csv as pacsv
9
9
  from pyarrow import json as pajson
10
10
  from ray.data.datasource import FileExtensionFilter
11
11
  from triad.collections import Schema
12
12
  from triad.collections.dict import ParamDict
13
13
  from triad.utils.assertion import assert_or_throw
14
- from triad.utils.io import exists, makedirs, rm
14
+ from triad.utils.io import exists, makedirs, rm, isfile
15
15
 
16
16
  from fugue import ExecutionEngine
17
17
  from fugue._utils.io import FileParser, save_df
@@ -19,6 +19,8 @@ from fugue.collections.partition import PartitionSpec
19
19
  from fugue.dataframe import DataFrame
20
20
  from fugue_ray.dataframe import RayDataFrame
21
21
 
22
+ from .._constants import RAY_VERSION
23
+
22
24
 
23
25
  class RayIO(object):
24
26
  def __init__(self, engine: ExecutionEngine):
@@ -149,6 +151,18 @@ class RayIO(object):
149
151
  if infer_schema and columns is not None and not isinstance(columns, list):
150
152
  raise ValueError("can't set columns as a schema when infer schema is true")
151
153
 
154
+ if RAY_VERSION >= version.parse("2.10"):
155
+ if len(p) == 1 and isfile(p[0]): # TODO: very hacky
156
+ params: Dict[str, Any] = {}
157
+ else:
158
+ params = {"file_extensions": ["csv"]}
159
+ else: # pragma: no cover
160
+ params = {
161
+ "partition_filter": _FileFiler(
162
+ file_extensions=["csv"], exclude=["_SUCCESS"]
163
+ ),
164
+ }
165
+
152
166
  def _read_csv(to_str: bool) -> RayDataFrame:
153
167
  res = rd.read_csv(
154
168
  p,
@@ -156,9 +170,7 @@ class RayIO(object):
156
170
  read_options=pacsv.ReadOptions(**read_options),
157
171
  parse_options=pacsv.ParseOptions(**parse_options),
158
172
  convert_options=pacsv.ConvertOptions(**convert_options),
159
- partition_filter=_FileFiler(
160
- file_extensions=["csv"], exclude=["_SUCCESS"]
161
- ),
173
+ **params,
162
174
  )
163
175
  if to_str:
164
176
  _schema = res.schema(fetch_if_missing=True)
@@ -196,20 +208,31 @@ class RayIO(object):
196
208
  read_options: Dict[str, Any] = {"use_threads": False}
197
209
  parse_options: Dict[str, Any] = {}
198
210
 
199
- def _read_json() -> RayDataFrame:
200
- if ray.__version__ >= "2.9":
201
- params: Dict[str, Any] = {"file_extensions": None}
211
+ def _read_json() -> RayDataFrame: # pragma: no cover
212
+ if RAY_VERSION >= version.parse("2.10"):
213
+ if len(p) == 1 and isfile(p[0]): # TODO: very hacky
214
+ params: Dict[str, Any] = {"file_extensions": None}
215
+ else:
216
+ params = {"file_extensions": ["json"]}
217
+ elif RAY_VERSION >= version.parse("2.9"): # pragma: no cover
218
+ params = {
219
+ "file_extensions": None,
220
+ "partition_filter": _FileFiler(
221
+ file_extensions=["json"], exclude=["_SUCCESS"]
222
+ ),
223
+ }
202
224
  else: # pragma: no cover
203
- params = {}
225
+ params = {
226
+ "partition_filter": _FileFiler(
227
+ file_extensions=["json"], exclude=["_SUCCESS"]
228
+ ),
229
+ }
204
230
  return RayDataFrame(
205
231
  rd.read_json(
206
232
  p,
207
233
  ray_remote_args=self._remote_args(),
208
234
  read_options=pajson.ReadOptions(**read_options),
209
235
  parse_options=pajson.ParseOptions(**parse_options),
210
- partition_filter=_FileFiler(
211
- file_extensions=["json"], exclude=["_SUCCESS"]
212
- ),
213
236
  **params,
214
237
  )
215
238
  )
@@ -227,7 +250,7 @@ class RayIO(object):
227
250
  return {"num_cpus": 1}
228
251
 
229
252
 
230
- class _FileFiler(FileExtensionFilter):
253
+ class _FileFiler(FileExtensionFilter): # pragma: no cover
231
254
  def __init__(self, file_extensions: Union[str, List[str]], exclude: Iterable[str]):
232
255
  super().__init__(file_extensions, allow_if_no_extension=True)
233
256
  self._exclude = set(exclude)
@@ -191,8 +191,7 @@ class RayMapEngine(MapEngine):
191
191
  mb_args["batch_size"] = self.conf.get_or_throw(
192
192
  FUGUE_RAY_DEFAULT_BATCH_SIZE, int
193
193
  )
194
- if ray.__version__ >= "2.3":
195
- mb_args["zero_copy_batch"] = self.conf.get(FUGUE_RAY_ZERO_COPY, True)
194
+ mb_args["zero_copy_batch"] = self.conf.get(FUGUE_RAY_ZERO_COPY, True)
196
195
  sdf = rdf.native.map_batches(
197
196
  _udf,
198
197
  batch_format="pyarrow",