fugue 0.8.7.dev4__py3-none-any.whl → 0.8.7.dev6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fugue/api.py +1 -0
- fugue/dataframe/api.py +51 -15
- fugue/dataframe/arrow_dataframe.py +48 -11
- fugue/dataframe/dataframe.py +20 -2
- fugue/dataframe/function_wrapper.py +1 -1
- fugue/dataframe/iterable_dataframe.py +3 -0
- fugue/dataframe/pandas_dataframe.py +73 -0
- fugue/dataframe/utils.py +72 -4
- fugue/execution/execution_engine.py +1 -1
- fugue/execution/native_execution_engine.py +1 -1
- fugue/plugins.py +1 -0
- {fugue-0.8.7.dev4.dist-info → fugue-0.8.7.dev6.dist-info}/METADATA +5 -4
- {fugue-0.8.7.dev4.dist-info → fugue-0.8.7.dev6.dist-info}/RECORD +30 -30
- {fugue-0.8.7.dev4.dist-info → fugue-0.8.7.dev6.dist-info}/WHEEL +1 -1
- {fugue-0.8.7.dev4.dist-info → fugue-0.8.7.dev6.dist-info}/entry_points.txt +1 -1
- fugue_dask/_io.py +5 -0
- fugue_dask/_utils.py +15 -2
- fugue_dask/dataframe.py +105 -18
- fugue_duckdb/dataframe.py +87 -29
- fugue_ibis/dataframe.py +13 -0
- fugue_polars/polars_dataframe.py +53 -16
- fugue_ray/dataframe.py +71 -19
- fugue_spark/_utils/convert.py +32 -7
- fugue_spark/_utils/io.py +3 -1
- fugue_spark/dataframe.py +94 -22
- fugue_spark/execution_engine.py +7 -3
- fugue_test/builtin_suite.py +1 -1
- fugue_test/dataframe_suite.py +14 -0
- {fugue-0.8.7.dev4.dist-info → fugue-0.8.7.dev6.dist-info}/LICENSE +0 -0
- {fugue-0.8.7.dev4.dist-info → fugue-0.8.7.dev6.dist-info}/top_level.txt +0 -0
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
fugue/__init__.py,sha256=xT5zuNZfRkjbA8a-uTT5oLK6hLGuezGZLWYBl6eS5J4,2749
|
|
2
|
-
fugue/api.py,sha256=
|
|
2
|
+
fugue/api.py,sha256=dLUrigFhDMB5x7cvlWSK8EyaY2o0AmhgPr0VRtfzSz0,1254
|
|
3
3
|
fugue/constants.py,sha256=crd0VqX8WtBcjSUNwZDi2LDIEkhUMWOlSn73H8JI9ds,3385
|
|
4
4
|
fugue/dev.py,sha256=GQCkezBBl4V0lVDWhGtUQKqomiCxgR9dMhfqj9C8cS8,1369
|
|
5
5
|
fugue/exceptions.py,sha256=ylP8gkZL8ao_ZLinNYKv16FPyO_n7c29dN-4QChUxi0,1544
|
|
6
|
-
fugue/plugins.py,sha256=
|
|
6
|
+
fugue/plugins.py,sha256=kao-H5z-cRbujBKW1QC9IHUOBKxXMhpVQ6saIE7cXm8,1012
|
|
7
7
|
fugue/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
8
|
fugue/registry.py,sha256=SNULGv08f37fRO-cIxFDmnVcod7ref2fNLSK6G7nVnI,868
|
|
9
9
|
fugue/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -25,24 +25,24 @@ fugue/column/expressions.py,sha256=fdGX9oPCqJBuROFZqrOYVcwkjghdXT9ngaSTG5tW_i8,2
|
|
|
25
25
|
fugue/column/functions.py,sha256=ygLyn2gp5lTdGbYqJXeGeMmRNhbm4-vfJvAY_Zt0pb0,9774
|
|
26
26
|
fugue/column/sql.py,sha256=s_qTtHgnvRFqjhCWr7s595PTrHM-Pr9zHUQfU5xcTVA,17391
|
|
27
27
|
fugue/dataframe/__init__.py,sha256=zm7TbsaJLIvfm7zymWm2LGcuJd3nxfGsFnQiyrSnenM,678
|
|
28
|
-
fugue/dataframe/api.py,sha256=
|
|
28
|
+
fugue/dataframe/api.py,sha256=aWBvMaiSUxOvdQMfe79zHShWuPfLcgiWggC9HvVxvSE,11017
|
|
29
29
|
fugue/dataframe/array_dataframe.py,sha256=4scWnmQ6sjy1A6o7IYdRc0VVutBEfcJrA1f9wkph4Kg,4440
|
|
30
|
-
fugue/dataframe/arrow_dataframe.py,sha256=
|
|
31
|
-
fugue/dataframe/dataframe.py,sha256=
|
|
30
|
+
fugue/dataframe/arrow_dataframe.py,sha256=r5zcZBX_N6XO5dmixBkTCPgLcMmgDF022piZvrwRp_c,11485
|
|
31
|
+
fugue/dataframe/dataframe.py,sha256=xmyG85i14A6LDRkNmPt29oYq7PJsq668s1QvFHK8PV4,16964
|
|
32
32
|
fugue/dataframe/dataframe_iterable_dataframe.py,sha256=lx71KfaI4lsVKI-79buc-idaeT20JEMBOq21SQcAiY8,7259
|
|
33
33
|
fugue/dataframe/dataframes.py,sha256=tBSpHsENgbcdOJ0Jgst6PTKbjG7_uoFJch96oTlaQIs,4160
|
|
34
|
-
fugue/dataframe/function_wrapper.py,sha256=
|
|
35
|
-
fugue/dataframe/iterable_dataframe.py,sha256=
|
|
36
|
-
fugue/dataframe/pandas_dataframe.py,sha256=
|
|
37
|
-
fugue/dataframe/utils.py,sha256=
|
|
34
|
+
fugue/dataframe/function_wrapper.py,sha256=V1eQMOn27UroEYT7_YiwoEF0RjZYIM0zkD3vfaMAQFs,14813
|
|
35
|
+
fugue/dataframe/iterable_dataframe.py,sha256=TcOoNKa4jNbHbvAZ0XAhtMmGcioygIHPxI9budDtenQ,4758
|
|
36
|
+
fugue/dataframe/pandas_dataframe.py,sha256=0L0wYCGhD2BpQbruoT07Ox9iQM5YLHLNrcgzudc-yKs,11633
|
|
37
|
+
fugue/dataframe/utils.py,sha256=VS1qLCr-9NEcEjaK-219rADJadDf6EfzYZCGRUpn1fY,11405
|
|
38
38
|
fugue/dataset/__init__.py,sha256=5f2CAJ4xst6Z2o9Q2e2twfDOGUw8ZJoE2ild4JEU2pg,112
|
|
39
39
|
fugue/dataset/api.py,sha256=DacI4L2w5NJ-eZ6nFxNMqmReEnb0WUXswbjVp7BeErk,2794
|
|
40
40
|
fugue/dataset/dataset.py,sha256=jWXZqy3msMPFFkhas2PYJEX55ZAI3gk3Txq5f4-Qya4,4759
|
|
41
41
|
fugue/execution/__init__.py,sha256=iZGxAznZz9piM3k4gp0tln97MDIBxdliLyNbD-0Zc48,427
|
|
42
42
|
fugue/execution/api.py,sha256=KsFOLGdWQMlXmlQ5JRgRsbUeB64qzTVHxSEaunjiojo,39818
|
|
43
|
-
fugue/execution/execution_engine.py,sha256=
|
|
43
|
+
fugue/execution/execution_engine.py,sha256=G_SsTmcuDcy6_azi_88lGzsOodiizu0JdWxebxgbqRg,47721
|
|
44
44
|
fugue/execution/factory.py,sha256=5ICzfNh2QqqABuVyYLijY5-7LZgfRqczlaZN32p78bE,21003
|
|
45
|
-
fugue/execution/native_execution_engine.py,sha256=
|
|
45
|
+
fugue/execution/native_execution_engine.py,sha256=Mm9BVC3dEMS3IWRZe4YvGKp6_mmW7dLmoLMK5HgAPcs,14408
|
|
46
46
|
fugue/extensions/__init__.py,sha256=y-uLKd6mZ8sZ_8-OdW6ELoBO_9IfC0gDmEbE_rMCvOA,599
|
|
47
47
|
fugue/extensions/_utils.py,sha256=Bi3pYKy2Z6fG6_5BpwIWldxetassXpB4Zp8QamWB-wg,5173
|
|
48
48
|
fugue/extensions/context.py,sha256=c_y2UttzzIFoQTOCV42VCdj2nqah33xYuBjbKNIOpx8,4262
|
|
@@ -86,9 +86,9 @@ fugue_contrib/viz/__init__.py,sha256=osgZx63Br-yMZImyEfYf9MVzJNM2Cqqke_-WsuDmG5M
|
|
|
86
86
|
fugue_contrib/viz/_ext.py,sha256=Lu_DlS5DcmrFz27fHcKTCkhKyknVWcfS5kzZVVuO9xM,1345
|
|
87
87
|
fugue_dask/__init__.py,sha256=2CcJ0AsN-k_f7dZ-yAyYpaICfUMPfH3l0FvUJSBzTr0,161
|
|
88
88
|
fugue_dask/_constants.py,sha256=35UmTVITk21GhRyRlbJOwPPdQsytM_p_2NytOXEay18,510
|
|
89
|
-
fugue_dask/_io.py,sha256=
|
|
90
|
-
fugue_dask/_utils.py,sha256=
|
|
91
|
-
fugue_dask/dataframe.py,sha256=
|
|
89
|
+
fugue_dask/_io.py,sha256=9G516yM6zQvSC5_JA6qHb3LwBDmhWcxK5sjFHrQ81zo,6012
|
|
90
|
+
fugue_dask/_utils.py,sha256=n70N3wPPMz13Jh0GWJM3Je-TCYpU36yGP_YCwIHqUrc,8908
|
|
91
|
+
fugue_dask/dataframe.py,sha256=MuG9TqCND7qI66lPvxzuomfE7yA4sW7DjrvbyvE6XEU,13471
|
|
92
92
|
fugue_dask/execution_engine.py,sha256=XJp6wrdkaNh5pOpwt-Hjoa2sxgCOgusFRWrcqoCcaNM,21153
|
|
93
93
|
fugue_dask/ibis_engine.py,sha256=kQdaG_KlZZ2AjtYETNCdTJOgtwI_eH0aGzLaAiIBbRI,2120
|
|
94
94
|
fugue_dask/registry.py,sha256=7UTg_eie7zKlHYKMCyOo0TNn5y2TiIjE8kiS2PruHFc,2200
|
|
@@ -96,14 +96,14 @@ fugue_duckdb/__init__.py,sha256=nSNv-fxBAKD6W23EbMeV4dVRIaSTqr9DzQUWuVOES8s,379
|
|
|
96
96
|
fugue_duckdb/_io.py,sha256=Sq228unVnroYTq4GX-Wnv22SLHC9Ji-aWgiqrfdu81w,8880
|
|
97
97
|
fugue_duckdb/_utils.py,sha256=ElKbHUyn5fWSPGXsK57iqMzcqKtCf0c8pBVBYGe5Ql4,5020
|
|
98
98
|
fugue_duckdb/dask.py,sha256=agoLzeB7Swxj2kVWfmXFbWD1NS2lbbTlnrjSkR8kKWY,5014
|
|
99
|
-
fugue_duckdb/dataframe.py,sha256=
|
|
99
|
+
fugue_duckdb/dataframe.py,sha256=LRfTv7Y46wMM_IDYSP1R-5OXuHuBg8GHjPGFFt8u7l0,8444
|
|
100
100
|
fugue_duckdb/execution_engine.py,sha256=fkkQb4Eh0m7SwKrTplVk2oQalLkNoj3CW0R12g01ofk,20536
|
|
101
101
|
fugue_duckdb/ibis_engine.py,sha256=MrypeABozqwetKOpqtrmWvCJX2QPfBXhbSEhvK9vqmI,1990
|
|
102
102
|
fugue_duckdb/registry.py,sha256=Dj0Tng1cXVT6Q7t-KxOky2k1dD9xSBjYGQmI26UgZPo,3095
|
|
103
103
|
fugue_ibis/__init__.py,sha256=PcUt66KlLyGGicad7asq5j2U567_fhR0HzvWQBhV1VM,362
|
|
104
104
|
fugue_ibis/_compat.py,sha256=zKdTaTfuC02eUIzZPkcd7oObnVBi_X5mQjQf7SDme3Y,246
|
|
105
105
|
fugue_ibis/_utils.py,sha256=BUL5swA5FE4eQu0t5Z17hZVu9a2MFfxlFH6Ymy9xifg,6607
|
|
106
|
-
fugue_ibis/dataframe.py,sha256=
|
|
106
|
+
fugue_ibis/dataframe.py,sha256=0Fb1vJjwEeffgoUCDfDGIMuSFaPgUJqcB-JqJOAALfs,7789
|
|
107
107
|
fugue_ibis/execution_engine.py,sha256=p5zy0IBXiJgLi67RBHCRcHgZsaJMANdNSpUxz0k_6C0,18453
|
|
108
108
|
fugue_ibis/extensions.py,sha256=H8l-SPfoqLuUoILtOuL2nccOpoL83zHeSoIhoqjtWQM,6905
|
|
109
109
|
fugue_ibis/execution/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -117,11 +117,11 @@ fugue_notebook/nbextension/description.yaml,sha256=CsXgx9CSLbAlO4Z1kvX9ejYA_TImP
|
|
|
117
117
|
fugue_notebook/nbextension/main.js,sha256=Px2tQuBCNGEZOEBKsnfVruFEg-AxK7Tj0dY84ktub_U,3709
|
|
118
118
|
fugue_polars/__init__.py,sha256=NDkjlbLhHPTjUaCAw6mAwIqeK3HSeh-z88s9dqmwheQ,61
|
|
119
119
|
fugue_polars/_utils.py,sha256=7rGGWgB1-VqFwh4PcBLYk_5VNjd8FNOS4TDFyDVz2sg,159
|
|
120
|
-
fugue_polars/polars_dataframe.py,sha256=
|
|
120
|
+
fugue_polars/polars_dataframe.py,sha256=8LQ0IB-JFFdjW2ltDzq8DfIbUC_jjjDr1YM29usJag0,8831
|
|
121
121
|
fugue_polars/registry.py,sha256=gd6qQ-OxYtTAQFyvYbLDPXmSvCR-LW6n5K5ylgMY_7A,2950
|
|
122
122
|
fugue_ray/__init__.py,sha256=HzEHfG2mpc0ugf3nf1Pdy15Bhg35K6maZpYejn1aoyI,119
|
|
123
123
|
fugue_ray/_constants.py,sha256=vu5l1w-Wi-2V_nm0HLXKOYhh5HdWRCc5yQktO2XzhOg,569
|
|
124
|
-
fugue_ray/dataframe.py,sha256=
|
|
124
|
+
fugue_ray/dataframe.py,sha256=7asw2qf9vm6vLBSzqghm9pUcNAppJOz5CkT7XyR0S5g,12514
|
|
125
125
|
fugue_ray/execution_engine.py,sha256=NT_mnacijp1zskFbtganUwA3JNRPU-FNNvJswA6U_Yg,12607
|
|
126
126
|
fugue_ray/registry.py,sha256=xJRAhbwNrg695EwghQDnVtTKi4YkqZ0_61BD4OAblSA,1685
|
|
127
127
|
fugue_ray/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -130,21 +130,21 @@ fugue_ray/_utils/dataframe.py,sha256=_EadzS4rPom1A_cF0pqoPlwrNYZTfTwcyyu86_fFsqU
|
|
|
130
130
|
fugue_ray/_utils/io.py,sha256=SFTU4qXubGEmO5IGZA5yHy8Hu4b9aFZ9-eTU4Qs-NsQ,8757
|
|
131
131
|
fugue_spark/__init__.py,sha256=rvrMpFs9socMgyH_58gLbnAqmirBf5oidXoO4cekW6U,165
|
|
132
132
|
fugue_spark/_constants.py,sha256=K2uLQfjvMxXk75K-7_Wn47Alpwq5rW57BtECAUrOeqA,177
|
|
133
|
-
fugue_spark/dataframe.py,sha256
|
|
134
|
-
fugue_spark/execution_engine.py,sha256=
|
|
133
|
+
fugue_spark/dataframe.py,sha256=lYa8FizM3p_lsKYFR49FazkVZMJKyi2LABKTpP5YBLo,12006
|
|
134
|
+
fugue_spark/execution_engine.py,sha256=rqgY9U1bpjh0GFNyNkuPcI7iV0xeipadURhNIir4w08,33147
|
|
135
135
|
fugue_spark/ibis_engine.py,sha256=Yl5xxwROo1idcD2hFaylaI1IpmBUgbvOZRWtcrE0Zjo,1697
|
|
136
136
|
fugue_spark/registry.py,sha256=kyIMk6dAiKRSKCHawQKyXu9DhZ24T6j3gL57TiOAZ8c,4162
|
|
137
137
|
fugue_spark/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
138
|
-
fugue_spark/_utils/convert.py,sha256=
|
|
139
|
-
fugue_spark/_utils/io.py,sha256=
|
|
138
|
+
fugue_spark/_utils/convert.py,sha256=eRWkDYA4UO-FQu-2y4O80WEdawx7X_rIrWg55AlOiRc,10007
|
|
139
|
+
fugue_spark/_utils/io.py,sha256=0ndQ70YlirPwGKjh5IDN6IdJxD26BnPpMonRob4dxII,5668
|
|
140
140
|
fugue_spark/_utils/misc.py,sha256=o8dZmXOHnA7D_ps37vgGXTPTiSEG9LQzPKq7l-MG-qM,860
|
|
141
141
|
fugue_spark/_utils/partition.py,sha256=iaesyO5f4uXhj1W-p91cD5ecPiGlu0bzh8gl2ce2Uvg,3618
|
|
142
142
|
fugue_sql/__init__.py,sha256=Cmr7w0Efr7PzoXdQzdJfc4Dgqd69qKqcHZZodENq7EU,287
|
|
143
143
|
fugue_sql/exceptions.py,sha256=ltS0MC8gMnVVrJbQiOZ0kRUWvVQ2LTx33dCW3ugqtb0,260
|
|
144
144
|
fugue_test/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
145
145
|
fugue_test/bag_suite.py,sha256=WbDCFjuAHYoJh4GXSPiSJxOoOwE1VMtYpJ3lQrsUK-Y,2483
|
|
146
|
-
fugue_test/builtin_suite.py,sha256=
|
|
147
|
-
fugue_test/dataframe_suite.py,sha256=
|
|
146
|
+
fugue_test/builtin_suite.py,sha256=o8aMZTKa74nKBmcUTTBbliTJMtNbsXE9SPKZopS504o,78400
|
|
147
|
+
fugue_test/dataframe_suite.py,sha256=LgB931CkASbGOrRQ9j92DGk9wPb__FoNusOk-HeqU9E,19165
|
|
148
148
|
fugue_test/execution_suite.py,sha256=FI6UmwBvdoT1jkJRBqJT_Q0IDehFryvv00UL6jjxyAk,47689
|
|
149
149
|
fugue_test/ibis_suite.py,sha256=Dk4AHVD00RcFsNm9VvJ4_4LOyFdGX30OnAtpO2SPruE,3529
|
|
150
150
|
fugue_test/plugins/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -155,9 +155,9 @@ fugue_test/plugins/duckdb/fixtures.py,sha256=UxQbIMRbSrTZ3pgCmKZgd5wd1YvnVrqLSUP
|
|
|
155
155
|
fugue_test/plugins/ray/__init__.py,sha256=nyKGW6xgTXtMhSs7yjgFNKO7mVboCNg63Bvdf39fO_I,55
|
|
156
156
|
fugue_test/plugins/ray/fixtures.py,sha256=hZkvuo0AcD63XJl5JUroc9tm2LWHUPszg2zzY6FCSao,141
|
|
157
157
|
fugue_version/__init__.py,sha256=vTwvdJOZi8jZb9U-Em7-d50qNDNPS2z51IXqRoojeNM,22
|
|
158
|
-
fugue-0.8.7.
|
|
159
|
-
fugue-0.8.7.
|
|
160
|
-
fugue-0.8.7.
|
|
161
|
-
fugue-0.8.7.
|
|
162
|
-
fugue-0.8.7.
|
|
163
|
-
fugue-0.8.7.
|
|
158
|
+
fugue-0.8.7.dev6.dist-info/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
|
|
159
|
+
fugue-0.8.7.dev6.dist-info/METADATA,sha256=0i4ibczIy_wEMtZ6vFvaCw40x5KmuQa6OsuBVWUTQyk,17860
|
|
160
|
+
fugue-0.8.7.dev6.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
|
|
161
|
+
fugue-0.8.7.dev6.dist-info/entry_points.txt,sha256=N_BIIy3lSvF6Z32QE0yXTucgdHrPbUrOwH1zj7bZ0ow,536
|
|
162
|
+
fugue-0.8.7.dev6.dist-info/top_level.txt,sha256=y1eCfzGdQ1_RkgcShcfbvXs-bopD3DwJcIOxP9EFXno,140
|
|
163
|
+
fugue-0.8.7.dev6.dist-info/RECORD,,
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
dask = fugue_dask.registry [dask]
|
|
3
3
|
dask_ibis = fugue_dask.ibis_engine [dask,ibis]
|
|
4
4
|
duckdb = fugue_duckdb.registry [duckdb]
|
|
5
|
-
duckdb_ibis = fugue_duckdb.ibis_engine [ibis
|
|
5
|
+
duckdb_ibis = fugue_duckdb.ibis_engine [duckdb,ibis]
|
|
6
6
|
ibis = fugue_ibis [ibis]
|
|
7
7
|
polars = fugue_polars.registry [polars]
|
|
8
8
|
ray = fugue_ray.registry [ray]
|
fugue_dask/_io.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
2
2
|
|
|
3
|
+
import fsspec
|
|
3
4
|
import fs as pfs
|
|
4
5
|
import pandas as pd
|
|
5
6
|
from dask import dataframe as dd
|
|
@@ -96,6 +97,8 @@ def _load_parquet(
|
|
|
96
97
|
|
|
97
98
|
|
|
98
99
|
def _save_csv(df: DaskDataFrame, p: FileParser, **kwargs: Any) -> None:
|
|
100
|
+
fs, path = fsspec.core.url_to_fs(p.uri)
|
|
101
|
+
fs.makedirs(path, exist_ok=True)
|
|
99
102
|
df.native.to_csv(
|
|
100
103
|
pfs.path.combine(p.uri, "*.csv"), **{"index": False, "header": False, **kwargs}
|
|
101
104
|
)
|
|
@@ -145,6 +148,8 @@ def _load_csv( # noqa: C901
|
|
|
145
148
|
|
|
146
149
|
|
|
147
150
|
def _save_json(df: DaskDataFrame, p: FileParser, **kwargs: Any) -> None:
|
|
151
|
+
fs, path = fsspec.core.url_to_fs(p.uri)
|
|
152
|
+
fs.makedirs(path, exist_ok=True)
|
|
148
153
|
df.native.to_json(pfs.path.combine(p.uri, "*.json"), **kwargs)
|
|
149
154
|
|
|
150
155
|
|
fugue_dask/_utils.py
CHANGED
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
import math
|
|
2
|
-
from typing import Any, List, Optional, Tuple
|
|
2
|
+
from typing import Any, Callable, List, Optional, Tuple, TypeVar
|
|
3
3
|
|
|
4
4
|
import dask.dataframe as dd
|
|
5
5
|
import numpy as np
|
|
6
6
|
import pandas as pd
|
|
7
7
|
import pyarrow as pa
|
|
8
8
|
from dask.dataframe.core import DataFrame
|
|
9
|
+
from dask.delayed import delayed
|
|
9
10
|
from dask.distributed import Client, get_client
|
|
10
|
-
from triad.utils.pandas_like import
|
|
11
|
+
from triad.utils.pandas_like import PD_UTILS, PandasLikeUtils
|
|
11
12
|
from triad.utils.pyarrow import to_pandas_dtype
|
|
12
13
|
|
|
13
14
|
import fugue.api as fa
|
|
@@ -16,6 +17,7 @@ from fugue.constants import FUGUE_CONF_DEFAULT_PARTITIONS
|
|
|
16
17
|
from ._constants import FUGUE_DASK_CONF_DEFAULT_PARTITIONS
|
|
17
18
|
|
|
18
19
|
_FUGUE_DASK_TEMP_IDX_COLUMN = "_fugue_dask_temp_index"
|
|
20
|
+
T = TypeVar("T")
|
|
19
21
|
|
|
20
22
|
|
|
21
23
|
def get_default_partitions() -> int:
|
|
@@ -28,6 +30,17 @@ def get_default_partitions() -> int:
|
|
|
28
30
|
return n if n > 0 else fa.get_current_parallelism() * 2
|
|
29
31
|
|
|
30
32
|
|
|
33
|
+
def collect(df: dd.DataFrame, func: Callable[[pd.DataFrame], T]) -> Tuple[T]:
|
|
34
|
+
"""Compute each partition in parallel and collect the results
|
|
35
|
+
|
|
36
|
+
:param df: dask dataframe
|
|
37
|
+
:return: the collected result
|
|
38
|
+
"""
|
|
39
|
+
dfs = df.to_delayed()
|
|
40
|
+
objs = [delayed(func)(df) for df in dfs]
|
|
41
|
+
return dd.compute(*objs)
|
|
42
|
+
|
|
43
|
+
|
|
31
44
|
def hash_repartition(df: dd.DataFrame, num: int, cols: List[Any]) -> dd.DataFrame:
|
|
32
45
|
"""Repartition the dataframe by hashing the given columns
|
|
33
46
|
|
fugue_dask/dataframe.py
CHANGED
|
@@ -3,20 +3,21 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
|
3
3
|
import dask.dataframe as dd
|
|
4
4
|
import pandas as pd
|
|
5
5
|
import pyarrow as pa
|
|
6
|
+
from triad import assert_or_throw
|
|
6
7
|
from triad.collections.schema import Schema
|
|
7
8
|
from triad.utils.assertion import assert_arg_not_none
|
|
8
9
|
from triad.utils.pandas_like import PD_UTILS
|
|
9
10
|
from triad.utils.pyarrow import cast_pa_table
|
|
10
11
|
|
|
11
|
-
from fugue.dataframe import
|
|
12
|
-
ArrowDataFrame,
|
|
13
|
-
DataFrame,
|
|
14
|
-
LocalBoundedDataFrame,
|
|
15
|
-
PandasDataFrame,
|
|
16
|
-
)
|
|
12
|
+
from fugue.dataframe import DataFrame, LocalBoundedDataFrame, PandasDataFrame
|
|
17
13
|
from fugue.dataframe.dataframe import _input_schema
|
|
14
|
+
from fugue.dataframe.pandas_dataframe import _pd_as_dicts
|
|
18
15
|
from fugue.exceptions import FugueDataFrameOperationError
|
|
19
16
|
from fugue.plugins import (
|
|
17
|
+
as_array,
|
|
18
|
+
as_array_iterable,
|
|
19
|
+
as_dict_iterable,
|
|
20
|
+
as_dicts,
|
|
20
21
|
as_local_bounded,
|
|
21
22
|
count,
|
|
22
23
|
drop_columns,
|
|
@@ -32,7 +33,7 @@ from fugue.plugins import (
|
|
|
32
33
|
)
|
|
33
34
|
|
|
34
35
|
from ._constants import FUGUE_DASK_USE_ARROW
|
|
35
|
-
from ._utils import DASK_UTILS, get_default_partitions
|
|
36
|
+
from ._utils import DASK_UTILS, collect, get_default_partitions
|
|
36
37
|
|
|
37
38
|
|
|
38
39
|
class DaskDataFrame(DataFrame):
|
|
@@ -150,8 +151,16 @@ class DaskDataFrame(DataFrame):
|
|
|
150
151
|
)
|
|
151
152
|
|
|
152
153
|
def as_arrow(self, type_safe: bool = False) -> pa.Table:
|
|
153
|
-
|
|
154
|
-
return
|
|
154
|
+
schema = self.schema.pa_schema
|
|
155
|
+
return pa.concat_tables(
|
|
156
|
+
collect(
|
|
157
|
+
self.native,
|
|
158
|
+
lambda df: cast_pa_table(
|
|
159
|
+
pa.Table.from_pandas(df.reset_index(drop=True), schema=schema),
|
|
160
|
+
schema=schema,
|
|
161
|
+
),
|
|
162
|
+
)
|
|
163
|
+
)
|
|
155
164
|
|
|
156
165
|
def rename(self, columns: Dict[str, str]) -> DataFrame:
|
|
157
166
|
try:
|
|
@@ -170,17 +179,28 @@ class DaskDataFrame(DataFrame):
|
|
|
170
179
|
def as_array(
|
|
171
180
|
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
172
181
|
) -> List[Any]:
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
)
|
|
182
|
+
chunks = _to_array_chunks(self.native, columns, type_safe, schema=self.schema)
|
|
183
|
+
res: List[List[Any]] = []
|
|
184
|
+
for x in chunks:
|
|
185
|
+
res += x
|
|
186
|
+
return res
|
|
179
187
|
|
|
180
188
|
def as_array_iterable(
|
|
181
189
|
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
182
190
|
) -> Iterable[Any]:
|
|
183
|
-
|
|
191
|
+
chunks = _to_array_chunks(self.native, columns, type_safe, schema=self.schema)
|
|
192
|
+
for x in chunks:
|
|
193
|
+
yield from x
|
|
194
|
+
|
|
195
|
+
def as_dicts(
|
|
196
|
+
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
197
|
+
) -> List[Dict[str, Any]]:
|
|
198
|
+
return _dd_as_dicts(self.native, columns)
|
|
199
|
+
|
|
200
|
+
def as_dict_iterable(
|
|
201
|
+
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
202
|
+
) -> Iterable[Dict[str, Any]]:
|
|
203
|
+
yield from _dd_as_dict_iterable(self.native, columns)
|
|
184
204
|
|
|
185
205
|
def head(
|
|
186
206
|
self, n: int, columns: Optional[List[str]] = None
|
|
@@ -197,8 +217,11 @@ class DaskDataFrame(DataFrame):
|
|
|
197
217
|
assert_arg_not_none(schema, "schema")
|
|
198
218
|
return pdf, schema
|
|
199
219
|
DASK_UTILS.ensure_compatible(pdf)
|
|
200
|
-
|
|
201
|
-
|
|
220
|
+
# when pdf contains bytes, or any object types, and schema contains str
|
|
221
|
+
# there is no way to get the real schema of the pdf, (pschema will contain
|
|
222
|
+
# strs instead of the real types) so we have to force cast it to the schema
|
|
223
|
+
if schema is None:
|
|
224
|
+
pschema = Schema(DASK_UTILS.to_schema(pdf))
|
|
202
225
|
return pdf, pschema.assert_not_empty()
|
|
203
226
|
pdf = pdf[schema.assert_not_empty().names]
|
|
204
227
|
return (
|
|
@@ -295,6 +318,48 @@ def _dd_head(
|
|
|
295
318
|
return PandasDataFrame(res) if as_fugue else res
|
|
296
319
|
|
|
297
320
|
|
|
321
|
+
@as_array.candidate(lambda df, *args, **kwargs: isinstance(df, dd.DataFrame))
|
|
322
|
+
def _dd_as_array(
|
|
323
|
+
df: dd.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
324
|
+
) -> List[Any]:
|
|
325
|
+
chunks = _to_array_chunks(df, columns, type_safe)
|
|
326
|
+
res: List[List[Any]] = []
|
|
327
|
+
for x in chunks:
|
|
328
|
+
res += x
|
|
329
|
+
return res
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
@as_array_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, dd.DataFrame))
|
|
333
|
+
def _dd_as_array_iterable(
|
|
334
|
+
df: dd.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
335
|
+
) -> Iterable[Any]:
|
|
336
|
+
chunks = _to_array_chunks(df, columns, type_safe)
|
|
337
|
+
for x in chunks:
|
|
338
|
+
yield from x
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
@as_dicts.candidate(lambda df, *args, **kwargs: isinstance(df, dd.DataFrame))
|
|
342
|
+
def _dd_as_dicts(
|
|
343
|
+
df: dd.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
344
|
+
) -> List[Dict[str, Any]]:
|
|
345
|
+
assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
|
|
346
|
+
_df = df if columns is None or len(columns) == 0 else df[columns]
|
|
347
|
+
res: List[Dict[str, Any]] = []
|
|
348
|
+
for x in collect(_df, lambda df: _pd_as_dicts(df, columns)):
|
|
349
|
+
res += x
|
|
350
|
+
return res
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
@as_dict_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, dd.DataFrame))
|
|
354
|
+
def _dd_as_dict_iterable(
|
|
355
|
+
df: dd.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
356
|
+
) -> Iterable[Dict[str, Any]]:
|
|
357
|
+
assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
|
|
358
|
+
_df = df if columns is None or len(columns) == 0 else df[columns]
|
|
359
|
+
for x in collect(_df, lambda df: _pd_as_dicts(df, columns)):
|
|
360
|
+
yield from x
|
|
361
|
+
|
|
362
|
+
|
|
298
363
|
def _assert_no_missing(df: dd.DataFrame, columns: Iterable[Any]) -> None:
|
|
299
364
|
missing = set(columns) - set(df.columns)
|
|
300
365
|
if len(missing) > 0:
|
|
@@ -303,3 +368,25 @@ def _assert_no_missing(df: dd.DataFrame, columns: Iterable[Any]) -> None:
|
|
|
303
368
|
|
|
304
369
|
def _adjust_df(res: dd.DataFrame, as_fugue: bool):
|
|
305
370
|
return res if not as_fugue else DaskDataFrame(res)
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
def _to_array_chunks(
|
|
374
|
+
df: dd.DataFrame,
|
|
375
|
+
columns: Optional[List[str]] = None,
|
|
376
|
+
type_safe: bool = False,
|
|
377
|
+
schema: Optional[Schema] = None,
|
|
378
|
+
) -> Tuple[List[Any]]:
|
|
379
|
+
assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
|
|
380
|
+
_df = df if columns is None or len(columns) == 0 else df[columns]
|
|
381
|
+
|
|
382
|
+
def _to_list(pdf: pd.DataFrame) -> List[Any]:
|
|
383
|
+
return list(
|
|
384
|
+
PD_UTILS.as_array_iterable(
|
|
385
|
+
pdf,
|
|
386
|
+
schema=None if schema is None else schema.pa_schema,
|
|
387
|
+
columns=columns,
|
|
388
|
+
type_safe=type_safe,
|
|
389
|
+
)
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
return collect(_df, _to_list)
|
fugue_duckdb/dataframe.py
CHANGED
|
@@ -3,21 +3,33 @@ from typing import Any, Dict, Iterable, List, Optional
|
|
|
3
3
|
import pandas as pd
|
|
4
4
|
import pyarrow as pa
|
|
5
5
|
from duckdb import DuckDBPyRelation
|
|
6
|
-
from triad import Schema
|
|
6
|
+
from triad import Schema, assert_or_throw
|
|
7
7
|
from triad.utils.pyarrow import LARGE_TYPES_REPLACEMENT, replace_types_in_table
|
|
8
8
|
|
|
9
|
-
from fugue import
|
|
9
|
+
from fugue import ArrowDataFrame, DataFrame, LocalBoundedDataFrame
|
|
10
10
|
from fugue.dataframe.arrow_dataframe import _pa_table_as_pandas
|
|
11
|
+
from fugue.dataframe.utils import (
|
|
12
|
+
pa_table_as_array,
|
|
13
|
+
pa_table_as_array_iterable,
|
|
14
|
+
pa_table_as_dict_iterable,
|
|
15
|
+
pa_table_as_dicts,
|
|
16
|
+
)
|
|
11
17
|
from fugue.exceptions import FugueDataFrameOperationError, FugueDatasetEmptyError
|
|
12
18
|
from fugue.plugins import (
|
|
19
|
+
as_array,
|
|
20
|
+
as_array_iterable,
|
|
13
21
|
as_arrow,
|
|
22
|
+
as_dict_iterable,
|
|
23
|
+
as_dicts,
|
|
14
24
|
as_fugue_dataset,
|
|
15
25
|
as_local_bounded,
|
|
16
26
|
as_pandas,
|
|
27
|
+
drop_columns,
|
|
17
28
|
get_column_names,
|
|
18
29
|
get_num_partitions,
|
|
19
30
|
get_schema,
|
|
20
31
|
is_df,
|
|
32
|
+
select_columns,
|
|
21
33
|
)
|
|
22
34
|
|
|
23
35
|
from ._utils import encode_column_name, to_duck_type, to_pa_type
|
|
@@ -59,13 +71,10 @@ class DuckDataFrame(LocalBoundedDataFrame):
|
|
|
59
71
|
return len(self._rel)
|
|
60
72
|
|
|
61
73
|
def _drop_cols(self, cols: List[str]) -> DataFrame:
|
|
62
|
-
|
|
63
|
-
rel = self._rel.project(",".join(encode_column_name(n) for n in cols))
|
|
64
|
-
return DuckDataFrame(rel)
|
|
74
|
+
return DuckDataFrame(_drop_duckdb_columns(self._rel, cols))
|
|
65
75
|
|
|
66
76
|
def _select_cols(self, keys: List[Any]) -> DataFrame:
|
|
67
|
-
|
|
68
|
-
return DuckDataFrame(rel)
|
|
77
|
+
return DuckDataFrame(_select_duckdb_columns(self._rel, keys))
|
|
69
78
|
|
|
70
79
|
def rename(self, columns: Dict[str, str]) -> DataFrame:
|
|
71
80
|
_assert_no_missing(self._rel, columns.keys())
|
|
@@ -109,38 +118,29 @@ class DuckDataFrame(LocalBoundedDataFrame):
|
|
|
109
118
|
def as_array(
|
|
110
119
|
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
111
120
|
) -> List[Any]:
|
|
112
|
-
|
|
113
|
-
return self[columns].as_array(type_safe=type_safe)
|
|
114
|
-
return self._fetchall(self._rel)
|
|
121
|
+
return _duck_as_array(self._rel, columns=columns, type_safe=type_safe)
|
|
115
122
|
|
|
116
123
|
def as_array_iterable(
|
|
117
124
|
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
118
125
|
) -> Iterable[Any]:
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
126
|
+
yield from _duck_as_array_iterable(
|
|
127
|
+
self._rel, columns=columns, type_safe=type_safe
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
131
|
+
return _duck_as_dicts(self._rel, columns=columns)
|
|
132
|
+
|
|
133
|
+
def as_dict_iterable(
|
|
134
|
+
self, columns: Optional[List[str]] = None
|
|
135
|
+
) -> Iterable[Dict[str, Any]]:
|
|
136
|
+
yield from _duck_as_dict_iterable(self._rel, columns=columns)
|
|
123
137
|
|
|
124
138
|
def head(
|
|
125
139
|
self, n: int, columns: Optional[List[str]] = None
|
|
126
140
|
) -> LocalBoundedDataFrame:
|
|
127
141
|
if columns is not None:
|
|
128
142
|
return self[columns].head(n)
|
|
129
|
-
return
|
|
130
|
-
|
|
131
|
-
def _fetchall(self, rel: DuckDBPyRelation) -> List[List[Any]]:
|
|
132
|
-
map_pos = [i for i, t in enumerate(self.schema.types) if pa.types.is_map(t)]
|
|
133
|
-
if len(map_pos) == 0:
|
|
134
|
-
return [list(x) for x in rel.fetchall()]
|
|
135
|
-
else:
|
|
136
|
-
|
|
137
|
-
def to_list(row: Any) -> List[Any]:
|
|
138
|
-
res = list(row)
|
|
139
|
-
for p in map_pos:
|
|
140
|
-
res[p] = list(zip(row[p]["key"], row[p]["value"]))
|
|
141
|
-
return res
|
|
142
|
-
|
|
143
|
-
return [to_list(x) for x in rel.fetchall()]
|
|
143
|
+
return ArrowDataFrame(_duck_as_arrow(self._rel.limit(n)))
|
|
144
144
|
|
|
145
145
|
|
|
146
146
|
@as_fugue_dataset.candidate(lambda df, **kwargs: isinstance(df, DuckDBPyRelation))
|
|
@@ -186,6 +186,64 @@ def _get_duckdb_columns(df: DuckDBPyRelation) -> List[Any]:
|
|
|
186
186
|
return list(df.columns)
|
|
187
187
|
|
|
188
188
|
|
|
189
|
+
@select_columns.candidate(lambda df, *args, **kwargs: isinstance(df, DuckDBPyRelation))
|
|
190
|
+
def _select_duckdb_columns(
|
|
191
|
+
df: DuckDBPyRelation, columns: List[Any]
|
|
192
|
+
) -> DuckDBPyRelation:
|
|
193
|
+
if len(columns) == 0:
|
|
194
|
+
raise FugueDataFrameOperationError("must select at least one column")
|
|
195
|
+
_assert_no_missing(df, columns)
|
|
196
|
+
return df.project(",".join(encode_column_name(n) for n in columns))
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
@drop_columns.candidate(lambda df, *args, **kwargs: isinstance(df, DuckDBPyRelation))
|
|
200
|
+
def _drop_duckdb_columns(df: DuckDBPyRelation, columns: List[str]) -> DuckDBPyRelation:
|
|
201
|
+
# if len(columns) == 0:
|
|
202
|
+
# return df
|
|
203
|
+
_columns = {c: 1 for c in columns}
|
|
204
|
+
cols = [col for col in df.columns if _columns.pop(col, None) is None]
|
|
205
|
+
assert_or_throw(
|
|
206
|
+
len(cols) > 0, FugueDataFrameOperationError("must keep at least one column")
|
|
207
|
+
)
|
|
208
|
+
assert_or_throw(
|
|
209
|
+
len(_columns) == 0,
|
|
210
|
+
FugueDataFrameOperationError("found nonexistent columns {_columns}"),
|
|
211
|
+
)
|
|
212
|
+
return df.project(",".join(encode_column_name(n) for n in cols))
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
@as_array.candidate(lambda df, *args, **kwargs: isinstance(df, DuckDBPyRelation))
|
|
216
|
+
def _duck_as_array(
|
|
217
|
+
df: DuckDBPyRelation, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
218
|
+
) -> List[Any]:
|
|
219
|
+
return pa_table_as_array(df.arrow(), columns=columns)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
@as_array_iterable.candidate(
|
|
223
|
+
lambda df, *args, **kwargs: isinstance(df, DuckDBPyRelation)
|
|
224
|
+
)
|
|
225
|
+
def _duck_as_array_iterable(
|
|
226
|
+
df: DuckDBPyRelation, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
227
|
+
) -> Iterable[Any]:
|
|
228
|
+
yield from pa_table_as_array_iterable(df.arrow(), columns=columns)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
@as_dicts.candidate(lambda df, *args, **kwargs: isinstance(df, DuckDBPyRelation))
|
|
232
|
+
def _duck_as_dicts(
|
|
233
|
+
df: DuckDBPyRelation, columns: Optional[List[str]] = None
|
|
234
|
+
) -> List[Dict[str, Any]]:
|
|
235
|
+
return pa_table_as_dicts(df.arrow(), columns=columns)
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
@as_dict_iterable.candidate(
|
|
239
|
+
lambda df, *args, **kwargs: isinstance(df, DuckDBPyRelation)
|
|
240
|
+
)
|
|
241
|
+
def _duck_as_dict_iterable(
|
|
242
|
+
df: DuckDBPyRelation, columns: Optional[List[str]] = None
|
|
243
|
+
) -> Iterable[Dict[str, Any]]:
|
|
244
|
+
yield from pa_table_as_dict_iterable(df.arrow(), columns=columns)
|
|
245
|
+
|
|
246
|
+
|
|
189
247
|
def _assert_no_missing(df: DuckDBPyRelation, columns: Iterable[Any]) -> None:
|
|
190
248
|
missing = set(columns) - set(df.columns)
|
|
191
249
|
if len(missing) > 0:
|
fugue_ibis/dataframe.py
CHANGED
|
@@ -143,6 +143,19 @@ class IbisDataFrame(DataFrame):
|
|
|
143
143
|
type_safe=type_safe
|
|
144
144
|
)
|
|
145
145
|
|
|
146
|
+
def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
147
|
+
if columns is not None:
|
|
148
|
+
return self[columns].as_dicts()
|
|
149
|
+
return self.as_local().as_dicts()
|
|
150
|
+
|
|
151
|
+
def as_dict_iterable(
|
|
152
|
+
self, columns: Optional[List[str]] = None
|
|
153
|
+
) -> Iterable[Dict[str, Any]]:
|
|
154
|
+
if columns is not None:
|
|
155
|
+
yield from self[columns].as_dict_iterable()
|
|
156
|
+
else:
|
|
157
|
+
yield from self._to_iterable_df(self._table).as_dict_iterable()
|
|
158
|
+
|
|
146
159
|
def head(
|
|
147
160
|
self, n: int, columns: Optional[List[str]] = None
|
|
148
161
|
) -> LocalBoundedDataFrame:
|