fugue 0.8.7.dev4__py3-none-any.whl → 0.8.7.dev6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +1,9 @@
1
1
  fugue/__init__.py,sha256=xT5zuNZfRkjbA8a-uTT5oLK6hLGuezGZLWYBl6eS5J4,2749
2
- fugue/api.py,sha256=6_d3vYwJGAX7tW7NMhHB_NAX4aPsfzK2L06Zr2V78Ks,1240
2
+ fugue/api.py,sha256=dLUrigFhDMB5x7cvlWSK8EyaY2o0AmhgPr0VRtfzSz0,1254
3
3
  fugue/constants.py,sha256=crd0VqX8WtBcjSUNwZDi2LDIEkhUMWOlSn73H8JI9ds,3385
4
4
  fugue/dev.py,sha256=GQCkezBBl4V0lVDWhGtUQKqomiCxgR9dMhfqj9C8cS8,1369
5
5
  fugue/exceptions.py,sha256=ylP8gkZL8ao_ZLinNYKv16FPyO_n7c29dN-4QChUxi0,1544
6
- fugue/plugins.py,sha256=SJ-jqs04StHIHJ65lgdGP0IDopVIGBDpmzHHllNK8p0,998
6
+ fugue/plugins.py,sha256=kao-H5z-cRbujBKW1QC9IHUOBKxXMhpVQ6saIE7cXm8,1012
7
7
  fugue/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
8
  fugue/registry.py,sha256=SNULGv08f37fRO-cIxFDmnVcod7ref2fNLSK6G7nVnI,868
9
9
  fugue/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -25,24 +25,24 @@ fugue/column/expressions.py,sha256=fdGX9oPCqJBuROFZqrOYVcwkjghdXT9ngaSTG5tW_i8,2
25
25
  fugue/column/functions.py,sha256=ygLyn2gp5lTdGbYqJXeGeMmRNhbm4-vfJvAY_Zt0pb0,9774
26
26
  fugue/column/sql.py,sha256=s_qTtHgnvRFqjhCWr7s595PTrHM-Pr9zHUQfU5xcTVA,17391
27
27
  fugue/dataframe/__init__.py,sha256=zm7TbsaJLIvfm7zymWm2LGcuJd3nxfGsFnQiyrSnenM,678
28
- fugue/dataframe/api.py,sha256=c5Err3c-ayl-k28IUi6kV_ClDWX30NpVNkv97hQKDac,9862
28
+ fugue/dataframe/api.py,sha256=aWBvMaiSUxOvdQMfe79zHShWuPfLcgiWggC9HvVxvSE,11017
29
29
  fugue/dataframe/array_dataframe.py,sha256=4scWnmQ6sjy1A6o7IYdRc0VVutBEfcJrA1f9wkph4Kg,4440
30
- fugue/dataframe/arrow_dataframe.py,sha256=mJzrYBGs9mEMsHgxmnhDdiLUiOkcOs3YBAzHs75KFsI,10202
31
- fugue/dataframe/dataframe.py,sha256=a7jhYUaovN7w8vcJ-OU2AMfkfqxpvFF06cYWFqIJWqM,16418
30
+ fugue/dataframe/arrow_dataframe.py,sha256=r5zcZBX_N6XO5dmixBkTCPgLcMmgDF022piZvrwRp_c,11485
31
+ fugue/dataframe/dataframe.py,sha256=xmyG85i14A6LDRkNmPt29oYq7PJsq668s1QvFHK8PV4,16964
32
32
  fugue/dataframe/dataframe_iterable_dataframe.py,sha256=lx71KfaI4lsVKI-79buc-idaeT20JEMBOq21SQcAiY8,7259
33
33
  fugue/dataframe/dataframes.py,sha256=tBSpHsENgbcdOJ0Jgst6PTKbjG7_uoFJch96oTlaQIs,4160
34
- fugue/dataframe/function_wrapper.py,sha256=r6H1SQWaag2eSbJ50327t_bt7MZunbOMOl9OcOcQW2E,14827
35
- fugue/dataframe/iterable_dataframe.py,sha256=9g2BAF9A6QPbo63Si-trFq_9nPVqAD9vSePRCV71AfY,4620
36
- fugue/dataframe/pandas_dataframe.py,sha256=JNkr24h5gir1Msttx3lNfzFjwMqjHbjDswNynpCiizo,9158
37
- fugue/dataframe/utils.py,sha256=4l2Ag3iA9dh8zIbtyihe82X5WNB-6hbbRVvSlmJbSuY,9086
34
+ fugue/dataframe/function_wrapper.py,sha256=V1eQMOn27UroEYT7_YiwoEF0RjZYIM0zkD3vfaMAQFs,14813
35
+ fugue/dataframe/iterable_dataframe.py,sha256=TcOoNKa4jNbHbvAZ0XAhtMmGcioygIHPxI9budDtenQ,4758
36
+ fugue/dataframe/pandas_dataframe.py,sha256=0L0wYCGhD2BpQbruoT07Ox9iQM5YLHLNrcgzudc-yKs,11633
37
+ fugue/dataframe/utils.py,sha256=VS1qLCr-9NEcEjaK-219rADJadDf6EfzYZCGRUpn1fY,11405
38
38
  fugue/dataset/__init__.py,sha256=5f2CAJ4xst6Z2o9Q2e2twfDOGUw8ZJoE2ild4JEU2pg,112
39
39
  fugue/dataset/api.py,sha256=DacI4L2w5NJ-eZ6nFxNMqmReEnb0WUXswbjVp7BeErk,2794
40
40
  fugue/dataset/dataset.py,sha256=jWXZqy3msMPFFkhas2PYJEX55ZAI3gk3Txq5f4-Qya4,4759
41
41
  fugue/execution/__init__.py,sha256=iZGxAznZz9piM3k4gp0tln97MDIBxdliLyNbD-0Zc48,427
42
42
  fugue/execution/api.py,sha256=KsFOLGdWQMlXmlQ5JRgRsbUeB64qzTVHxSEaunjiojo,39818
43
- fugue/execution/execution_engine.py,sha256=n-mw0k0QtK8FQgP4w4_NrWJbg0XvrR4sFn70tSaOi0I,47735
43
+ fugue/execution/execution_engine.py,sha256=G_SsTmcuDcy6_azi_88lGzsOodiizu0JdWxebxgbqRg,47721
44
44
  fugue/execution/factory.py,sha256=5ICzfNh2QqqABuVyYLijY5-7LZgfRqczlaZN32p78bE,21003
45
- fugue/execution/native_execution_engine.py,sha256=_cXg7PTmDL4QvkcOnGhLFlVEZVZvGu9-wHPTBM0e-vI,14388
45
+ fugue/execution/native_execution_engine.py,sha256=Mm9BVC3dEMS3IWRZe4YvGKp6_mmW7dLmoLMK5HgAPcs,14408
46
46
  fugue/extensions/__init__.py,sha256=y-uLKd6mZ8sZ_8-OdW6ELoBO_9IfC0gDmEbE_rMCvOA,599
47
47
  fugue/extensions/_utils.py,sha256=Bi3pYKy2Z6fG6_5BpwIWldxetassXpB4Zp8QamWB-wg,5173
48
48
  fugue/extensions/context.py,sha256=c_y2UttzzIFoQTOCV42VCdj2nqah33xYuBjbKNIOpx8,4262
@@ -86,9 +86,9 @@ fugue_contrib/viz/__init__.py,sha256=osgZx63Br-yMZImyEfYf9MVzJNM2Cqqke_-WsuDmG5M
86
86
  fugue_contrib/viz/_ext.py,sha256=Lu_DlS5DcmrFz27fHcKTCkhKyknVWcfS5kzZVVuO9xM,1345
87
87
  fugue_dask/__init__.py,sha256=2CcJ0AsN-k_f7dZ-yAyYpaICfUMPfH3l0FvUJSBzTr0,161
88
88
  fugue_dask/_constants.py,sha256=35UmTVITk21GhRyRlbJOwPPdQsytM_p_2NytOXEay18,510
89
- fugue_dask/_io.py,sha256=V-S6mA7VhDVeKfE46c6icZggwA6kobBNVsCUJsJTmTk,5836
90
- fugue_dask/_utils.py,sha256=uFoJAL95rmnBgieU2hPyqxFZGvR6ZJgPRMq5TAJqIBI,8520
91
- fugue_dask/dataframe.py,sha256=TdKjxhoQpsU5CvBTgO2c5Zo_4LfyelR0IK8bPgjAxcg,10218
89
+ fugue_dask/_io.py,sha256=9G516yM6zQvSC5_JA6qHb3LwBDmhWcxK5sjFHrQ81zo,6012
90
+ fugue_dask/_utils.py,sha256=n70N3wPPMz13Jh0GWJM3Je-TCYpU36yGP_YCwIHqUrc,8908
91
+ fugue_dask/dataframe.py,sha256=MuG9TqCND7qI66lPvxzuomfE7yA4sW7DjrvbyvE6XEU,13471
92
92
  fugue_dask/execution_engine.py,sha256=XJp6wrdkaNh5pOpwt-Hjoa2sxgCOgusFRWrcqoCcaNM,21153
93
93
  fugue_dask/ibis_engine.py,sha256=kQdaG_KlZZ2AjtYETNCdTJOgtwI_eH0aGzLaAiIBbRI,2120
94
94
  fugue_dask/registry.py,sha256=7UTg_eie7zKlHYKMCyOo0TNn5y2TiIjE8kiS2PruHFc,2200
@@ -96,14 +96,14 @@ fugue_duckdb/__init__.py,sha256=nSNv-fxBAKD6W23EbMeV4dVRIaSTqr9DzQUWuVOES8s,379
96
96
  fugue_duckdb/_io.py,sha256=Sq228unVnroYTq4GX-Wnv22SLHC9Ji-aWgiqrfdu81w,8880
97
97
  fugue_duckdb/_utils.py,sha256=ElKbHUyn5fWSPGXsK57iqMzcqKtCf0c8pBVBYGe5Ql4,5020
98
98
  fugue_duckdb/dask.py,sha256=agoLzeB7Swxj2kVWfmXFbWD1NS2lbbTlnrjSkR8kKWY,5014
99
- fugue_duckdb/dataframe.py,sha256=vNZF2BC1sJpW3P5TVFTpU6C1Ddam81jPC_4i8kBuEpo,6512
99
+ fugue_duckdb/dataframe.py,sha256=LRfTv7Y46wMM_IDYSP1R-5OXuHuBg8GHjPGFFt8u7l0,8444
100
100
  fugue_duckdb/execution_engine.py,sha256=fkkQb4Eh0m7SwKrTplVk2oQalLkNoj3CW0R12g01ofk,20536
101
101
  fugue_duckdb/ibis_engine.py,sha256=MrypeABozqwetKOpqtrmWvCJX2QPfBXhbSEhvK9vqmI,1990
102
102
  fugue_duckdb/registry.py,sha256=Dj0Tng1cXVT6Q7t-KxOky2k1dD9xSBjYGQmI26UgZPo,3095
103
103
  fugue_ibis/__init__.py,sha256=PcUt66KlLyGGicad7asq5j2U567_fhR0HzvWQBhV1VM,362
104
104
  fugue_ibis/_compat.py,sha256=zKdTaTfuC02eUIzZPkcd7oObnVBi_X5mQjQf7SDme3Y,246
105
105
  fugue_ibis/_utils.py,sha256=BUL5swA5FE4eQu0t5Z17hZVu9a2MFfxlFH6Ymy9xifg,6607
106
- fugue_ibis/dataframe.py,sha256=olGfVYY9n5wwPOZojS30Fs3XEOMlenCzX8fuR2WPaq4,7295
106
+ fugue_ibis/dataframe.py,sha256=0Fb1vJjwEeffgoUCDfDGIMuSFaPgUJqcB-JqJOAALfs,7789
107
107
  fugue_ibis/execution_engine.py,sha256=p5zy0IBXiJgLi67RBHCRcHgZsaJMANdNSpUxz0k_6C0,18453
108
108
  fugue_ibis/extensions.py,sha256=H8l-SPfoqLuUoILtOuL2nccOpoL83zHeSoIhoqjtWQM,6905
109
109
  fugue_ibis/execution/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -117,11 +117,11 @@ fugue_notebook/nbextension/description.yaml,sha256=CsXgx9CSLbAlO4Z1kvX9ejYA_TImP
117
117
  fugue_notebook/nbextension/main.js,sha256=Px2tQuBCNGEZOEBKsnfVruFEg-AxK7Tj0dY84ktub_U,3709
118
118
  fugue_polars/__init__.py,sha256=NDkjlbLhHPTjUaCAw6mAwIqeK3HSeh-z88s9dqmwheQ,61
119
119
  fugue_polars/_utils.py,sha256=7rGGWgB1-VqFwh4PcBLYk_5VNjd8FNOS4TDFyDVz2sg,159
120
- fugue_polars/polars_dataframe.py,sha256=Ll4ZUuRhAETWtmSf87KsdUCqZPiexFqy4FiPkvWQkN0,7348
120
+ fugue_polars/polars_dataframe.py,sha256=8LQ0IB-JFFdjW2ltDzq8DfIbUC_jjjDr1YM29usJag0,8831
121
121
  fugue_polars/registry.py,sha256=gd6qQ-OxYtTAQFyvYbLDPXmSvCR-LW6n5K5ylgMY_7A,2950
122
122
  fugue_ray/__init__.py,sha256=HzEHfG2mpc0ugf3nf1Pdy15Bhg35K6maZpYejn1aoyI,119
123
123
  fugue_ray/_constants.py,sha256=vu5l1w-Wi-2V_nm0HLXKOYhh5HdWRCc5yQktO2XzhOg,569
124
- fugue_ray/dataframe.py,sha256=vyVShPnNtMef_KBsVP3iTHcssA_fm33-Y077c7S3J-A,10612
124
+ fugue_ray/dataframe.py,sha256=7asw2qf9vm6vLBSzqghm9pUcNAppJOz5CkT7XyR0S5g,12514
125
125
  fugue_ray/execution_engine.py,sha256=NT_mnacijp1zskFbtganUwA3JNRPU-FNNvJswA6U_Yg,12607
126
126
  fugue_ray/registry.py,sha256=xJRAhbwNrg695EwghQDnVtTKi4YkqZ0_61BD4OAblSA,1685
127
127
  fugue_ray/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -130,21 +130,21 @@ fugue_ray/_utils/dataframe.py,sha256=_EadzS4rPom1A_cF0pqoPlwrNYZTfTwcyyu86_fFsqU
130
130
  fugue_ray/_utils/io.py,sha256=SFTU4qXubGEmO5IGZA5yHy8Hu4b9aFZ9-eTU4Qs-NsQ,8757
131
131
  fugue_spark/__init__.py,sha256=rvrMpFs9socMgyH_58gLbnAqmirBf5oidXoO4cekW6U,165
132
132
  fugue_spark/_constants.py,sha256=K2uLQfjvMxXk75K-7_Wn47Alpwq5rW57BtECAUrOeqA,177
133
- fugue_spark/dataframe.py,sha256=-3kGdkuYSoM_l2xyXiEUtTjSfBfkLcg10hhWTQiORdI,9503
134
- fugue_spark/execution_engine.py,sha256=_0ldgIi4408LTCtMheelbnmx3p1_VHyc_zAMTLMj4eU,32928
133
+ fugue_spark/dataframe.py,sha256=lYa8FizM3p_lsKYFR49FazkVZMJKyi2LABKTpP5YBLo,12006
134
+ fugue_spark/execution_engine.py,sha256=rqgY9U1bpjh0GFNyNkuPcI7iV0xeipadURhNIir4w08,33147
135
135
  fugue_spark/ibis_engine.py,sha256=Yl5xxwROo1idcD2hFaylaI1IpmBUgbvOZRWtcrE0Zjo,1697
136
136
  fugue_spark/registry.py,sha256=kyIMk6dAiKRSKCHawQKyXu9DhZ24T6j3gL57TiOAZ8c,4162
137
137
  fugue_spark/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
138
- fugue_spark/_utils/convert.py,sha256=UlLGqwveT6H-O3EgC0r-2sLP5t5fUfBxdgbDqXUxdvE,8980
139
- fugue_spark/_utils/io.py,sha256=q-hQ2tlNa2AkWlGjJmoCLqran9jZLoyf2KZ5GP_3BIM,5511
138
+ fugue_spark/_utils/convert.py,sha256=eRWkDYA4UO-FQu-2y4O80WEdawx7X_rIrWg55AlOiRc,10007
139
+ fugue_spark/_utils/io.py,sha256=0ndQ70YlirPwGKjh5IDN6IdJxD26BnPpMonRob4dxII,5668
140
140
  fugue_spark/_utils/misc.py,sha256=o8dZmXOHnA7D_ps37vgGXTPTiSEG9LQzPKq7l-MG-qM,860
141
141
  fugue_spark/_utils/partition.py,sha256=iaesyO5f4uXhj1W-p91cD5ecPiGlu0bzh8gl2ce2Uvg,3618
142
142
  fugue_sql/__init__.py,sha256=Cmr7w0Efr7PzoXdQzdJfc4Dgqd69qKqcHZZodENq7EU,287
143
143
  fugue_sql/exceptions.py,sha256=ltS0MC8gMnVVrJbQiOZ0kRUWvVQ2LTx33dCW3ugqtb0,260
144
144
  fugue_test/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
145
145
  fugue_test/bag_suite.py,sha256=WbDCFjuAHYoJh4GXSPiSJxOoOwE1VMtYpJ3lQrsUK-Y,2483
146
- fugue_test/builtin_suite.py,sha256=eJ8jiBRbg61IkjUJa1r5PPHff_k2qHNg_ZQw1D4MIKY,78384
147
- fugue_test/dataframe_suite.py,sha256=mOr_x94H-Ylp0lJ-KBwHXJu-Q-qesqY3PzJxR9LI_Ko,18323
146
+ fugue_test/builtin_suite.py,sha256=o8aMZTKa74nKBmcUTTBbliTJMtNbsXE9SPKZopS504o,78400
147
+ fugue_test/dataframe_suite.py,sha256=LgB931CkASbGOrRQ9j92DGk9wPb__FoNusOk-HeqU9E,19165
148
148
  fugue_test/execution_suite.py,sha256=FI6UmwBvdoT1jkJRBqJT_Q0IDehFryvv00UL6jjxyAk,47689
149
149
  fugue_test/ibis_suite.py,sha256=Dk4AHVD00RcFsNm9VvJ4_4LOyFdGX30OnAtpO2SPruE,3529
150
150
  fugue_test/plugins/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -155,9 +155,9 @@ fugue_test/plugins/duckdb/fixtures.py,sha256=UxQbIMRbSrTZ3pgCmKZgd5wd1YvnVrqLSUP
155
155
  fugue_test/plugins/ray/__init__.py,sha256=nyKGW6xgTXtMhSs7yjgFNKO7mVboCNg63Bvdf39fO_I,55
156
156
  fugue_test/plugins/ray/fixtures.py,sha256=hZkvuo0AcD63XJl5JUroc9tm2LWHUPszg2zzY6FCSao,141
157
157
  fugue_version/__init__.py,sha256=vTwvdJOZi8jZb9U-Em7-d50qNDNPS2z51IXqRoojeNM,22
158
- fugue-0.8.7.dev4.dist-info/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
159
- fugue-0.8.7.dev4.dist-info/METADATA,sha256=q5KpH6iM_cZS-qZO357pbHw1saF1Dl03vustptvy9YY,17673
160
- fugue-0.8.7.dev4.dist-info/WHEEL,sha256=5sUXSg9e4bi7lTLOHcm6QEYwO5TIF1TNbTSVFVjcJcc,92
161
- fugue-0.8.7.dev4.dist-info/entry_points.txt,sha256=US6kfp0GXKzBD0cACD9Senb90evV9xSeKEfkriLaU6M,536
162
- fugue-0.8.7.dev4.dist-info/top_level.txt,sha256=y1eCfzGdQ1_RkgcShcfbvXs-bopD3DwJcIOxP9EFXno,140
163
- fugue-0.8.7.dev4.dist-info/RECORD,,
158
+ fugue-0.8.7.dev6.dist-info/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
159
+ fugue-0.8.7.dev6.dist-info/METADATA,sha256=0i4ibczIy_wEMtZ6vFvaCw40x5KmuQa6OsuBVWUTQyk,17860
160
+ fugue-0.8.7.dev6.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
161
+ fugue-0.8.7.dev6.dist-info/entry_points.txt,sha256=N_BIIy3lSvF6Z32QE0yXTucgdHrPbUrOwH1zj7bZ0ow,536
162
+ fugue-0.8.7.dev6.dist-info/top_level.txt,sha256=y1eCfzGdQ1_RkgcShcfbvXs-bopD3DwJcIOxP9EFXno,140
163
+ fugue-0.8.7.dev6.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.41.1)
2
+ Generator: bdist_wheel (0.41.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -2,7 +2,7 @@
2
2
  dask = fugue_dask.registry [dask]
3
3
  dask_ibis = fugue_dask.ibis_engine [dask,ibis]
4
4
  duckdb = fugue_duckdb.registry [duckdb]
5
- duckdb_ibis = fugue_duckdb.ibis_engine [ibis,duckdb]
5
+ duckdb_ibis = fugue_duckdb.ibis_engine [duckdb,ibis]
6
6
  ibis = fugue_ibis [ibis]
7
7
  polars = fugue_polars.registry [polars]
8
8
  ray = fugue_ray.registry [ray]
fugue_dask/_io.py CHANGED
@@ -1,5 +1,6 @@
1
1
  from typing import Any, Callable, Dict, List, Optional, Tuple, Union
2
2
 
3
+ import fsspec
3
4
  import fs as pfs
4
5
  import pandas as pd
5
6
  from dask import dataframe as dd
@@ -96,6 +97,8 @@ def _load_parquet(
96
97
 
97
98
 
98
99
  def _save_csv(df: DaskDataFrame, p: FileParser, **kwargs: Any) -> None:
100
+ fs, path = fsspec.core.url_to_fs(p.uri)
101
+ fs.makedirs(path, exist_ok=True)
99
102
  df.native.to_csv(
100
103
  pfs.path.combine(p.uri, "*.csv"), **{"index": False, "header": False, **kwargs}
101
104
  )
@@ -145,6 +148,8 @@ def _load_csv( # noqa: C901
145
148
 
146
149
 
147
150
  def _save_json(df: DaskDataFrame, p: FileParser, **kwargs: Any) -> None:
151
+ fs, path = fsspec.core.url_to_fs(p.uri)
152
+ fs.makedirs(path, exist_ok=True)
148
153
  df.native.to_json(pfs.path.combine(p.uri, "*.json"), **kwargs)
149
154
 
150
155
 
fugue_dask/_utils.py CHANGED
@@ -1,13 +1,14 @@
1
1
  import math
2
- from typing import Any, List, Optional, Tuple
2
+ from typing import Any, Callable, List, Optional, Tuple, TypeVar
3
3
 
4
4
  import dask.dataframe as dd
5
5
  import numpy as np
6
6
  import pandas as pd
7
7
  import pyarrow as pa
8
8
  from dask.dataframe.core import DataFrame
9
+ from dask.delayed import delayed
9
10
  from dask.distributed import Client, get_client
10
- from triad.utils.pandas_like import PandasLikeUtils, PD_UTILS
11
+ from triad.utils.pandas_like import PD_UTILS, PandasLikeUtils
11
12
  from triad.utils.pyarrow import to_pandas_dtype
12
13
 
13
14
  import fugue.api as fa
@@ -16,6 +17,7 @@ from fugue.constants import FUGUE_CONF_DEFAULT_PARTITIONS
16
17
  from ._constants import FUGUE_DASK_CONF_DEFAULT_PARTITIONS
17
18
 
18
19
  _FUGUE_DASK_TEMP_IDX_COLUMN = "_fugue_dask_temp_index"
20
+ T = TypeVar("T")
19
21
 
20
22
 
21
23
  def get_default_partitions() -> int:
@@ -28,6 +30,17 @@ def get_default_partitions() -> int:
28
30
  return n if n > 0 else fa.get_current_parallelism() * 2
29
31
 
30
32
 
33
+ def collect(df: dd.DataFrame, func: Callable[[pd.DataFrame], T]) -> Tuple[T]:
34
+ """Compute each partition in parallel and collect the results
35
+
36
+ :param df: dask dataframe
37
+ :return: the collected result
38
+ """
39
+ dfs = df.to_delayed()
40
+ objs = [delayed(func)(df) for df in dfs]
41
+ return dd.compute(*objs)
42
+
43
+
31
44
  def hash_repartition(df: dd.DataFrame, num: int, cols: List[Any]) -> dd.DataFrame:
32
45
  """Repartition the dataframe by hashing the given columns
33
46
 
fugue_dask/dataframe.py CHANGED
@@ -3,20 +3,21 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple
3
3
  import dask.dataframe as dd
4
4
  import pandas as pd
5
5
  import pyarrow as pa
6
+ from triad import assert_or_throw
6
7
  from triad.collections.schema import Schema
7
8
  from triad.utils.assertion import assert_arg_not_none
8
9
  from triad.utils.pandas_like import PD_UTILS
9
10
  from triad.utils.pyarrow import cast_pa_table
10
11
 
11
- from fugue.dataframe import (
12
- ArrowDataFrame,
13
- DataFrame,
14
- LocalBoundedDataFrame,
15
- PandasDataFrame,
16
- )
12
+ from fugue.dataframe import DataFrame, LocalBoundedDataFrame, PandasDataFrame
17
13
  from fugue.dataframe.dataframe import _input_schema
14
+ from fugue.dataframe.pandas_dataframe import _pd_as_dicts
18
15
  from fugue.exceptions import FugueDataFrameOperationError
19
16
  from fugue.plugins import (
17
+ as_array,
18
+ as_array_iterable,
19
+ as_dict_iterable,
20
+ as_dicts,
20
21
  as_local_bounded,
21
22
  count,
22
23
  drop_columns,
@@ -32,7 +33,7 @@ from fugue.plugins import (
32
33
  )
33
34
 
34
35
  from ._constants import FUGUE_DASK_USE_ARROW
35
- from ._utils import DASK_UTILS, get_default_partitions
36
+ from ._utils import DASK_UTILS, collect, get_default_partitions
36
37
 
37
38
 
38
39
  class DaskDataFrame(DataFrame):
@@ -150,8 +151,16 @@ class DaskDataFrame(DataFrame):
150
151
  )
151
152
 
152
153
  def as_arrow(self, type_safe: bool = False) -> pa.Table:
153
- adf = pa.Table.from_pandas(self.native.compute().reset_index(drop=True))
154
- return cast_pa_table(adf, self.schema.pa_schema)
154
+ schema = self.schema.pa_schema
155
+ return pa.concat_tables(
156
+ collect(
157
+ self.native,
158
+ lambda df: cast_pa_table(
159
+ pa.Table.from_pandas(df.reset_index(drop=True), schema=schema),
160
+ schema=schema,
161
+ ),
162
+ )
163
+ )
155
164
 
156
165
  def rename(self, columns: Dict[str, str]) -> DataFrame:
157
166
  try:
@@ -170,17 +179,28 @@ class DaskDataFrame(DataFrame):
170
179
  def as_array(
171
180
  self, columns: Optional[List[str]] = None, type_safe: bool = False
172
181
  ) -> List[Any]:
173
- df: DataFrame = self
174
- if columns is not None:
175
- df = df[columns]
176
- return ArrowDataFrame(df.as_pandas(), schema=df.schema).as_array(
177
- type_safe=type_safe
178
- )
182
+ chunks = _to_array_chunks(self.native, columns, type_safe, schema=self.schema)
183
+ res: List[List[Any]] = []
184
+ for x in chunks:
185
+ res += x
186
+ return res
179
187
 
180
188
  def as_array_iterable(
181
189
  self, columns: Optional[List[str]] = None, type_safe: bool = False
182
190
  ) -> Iterable[Any]:
183
- yield from self.as_array(columns=columns, type_safe=type_safe)
191
+ chunks = _to_array_chunks(self.native, columns, type_safe, schema=self.schema)
192
+ for x in chunks:
193
+ yield from x
194
+
195
+ def as_dicts(
196
+ self, columns: Optional[List[str]] = None, type_safe: bool = False
197
+ ) -> List[Dict[str, Any]]:
198
+ return _dd_as_dicts(self.native, columns)
199
+
200
+ def as_dict_iterable(
201
+ self, columns: Optional[List[str]] = None, type_safe: bool = False
202
+ ) -> Iterable[Dict[str, Any]]:
203
+ yield from _dd_as_dict_iterable(self.native, columns)
184
204
 
185
205
  def head(
186
206
  self, n: int, columns: Optional[List[str]] = None
@@ -197,8 +217,11 @@ class DaskDataFrame(DataFrame):
197
217
  assert_arg_not_none(schema, "schema")
198
218
  return pdf, schema
199
219
  DASK_UTILS.ensure_compatible(pdf)
200
- pschema = Schema(DASK_UTILS.to_schema(pdf))
201
- if schema is None or pschema == schema:
220
+ # when pdf contains bytes, or any object types, and schema contains str
221
+ # there is no way to get the real schema of the pdf, (pschema will contain
222
+ # strs instead of the real types) so we have to force cast it to the schema
223
+ if schema is None:
224
+ pschema = Schema(DASK_UTILS.to_schema(pdf))
202
225
  return pdf, pschema.assert_not_empty()
203
226
  pdf = pdf[schema.assert_not_empty().names]
204
227
  return (
@@ -295,6 +318,48 @@ def _dd_head(
295
318
  return PandasDataFrame(res) if as_fugue else res
296
319
 
297
320
 
321
+ @as_array.candidate(lambda df, *args, **kwargs: isinstance(df, dd.DataFrame))
322
+ def _dd_as_array(
323
+ df: dd.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
324
+ ) -> List[Any]:
325
+ chunks = _to_array_chunks(df, columns, type_safe)
326
+ res: List[List[Any]] = []
327
+ for x in chunks:
328
+ res += x
329
+ return res
330
+
331
+
332
+ @as_array_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, dd.DataFrame))
333
+ def _dd_as_array_iterable(
334
+ df: dd.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
335
+ ) -> Iterable[Any]:
336
+ chunks = _to_array_chunks(df, columns, type_safe)
337
+ for x in chunks:
338
+ yield from x
339
+
340
+
341
+ @as_dicts.candidate(lambda df, *args, **kwargs: isinstance(df, dd.DataFrame))
342
+ def _dd_as_dicts(
343
+ df: dd.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
344
+ ) -> List[Dict[str, Any]]:
345
+ assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
346
+ _df = df if columns is None or len(columns) == 0 else df[columns]
347
+ res: List[Dict[str, Any]] = []
348
+ for x in collect(_df, lambda df: _pd_as_dicts(df, columns)):
349
+ res += x
350
+ return res
351
+
352
+
353
+ @as_dict_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, dd.DataFrame))
354
+ def _dd_as_dict_iterable(
355
+ df: dd.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
356
+ ) -> Iterable[Dict[str, Any]]:
357
+ assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
358
+ _df = df if columns is None or len(columns) == 0 else df[columns]
359
+ for x in collect(_df, lambda df: _pd_as_dicts(df, columns)):
360
+ yield from x
361
+
362
+
298
363
  def _assert_no_missing(df: dd.DataFrame, columns: Iterable[Any]) -> None:
299
364
  missing = set(columns) - set(df.columns)
300
365
  if len(missing) > 0:
@@ -303,3 +368,25 @@ def _assert_no_missing(df: dd.DataFrame, columns: Iterable[Any]) -> None:
303
368
 
304
369
  def _adjust_df(res: dd.DataFrame, as_fugue: bool):
305
370
  return res if not as_fugue else DaskDataFrame(res)
371
+
372
+
373
+ def _to_array_chunks(
374
+ df: dd.DataFrame,
375
+ columns: Optional[List[str]] = None,
376
+ type_safe: bool = False,
377
+ schema: Optional[Schema] = None,
378
+ ) -> Tuple[List[Any]]:
379
+ assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
380
+ _df = df if columns is None or len(columns) == 0 else df[columns]
381
+
382
+ def _to_list(pdf: pd.DataFrame) -> List[Any]:
383
+ return list(
384
+ PD_UTILS.as_array_iterable(
385
+ pdf,
386
+ schema=None if schema is None else schema.pa_schema,
387
+ columns=columns,
388
+ type_safe=type_safe,
389
+ )
390
+ )
391
+
392
+ return collect(_df, _to_list)
fugue_duckdb/dataframe.py CHANGED
@@ -3,21 +3,33 @@ from typing import Any, Dict, Iterable, List, Optional
3
3
  import pandas as pd
4
4
  import pyarrow as pa
5
5
  from duckdb import DuckDBPyRelation
6
- from triad import Schema
6
+ from triad import Schema, assert_or_throw
7
7
  from triad.utils.pyarrow import LARGE_TYPES_REPLACEMENT, replace_types_in_table
8
8
 
9
- from fugue import ArrayDataFrame, ArrowDataFrame, DataFrame, LocalBoundedDataFrame
9
+ from fugue import ArrowDataFrame, DataFrame, LocalBoundedDataFrame
10
10
  from fugue.dataframe.arrow_dataframe import _pa_table_as_pandas
11
+ from fugue.dataframe.utils import (
12
+ pa_table_as_array,
13
+ pa_table_as_array_iterable,
14
+ pa_table_as_dict_iterable,
15
+ pa_table_as_dicts,
16
+ )
11
17
  from fugue.exceptions import FugueDataFrameOperationError, FugueDatasetEmptyError
12
18
  from fugue.plugins import (
19
+ as_array,
20
+ as_array_iterable,
13
21
  as_arrow,
22
+ as_dict_iterable,
23
+ as_dicts,
14
24
  as_fugue_dataset,
15
25
  as_local_bounded,
16
26
  as_pandas,
27
+ drop_columns,
17
28
  get_column_names,
18
29
  get_num_partitions,
19
30
  get_schema,
20
31
  is_df,
32
+ select_columns,
21
33
  )
22
34
 
23
35
  from ._utils import encode_column_name, to_duck_type, to_pa_type
@@ -59,13 +71,10 @@ class DuckDataFrame(LocalBoundedDataFrame):
59
71
  return len(self._rel)
60
72
 
61
73
  def _drop_cols(self, cols: List[str]) -> DataFrame:
62
- cols = [col for col in self._rel.columns if col not in cols]
63
- rel = self._rel.project(",".join(encode_column_name(n) for n in cols))
64
- return DuckDataFrame(rel)
74
+ return DuckDataFrame(_drop_duckdb_columns(self._rel, cols))
65
75
 
66
76
  def _select_cols(self, keys: List[Any]) -> DataFrame:
67
- rel = self._rel.project(",".join(encode_column_name(n) for n in keys))
68
- return DuckDataFrame(rel)
77
+ return DuckDataFrame(_select_duckdb_columns(self._rel, keys))
69
78
 
70
79
  def rename(self, columns: Dict[str, str]) -> DataFrame:
71
80
  _assert_no_missing(self._rel, columns.keys())
@@ -109,38 +118,29 @@ class DuckDataFrame(LocalBoundedDataFrame):
109
118
  def as_array(
110
119
  self, columns: Optional[List[str]] = None, type_safe: bool = False
111
120
  ) -> List[Any]:
112
- if columns is not None:
113
- return self[columns].as_array(type_safe=type_safe)
114
- return self._fetchall(self._rel)
121
+ return _duck_as_array(self._rel, columns=columns, type_safe=type_safe)
115
122
 
116
123
  def as_array_iterable(
117
124
  self, columns: Optional[List[str]] = None, type_safe: bool = False
118
125
  ) -> Iterable[Any]:
119
- if columns is not None:
120
- yield from self[columns].as_array_iterable(type_safe=type_safe)
121
- else:
122
- yield from self._fetchall(self._rel)
126
+ yield from _duck_as_array_iterable(
127
+ self._rel, columns=columns, type_safe=type_safe
128
+ )
129
+
130
+ def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
131
+ return _duck_as_dicts(self._rel, columns=columns)
132
+
133
+ def as_dict_iterable(
134
+ self, columns: Optional[List[str]] = None
135
+ ) -> Iterable[Dict[str, Any]]:
136
+ yield from _duck_as_dict_iterable(self._rel, columns=columns)
123
137
 
124
138
  def head(
125
139
  self, n: int, columns: Optional[List[str]] = None
126
140
  ) -> LocalBoundedDataFrame:
127
141
  if columns is not None:
128
142
  return self[columns].head(n)
129
- return ArrayDataFrame(self._fetchall(self._rel.limit(n)), schema=self.schema)
130
-
131
- def _fetchall(self, rel: DuckDBPyRelation) -> List[List[Any]]:
132
- map_pos = [i for i, t in enumerate(self.schema.types) if pa.types.is_map(t)]
133
- if len(map_pos) == 0:
134
- return [list(x) for x in rel.fetchall()]
135
- else:
136
-
137
- def to_list(row: Any) -> List[Any]:
138
- res = list(row)
139
- for p in map_pos:
140
- res[p] = list(zip(row[p]["key"], row[p]["value"]))
141
- return res
142
-
143
- return [to_list(x) for x in rel.fetchall()]
143
+ return ArrowDataFrame(_duck_as_arrow(self._rel.limit(n)))
144
144
 
145
145
 
146
146
  @as_fugue_dataset.candidate(lambda df, **kwargs: isinstance(df, DuckDBPyRelation))
@@ -186,6 +186,64 @@ def _get_duckdb_columns(df: DuckDBPyRelation) -> List[Any]:
186
186
  return list(df.columns)
187
187
 
188
188
 
189
+ @select_columns.candidate(lambda df, *args, **kwargs: isinstance(df, DuckDBPyRelation))
190
+ def _select_duckdb_columns(
191
+ df: DuckDBPyRelation, columns: List[Any]
192
+ ) -> DuckDBPyRelation:
193
+ if len(columns) == 0:
194
+ raise FugueDataFrameOperationError("must select at least one column")
195
+ _assert_no_missing(df, columns)
196
+ return df.project(",".join(encode_column_name(n) for n in columns))
197
+
198
+
199
+ @drop_columns.candidate(lambda df, *args, **kwargs: isinstance(df, DuckDBPyRelation))
200
+ def _drop_duckdb_columns(df: DuckDBPyRelation, columns: List[str]) -> DuckDBPyRelation:
201
+ # if len(columns) == 0:
202
+ # return df
203
+ _columns = {c: 1 for c in columns}
204
+ cols = [col for col in df.columns if _columns.pop(col, None) is None]
205
+ assert_or_throw(
206
+ len(cols) > 0, FugueDataFrameOperationError("must keep at least one column")
207
+ )
208
+ assert_or_throw(
209
+ len(_columns) == 0,
210
+ FugueDataFrameOperationError("found nonexistent columns {_columns}"),
211
+ )
212
+ return df.project(",".join(encode_column_name(n) for n in cols))
213
+
214
+
215
+ @as_array.candidate(lambda df, *args, **kwargs: isinstance(df, DuckDBPyRelation))
216
+ def _duck_as_array(
217
+ df: DuckDBPyRelation, columns: Optional[List[str]] = None, type_safe: bool = False
218
+ ) -> List[Any]:
219
+ return pa_table_as_array(df.arrow(), columns=columns)
220
+
221
+
222
+ @as_array_iterable.candidate(
223
+ lambda df, *args, **kwargs: isinstance(df, DuckDBPyRelation)
224
+ )
225
+ def _duck_as_array_iterable(
226
+ df: DuckDBPyRelation, columns: Optional[List[str]] = None, type_safe: bool = False
227
+ ) -> Iterable[Any]:
228
+ yield from pa_table_as_array_iterable(df.arrow(), columns=columns)
229
+
230
+
231
+ @as_dicts.candidate(lambda df, *args, **kwargs: isinstance(df, DuckDBPyRelation))
232
+ def _duck_as_dicts(
233
+ df: DuckDBPyRelation, columns: Optional[List[str]] = None
234
+ ) -> List[Dict[str, Any]]:
235
+ return pa_table_as_dicts(df.arrow(), columns=columns)
236
+
237
+
238
+ @as_dict_iterable.candidate(
239
+ lambda df, *args, **kwargs: isinstance(df, DuckDBPyRelation)
240
+ )
241
+ def _duck_as_dict_iterable(
242
+ df: DuckDBPyRelation, columns: Optional[List[str]] = None
243
+ ) -> Iterable[Dict[str, Any]]:
244
+ yield from pa_table_as_dict_iterable(df.arrow(), columns=columns)
245
+
246
+
189
247
  def _assert_no_missing(df: DuckDBPyRelation, columns: Iterable[Any]) -> None:
190
248
  missing = set(columns) - set(df.columns)
191
249
  if len(missing) > 0:
fugue_ibis/dataframe.py CHANGED
@@ -143,6 +143,19 @@ class IbisDataFrame(DataFrame):
143
143
  type_safe=type_safe
144
144
  )
145
145
 
146
+ def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
147
+ if columns is not None:
148
+ return self[columns].as_dicts()
149
+ return self.as_local().as_dicts()
150
+
151
+ def as_dict_iterable(
152
+ self, columns: Optional[List[str]] = None
153
+ ) -> Iterable[Dict[str, Any]]:
154
+ if columns is not None:
155
+ yield from self[columns].as_dict_iterable()
156
+ else:
157
+ yield from self._to_iterable_df(self._table).as_dict_iterable()
158
+
146
159
  def head(
147
160
  self, n: int, columns: Optional[List[str]] = None
148
161
  ) -> LocalBoundedDataFrame: