fugue 0.8.7.dev6__py3-none-any.whl → 0.8.7.dev7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fugue/__init__.py +0 -1
- fugue/_utils/io.py +84 -89
- fugue/dataframe/utils.py +12 -25
- fugue/execution/execution_engine.py +0 -7
- fugue/execution/native_execution_engine.py +5 -11
- fugue/workflow/_checkpoint.py +9 -9
- {fugue-0.8.7.dev6.dist-info → fugue-0.8.7.dev7.dist-info}/METADATA +2 -2
- {fugue-0.8.7.dev6.dist-info → fugue-0.8.7.dev7.dist-info}/RECORD +24 -22
- {fugue-0.8.7.dev6.dist-info → fugue-0.8.7.dev7.dist-info}/WHEEL +1 -1
- {fugue-0.8.7.dev6.dist-info → fugue-0.8.7.dev7.dist-info}/entry_points.txt +2 -1
- fugue_dask/_io.py +22 -29
- fugue_dask/execution_engine.py +5 -12
- fugue_duckdb/_io.py +21 -37
- fugue_duckdb/execution_engine.py +2 -7
- fugue_ibis/execution_engine.py +1 -5
- fugue_ray/_utils/io.py +15 -17
- fugue_spark/_utils/io.py +3 -5
- fugue_spark/execution_engine.py +2 -7
- fugue_test/builtin_suite.py +12 -12
- fugue_test/execution_suite.py +13 -18
- fugue_test/plugins/misc/__init__.py +2 -0
- fugue_test/plugins/misc/fixtures.py +18 -0
- {fugue-0.8.7.dev6.dist-info → fugue-0.8.7.dev7.dist-info}/LICENSE +0 -0
- {fugue-0.8.7.dev6.dist-info → fugue-0.8.7.dev7.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
fugue/__init__.py,sha256=
|
|
1
|
+
fugue/__init__.py,sha256=LKkBEPEAMLG-Yuzqt0IgoIDEfNf9a1zUffNKb83D_l8,2705
|
|
2
2
|
fugue/api.py,sha256=dLUrigFhDMB5x7cvlWSK8EyaY2o0AmhgPr0VRtfzSz0,1254
|
|
3
3
|
fugue/constants.py,sha256=crd0VqX8WtBcjSUNwZDi2LDIEkhUMWOlSn73H8JI9ds,3385
|
|
4
4
|
fugue/dev.py,sha256=GQCkezBBl4V0lVDWhGtUQKqomiCxgR9dMhfqj9C8cS8,1369
|
|
@@ -10,7 +10,7 @@ fugue/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
10
10
|
fugue/_utils/display.py,sha256=JV8oDA7efHm1wceZulCBOY5dMvjbWHvIm6ASisKfoWY,3164
|
|
11
11
|
fugue/_utils/exception.py,sha256=SFIjwjV4CIEovp3P9k7ePNOFB12A5D8hDdhtfFUeM5Y,2247
|
|
12
12
|
fugue/_utils/interfaceless.py,sha256=wI0H6L4W_1uQjh9tpjgT9HzN-fbrrtXXHC1x6Q_rrPg,2203
|
|
13
|
-
fugue/_utils/io.py,sha256=
|
|
13
|
+
fugue/_utils/io.py,sha256=adrtj6Dq0ti426DNlkliApbTkp8b3bfBysAiE5MVQVc,9265
|
|
14
14
|
fugue/_utils/misc.py,sha256=_huy0eylmRTEFoReGR2M4rbAI8m79hFcfY5bDceVEXU,887
|
|
15
15
|
fugue/_utils/registry.py,sha256=lrbzTdUEVnW6paBGDj-Yb-aTIbP5mjCqrXuRU9_N6os,316
|
|
16
16
|
fugue/bag/__init__.py,sha256=0Q0_rnrEThrTx2U-1xGNyAg95idp_xcnywymIcW4Xck,46
|
|
@@ -34,15 +34,15 @@ fugue/dataframe/dataframes.py,sha256=tBSpHsENgbcdOJ0Jgst6PTKbjG7_uoFJch96oTlaQIs
|
|
|
34
34
|
fugue/dataframe/function_wrapper.py,sha256=V1eQMOn27UroEYT7_YiwoEF0RjZYIM0zkD3vfaMAQFs,14813
|
|
35
35
|
fugue/dataframe/iterable_dataframe.py,sha256=TcOoNKa4jNbHbvAZ0XAhtMmGcioygIHPxI9budDtenQ,4758
|
|
36
36
|
fugue/dataframe/pandas_dataframe.py,sha256=0L0wYCGhD2BpQbruoT07Ox9iQM5YLHLNrcgzudc-yKs,11633
|
|
37
|
-
fugue/dataframe/utils.py,sha256=
|
|
37
|
+
fugue/dataframe/utils.py,sha256=shN1eHYTnPhb38BHEpLlCdLSzX_qpoQ3-fsDgu1hCzQ,10840
|
|
38
38
|
fugue/dataset/__init__.py,sha256=5f2CAJ4xst6Z2o9Q2e2twfDOGUw8ZJoE2ild4JEU2pg,112
|
|
39
39
|
fugue/dataset/api.py,sha256=DacI4L2w5NJ-eZ6nFxNMqmReEnb0WUXswbjVp7BeErk,2794
|
|
40
40
|
fugue/dataset/dataset.py,sha256=jWXZqy3msMPFFkhas2PYJEX55ZAI3gk3Txq5f4-Qya4,4759
|
|
41
41
|
fugue/execution/__init__.py,sha256=iZGxAznZz9piM3k4gp0tln97MDIBxdliLyNbD-0Zc48,427
|
|
42
42
|
fugue/execution/api.py,sha256=KsFOLGdWQMlXmlQ5JRgRsbUeB64qzTVHxSEaunjiojo,39818
|
|
43
|
-
fugue/execution/execution_engine.py,sha256=
|
|
43
|
+
fugue/execution/execution_engine.py,sha256=5lIlebgPK7q-Gf4bWt1t_Anq3MjPaJBpGWN9bbry1B4,47506
|
|
44
44
|
fugue/execution/factory.py,sha256=5ICzfNh2QqqABuVyYLijY5-7LZgfRqczlaZN32p78bE,21003
|
|
45
|
-
fugue/execution/native_execution_engine.py,sha256=
|
|
45
|
+
fugue/execution/native_execution_engine.py,sha256=lbKd3uGh00cSTkIM8l-u8jmsMxFzV2PSUeJgudayxKs,14236
|
|
46
46
|
fugue/extensions/__init__.py,sha256=y-uLKd6mZ8sZ_8-OdW6ELoBO_9IfC0gDmEbE_rMCvOA,599
|
|
47
47
|
fugue/extensions/_utils.py,sha256=Bi3pYKy2Z6fG6_5BpwIWldxetassXpB4Zp8QamWB-wg,5173
|
|
48
48
|
fugue/extensions/context.py,sha256=c_y2UttzzIFoQTOCV42VCdj2nqah33xYuBjbKNIOpx8,4262
|
|
@@ -72,7 +72,7 @@ fugue/sql/_visitors.py,sha256=2pc0J-AHJAiIexsKgNjcgrCGOyhC3_7rzonSgtjy--k,33844
|
|
|
72
72
|
fugue/sql/api.py,sha256=l2I9CAy_W2oFFTct9fDPLyXF0LiDxQhMx5O8jBHTAxU,10050
|
|
73
73
|
fugue/sql/workflow.py,sha256=S1pOhp0b0t6johFAJWmj6xUB7Ti5LQgNABpAzmLGjrQ,3010
|
|
74
74
|
fugue/workflow/__init__.py,sha256=tXM_KYO8Q358W6qAVlwhIQIaYNRDgZtTubrIEX4QMgM,229
|
|
75
|
-
fugue/workflow/_checkpoint.py,sha256=
|
|
75
|
+
fugue/workflow/_checkpoint.py,sha256=tt5Iv7c5ZStC0MD1inItksQ0GuK0ViniA3nvrgym-5c,5681
|
|
76
76
|
fugue/workflow/_tasks.py,sha256=Zq_jXJO_VaF8DrWUuBiwO2Y3OVuhsiOQdzP4VBsp7Fo,11826
|
|
77
77
|
fugue/workflow/_workflow_context.py,sha256=Wmp6n0lSrh2Gpslb5EaSX6BQNniKsvKn6SlhVkQ6ui0,2504
|
|
78
78
|
fugue/workflow/api.py,sha256=uQoxPSCZ91-ST4vwuPWG7qioRGW4eo-Sgi3DdwtSL4k,12495
|
|
@@ -86,25 +86,25 @@ fugue_contrib/viz/__init__.py,sha256=osgZx63Br-yMZImyEfYf9MVzJNM2Cqqke_-WsuDmG5M
|
|
|
86
86
|
fugue_contrib/viz/_ext.py,sha256=Lu_DlS5DcmrFz27fHcKTCkhKyknVWcfS5kzZVVuO9xM,1345
|
|
87
87
|
fugue_dask/__init__.py,sha256=2CcJ0AsN-k_f7dZ-yAyYpaICfUMPfH3l0FvUJSBzTr0,161
|
|
88
88
|
fugue_dask/_constants.py,sha256=35UmTVITk21GhRyRlbJOwPPdQsytM_p_2NytOXEay18,510
|
|
89
|
-
fugue_dask/_io.py,sha256=
|
|
89
|
+
fugue_dask/_io.py,sha256=HmL3Q2lRSptX1-GwiB3MN2VpjTRfmVKD8TDZkhS4x5c,5818
|
|
90
90
|
fugue_dask/_utils.py,sha256=n70N3wPPMz13Jh0GWJM3Je-TCYpU36yGP_YCwIHqUrc,8908
|
|
91
91
|
fugue_dask/dataframe.py,sha256=MuG9TqCND7qI66lPvxzuomfE7yA4sW7DjrvbyvE6XEU,13471
|
|
92
|
-
fugue_dask/execution_engine.py,sha256=
|
|
92
|
+
fugue_dask/execution_engine.py,sha256=PAClUP9lCdn2Aajt2AsoFOsgO-95WcdRDKkjNSbVbzA,20980
|
|
93
93
|
fugue_dask/ibis_engine.py,sha256=kQdaG_KlZZ2AjtYETNCdTJOgtwI_eH0aGzLaAiIBbRI,2120
|
|
94
94
|
fugue_dask/registry.py,sha256=7UTg_eie7zKlHYKMCyOo0TNn5y2TiIjE8kiS2PruHFc,2200
|
|
95
95
|
fugue_duckdb/__init__.py,sha256=nSNv-fxBAKD6W23EbMeV4dVRIaSTqr9DzQUWuVOES8s,379
|
|
96
|
-
fugue_duckdb/_io.py,sha256=
|
|
96
|
+
fugue_duckdb/_io.py,sha256=E35_GoD1uGuuAMOY4H8E2j-UazdAgTmLp4lLWqJrNsE,8437
|
|
97
97
|
fugue_duckdb/_utils.py,sha256=ElKbHUyn5fWSPGXsK57iqMzcqKtCf0c8pBVBYGe5Ql4,5020
|
|
98
98
|
fugue_duckdb/dask.py,sha256=agoLzeB7Swxj2kVWfmXFbWD1NS2lbbTlnrjSkR8kKWY,5014
|
|
99
99
|
fugue_duckdb/dataframe.py,sha256=LRfTv7Y46wMM_IDYSP1R-5OXuHuBg8GHjPGFFt8u7l0,8444
|
|
100
|
-
fugue_duckdb/execution_engine.py,sha256=
|
|
100
|
+
fugue_duckdb/execution_engine.py,sha256=IZDmSAtOMJGvulTStxjTmsqJyI5QRNyxBgSMlFMSrBI,20389
|
|
101
101
|
fugue_duckdb/ibis_engine.py,sha256=MrypeABozqwetKOpqtrmWvCJX2QPfBXhbSEhvK9vqmI,1990
|
|
102
102
|
fugue_duckdb/registry.py,sha256=Dj0Tng1cXVT6Q7t-KxOky2k1dD9xSBjYGQmI26UgZPo,3095
|
|
103
103
|
fugue_ibis/__init__.py,sha256=PcUt66KlLyGGicad7asq5j2U567_fhR0HzvWQBhV1VM,362
|
|
104
104
|
fugue_ibis/_compat.py,sha256=zKdTaTfuC02eUIzZPkcd7oObnVBi_X5mQjQf7SDme3Y,246
|
|
105
105
|
fugue_ibis/_utils.py,sha256=BUL5swA5FE4eQu0t5Z17hZVu9a2MFfxlFH6Ymy9xifg,6607
|
|
106
106
|
fugue_ibis/dataframe.py,sha256=0Fb1vJjwEeffgoUCDfDGIMuSFaPgUJqcB-JqJOAALfs,7789
|
|
107
|
-
fugue_ibis/execution_engine.py,sha256=
|
|
107
|
+
fugue_ibis/execution_engine.py,sha256=0GIjjMmitCKhjasAKFiFUCCUBNdxAiU0b61RsmFyhIk,18355
|
|
108
108
|
fugue_ibis/extensions.py,sha256=H8l-SPfoqLuUoILtOuL2nccOpoL83zHeSoIhoqjtWQM,6905
|
|
109
109
|
fugue_ibis/execution/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
110
110
|
fugue_ibis/execution/ibis_engine.py,sha256=-HdPnIFWD83n5WITdzJiu4attH7GOcO041wkT5Y5ChA,1499
|
|
@@ -127,37 +127,39 @@ fugue_ray/registry.py,sha256=xJRAhbwNrg695EwghQDnVtTKi4YkqZ0_61BD4OAblSA,1685
|
|
|
127
127
|
fugue_ray/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
128
128
|
fugue_ray/_utils/cluster.py,sha256=3T3Gyra6lAHlzktta-Ro35j6YZQfH6fNrj2hC5ATF9k,621
|
|
129
129
|
fugue_ray/_utils/dataframe.py,sha256=_EadzS4rPom1A_cF0pqoPlwrNYZTfTwcyyu86_fFsqU,4400
|
|
130
|
-
fugue_ray/_utils/io.py,sha256=
|
|
130
|
+
fugue_ray/_utils/io.py,sha256=4FfPS2DMeIHvbzGoJ_iPvwwVr7lZHXRoJZxceNZ4EHQ,8647
|
|
131
131
|
fugue_spark/__init__.py,sha256=rvrMpFs9socMgyH_58gLbnAqmirBf5oidXoO4cekW6U,165
|
|
132
132
|
fugue_spark/_constants.py,sha256=K2uLQfjvMxXk75K-7_Wn47Alpwq5rW57BtECAUrOeqA,177
|
|
133
133
|
fugue_spark/dataframe.py,sha256=lYa8FizM3p_lsKYFR49FazkVZMJKyi2LABKTpP5YBLo,12006
|
|
134
|
-
fugue_spark/execution_engine.py,sha256=
|
|
134
|
+
fugue_spark/execution_engine.py,sha256=KPmBtH4zioXdWsvnPow4fOPQh8Yj0cn6yCJyKdbT544,33023
|
|
135
135
|
fugue_spark/ibis_engine.py,sha256=Yl5xxwROo1idcD2hFaylaI1IpmBUgbvOZRWtcrE0Zjo,1697
|
|
136
136
|
fugue_spark/registry.py,sha256=kyIMk6dAiKRSKCHawQKyXu9DhZ24T6j3gL57TiOAZ8c,4162
|
|
137
137
|
fugue_spark/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
138
138
|
fugue_spark/_utils/convert.py,sha256=eRWkDYA4UO-FQu-2y4O80WEdawx7X_rIrWg55AlOiRc,10007
|
|
139
|
-
fugue_spark/_utils/io.py,sha256=
|
|
139
|
+
fugue_spark/_utils/io.py,sha256=OdUezKpB29Lx9aUS2k9x0xUAGZrmgMZyQYGPEeHk7rQ,5574
|
|
140
140
|
fugue_spark/_utils/misc.py,sha256=o8dZmXOHnA7D_ps37vgGXTPTiSEG9LQzPKq7l-MG-qM,860
|
|
141
141
|
fugue_spark/_utils/partition.py,sha256=iaesyO5f4uXhj1W-p91cD5ecPiGlu0bzh8gl2ce2Uvg,3618
|
|
142
142
|
fugue_sql/__init__.py,sha256=Cmr7w0Efr7PzoXdQzdJfc4Dgqd69qKqcHZZodENq7EU,287
|
|
143
143
|
fugue_sql/exceptions.py,sha256=ltS0MC8gMnVVrJbQiOZ0kRUWvVQ2LTx33dCW3ugqtb0,260
|
|
144
144
|
fugue_test/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
145
145
|
fugue_test/bag_suite.py,sha256=WbDCFjuAHYoJh4GXSPiSJxOoOwE1VMtYpJ3lQrsUK-Y,2483
|
|
146
|
-
fugue_test/builtin_suite.py,sha256=
|
|
146
|
+
fugue_test/builtin_suite.py,sha256=3uSY484Jl2985UoJravD4C-SlKBH0WwTWFobp4Pqgzg,78399
|
|
147
147
|
fugue_test/dataframe_suite.py,sha256=LgB931CkASbGOrRQ9j92DGk9wPb__FoNusOk-HeqU9E,19165
|
|
148
|
-
fugue_test/execution_suite.py,sha256=
|
|
148
|
+
fugue_test/execution_suite.py,sha256=ClZUYt2R560LN4DZM_OP9cA5jaHmz3-u_BC3A0C24fQ,47472
|
|
149
149
|
fugue_test/ibis_suite.py,sha256=Dk4AHVD00RcFsNm9VvJ4_4LOyFdGX30OnAtpO2SPruE,3529
|
|
150
150
|
fugue_test/plugins/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
151
151
|
fugue_test/plugins/dask/__init__.py,sha256=LQVyNgGRvZrKUrNNV1z1X1GyIL3qJoxvNjFfpFzNVCc,55
|
|
152
152
|
fugue_test/plugins/dask/fixtures.py,sha256=uf5gkO30L5-LvxpEpBjG4_bNUrpkemHvyVPxDHgMSGM,354
|
|
153
153
|
fugue_test/plugins/duckdb/__init__.py,sha256=WXtNYQpbO0JScPpIA3QREv8cwOZP2GDOgGOtJKgpTVM,61
|
|
154
154
|
fugue_test/plugins/duckdb/fixtures.py,sha256=UxQbIMRbSrTZ3pgCmKZgd5wd1YvnVrqLSUPaO8UYrSE,165
|
|
155
|
+
fugue_test/plugins/misc/__init__.py,sha256=0SZvyo0xlw5NDbJly4yjaNDqL9M4D2Jsg33HCWE40q8,49
|
|
156
|
+
fugue_test/plugins/misc/fixtures.py,sha256=GrD9WTTtcIDCWLHn-ToVv8pUiUCGCSczgs9bodWKo7c,353
|
|
155
157
|
fugue_test/plugins/ray/__init__.py,sha256=nyKGW6xgTXtMhSs7yjgFNKO7mVboCNg63Bvdf39fO_I,55
|
|
156
158
|
fugue_test/plugins/ray/fixtures.py,sha256=hZkvuo0AcD63XJl5JUroc9tm2LWHUPszg2zzY6FCSao,141
|
|
157
159
|
fugue_version/__init__.py,sha256=vTwvdJOZi8jZb9U-Em7-d50qNDNPS2z51IXqRoojeNM,22
|
|
158
|
-
fugue-0.8.7.
|
|
159
|
-
fugue-0.8.7.
|
|
160
|
-
fugue-0.8.7.
|
|
161
|
-
fugue-0.8.7.
|
|
162
|
-
fugue-0.8.7.
|
|
163
|
-
fugue-0.8.7.
|
|
160
|
+
fugue-0.8.7.dev7.dist-info/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
|
|
161
|
+
fugue-0.8.7.dev7.dist-info/METADATA,sha256=nSp1i8apniEEe6U09_5RA8K89P40c7M5Gn9l6ofLTHQ,17860
|
|
162
|
+
fugue-0.8.7.dev7.dist-info/WHEEL,sha256=Xo9-1PvkuimrydujYJAjF7pCkriuXBpUPEjma1nZyJ0,92
|
|
163
|
+
fugue-0.8.7.dev7.dist-info/entry_points.txt,sha256=Xrl3ISyVKAFIPn1klqeGsL9DinzoYqfqBkOT4qAVBNA,578
|
|
164
|
+
fugue-0.8.7.dev7.dist-info/top_level.txt,sha256=y1eCfzGdQ1_RkgcShcfbvXs-bopD3DwJcIOxP9EFXno,140
|
|
165
|
+
fugue-0.8.7.dev7.dist-info/RECORD,,
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
dask = fugue_dask.registry [dask]
|
|
3
3
|
dask_ibis = fugue_dask.ibis_engine [dask,ibis]
|
|
4
4
|
duckdb = fugue_duckdb.registry [duckdb]
|
|
5
|
-
duckdb_ibis = fugue_duckdb.ibis_engine [duckdb
|
|
5
|
+
duckdb_ibis = fugue_duckdb.ibis_engine [ibis,duckdb]
|
|
6
6
|
ibis = fugue_ibis [ibis]
|
|
7
7
|
polars = fugue_polars.registry [polars]
|
|
8
8
|
ray = fugue_ray.registry [ray]
|
|
@@ -12,5 +12,6 @@ spark_ibis = fugue_spark.ibis_engine [spark,ibis]
|
|
|
12
12
|
[pytest11]
|
|
13
13
|
fugue_test_dask = fugue_test.plugins.dask [dask]
|
|
14
14
|
fugue_test_duckdb = fugue_test.plugins.duckdb [duckdb]
|
|
15
|
+
fugue_test_misc = fugue_test.plugins.misc
|
|
15
16
|
fugue_test_ray = fugue_test.plugins.ray [ray]
|
|
16
17
|
|
fugue_dask/_io.py
CHANGED
|
@@ -1,13 +1,12 @@
|
|
|
1
1
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
2
2
|
|
|
3
|
-
import fsspec
|
|
4
|
-
import fs as pfs
|
|
5
3
|
import pandas as pd
|
|
6
4
|
from dask import dataframe as dd
|
|
5
|
+
from fsspec import AbstractFileSystem
|
|
7
6
|
from triad.collections.dict import ParamDict
|
|
8
|
-
from triad.collections.fs import FileSystem
|
|
9
7
|
from triad.collections.schema import Schema
|
|
10
8
|
from triad.utils.assertion import assert_or_throw
|
|
9
|
+
from triad.utils.io import join, makedirs, url_to_fs
|
|
11
10
|
|
|
12
11
|
from fugue._utils.io import FileParser, _get_single_files
|
|
13
12
|
from fugue_dask.dataframe import DaskDataFrame
|
|
@@ -19,7 +18,7 @@ def load_df(
|
|
|
19
18
|
uri: Union[str, List[str]],
|
|
20
19
|
format_hint: Optional[str] = None,
|
|
21
20
|
columns: Any = None,
|
|
22
|
-
fs: Optional[
|
|
21
|
+
fs: Optional[AbstractFileSystem] = None,
|
|
23
22
|
**kwargs: Any,
|
|
24
23
|
) -> DaskDataFrame:
|
|
25
24
|
if isinstance(uri, str):
|
|
@@ -39,7 +38,7 @@ def save_df(
|
|
|
39
38
|
uri: str,
|
|
40
39
|
format_hint: Optional[str] = None,
|
|
41
40
|
mode: str = "overwrite",
|
|
42
|
-
fs: Optional[
|
|
41
|
+
fs: Optional[AbstractFileSystem] = None,
|
|
43
42
|
**kwargs: Any,
|
|
44
43
|
) -> None:
|
|
45
44
|
assert_or_throw(
|
|
@@ -48,16 +47,13 @@ def save_df(
|
|
|
48
47
|
)
|
|
49
48
|
p = FileParser(uri, format_hint).assert_no_glob()
|
|
50
49
|
if fs is None:
|
|
51
|
-
fs =
|
|
50
|
+
fs, _ = url_to_fs(uri)
|
|
52
51
|
if fs.exists(uri):
|
|
53
52
|
assert_or_throw(mode == "overwrite", FileExistsError(uri))
|
|
54
53
|
try:
|
|
55
|
-
fs.
|
|
56
|
-
except Exception:
|
|
57
|
-
|
|
58
|
-
fs.removetree(uri)
|
|
59
|
-
except Exception: # pragma: no cover
|
|
60
|
-
pass
|
|
54
|
+
fs.rm(uri, recursive=True)
|
|
55
|
+
except Exception: # pragma: no cover
|
|
56
|
+
pass
|
|
61
57
|
_FORMAT_SAVE[p.file_format](df, p, **kwargs)
|
|
62
58
|
|
|
63
59
|
|
|
@@ -67,7 +63,7 @@ def _save_parquet(df: DaskDataFrame, p: FileParser, **kwargs: Any) -> None:
|
|
|
67
63
|
"write_index": False,
|
|
68
64
|
**kwargs,
|
|
69
65
|
}
|
|
70
|
-
DASK_UTILS.to_parquet_friendly(df.native).to_parquet(p.
|
|
66
|
+
DASK_UTILS.to_parquet_friendly(df.native).to_parquet(p.path, **params)
|
|
71
67
|
|
|
72
68
|
|
|
73
69
|
def _load_parquet(
|
|
@@ -80,27 +76,26 @@ def _load_parquet(
|
|
|
80
76
|
if pd.__version__ >= "1.5":
|
|
81
77
|
dtype_backend = "pyarrow"
|
|
82
78
|
if columns is None:
|
|
83
|
-
pdf = dd.read_parquet(p.
|
|
79
|
+
pdf = dd.read_parquet(p.path, dtype_backend=dtype_backend, **params)
|
|
84
80
|
schema = Schema(pdf.head(1))
|
|
85
81
|
return pdf, schema
|
|
86
82
|
if isinstance(columns, list): # column names
|
|
87
83
|
pdf = dd.read_parquet(
|
|
88
|
-
p.
|
|
84
|
+
p.path, columns=columns, dtype_backend=dtype_backend, **params
|
|
89
85
|
)
|
|
90
86
|
schema = Schema(pdf.head(1))
|
|
91
87
|
return pdf, schema
|
|
92
88
|
schema = Schema(columns)
|
|
93
89
|
pdf = dd.read_parquet(
|
|
94
|
-
p.
|
|
90
|
+
p.path, columns=schema.names, dtype_backend=dtype_backend, **params
|
|
95
91
|
)
|
|
96
92
|
return pdf, schema
|
|
97
93
|
|
|
98
94
|
|
|
99
95
|
def _save_csv(df: DaskDataFrame, p: FileParser, **kwargs: Any) -> None:
|
|
100
|
-
|
|
101
|
-
fs.makedirs(path, exist_ok=True)
|
|
96
|
+
makedirs(p.path, exist_ok=True)
|
|
102
97
|
df.native.to_csv(
|
|
103
|
-
|
|
98
|
+
p.join("*.csv").path, **{"index": False, "header": False, **kwargs}
|
|
104
99
|
)
|
|
105
100
|
|
|
106
101
|
|
|
@@ -108,7 +103,7 @@ def _safe_load_csv(path: str, **kwargs: Any) -> dd.DataFrame:
|
|
|
108
103
|
try:
|
|
109
104
|
return dd.read_csv(path, **kwargs)
|
|
110
105
|
except (IsADirectoryError, PermissionError):
|
|
111
|
-
return dd.read_csv(
|
|
106
|
+
return dd.read_csv(join(path, "*.csv"), **kwargs)
|
|
112
107
|
|
|
113
108
|
|
|
114
109
|
def _load_csv( # noqa: C901
|
|
@@ -127,7 +122,7 @@ def _load_csv( # noqa: C901
|
|
|
127
122
|
header = kw["header"]
|
|
128
123
|
del kw["header"]
|
|
129
124
|
if str(header) in ["True", "0"]:
|
|
130
|
-
pdf = _safe_load_csv(p.
|
|
125
|
+
pdf = _safe_load_csv(p.path, **{"header": 0, **kw})
|
|
131
126
|
if columns is None:
|
|
132
127
|
return pdf, None
|
|
133
128
|
if isinstance(columns, list): # column names
|
|
@@ -138,34 +133,32 @@ def _load_csv( # noqa: C901
|
|
|
138
133
|
if columns is None:
|
|
139
134
|
raise ValueError("columns must be set if without header")
|
|
140
135
|
if isinstance(columns, list): # column names
|
|
141
|
-
pdf = _safe_load_csv(p.
|
|
136
|
+
pdf = _safe_load_csv(p.path, **{"header": None, "names": columns, **kw})
|
|
142
137
|
return pdf, None
|
|
143
138
|
schema = Schema(columns)
|
|
144
|
-
pdf = _safe_load_csv(p.
|
|
139
|
+
pdf = _safe_load_csv(p.path, **{"header": None, "names": schema.names, **kw})
|
|
145
140
|
return pdf, schema
|
|
146
141
|
else:
|
|
147
142
|
raise NotImplementedError(f"{header} is not supported")
|
|
148
143
|
|
|
149
144
|
|
|
150
145
|
def _save_json(df: DaskDataFrame, p: FileParser, **kwargs: Any) -> None:
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
df.native.to_json(pfs.path.combine(p.uri, "*.json"), **kwargs)
|
|
146
|
+
makedirs(p.path, exist_ok=True)
|
|
147
|
+
df.native.to_json(p.join("*.json").path, **kwargs)
|
|
154
148
|
|
|
155
149
|
|
|
156
150
|
def _safe_load_json(path: str, **kwargs: Any) -> dd.DataFrame:
|
|
157
151
|
try:
|
|
158
152
|
return dd.read_json(path, **kwargs)
|
|
159
153
|
except (IsADirectoryError, PermissionError):
|
|
160
|
-
x = dd.read_json(
|
|
161
|
-
print(x.compute())
|
|
154
|
+
x = dd.read_json(join(path, "*.json"), **kwargs)
|
|
162
155
|
return x
|
|
163
156
|
|
|
164
157
|
|
|
165
158
|
def _load_json(
|
|
166
159
|
p: FileParser, columns: Any = None, **kwargs: Any
|
|
167
160
|
) -> Tuple[dd.DataFrame, Any]:
|
|
168
|
-
pdf = _safe_load_json(p.
|
|
161
|
+
pdf = _safe_load_json(p.path, **kwargs).reset_index(drop=True)
|
|
169
162
|
if columns is None:
|
|
170
163
|
return pdf, None
|
|
171
164
|
if isinstance(columns, list): # column names
|
fugue_dask/execution_engine.py
CHANGED
|
@@ -7,18 +7,17 @@ import pandas as pd
|
|
|
7
7
|
from distributed import Client
|
|
8
8
|
from triad.collections import Schema
|
|
9
9
|
from triad.collections.dict import IndexedOrderedDict, ParamDict
|
|
10
|
-
from triad.collections.fs import FileSystem
|
|
11
10
|
from triad.utils.assertion import assert_or_throw
|
|
12
11
|
from triad.utils.hash import to_uuid
|
|
13
12
|
from triad.utils.pandas_like import PandasUtils
|
|
14
13
|
from triad.utils.threading import RunOnce
|
|
14
|
+
from triad.utils.io import makedirs
|
|
15
15
|
from fugue import StructuredRawSQL
|
|
16
16
|
from fugue.collections.partition import (
|
|
17
17
|
PartitionCursor,
|
|
18
18
|
PartitionSpec,
|
|
19
19
|
parse_presort_exp,
|
|
20
20
|
)
|
|
21
|
-
from fugue.exceptions import FugueBug
|
|
22
21
|
from fugue.constants import KEYWORD_PARALLELISM, KEYWORD_ROWCOUNT
|
|
23
22
|
from fugue.dataframe import (
|
|
24
23
|
AnyDataFrame,
|
|
@@ -28,6 +27,7 @@ from fugue.dataframe import (
|
|
|
28
27
|
PandasDataFrame,
|
|
29
28
|
)
|
|
30
29
|
from fugue.dataframe.utils import get_join_schemas
|
|
30
|
+
from fugue.exceptions import FugueBug
|
|
31
31
|
from fugue.execution.execution_engine import ExecutionEngine, MapEngine, SQLEngine
|
|
32
32
|
from fugue.execution.native_execution_engine import NativeExecutionEngine
|
|
33
33
|
from fugue_dask._constants import FUGUE_DASK_DEFAULT_CONF
|
|
@@ -206,7 +206,6 @@ class DaskExecutionEngine(ExecutionEngine):
|
|
|
206
206
|
p = ParamDict(FUGUE_DASK_DEFAULT_CONF)
|
|
207
207
|
p.update(ParamDict(conf))
|
|
208
208
|
super().__init__(p)
|
|
209
|
-
self._fs = FileSystem()
|
|
210
209
|
self._log = logging.getLogger()
|
|
211
210
|
self._client = DASK_UTILS.get_or_create_client(dask_client)
|
|
212
211
|
self._native = NativeExecutionEngine(conf=conf)
|
|
@@ -227,10 +226,6 @@ class DaskExecutionEngine(ExecutionEngine):
|
|
|
227
226
|
def log(self) -> logging.Logger:
|
|
228
227
|
return self._log
|
|
229
228
|
|
|
230
|
-
@property
|
|
231
|
-
def fs(self) -> FileSystem:
|
|
232
|
-
return self._fs
|
|
233
|
-
|
|
234
229
|
def create_default_sql_engine(self) -> SQLEngine:
|
|
235
230
|
return DaskSQLEngine(self)
|
|
236
231
|
|
|
@@ -527,9 +522,7 @@ class DaskExecutionEngine(ExecutionEngine):
|
|
|
527
522
|
**kwargs: Any,
|
|
528
523
|
) -> DaskDataFrame:
|
|
529
524
|
return self.to_df(
|
|
530
|
-
load_df(
|
|
531
|
-
path, format_hint=format_hint, columns=columns, fs=self.fs, **kwargs
|
|
532
|
-
)
|
|
525
|
+
load_df(path, format_hint=format_hint, columns=columns, **kwargs)
|
|
533
526
|
)
|
|
534
527
|
|
|
535
528
|
def save_df(
|
|
@@ -556,9 +549,9 @@ class DaskExecutionEngine(ExecutionEngine):
|
|
|
556
549
|
else:
|
|
557
550
|
if not partition_spec.empty:
|
|
558
551
|
kwargs["partition_on"] = partition_spec.partition_by
|
|
559
|
-
|
|
552
|
+
makedirs(os.path.dirname(path), exist_ok=True)
|
|
560
553
|
df = self.to_df(df)
|
|
561
|
-
save_df(df, path, format_hint=format_hint, mode=mode,
|
|
554
|
+
save_df(df, path, format_hint=format_hint, mode=mode, **kwargs)
|
|
562
555
|
|
|
563
556
|
|
|
564
557
|
def to_dask_engine_df(df: Any, schema: Any = None) -> DaskDataFrame:
|
fugue_duckdb/_io.py
CHANGED
|
@@ -3,9 +3,9 @@ from typing import Any, Iterable, List, Optional, Union
|
|
|
3
3
|
|
|
4
4
|
from duckdb import DuckDBPyConnection
|
|
5
5
|
from triad import ParamDict, Schema
|
|
6
|
-
from triad.collections.fs import FileSystem
|
|
7
|
-
from triad.utils.assertion import assert_or_throw
|
|
8
6
|
|
|
7
|
+
from triad.utils.assertion import assert_or_throw
|
|
8
|
+
from triad.utils.io import isdir, makedirs, rm, exists
|
|
9
9
|
from fugue._utils.io import FileParser, load_df, save_df
|
|
10
10
|
from fugue.collections.sql import TempTableName
|
|
11
11
|
from fugue.dataframe import ArrowDataFrame, LocalBoundedDataFrame
|
|
@@ -18,26 +18,17 @@ from fugue_duckdb._utils import (
|
|
|
18
18
|
from fugue_duckdb.dataframe import DuckDataFrame
|
|
19
19
|
|
|
20
20
|
|
|
21
|
-
def
|
|
22
|
-
fp: Iterable[FileParser], fs: FileSystem, fmt: str
|
|
23
|
-
) -> Iterable[FileParser]:
|
|
24
|
-
def _isdir(d: str) -> bool:
|
|
25
|
-
try:
|
|
26
|
-
return fs.isdir(d)
|
|
27
|
-
except Exception: # pragma: no cover
|
|
28
|
-
return False
|
|
29
|
-
|
|
21
|
+
def _get_files(fp: Iterable[FileParser], fmt: str) -> Iterable[FileParser]:
|
|
30
22
|
for f in fp:
|
|
31
|
-
if f.
|
|
32
|
-
yield f.
|
|
23
|
+
if not f.has_glob and isdir(f.path):
|
|
24
|
+
yield from f.join("*." + fmt, fmt).find_all()
|
|
33
25
|
else:
|
|
34
26
|
yield f
|
|
35
27
|
|
|
36
28
|
|
|
37
29
|
class DuckDBIO:
|
|
38
|
-
def __init__(self,
|
|
30
|
+
def __init__(self, con: DuckDBPyConnection) -> None:
|
|
39
31
|
self._con = con
|
|
40
|
-
self._fs = fs
|
|
41
32
|
self._format_load = {"csv": self._load_csv, "parquet": self._load_parquet}
|
|
42
33
|
self._format_save = {"csv": self._save_csv, "parquet": self._save_parquet}
|
|
43
34
|
|
|
@@ -55,11 +46,9 @@ class DuckDBIO:
|
|
|
55
46
|
else:
|
|
56
47
|
fp = [FileParser(u, format_hint) for u in uri]
|
|
57
48
|
if fp[0].file_format not in self._format_load:
|
|
58
|
-
return load_df(
|
|
59
|
-
uri, format_hint=format_hint, columns=columns, fs=self._fs, **kwargs
|
|
60
|
-
)
|
|
49
|
+
return load_df(uri, format_hint=format_hint, columns=columns, **kwargs)
|
|
61
50
|
dfs: List[DuckDataFrame] = []
|
|
62
|
-
for f in
|
|
51
|
+
for f in _get_files(fp, fp[0].file_format):
|
|
63
52
|
df = self._format_load[f.file_format](f, columns, **kwargs)
|
|
64
53
|
dfs.append(df)
|
|
65
54
|
rel = dfs[0].native
|
|
@@ -83,26 +72,20 @@ class DuckDBIO:
|
|
|
83
72
|
)
|
|
84
73
|
p = FileParser(uri, format_hint).assert_no_glob()
|
|
85
74
|
if (p.file_format not in self._format_save) or ("partition_cols" in kwargs):
|
|
86
|
-
|
|
75
|
+
makedirs(os.path.dirname(uri), exist_ok=True)
|
|
87
76
|
ldf = ArrowDataFrame(df.as_arrow())
|
|
88
|
-
return save_df(
|
|
89
|
-
|
|
90
|
-
)
|
|
91
|
-
fs = self._fs
|
|
92
|
-
if fs.exists(uri):
|
|
77
|
+
return save_df(ldf, uri=uri, format_hint=format_hint, mode=mode, **kwargs)
|
|
78
|
+
if exists(uri):
|
|
93
79
|
assert_or_throw(mode == "overwrite", FileExistsError(uri))
|
|
94
80
|
try:
|
|
95
|
-
|
|
96
|
-
except Exception:
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
except Exception: # pragma: no cover
|
|
100
|
-
pass
|
|
101
|
-
if not fs.exists(p.parent):
|
|
102
|
-
fs.makedirs(p.parent, recreate=True)
|
|
81
|
+
rm(uri, recursive=True)
|
|
82
|
+
except Exception: # pragma: no cover
|
|
83
|
+
pass
|
|
84
|
+
p.make_parent_dirs()
|
|
103
85
|
self._format_save[p.file_format](df, p, **kwargs)
|
|
104
86
|
|
|
105
87
|
def _save_csv(self, df: DuckDataFrame, p: FileParser, **kwargs: Any):
|
|
88
|
+
p.assert_no_glob()
|
|
106
89
|
dn = TempTableName()
|
|
107
90
|
df.native.create_view(dn.key)
|
|
108
91
|
kw = ParamDict({k.lower(): v for k, v in kwargs.items()})
|
|
@@ -111,7 +94,7 @@ class DuckDBIO:
|
|
|
111
94
|
for k, v in kw.items():
|
|
112
95
|
params.append(f"{k.upper()} " + encode_value_to_expr(v))
|
|
113
96
|
pm = ", ".join(params)
|
|
114
|
-
query = f"COPY {dn.key} TO {encode_value_to_expr(p.
|
|
97
|
+
query = f"COPY {dn.key} TO {encode_value_to_expr(p.path)} WITH ({pm})"
|
|
115
98
|
self._con.execute(query)
|
|
116
99
|
|
|
117
100
|
def _load_csv( # noqa: C901
|
|
@@ -125,7 +108,7 @@ class DuckDBIO:
|
|
|
125
108
|
ValueError("when csv has no header, columns must be specified"),
|
|
126
109
|
)
|
|
127
110
|
kw.pop("auto_detect", None)
|
|
128
|
-
params: List[str] = [encode_value_to_expr(p.
|
|
111
|
+
params: List[str] = [encode_value_to_expr(p.path)]
|
|
129
112
|
kw["header"] = 1 if header else 0
|
|
130
113
|
kw["auto_detect"] = 1 if infer_schema else 0
|
|
131
114
|
if infer_schema:
|
|
@@ -188,6 +171,7 @@ class DuckDBIO:
|
|
|
188
171
|
return DuckDataFrame(self._con.from_query(query))
|
|
189
172
|
|
|
190
173
|
def _save_parquet(self, df: DuckDataFrame, p: FileParser, **kwargs: Any):
|
|
174
|
+
p.assert_no_glob()
|
|
191
175
|
dn = TempTableName()
|
|
192
176
|
df.native.create_view(dn.key)
|
|
193
177
|
kw = ParamDict({k.lower(): v for k, v in kwargs.items()})
|
|
@@ -196,7 +180,7 @@ class DuckDBIO:
|
|
|
196
180
|
for k, v in kw.items():
|
|
197
181
|
params.append(f"{k.upper()} " + encode_value_to_expr(v))
|
|
198
182
|
pm = ", ".join(params)
|
|
199
|
-
query = f"COPY {dn.key} TO {encode_value_to_expr(p.
|
|
183
|
+
query = f"COPY {dn.key} TO {encode_value_to_expr(p.path)}"
|
|
200
184
|
if len(params) > 0:
|
|
201
185
|
query += f" WITH ({pm})"
|
|
202
186
|
self._con.execute(query)
|
|
@@ -205,7 +189,7 @@ class DuckDBIO:
|
|
|
205
189
|
self, p: FileParser, columns: Any = None, **kwargs: Any
|
|
206
190
|
) -> DuckDataFrame:
|
|
207
191
|
kw = ParamDict({k.lower(): v for k, v in kwargs.items()})
|
|
208
|
-
params: List[str] = [encode_value_to_expr(p.
|
|
192
|
+
params: List[str] = [encode_value_to_expr(p.path)]
|
|
209
193
|
if isinstance(columns, list):
|
|
210
194
|
cols = ", ".join(encode_column_names(columns))
|
|
211
195
|
else:
|
fugue_duckdb/execution_engine.py
CHANGED
|
@@ -4,7 +4,6 @@ from typing import Any, Dict, Iterable, List, Optional, Union
|
|
|
4
4
|
import duckdb
|
|
5
5
|
from duckdb import DuckDBPyConnection, DuckDBPyRelation
|
|
6
6
|
from triad import SerializableRLock
|
|
7
|
-
from triad.collections.fs import FileSystem
|
|
8
7
|
from triad.utils.assertion import assert_or_throw
|
|
9
8
|
from triad.utils.schema import quote_name
|
|
10
9
|
|
|
@@ -195,10 +194,6 @@ class DuckExecutionEngine(ExecutionEngine):
|
|
|
195
194
|
def log(self) -> logging.Logger:
|
|
196
195
|
return self._native_engine.log
|
|
197
196
|
|
|
198
|
-
@property
|
|
199
|
-
def fs(self) -> FileSystem:
|
|
200
|
-
return self._native_engine.fs
|
|
201
|
-
|
|
202
197
|
def create_default_sql_engine(self) -> SQLEngine:
|
|
203
198
|
return DuckDBEngine(self)
|
|
204
199
|
|
|
@@ -488,7 +483,7 @@ class DuckExecutionEngine(ExecutionEngine):
|
|
|
488
483
|
columns: Any = None,
|
|
489
484
|
**kwargs: Any,
|
|
490
485
|
) -> LocalBoundedDataFrame:
|
|
491
|
-
dio = DuckDBIO(self.
|
|
486
|
+
dio = DuckDBIO(self.connection)
|
|
492
487
|
return dio.load_df(path, format_hint, columns, **kwargs)
|
|
493
488
|
|
|
494
489
|
def save_df(
|
|
@@ -504,7 +499,7 @@ class DuckExecutionEngine(ExecutionEngine):
|
|
|
504
499
|
partition_spec = partition_spec or PartitionSpec()
|
|
505
500
|
if not partition_spec.empty and not force_single:
|
|
506
501
|
kwargs["partition_cols"] = partition_spec.partition_by
|
|
507
|
-
dio = DuckDBIO(self.
|
|
502
|
+
dio = DuckDBIO(self.connection)
|
|
508
503
|
dio.save_df(_to_duck_df(self, df), path, format_hint, mode, **kwargs)
|
|
509
504
|
|
|
510
505
|
def convert_yield_dataframe(self, df: DataFrame, as_local: bool) -> DataFrame:
|
fugue_ibis/execution_engine.py
CHANGED
|
@@ -5,7 +5,7 @@ from typing import Any, Callable, Dict, List, Optional, Type
|
|
|
5
5
|
|
|
6
6
|
import ibis
|
|
7
7
|
from ibis import BaseBackend
|
|
8
|
-
from triad import
|
|
8
|
+
from triad import assert_or_throw
|
|
9
9
|
|
|
10
10
|
from fugue import StructuredRawSQL
|
|
11
11
|
from fugue.bag import Bag, LocalBag
|
|
@@ -375,10 +375,6 @@ class IbisExecutionEngine(ExecutionEngine):
|
|
|
375
375
|
def log(self) -> logging.Logger:
|
|
376
376
|
return self.non_ibis_engine.log
|
|
377
377
|
|
|
378
|
-
@property
|
|
379
|
-
def fs(self) -> FileSystem:
|
|
380
|
-
return self.non_ibis_engine.fs
|
|
381
|
-
|
|
382
378
|
def get_current_parallelism(self) -> int:
|
|
383
379
|
return self.non_ibis_engine.get_current_parallelism()
|
|
384
380
|
|
fugue_ray/_utils/io.py
CHANGED
|
@@ -4,23 +4,24 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Union
|
|
|
4
4
|
|
|
5
5
|
import pyarrow as pa
|
|
6
6
|
import ray.data as rd
|
|
7
|
-
from fugue import ExecutionEngine
|
|
8
|
-
from fugue._utils.io import FileParser, save_df
|
|
9
|
-
from fugue.collections.partition import PartitionSpec
|
|
10
|
-
from fugue.dataframe import DataFrame
|
|
11
|
-
from fugue_ray.dataframe import RayDataFrame
|
|
12
7
|
from pyarrow import csv as pacsv
|
|
13
8
|
from pyarrow import json as pajson
|
|
14
9
|
from ray.data.datasource import FileExtensionFilter
|
|
15
10
|
from triad.collections import Schema
|
|
16
11
|
from triad.collections.dict import ParamDict
|
|
17
12
|
from triad.utils.assertion import assert_or_throw
|
|
13
|
+
from triad.utils.io import exists, makedirs, rm
|
|
14
|
+
|
|
15
|
+
from fugue import ExecutionEngine
|
|
16
|
+
from fugue._utils.io import FileParser, save_df
|
|
17
|
+
from fugue.collections.partition import PartitionSpec
|
|
18
|
+
from fugue.dataframe import DataFrame
|
|
19
|
+
from fugue_ray.dataframe import RayDataFrame
|
|
18
20
|
|
|
19
21
|
|
|
20
22
|
class RayIO(object):
|
|
21
23
|
def __init__(self, engine: ExecutionEngine):
|
|
22
24
|
self._engine = engine
|
|
23
|
-
self._fs = engine.fs
|
|
24
25
|
self._logger = engine.log
|
|
25
26
|
self._loads: Dict[str, Callable[..., DataFrame]] = {
|
|
26
27
|
"csv": self._load_csv,
|
|
@@ -49,7 +50,7 @@ class RayIO(object):
|
|
|
49
50
|
len(fmts) == 1, NotImplementedError("can't support multiple formats")
|
|
50
51
|
)
|
|
51
52
|
fmt = fmts[0]
|
|
52
|
-
files = [f.
|
|
53
|
+
files = [f.path for f in fp]
|
|
53
54
|
return self._loads[fmt](files, columns, **kwargs)
|
|
54
55
|
|
|
55
56
|
def save_df(
|
|
@@ -63,24 +64,21 @@ class RayIO(object):
|
|
|
63
64
|
**kwargs: Any,
|
|
64
65
|
) -> None:
|
|
65
66
|
partition_spec = partition_spec or PartitionSpec()
|
|
66
|
-
if
|
|
67
|
+
if exists(uri):
|
|
67
68
|
assert_or_throw(mode == "overwrite", FileExistsError(uri))
|
|
68
69
|
try:
|
|
69
|
-
|
|
70
|
-
except Exception:
|
|
71
|
-
|
|
72
|
-
self._fs.removetree(uri)
|
|
73
|
-
except Exception: # pragma: no cover
|
|
74
|
-
pass
|
|
70
|
+
rm(uri, recursive=True)
|
|
71
|
+
except Exception: # pragma: no cover
|
|
72
|
+
pass
|
|
75
73
|
p = FileParser(uri, format_hint)
|
|
76
74
|
if not force_single:
|
|
77
75
|
df = self._prepartition(df, partition_spec=partition_spec)
|
|
78
76
|
|
|
79
|
-
self._saves[p.file_format](df=df, uri=p.
|
|
77
|
+
self._saves[p.file_format](df=df, uri=p.path, **kwargs)
|
|
80
78
|
else:
|
|
81
79
|
ldf = df.as_local()
|
|
82
|
-
|
|
83
|
-
save_df(ldf, uri, format_hint=format_hint, mode=mode,
|
|
80
|
+
makedirs(os.path.dirname(uri), exist_ok=True)
|
|
81
|
+
save_df(ldf, uri, format_hint=format_hint, mode=mode, **kwargs)
|
|
84
82
|
|
|
85
83
|
def _save_parquet(
|
|
86
84
|
self,
|