digitalhub 0.10.0b1__py3-none-any.whl → 0.10.0b3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of digitalhub might be problematic. Click here for more details.
- digitalhub/entities/dataitem/_base/entity.py +0 -41
- digitalhub/entities/dataitem/crud.py +14 -3
- digitalhub/entities/dataitem/table/entity.py +49 -35
- digitalhub/entities/dataitem/utils.py +40 -1
- digitalhub/readers/{_base → data/_base}/builder.py +1 -1
- digitalhub/readers/{_base → data/_base}/reader.py +16 -4
- digitalhub/readers/{api.py → data/api.py} +2 -2
- digitalhub/readers/{factory.py → data/factory.py} +3 -3
- digitalhub/readers/data/pandas/__init__.py +0 -0
- digitalhub/readers/{pandas → data/pandas}/builder.py +2 -2
- digitalhub/readers/{pandas → data/pandas}/reader.py +56 -24
- digitalhub/readers/query/__init__.py +0 -0
- digitalhub/stores/_base/store.py +60 -23
- digitalhub/stores/local/store.py +101 -71
- digitalhub/stores/remote/store.py +81 -0
- digitalhub/stores/s3/configurator.py +3 -2
- digitalhub/stores/s3/store.py +144 -41
- digitalhub/stores/sql/store.py +90 -30
- {digitalhub-0.10.0b1.dist-info → digitalhub-0.10.0b3.dist-info}/METADATA +27 -29
- {digitalhub-0.10.0b1.dist-info → digitalhub-0.10.0b3.dist-info}/RECORD +25 -34
- {digitalhub-0.10.0b1.dist-info → digitalhub-0.10.0b3.dist-info}/WHEEL +1 -2
- digitalhub-0.10.0b1.dist-info/top_level.txt +0 -2
- test/local/CRUD/test_artifacts.py +0 -96
- test/local/CRUD/test_dataitems.py +0 -96
- test/local/CRUD/test_models.py +0 -95
- test/local/imports/test_imports.py +0 -65
- test/local/instances/test_validate.py +0 -55
- test/test_crud_functions.py +0 -109
- test/test_crud_runs.py +0 -86
- test/test_crud_tasks.py +0 -81
- test/testkfp.py +0 -37
- test/testkfp_pipeline.py +0 -22
- /digitalhub/readers/{_base → data}/__init__.py +0 -0
- /digitalhub/readers/{pandas → data/_base}/__init__.py +0 -0
- /digitalhub/readers/{pandas → data/pandas}/enums.py +0 -0
- {digitalhub-0.10.0b1.dist-info → digitalhub-0.10.0b3.dist-info/licenses}/LICENSE.txt +0 -0
digitalhub/stores/local/store.py
CHANGED
|
@@ -4,7 +4,7 @@ import shutil
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import Any
|
|
6
6
|
|
|
7
|
-
from digitalhub.readers.api import get_reader_by_object
|
|
7
|
+
from digitalhub.readers.data.api import get_reader_by_object
|
|
8
8
|
from digitalhub.stores._base.store import Store
|
|
9
9
|
from digitalhub.utils.exceptions import StoreError
|
|
10
10
|
from digitalhub.utils.file_utils import get_file_info_from_local
|
|
@@ -82,6 +82,106 @@ class LocalStore(Store):
|
|
|
82
82
|
"""
|
|
83
83
|
return [get_file_info_from_local(p) for p in paths]
|
|
84
84
|
|
|
85
|
+
##############################
|
|
86
|
+
# Datastore methods
|
|
87
|
+
##############################
|
|
88
|
+
|
|
89
|
+
def read_df(
|
|
90
|
+
self,
|
|
91
|
+
path: str | list[str],
|
|
92
|
+
file_format: str | None = None,
|
|
93
|
+
engine: str | None = None,
|
|
94
|
+
**kwargs,
|
|
95
|
+
) -> Any:
|
|
96
|
+
"""
|
|
97
|
+
Read DataFrame from path.
|
|
98
|
+
|
|
99
|
+
Parameters
|
|
100
|
+
----------
|
|
101
|
+
path : str | list[str]
|
|
102
|
+
Path(s) to read DataFrame from.
|
|
103
|
+
file_format : str
|
|
104
|
+
Extension of the file.
|
|
105
|
+
engine : str
|
|
106
|
+
Dataframe engine (pandas, polars, etc.).
|
|
107
|
+
**kwargs : dict
|
|
108
|
+
Keyword arguments.
|
|
109
|
+
|
|
110
|
+
Returns
|
|
111
|
+
-------
|
|
112
|
+
Any
|
|
113
|
+
DataFrame.
|
|
114
|
+
"""
|
|
115
|
+
reader = self._get_reader(engine)
|
|
116
|
+
|
|
117
|
+
dfs = []
|
|
118
|
+
if isinstance(path, list):
|
|
119
|
+
for p in path:
|
|
120
|
+
file_format = self._get_extension(file_format, p)
|
|
121
|
+
dfs.append(reader.read_df(p, file_format, **kwargs))
|
|
122
|
+
elif Path(path).is_dir():
|
|
123
|
+
import glob
|
|
124
|
+
|
|
125
|
+
paths = glob.glob(f"{path}/*")
|
|
126
|
+
for p in paths:
|
|
127
|
+
file_format = self._get_extension(file_format, p)
|
|
128
|
+
dfs.append(reader.read_df(p, file_format, **kwargs))
|
|
129
|
+
else:
|
|
130
|
+
file_format = self._get_extension(file_format, path)
|
|
131
|
+
dfs.append(reader.read_df(path, file_format, **kwargs))
|
|
132
|
+
|
|
133
|
+
if len(dfs) == 1:
|
|
134
|
+
return dfs[0]
|
|
135
|
+
|
|
136
|
+
return reader.concat_dfs(dfs)
|
|
137
|
+
|
|
138
|
+
def query(
|
|
139
|
+
self,
|
|
140
|
+
query: str,
|
|
141
|
+
engine: str | None = None,
|
|
142
|
+
) -> Any:
|
|
143
|
+
"""
|
|
144
|
+
Query data from database.
|
|
145
|
+
|
|
146
|
+
Parameters
|
|
147
|
+
----------
|
|
148
|
+
query : str
|
|
149
|
+
The query to execute.
|
|
150
|
+
engine : str
|
|
151
|
+
Dataframe engine (pandas, polars, etc.).
|
|
152
|
+
|
|
153
|
+
Returns
|
|
154
|
+
-------
|
|
155
|
+
Any
|
|
156
|
+
DataFrame.
|
|
157
|
+
"""
|
|
158
|
+
raise StoreError("Local store does not support query.")
|
|
159
|
+
|
|
160
|
+
def write_df(self, df: Any, dst: str, extension: str | None = None, **kwargs) -> str:
|
|
161
|
+
"""
|
|
162
|
+
Method to write a dataframe to a file. Kwargs are passed to df.to_parquet().
|
|
163
|
+
If destination is not provided, the dataframe is written to the default
|
|
164
|
+
store path with generated name.
|
|
165
|
+
|
|
166
|
+
Parameters
|
|
167
|
+
----------
|
|
168
|
+
df : Any
|
|
169
|
+
The dataframe to write.
|
|
170
|
+
dst : str
|
|
171
|
+
The destination of the dataframe.
|
|
172
|
+
**kwargs : dict
|
|
173
|
+
Keyword arguments.
|
|
174
|
+
|
|
175
|
+
Returns
|
|
176
|
+
-------
|
|
177
|
+
str
|
|
178
|
+
Path of written dataframe.
|
|
179
|
+
"""
|
|
180
|
+
self._check_local_dst(dst)
|
|
181
|
+
reader = get_reader_by_object(df)
|
|
182
|
+
reader.write_df(df, dst, extension=extension, **kwargs)
|
|
183
|
+
return dst
|
|
184
|
+
|
|
85
185
|
##############################
|
|
86
186
|
# Private I/O methods
|
|
87
187
|
##############################
|
|
@@ -183,73 +283,3 @@ class LocalStore(Store):
|
|
|
183
283
|
dst = dst / src
|
|
184
284
|
self._build_path(dst)
|
|
185
285
|
return dst
|
|
186
|
-
|
|
187
|
-
##############################
|
|
188
|
-
# Datastore methods
|
|
189
|
-
##############################
|
|
190
|
-
|
|
191
|
-
def write_df(self, df: Any, dst: str, extension: str | None = None, **kwargs) -> str:
|
|
192
|
-
"""
|
|
193
|
-
Method to write a dataframe to a file. Kwargs are passed to df.to_parquet().
|
|
194
|
-
If destination is not provided, the dataframe is written to the default
|
|
195
|
-
store path with generated name.
|
|
196
|
-
|
|
197
|
-
Parameters
|
|
198
|
-
----------
|
|
199
|
-
df : Any
|
|
200
|
-
The dataframe to write.
|
|
201
|
-
dst : str
|
|
202
|
-
The destination of the dataframe.
|
|
203
|
-
**kwargs : dict
|
|
204
|
-
Keyword arguments.
|
|
205
|
-
|
|
206
|
-
Returns
|
|
207
|
-
-------
|
|
208
|
-
str
|
|
209
|
-
Path of written dataframe.
|
|
210
|
-
"""
|
|
211
|
-
self._check_local_dst(dst)
|
|
212
|
-
reader = get_reader_by_object(df)
|
|
213
|
-
reader.write_df(df, dst, extension=extension, **kwargs)
|
|
214
|
-
return dst
|
|
215
|
-
|
|
216
|
-
##############################
|
|
217
|
-
# Helper methods
|
|
218
|
-
##############################
|
|
219
|
-
|
|
220
|
-
@staticmethod
|
|
221
|
-
def is_partition_or_dir(path: str) -> bool:
|
|
222
|
-
"""
|
|
223
|
-
Check if path is a directory or a partition.
|
|
224
|
-
|
|
225
|
-
Parameters
|
|
226
|
-
----------
|
|
227
|
-
path : str
|
|
228
|
-
The path to check.
|
|
229
|
-
|
|
230
|
-
Returns
|
|
231
|
-
-------
|
|
232
|
-
bool
|
|
233
|
-
"""
|
|
234
|
-
return Path(path).is_dir()
|
|
235
|
-
|
|
236
|
-
@staticmethod
|
|
237
|
-
def build_object_path(root: str, paths: str | list[str]) -> list[str]:
|
|
238
|
-
"""
|
|
239
|
-
Method to build object path.
|
|
240
|
-
|
|
241
|
-
Parameters
|
|
242
|
-
----------
|
|
243
|
-
root : str
|
|
244
|
-
The root of the object path.
|
|
245
|
-
paths : str | list[str]
|
|
246
|
-
The path to build.
|
|
247
|
-
|
|
248
|
-
Returns
|
|
249
|
-
-------
|
|
250
|
-
list[str]
|
|
251
|
-
Returns the path of the object.
|
|
252
|
-
"""
|
|
253
|
-
if isinstance(paths, str):
|
|
254
|
-
paths = [paths]
|
|
255
|
-
return [str(Path(root) / path) for path in paths]
|
|
@@ -97,6 +97,58 @@ class RemoteStore(Store):
|
|
|
97
97
|
# Datastore methods
|
|
98
98
|
##############################
|
|
99
99
|
|
|
100
|
+
def read_df(
|
|
101
|
+
self,
|
|
102
|
+
path: str | list[str],
|
|
103
|
+
file_format: str | None = None,
|
|
104
|
+
engine: str | None = None,
|
|
105
|
+
**kwargs,
|
|
106
|
+
) -> Any:
|
|
107
|
+
"""
|
|
108
|
+
Read DataFrame from path.
|
|
109
|
+
|
|
110
|
+
Parameters
|
|
111
|
+
----------
|
|
112
|
+
path : str | list[str]
|
|
113
|
+
Path(s) to read DataFrame from.
|
|
114
|
+
file_format : str
|
|
115
|
+
Extension of the file.
|
|
116
|
+
engine : str
|
|
117
|
+
Dataframe engine (pandas, polars, etc.).
|
|
118
|
+
**kwargs : dict
|
|
119
|
+
Keyword arguments.
|
|
120
|
+
|
|
121
|
+
Returns
|
|
122
|
+
-------
|
|
123
|
+
Any
|
|
124
|
+
DataFrame.
|
|
125
|
+
"""
|
|
126
|
+
reader = self._get_reader(engine)
|
|
127
|
+
extension = self._head_extension(path, file_format)
|
|
128
|
+
return reader.read_df(path, extension, **kwargs)
|
|
129
|
+
|
|
130
|
+
def query(
|
|
131
|
+
self,
|
|
132
|
+
query: str,
|
|
133
|
+
engine: str | None = None,
|
|
134
|
+
) -> Any:
|
|
135
|
+
"""
|
|
136
|
+
Query data from database.
|
|
137
|
+
|
|
138
|
+
Parameters
|
|
139
|
+
----------
|
|
140
|
+
query : str
|
|
141
|
+
The query to execute.
|
|
142
|
+
engine : str
|
|
143
|
+
Dataframe engine (pandas, polars, etc.).
|
|
144
|
+
|
|
145
|
+
Returns
|
|
146
|
+
-------
|
|
147
|
+
Any
|
|
148
|
+
DataFrame.
|
|
149
|
+
"""
|
|
150
|
+
raise StoreError("Remote store does not support query.")
|
|
151
|
+
|
|
100
152
|
def write_df(self, df: Any, dst: str, extension: str | None = None, **kwargs) -> str:
|
|
101
153
|
"""
|
|
102
154
|
Method to write a dataframe to a file. Note that this method is not implemented
|
|
@@ -160,3 +212,32 @@ class RemoteStore(Store):
|
|
|
160
212
|
for chunk in r.iter_content(chunk_size=8192):
|
|
161
213
|
f.write(chunk)
|
|
162
214
|
return str(dst)
|
|
215
|
+
|
|
216
|
+
def _head_extension(self, url: str, file_format: str | None = None) -> str:
|
|
217
|
+
"""
|
|
218
|
+
Method to get the extension of a file from a given url.
|
|
219
|
+
|
|
220
|
+
Parameters
|
|
221
|
+
----------
|
|
222
|
+
url : str
|
|
223
|
+
The url of the file to get the extension.
|
|
224
|
+
file_format : str
|
|
225
|
+
The file format to check.
|
|
226
|
+
|
|
227
|
+
Returns
|
|
228
|
+
-------
|
|
229
|
+
str
|
|
230
|
+
File extension.
|
|
231
|
+
"""
|
|
232
|
+
if file_format is not None:
|
|
233
|
+
return file_format
|
|
234
|
+
try:
|
|
235
|
+
r = requests.head(url, timeout=60)
|
|
236
|
+
r.raise_for_status()
|
|
237
|
+
content_type = r.headers["content-type"]
|
|
238
|
+
if "text" in content_type:
|
|
239
|
+
return "csv"
|
|
240
|
+
else:
|
|
241
|
+
raise ValueError("Content type not supported.")
|
|
242
|
+
except Exception as e:
|
|
243
|
+
raise e
|
|
@@ -51,9 +51,10 @@ class S3StoreConfigurator:
|
|
|
51
51
|
]:
|
|
52
52
|
configurator.set_credential(*pair)
|
|
53
53
|
|
|
54
|
-
def
|
|
54
|
+
def get_boto3_client_config(self) -> dict:
|
|
55
55
|
"""
|
|
56
|
-
Get
|
|
56
|
+
Get S3 credentials (access key, secret key,
|
|
57
|
+
session token and other config).
|
|
57
58
|
|
|
58
59
|
Returns
|
|
59
60
|
-------
|
digitalhub/stores/s3/store.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import typing
|
|
3
4
|
from io import BytesIO
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
from typing import Any, Type
|
|
@@ -9,13 +10,16 @@ import boto3
|
|
|
9
10
|
import botocore.client # pylint: disable=unused-import
|
|
10
11
|
from botocore.exceptions import ClientError
|
|
11
12
|
|
|
12
|
-
from digitalhub.readers.api import get_reader_by_object
|
|
13
|
+
from digitalhub.readers.data.api import get_reader_by_object
|
|
13
14
|
from digitalhub.stores._base.store import Store
|
|
14
15
|
from digitalhub.stores.s3.configurator import S3StoreConfigurator
|
|
15
16
|
from digitalhub.stores.s3.utils import get_bucket_name
|
|
16
17
|
from digitalhub.utils.exceptions import StoreError
|
|
17
18
|
from digitalhub.utils.file_utils import get_file_info_from_s3, get_file_mime_type
|
|
18
19
|
|
|
20
|
+
if typing.TYPE_CHECKING:
|
|
21
|
+
pass
|
|
22
|
+
|
|
19
23
|
# Type aliases
|
|
20
24
|
S3Client = Type["botocore.client.S3"]
|
|
21
25
|
|
|
@@ -228,12 +232,121 @@ class S3Store(Store):
|
|
|
228
232
|
|
|
229
233
|
return infos
|
|
230
234
|
|
|
235
|
+
##############################
|
|
236
|
+
# Datastore methods
|
|
237
|
+
##############################
|
|
238
|
+
|
|
239
|
+
def read_df(
|
|
240
|
+
self,
|
|
241
|
+
path: str | list[str],
|
|
242
|
+
file_format: str | None = None,
|
|
243
|
+
engine: str | None = None,
|
|
244
|
+
**kwargs,
|
|
245
|
+
) -> Any:
|
|
246
|
+
"""
|
|
247
|
+
Read DataFrame from path.
|
|
248
|
+
|
|
249
|
+
Parameters
|
|
250
|
+
----------
|
|
251
|
+
path : str | list[str]
|
|
252
|
+
Path(s) to read DataFrame from.
|
|
253
|
+
file_format : str
|
|
254
|
+
Extension of the file.
|
|
255
|
+
engine : str
|
|
256
|
+
Dataframe engine (pandas, polars, etc.).
|
|
257
|
+
**kwargs : dict
|
|
258
|
+
Keyword arguments.
|
|
259
|
+
|
|
260
|
+
Returns
|
|
261
|
+
-------
|
|
262
|
+
Any
|
|
263
|
+
DataFrame.
|
|
264
|
+
"""
|
|
265
|
+
reader = self._get_reader(engine)
|
|
266
|
+
|
|
267
|
+
# Verify if partition or single file
|
|
268
|
+
if self.is_partition(path):
|
|
269
|
+
client, bucket = self._check_factory(path)
|
|
270
|
+
objects = self._list_objects(client, bucket, path)
|
|
271
|
+
keys = [self._get_key(o) for o in objects]
|
|
272
|
+
|
|
273
|
+
else:
|
|
274
|
+
if isinstance(path, list):
|
|
275
|
+
client, bucket = self._check_factory(path[0])
|
|
276
|
+
keys = [self._get_key(p) for p in path]
|
|
277
|
+
else:
|
|
278
|
+
client, bucket = self._check_factory(path)
|
|
279
|
+
keys = [self._get_key(path)]
|
|
280
|
+
|
|
281
|
+
dfs = []
|
|
282
|
+
for key in keys:
|
|
283
|
+
file_format = self._get_extension(file_format, key)
|
|
284
|
+
obj = self._download_fileobject(key, client, bucket)
|
|
285
|
+
dfs.append(reader.read_df(obj, extension=file_format, **kwargs))
|
|
286
|
+
|
|
287
|
+
if len(dfs) == 1:
|
|
288
|
+
return dfs[0]
|
|
289
|
+
return reader.concat_dfs(dfs)
|
|
290
|
+
|
|
291
|
+
def query(
|
|
292
|
+
self,
|
|
293
|
+
query: str,
|
|
294
|
+
engine: str | None = None,
|
|
295
|
+
) -> Any:
|
|
296
|
+
"""
|
|
297
|
+
Query data from database.
|
|
298
|
+
|
|
299
|
+
Parameters
|
|
300
|
+
----------
|
|
301
|
+
query : str
|
|
302
|
+
The query to execute.
|
|
303
|
+
engine : str
|
|
304
|
+
Dataframe engine (pandas, polars, etc.).
|
|
305
|
+
|
|
306
|
+
Returns
|
|
307
|
+
-------
|
|
308
|
+
Any
|
|
309
|
+
DataFrame.
|
|
310
|
+
"""
|
|
311
|
+
raise StoreError("S3 store does not support query.")
|
|
312
|
+
|
|
313
|
+
def write_df(
|
|
314
|
+
self,
|
|
315
|
+
df: Any,
|
|
316
|
+
dst: str,
|
|
317
|
+
extension: str | None = None,
|
|
318
|
+
**kwargs,
|
|
319
|
+
) -> str:
|
|
320
|
+
"""
|
|
321
|
+
Write a dataframe to S3 based storage. Kwargs are passed to df.to_parquet().
|
|
322
|
+
|
|
323
|
+
Parameters
|
|
324
|
+
----------
|
|
325
|
+
df : Any
|
|
326
|
+
The dataframe.
|
|
327
|
+
dst : str
|
|
328
|
+
The destination path on S3 based storage.
|
|
329
|
+
extension : str
|
|
330
|
+
The extension of the file.
|
|
331
|
+
**kwargs : dict
|
|
332
|
+
Keyword arguments.
|
|
333
|
+
|
|
334
|
+
Returns
|
|
335
|
+
-------
|
|
336
|
+
str
|
|
337
|
+
The S3 path where the dataframe was saved.
|
|
338
|
+
"""
|
|
339
|
+
fileobj = BytesIO()
|
|
340
|
+
reader = get_reader_by_object(df)
|
|
341
|
+
reader.write_df(df, fileobj, extension=extension, **kwargs)
|
|
342
|
+
return self.upload_fileobject(fileobj, dst)
|
|
343
|
+
|
|
231
344
|
##############################
|
|
232
345
|
# Private I/O methods
|
|
233
346
|
##############################
|
|
234
347
|
|
|
348
|
+
@staticmethod
|
|
235
349
|
def _download_file(
|
|
236
|
-
self,
|
|
237
350
|
key: str,
|
|
238
351
|
dst_pth: Path,
|
|
239
352
|
client: S3Client,
|
|
@@ -244,8 +357,8 @@ class S3Store(Store):
|
|
|
244
357
|
|
|
245
358
|
Parameters
|
|
246
359
|
----------
|
|
247
|
-
|
|
248
|
-
The
|
|
360
|
+
key : str
|
|
361
|
+
The key to be downloaded.
|
|
249
362
|
dst_pth : str
|
|
250
363
|
The destination of the files on local filesystem.
|
|
251
364
|
client : S3Client
|
|
@@ -258,9 +371,34 @@ class S3Store(Store):
|
|
|
258
371
|
list[str]
|
|
259
372
|
The list of paths of the downloaded files.
|
|
260
373
|
"""
|
|
261
|
-
# Download file
|
|
262
374
|
client.download_file(bucket, key, dst_pth)
|
|
263
375
|
|
|
376
|
+
@staticmethod
|
|
377
|
+
def _download_fileobject(
|
|
378
|
+
key: str,
|
|
379
|
+
client: S3Client,
|
|
380
|
+
bucket: str,
|
|
381
|
+
) -> BytesIO:
|
|
382
|
+
"""
|
|
383
|
+
Download fileobject from S3 partition.
|
|
384
|
+
|
|
385
|
+
Parameters
|
|
386
|
+
----------
|
|
387
|
+
key : str
|
|
388
|
+
The key of the file.
|
|
389
|
+
client : S3Client
|
|
390
|
+
The S3 client object.
|
|
391
|
+
bucket : str
|
|
392
|
+
The name of the S3 bucket.
|
|
393
|
+
|
|
394
|
+
Returns
|
|
395
|
+
-------
|
|
396
|
+
BytesIO
|
|
397
|
+
The fileobject of the downloaded file.
|
|
398
|
+
"""
|
|
399
|
+
obj = client.get_object(Bucket=bucket, Key=key)
|
|
400
|
+
return BytesIO(obj["Body"].read())
|
|
401
|
+
|
|
264
402
|
def _upload_dir(
|
|
265
403
|
self,
|
|
266
404
|
src: str,
|
|
@@ -437,41 +575,6 @@ class S3Store(Store):
|
|
|
437
575
|
"""
|
|
438
576
|
client.put_object(Bucket=bucket, Key=key, Body=fileobj.getvalue())
|
|
439
577
|
|
|
440
|
-
##############################
|
|
441
|
-
# Datastore methods
|
|
442
|
-
##############################
|
|
443
|
-
|
|
444
|
-
def write_df(
|
|
445
|
-
self,
|
|
446
|
-
df: Any,
|
|
447
|
-
dst: str,
|
|
448
|
-
extension: str | None = None,
|
|
449
|
-
**kwargs,
|
|
450
|
-
) -> str:
|
|
451
|
-
"""
|
|
452
|
-
Write a dataframe to S3 based storage. Kwargs are passed to df.to_parquet().
|
|
453
|
-
|
|
454
|
-
Parameters
|
|
455
|
-
----------
|
|
456
|
-
df : Any
|
|
457
|
-
The dataframe.
|
|
458
|
-
dst : str
|
|
459
|
-
The destination path on S3 based storage.
|
|
460
|
-
extension : str
|
|
461
|
-
The extension of the file.
|
|
462
|
-
**kwargs : dict
|
|
463
|
-
Keyword arguments.
|
|
464
|
-
|
|
465
|
-
Returns
|
|
466
|
-
-------
|
|
467
|
-
str
|
|
468
|
-
The S3 path where the dataframe was saved.
|
|
469
|
-
"""
|
|
470
|
-
fileobj = BytesIO()
|
|
471
|
-
reader = get_reader_by_object(df)
|
|
472
|
-
reader.write_df(df, fileobj, extension=extension, **kwargs)
|
|
473
|
-
return self.upload_fileobject(fileobj, dst)
|
|
474
|
-
|
|
475
578
|
##############################
|
|
476
579
|
# Helper methods
|
|
477
580
|
##############################
|
|
@@ -496,7 +599,7 @@ class S3Store(Store):
|
|
|
496
599
|
S3Client
|
|
497
600
|
Returns a client object that interacts with the S3 storage service.
|
|
498
601
|
"""
|
|
499
|
-
cfg = self._configurator.
|
|
602
|
+
cfg = self._configurator.get_boto3_client_config()
|
|
500
603
|
return boto3.client("s3", **cfg)
|
|
501
604
|
|
|
502
605
|
def _check_factory(self, root: str) -> tuple[S3Client, str]:
|
digitalhub/stores/sql/store.py
CHANGED
|
@@ -10,7 +10,7 @@ from sqlalchemy import MetaData, Table, create_engine, select
|
|
|
10
10
|
from sqlalchemy.engine import Engine
|
|
11
11
|
from sqlalchemy.exc import SQLAlchemyError
|
|
12
12
|
|
|
13
|
-
from digitalhub.readers.api import get_reader_by_object
|
|
13
|
+
from digitalhub.readers.data.api import get_reader_by_object
|
|
14
14
|
from digitalhub.stores._base.store import Store
|
|
15
15
|
from digitalhub.stores.sql.configurator import SqlStoreConfigurator
|
|
16
16
|
from digitalhub.utils.exceptions import StoreError
|
|
@@ -119,50 +119,72 @@ class SqlStore(Store):
|
|
|
119
119
|
return []
|
|
120
120
|
|
|
121
121
|
##############################
|
|
122
|
-
#
|
|
122
|
+
# Datastore methods
|
|
123
123
|
##############################
|
|
124
124
|
|
|
125
|
-
def
|
|
125
|
+
def read_df(
|
|
126
|
+
self,
|
|
127
|
+
path: str | list[str],
|
|
128
|
+
file_format: str | None = None,
|
|
129
|
+
engine: str | None = None,
|
|
130
|
+
**kwargs,
|
|
131
|
+
) -> Any:
|
|
126
132
|
"""
|
|
127
|
-
|
|
133
|
+
Read DataFrame from path.
|
|
128
134
|
|
|
129
135
|
Parameters
|
|
130
136
|
----------
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
+
path : str | list[str]
|
|
138
|
+
Path(s) to read DataFrame from.
|
|
139
|
+
file_format : str
|
|
140
|
+
Extension of the file.
|
|
141
|
+
engine : str
|
|
142
|
+
Dataframe engine (pandas, polars, etc.).
|
|
143
|
+
**kwargs : dict
|
|
144
|
+
Keyword arguments.
|
|
137
145
|
|
|
138
146
|
Returns
|
|
139
147
|
-------
|
|
140
|
-
|
|
141
|
-
|
|
148
|
+
Any
|
|
149
|
+
DataFrame.
|
|
142
150
|
"""
|
|
143
|
-
|
|
151
|
+
reader = self._get_reader(engine)
|
|
152
|
+
schema = self._get_schema(path)
|
|
153
|
+
table = self._get_table_name(path)
|
|
154
|
+
sql_engine = self._check_factory(schema=schema)
|
|
144
155
|
|
|
145
|
-
|
|
146
|
-
sa_table = Table(table, MetaData(), autoload_with=engine)
|
|
156
|
+
sa_table = Table(table, MetaData(), autoload_with=sql_engine)
|
|
147
157
|
stm = select(sa_table)
|
|
148
|
-
with engine.begin() as conn:
|
|
149
|
-
result: list[Row] = conn.execute(stm).fetchall()
|
|
150
158
|
|
|
151
|
-
|
|
152
|
-
data = {col: [row[idx] for row in result] for idx, col in enumerate(sa_table.columns.keys())}
|
|
153
|
-
|
|
154
|
-
# Convert the result to a pyarrow table and
|
|
155
|
-
# write the pyarrow table to a Parquet file
|
|
156
|
-
arrow_table = pa.Table.from_pydict(data)
|
|
157
|
-
pq.write_table(arrow_table, dst)
|
|
159
|
+
return reader.read_table(stm, sql_engine, **kwargs)
|
|
158
160
|
|
|
159
|
-
|
|
161
|
+
def query(
|
|
162
|
+
self,
|
|
163
|
+
query: str,
|
|
164
|
+
path: str,
|
|
165
|
+
engine: str | None = None,
|
|
166
|
+
) -> Any:
|
|
167
|
+
"""
|
|
168
|
+
Query data from database.
|
|
160
169
|
|
|
161
|
-
|
|
170
|
+
Parameters
|
|
171
|
+
----------
|
|
172
|
+
query : str
|
|
173
|
+
The query to execute.
|
|
174
|
+
path : str
|
|
175
|
+
Path to the database.
|
|
176
|
+
engine : str
|
|
177
|
+
Dataframe engine (pandas, polars, etc.).
|
|
162
178
|
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
179
|
+
Returns
|
|
180
|
+
-------
|
|
181
|
+
Any
|
|
182
|
+
DataFrame.
|
|
183
|
+
"""
|
|
184
|
+
reader = self._get_reader(engine)
|
|
185
|
+
schema = self._get_schema(path)
|
|
186
|
+
sql_engine = self._check_factory(schema=schema)
|
|
187
|
+
return reader.read_table(query, sql_engine)
|
|
166
188
|
|
|
167
189
|
def write_df(self, df: Any, dst: str, extension: str | None = None, **kwargs) -> str:
|
|
168
190
|
"""
|
|
@@ -187,9 +209,47 @@ class SqlStore(Store):
|
|
|
187
209
|
return self._upload_table(df, schema, table, **kwargs)
|
|
188
210
|
|
|
189
211
|
##############################
|
|
190
|
-
# Private
|
|
212
|
+
# Private I/O methods
|
|
191
213
|
##############################
|
|
192
214
|
|
|
215
|
+
def _download_table(self, schema: str, table: str, dst: str) -> str:
|
|
216
|
+
"""
|
|
217
|
+
Download a table from SQL based storage.
|
|
218
|
+
|
|
219
|
+
Parameters
|
|
220
|
+
----------
|
|
221
|
+
schema : str
|
|
222
|
+
The origin schema.
|
|
223
|
+
table : str
|
|
224
|
+
The origin table.
|
|
225
|
+
dst : str
|
|
226
|
+
The destination path.
|
|
227
|
+
|
|
228
|
+
Returns
|
|
229
|
+
-------
|
|
230
|
+
str
|
|
231
|
+
The destination path.
|
|
232
|
+
"""
|
|
233
|
+
engine = self._check_factory(schema=schema)
|
|
234
|
+
|
|
235
|
+
# Read the table from the database
|
|
236
|
+
sa_table = Table(table, MetaData(), autoload_with=engine)
|
|
237
|
+
stm = select(sa_table)
|
|
238
|
+
with engine.begin() as conn:
|
|
239
|
+
result: list[Row] = conn.execute(stm).fetchall()
|
|
240
|
+
|
|
241
|
+
# Parse the result
|
|
242
|
+
data = {col: [row[idx] for row in result] for idx, col in enumerate(sa_table.columns.keys())}
|
|
243
|
+
|
|
244
|
+
# Convert the result to a pyarrow table and
|
|
245
|
+
# write the pyarrow table to a Parquet file
|
|
246
|
+
arrow_table = pa.Table.from_pydict(data)
|
|
247
|
+
pq.write_table(arrow_table, dst)
|
|
248
|
+
|
|
249
|
+
engine.dispose()
|
|
250
|
+
|
|
251
|
+
return dst
|
|
252
|
+
|
|
193
253
|
def _upload_table(self, df: Any, schema: str, table: str, **kwargs) -> str:
|
|
194
254
|
"""
|
|
195
255
|
Upload a table to SQL based storage.
|