digitalhub 0.10.0b1__py3-none-any.whl → 0.10.0b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of digitalhub might be problematic. Click here for more details.

@@ -4,7 +4,7 @@ import shutil
4
4
  from pathlib import Path
5
5
  from typing import Any
6
6
 
7
- from digitalhub.readers.api import get_reader_by_object
7
+ from digitalhub.readers.data.api import get_reader_by_object
8
8
  from digitalhub.stores._base.store import Store
9
9
  from digitalhub.utils.exceptions import StoreError
10
10
  from digitalhub.utils.file_utils import get_file_info_from_local
@@ -82,6 +82,106 @@ class LocalStore(Store):
82
82
  """
83
83
  return [get_file_info_from_local(p) for p in paths]
84
84
 
85
+ ##############################
86
+ # Datastore methods
87
+ ##############################
88
+
89
+ def read_df(
90
+ self,
91
+ path: str | list[str],
92
+ file_format: str | None = None,
93
+ engine: str | None = None,
94
+ **kwargs,
95
+ ) -> Any:
96
+ """
97
+ Read DataFrame from path.
98
+
99
+ Parameters
100
+ ----------
101
+ path : str | list[str]
102
+ Path(s) to read DataFrame from.
103
+ file_format : str
104
+ Extension of the file.
105
+ engine : str
106
+ Dataframe engine (pandas, polars, etc.).
107
+ **kwargs : dict
108
+ Keyword arguments.
109
+
110
+ Returns
111
+ -------
112
+ Any
113
+ DataFrame.
114
+ """
115
+ reader = self._get_reader(engine)
116
+
117
+ dfs = []
118
+ if isinstance(path, list):
119
+ for p in path:
120
+ file_format = self._get_extension(file_format, p)
121
+ dfs.append(reader.read_df(p, file_format, **kwargs))
122
+ elif Path(path).is_dir():
123
+ import glob
124
+
125
+ paths = glob.glob(f"{path}/*")
126
+ for p in paths:
127
+ file_format = self._get_extension(file_format, p)
128
+ dfs.append(reader.read_df(p, file_format, **kwargs))
129
+ else:
130
+ file_format = self._get_extension(file_format, path)
131
+ dfs.append(reader.read_df(path, file_format, **kwargs))
132
+
133
+ if len(dfs) == 1:
134
+ return dfs[0]
135
+
136
+ return reader.concat_dfs(dfs)
137
+
138
+ def query(
139
+ self,
140
+ query: str,
141
+ engine: str | None = None,
142
+ ) -> Any:
143
+ """
144
+ Query data from database.
145
+
146
+ Parameters
147
+ ----------
148
+ query : str
149
+ The query to execute.
150
+ engine : str
151
+ Dataframe engine (pandas, polars, etc.).
152
+
153
+ Returns
154
+ -------
155
+ Any
156
+ DataFrame.
157
+ """
158
+ raise StoreError("Local store does not support query.")
159
+
160
+ def write_df(self, df: Any, dst: str, extension: str | None = None, **kwargs) -> str:
161
+ """
162
+ Method to write a dataframe to a file. Kwargs are passed to df.to_parquet().
163
+ If destination is not provided, the dataframe is written to the default
164
+ store path with generated name.
165
+
166
+ Parameters
167
+ ----------
168
+ df : Any
169
+ The dataframe to write.
170
+ dst : str
171
+ The destination of the dataframe.
172
+ **kwargs : dict
173
+ Keyword arguments.
174
+
175
+ Returns
176
+ -------
177
+ str
178
+ Path of written dataframe.
179
+ """
180
+ self._check_local_dst(dst)
181
+ reader = get_reader_by_object(df)
182
+ reader.write_df(df, dst, extension=extension, **kwargs)
183
+ return dst
184
+
85
185
  ##############################
86
186
  # Private I/O methods
87
187
  ##############################
@@ -183,73 +283,3 @@ class LocalStore(Store):
183
283
  dst = dst / src
184
284
  self._build_path(dst)
185
285
  return dst
186
-
187
- ##############################
188
- # Datastore methods
189
- ##############################
190
-
191
- def write_df(self, df: Any, dst: str, extension: str | None = None, **kwargs) -> str:
192
- """
193
- Method to write a dataframe to a file. Kwargs are passed to df.to_parquet().
194
- If destination is not provided, the dataframe is written to the default
195
- store path with generated name.
196
-
197
- Parameters
198
- ----------
199
- df : Any
200
- The dataframe to write.
201
- dst : str
202
- The destination of the dataframe.
203
- **kwargs : dict
204
- Keyword arguments.
205
-
206
- Returns
207
- -------
208
- str
209
- Path of written dataframe.
210
- """
211
- self._check_local_dst(dst)
212
- reader = get_reader_by_object(df)
213
- reader.write_df(df, dst, extension=extension, **kwargs)
214
- return dst
215
-
216
- ##############################
217
- # Helper methods
218
- ##############################
219
-
220
- @staticmethod
221
- def is_partition_or_dir(path: str) -> bool:
222
- """
223
- Check if path is a directory or a partition.
224
-
225
- Parameters
226
- ----------
227
- path : str
228
- The path to check.
229
-
230
- Returns
231
- -------
232
- bool
233
- """
234
- return Path(path).is_dir()
235
-
236
- @staticmethod
237
- def build_object_path(root: str, paths: str | list[str]) -> list[str]:
238
- """
239
- Method to build object path.
240
-
241
- Parameters
242
- ----------
243
- root : str
244
- The root of the object path.
245
- paths : str | list[str]
246
- The path to build.
247
-
248
- Returns
249
- -------
250
- list[str]
251
- Returns the path of the object.
252
- """
253
- if isinstance(paths, str):
254
- paths = [paths]
255
- return [str(Path(root) / path) for path in paths]
@@ -97,6 +97,58 @@ class RemoteStore(Store):
97
97
  # Datastore methods
98
98
  ##############################
99
99
 
100
+ def read_df(
101
+ self,
102
+ path: str | list[str],
103
+ file_format: str | None = None,
104
+ engine: str | None = None,
105
+ **kwargs,
106
+ ) -> Any:
107
+ """
108
+ Read DataFrame from path.
109
+
110
+ Parameters
111
+ ----------
112
+ path : str | list[str]
113
+ Path(s) to read DataFrame from.
114
+ file_format : str
115
+ Extension of the file.
116
+ engine : str
117
+ Dataframe engine (pandas, polars, etc.).
118
+ **kwargs : dict
119
+ Keyword arguments.
120
+
121
+ Returns
122
+ -------
123
+ Any
124
+ DataFrame.
125
+ """
126
+ reader = self._get_reader(engine)
127
+ extension = self._head_extension(path, file_format)
128
+ return reader.read_df(path, extension, **kwargs)
129
+
130
+ def query(
131
+ self,
132
+ query: str,
133
+ engine: str | None = None,
134
+ ) -> Any:
135
+ """
136
+ Query data from database.
137
+
138
+ Parameters
139
+ ----------
140
+ query : str
141
+ The query to execute.
142
+ engine : str
143
+ Dataframe engine (pandas, polars, etc.).
144
+
145
+ Returns
146
+ -------
147
+ Any
148
+ DataFrame.
149
+ """
150
+ raise StoreError("Remote store does not support query.")
151
+
100
152
  def write_df(self, df: Any, dst: str, extension: str | None = None, **kwargs) -> str:
101
153
  """
102
154
  Method to write a dataframe to a file. Note that this method is not implemented
@@ -160,3 +212,32 @@ class RemoteStore(Store):
160
212
  for chunk in r.iter_content(chunk_size=8192):
161
213
  f.write(chunk)
162
214
  return str(dst)
215
+
216
+ def _head_extension(self, url: str, file_format: str | None = None) -> str:
217
+ """
218
+ Method to get the extension of a file from a given url.
219
+
220
+ Parameters
221
+ ----------
222
+ url : str
223
+ The url of the file to get the extension.
224
+ file_format : str
225
+ The file format to check.
226
+
227
+ Returns
228
+ -------
229
+ str
230
+ File extension.
231
+ """
232
+ if file_format is not None:
233
+ return file_format
234
+ try:
235
+ r = requests.head(url, timeout=60)
236
+ r.raise_for_status()
237
+ content_type = r.headers["content-type"]
238
+ if "text" in content_type:
239
+ return "csv"
240
+ else:
241
+ raise ValueError("Content type not supported.")
242
+ except Exception as e:
243
+ raise e
@@ -51,9 +51,10 @@ class S3StoreConfigurator:
51
51
  ]:
52
52
  configurator.set_credential(*pair)
53
53
 
54
- def get_s3_creds(self) -> dict:
54
+ def get_boto3_client_config(self) -> dict:
55
55
  """
56
- Get endpoint, access key and secret key.
56
+ Get S3 credentials (access key, secret key,
57
+ session token and other config).
57
58
 
58
59
  Returns
59
60
  -------
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import typing
3
4
  from io import BytesIO
4
5
  from pathlib import Path
5
6
  from typing import Any, Type
@@ -9,13 +10,16 @@ import boto3
9
10
  import botocore.client # pylint: disable=unused-import
10
11
  from botocore.exceptions import ClientError
11
12
 
12
- from digitalhub.readers.api import get_reader_by_object
13
+ from digitalhub.readers.data.api import get_reader_by_object
13
14
  from digitalhub.stores._base.store import Store
14
15
  from digitalhub.stores.s3.configurator import S3StoreConfigurator
15
16
  from digitalhub.stores.s3.utils import get_bucket_name
16
17
  from digitalhub.utils.exceptions import StoreError
17
18
  from digitalhub.utils.file_utils import get_file_info_from_s3, get_file_mime_type
18
19
 
20
+ if typing.TYPE_CHECKING:
21
+ pass
22
+
19
23
  # Type aliases
20
24
  S3Client = Type["botocore.client.S3"]
21
25
 
@@ -228,12 +232,121 @@ class S3Store(Store):
228
232
 
229
233
  return infos
230
234
 
235
+ ##############################
236
+ # Datastore methods
237
+ ##############################
238
+
239
+ def read_df(
240
+ self,
241
+ path: str | list[str],
242
+ file_format: str | None = None,
243
+ engine: str | None = None,
244
+ **kwargs,
245
+ ) -> Any:
246
+ """
247
+ Read DataFrame from path.
248
+
249
+ Parameters
250
+ ----------
251
+ path : str | list[str]
252
+ Path(s) to read DataFrame from.
253
+ file_format : str
254
+ Extension of the file.
255
+ engine : str
256
+ Dataframe engine (pandas, polars, etc.).
257
+ **kwargs : dict
258
+ Keyword arguments.
259
+
260
+ Returns
261
+ -------
262
+ Any
263
+ DataFrame.
264
+ """
265
+ reader = self._get_reader(engine)
266
+
267
+ # Verify if partition or single file
268
+ if self.is_partition(path):
269
+ client, bucket = self._check_factory(path)
270
+ objects = self._list_objects(client, bucket, path)
271
+ keys = [self._get_key(o) for o in objects]
272
+
273
+ else:
274
+ if isinstance(path, list):
275
+ client, bucket = self._check_factory(path[0])
276
+ keys = [self._get_key(p) for p in path]
277
+ else:
278
+ client, bucket = self._check_factory(path)
279
+ keys = [self._get_key(path)]
280
+
281
+ dfs = []
282
+ for key in keys:
283
+ file_format = self._get_extension(file_format, key)
284
+ obj = self._download_fileobject(key, client, bucket)
285
+ dfs.append(reader.read_df(obj, extension=file_format, **kwargs))
286
+
287
+ if len(dfs) == 1:
288
+ return dfs[0]
289
+ return reader.concat_dfs(dfs)
290
+
291
+ def query(
292
+ self,
293
+ query: str,
294
+ engine: str | None = None,
295
+ ) -> Any:
296
+ """
297
+ Query data from database.
298
+
299
+ Parameters
300
+ ----------
301
+ query : str
302
+ The query to execute.
303
+ engine : str
304
+ Dataframe engine (pandas, polars, etc.).
305
+
306
+ Returns
307
+ -------
308
+ Any
309
+ DataFrame.
310
+ """
311
+ raise StoreError("S3 store does not support query.")
312
+
313
+ def write_df(
314
+ self,
315
+ df: Any,
316
+ dst: str,
317
+ extension: str | None = None,
318
+ **kwargs,
319
+ ) -> str:
320
+ """
321
+ Write a dataframe to S3 based storage. Kwargs are passed to df.to_parquet().
322
+
323
+ Parameters
324
+ ----------
325
+ df : Any
326
+ The dataframe.
327
+ dst : str
328
+ The destination path on S3 based storage.
329
+ extension : str
330
+ The extension of the file.
331
+ **kwargs : dict
332
+ Keyword arguments.
333
+
334
+ Returns
335
+ -------
336
+ str
337
+ The S3 path where the dataframe was saved.
338
+ """
339
+ fileobj = BytesIO()
340
+ reader = get_reader_by_object(df)
341
+ reader.write_df(df, fileobj, extension=extension, **kwargs)
342
+ return self.upload_fileobject(fileobj, dst)
343
+
231
344
  ##############################
232
345
  # Private I/O methods
233
346
  ##############################
234
347
 
348
+ @staticmethod
235
349
  def _download_file(
236
- self,
237
350
  key: str,
238
351
  dst_pth: Path,
239
352
  client: S3Client,
@@ -244,8 +357,8 @@ class S3Store(Store):
244
357
 
245
358
  Parameters
246
359
  ----------
247
- keys : str
248
- The list of keys to be downloaded.
360
+ key : str
361
+ The key to be downloaded.
249
362
  dst_pth : str
250
363
  The destination of the files on local filesystem.
251
364
  client : S3Client
@@ -258,9 +371,34 @@ class S3Store(Store):
258
371
  list[str]
259
372
  The list of paths of the downloaded files.
260
373
  """
261
- # Download file
262
374
  client.download_file(bucket, key, dst_pth)
263
375
 
376
+ @staticmethod
377
+ def _download_fileobject(
378
+ key: str,
379
+ client: S3Client,
380
+ bucket: str,
381
+ ) -> BytesIO:
382
+ """
383
+ Download fileobject from S3 partition.
384
+
385
+ Parameters
386
+ ----------
387
+ key : str
388
+ The key of the file.
389
+ client : S3Client
390
+ The S3 client object.
391
+ bucket : str
392
+ The name of the S3 bucket.
393
+
394
+ Returns
395
+ -------
396
+ BytesIO
397
+ The fileobject of the downloaded file.
398
+ """
399
+ obj = client.get_object(Bucket=bucket, Key=key)
400
+ return BytesIO(obj["Body"].read())
401
+
264
402
  def _upload_dir(
265
403
  self,
266
404
  src: str,
@@ -437,41 +575,6 @@ class S3Store(Store):
437
575
  """
438
576
  client.put_object(Bucket=bucket, Key=key, Body=fileobj.getvalue())
439
577
 
440
- ##############################
441
- # Datastore methods
442
- ##############################
443
-
444
- def write_df(
445
- self,
446
- df: Any,
447
- dst: str,
448
- extension: str | None = None,
449
- **kwargs,
450
- ) -> str:
451
- """
452
- Write a dataframe to S3 based storage. Kwargs are passed to df.to_parquet().
453
-
454
- Parameters
455
- ----------
456
- df : Any
457
- The dataframe.
458
- dst : str
459
- The destination path on S3 based storage.
460
- extension : str
461
- The extension of the file.
462
- **kwargs : dict
463
- Keyword arguments.
464
-
465
- Returns
466
- -------
467
- str
468
- The S3 path where the dataframe was saved.
469
- """
470
- fileobj = BytesIO()
471
- reader = get_reader_by_object(df)
472
- reader.write_df(df, fileobj, extension=extension, **kwargs)
473
- return self.upload_fileobject(fileobj, dst)
474
-
475
578
  ##############################
476
579
  # Helper methods
477
580
  ##############################
@@ -496,7 +599,7 @@ class S3Store(Store):
496
599
  S3Client
497
600
  Returns a client object that interacts with the S3 storage service.
498
601
  """
499
- cfg = self._configurator.get_s3_creds()
602
+ cfg = self._configurator.get_boto3_client_config()
500
603
  return boto3.client("s3", **cfg)
501
604
 
502
605
  def _check_factory(self, root: str) -> tuple[S3Client, str]:
@@ -10,7 +10,7 @@ from sqlalchemy import MetaData, Table, create_engine, select
10
10
  from sqlalchemy.engine import Engine
11
11
  from sqlalchemy.exc import SQLAlchemyError
12
12
 
13
- from digitalhub.readers.api import get_reader_by_object
13
+ from digitalhub.readers.data.api import get_reader_by_object
14
14
  from digitalhub.stores._base.store import Store
15
15
  from digitalhub.stores.sql.configurator import SqlStoreConfigurator
16
16
  from digitalhub.utils.exceptions import StoreError
@@ -119,50 +119,72 @@ class SqlStore(Store):
119
119
  return []
120
120
 
121
121
  ##############################
122
- # Private I/O methods
122
+ # Datastore methods
123
123
  ##############################
124
124
 
125
- def _download_table(self, schema: str, table: str, dst: str) -> str:
125
+ def read_df(
126
+ self,
127
+ path: str | list[str],
128
+ file_format: str | None = None,
129
+ engine: str | None = None,
130
+ **kwargs,
131
+ ) -> Any:
126
132
  """
127
- Download a table from SQL based storage.
133
+ Read DataFrame from path.
128
134
 
129
135
  Parameters
130
136
  ----------
131
- schema : str
132
- The origin schema.
133
- table : str
134
- The origin table.
135
- dst : str
136
- The destination path.
137
+ path : str | list[str]
138
+ Path(s) to read DataFrame from.
139
+ file_format : str
140
+ Extension of the file.
141
+ engine : str
142
+ Dataframe engine (pandas, polars, etc.).
143
+ **kwargs : dict
144
+ Keyword arguments.
137
145
 
138
146
  Returns
139
147
  -------
140
- str
141
- The destination path.
148
+ Any
149
+ DataFrame.
142
150
  """
143
- engine = self._check_factory(schema=schema)
151
+ reader = self._get_reader(engine)
152
+ schema = self._get_schema(path)
153
+ table = self._get_table_name(path)
154
+ sql_engine = self._check_factory(schema=schema)
144
155
 
145
- # Read the table from the database
146
- sa_table = Table(table, MetaData(), autoload_with=engine)
156
+ sa_table = Table(table, MetaData(), autoload_with=sql_engine)
147
157
  stm = select(sa_table)
148
- with engine.begin() as conn:
149
- result: list[Row] = conn.execute(stm).fetchall()
150
158
 
151
- # Parse the result
152
- data = {col: [row[idx] for row in result] for idx, col in enumerate(sa_table.columns.keys())}
153
-
154
- # Convert the result to a pyarrow table and
155
- # write the pyarrow table to a Parquet file
156
- arrow_table = pa.Table.from_pydict(data)
157
- pq.write_table(arrow_table, dst)
159
+ return reader.read_table(stm, sql_engine, **kwargs)
158
160
 
159
- engine.dispose()
161
+ def query(
162
+ self,
163
+ query: str,
164
+ path: str,
165
+ engine: str | None = None,
166
+ ) -> Any:
167
+ """
168
+ Query data from database.
160
169
 
161
- return dst
170
+ Parameters
171
+ ----------
172
+ query : str
173
+ The query to execute.
174
+ path : str
175
+ Path to the database.
176
+ engine : str
177
+ Dataframe engine (pandas, polars, etc.).
162
178
 
163
- ##############################
164
- # Datastore methods
165
- ##############################
179
+ Returns
180
+ -------
181
+ Any
182
+ DataFrame.
183
+ """
184
+ reader = self._get_reader(engine)
185
+ schema = self._get_schema(path)
186
+ sql_engine = self._check_factory(schema=schema)
187
+ return reader.read_table(query, sql_engine)
166
188
 
167
189
  def write_df(self, df: Any, dst: str, extension: str | None = None, **kwargs) -> str:
168
190
  """
@@ -187,9 +209,47 @@ class SqlStore(Store):
187
209
  return self._upload_table(df, schema, table, **kwargs)
188
210
 
189
211
  ##############################
190
- # Private Datastore methods
212
+ # Private I/O methods
191
213
  ##############################
192
214
 
215
+ def _download_table(self, schema: str, table: str, dst: str) -> str:
216
+ """
217
+ Download a table from SQL based storage.
218
+
219
+ Parameters
220
+ ----------
221
+ schema : str
222
+ The origin schema.
223
+ table : str
224
+ The origin table.
225
+ dst : str
226
+ The destination path.
227
+
228
+ Returns
229
+ -------
230
+ str
231
+ The destination path.
232
+ """
233
+ engine = self._check_factory(schema=schema)
234
+
235
+ # Read the table from the database
236
+ sa_table = Table(table, MetaData(), autoload_with=engine)
237
+ stm = select(sa_table)
238
+ with engine.begin() as conn:
239
+ result: list[Row] = conn.execute(stm).fetchall()
240
+
241
+ # Parse the result
242
+ data = {col: [row[idx] for row in result] for idx, col in enumerate(sa_table.columns.keys())}
243
+
244
+ # Convert the result to a pyarrow table and
245
+ # write the pyarrow table to a Parquet file
246
+ arrow_table = pa.Table.from_pydict(data)
247
+ pq.write_table(arrow_table, dst)
248
+
249
+ engine.dispose()
250
+
251
+ return dst
252
+
193
253
  def _upload_table(self, df: Any, schema: str, table: str, **kwargs) -> str:
194
254
  """
195
255
  Upload a table to SQL based storage.