digitalhub 0.10.0b1__py3-none-any.whl → 0.10.0b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of digitalhub might be problematic. Click here for more details.

@@ -1,13 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import typing
4
- from pathlib import Path
5
4
 
6
5
  from digitalhub.entities._base.material.entity import MaterialEntity
7
6
  from digitalhub.entities._commons.enums import EntityTypes
8
- from digitalhub.entities.dataitem.utils import DEFAULT_EXTENSION
9
- from digitalhub.utils.exceptions import EntityError
10
- from digitalhub.utils.uri_utils import has_sql_scheme
11
7
 
12
8
  if typing.TYPE_CHECKING:
13
9
  from digitalhub.entities._base.entity.metadata import Metadata
@@ -36,40 +32,3 @@ class Dataitem(MaterialEntity):
36
32
  super().__init__(project, name, uuid, kind, metadata, spec, status, user)
37
33
  self.spec: DataitemSpec
38
34
  self.status: DataitemStatus
39
-
40
- ##############################
41
- # Helper methods
42
- ##############################
43
-
44
- @staticmethod
45
- def _get_extension(path: str, file_format: str | None = None) -> str:
46
- """
47
- Get extension of path.
48
-
49
- Parameters
50
- ----------
51
- path : str
52
- Path to get extension from.
53
- file_format : str
54
- File format.
55
-
56
- Returns
57
- -------
58
- str
59
- File extension.
60
-
61
- Raises
62
- ------
63
- EntityError
64
- If file format is not supported.
65
- """
66
- if file_format is not None:
67
- return file_format
68
-
69
- if has_sql_scheme(path):
70
- return DEFAULT_EXTENSION
71
-
72
- ext = Path(path).suffix[1:]
73
- if ext is not None:
74
- return ext
75
- raise EntityError("Unknown file format. Only csv and parquet are supported.")
@@ -7,7 +7,7 @@ from typing import Any
7
7
 
8
8
  from digitalhub.entities.dataitem._base.entity import Dataitem
9
9
  from digitalhub.stores.api import get_store
10
- from digitalhub.utils.uri_utils import has_local_scheme
10
+ from digitalhub.utils.uri_utils import has_sql_scheme
11
11
 
12
12
  if typing.TYPE_CHECKING:
13
13
  from digitalhub.entities._base.entity.metadata import Metadata
@@ -36,19 +36,39 @@ class DataitemTable(Dataitem):
36
36
  self.spec: DataitemSpecTable
37
37
  self.status: DataitemStatusTable
38
38
 
39
+ self._query: str | None = None
40
+
41
+ def query(self, query: str) -> DataitemTable:
42
+ """
43
+ Set query to execute.
44
+
45
+ Parameters
46
+ ----------
47
+ query : str
48
+ Query to execute.
49
+
50
+ Returns
51
+ -------
52
+ DataitemTable
53
+ Self object.
54
+ """
55
+ # to remove in future
56
+ if not has_sql_scheme(self.spec.path):
57
+ raise ValueError(
58
+ f"Dataitem path is not a SQL scheme: {self.spec.path}",
59
+ " Query can be made only on a SQL scheme.",
60
+ )
61
+ self._query = query
62
+ return self
63
+
39
64
  def as_df(
40
65
  self,
41
66
  file_format: str | None = None,
42
- engine: str | None = None,
43
- clean_tmp_path: bool = True,
67
+ engine: str | None = "pandas",
44
68
  **kwargs,
45
69
  ) -> Any:
46
70
  """
47
71
  Read dataitem file (csv or parquet) as a DataFrame from spec.path.
48
- If the dataitem is not local, it will be downloaded to a temporary
49
- folder named tmp_dir in the project context folder.
50
- If clean_tmp_path is True, the temporary folder will be deleted after the
51
- method is executed.
52
72
  It's possible to pass additional arguments to the this function. These
53
73
  keyword arguments will be passed to the DataFrame reader function such as
54
74
  pandas's read_csv or read_parquet.
@@ -56,11 +76,10 @@ class DataitemTable(Dataitem):
56
76
  Parameters
57
77
  ----------
58
78
  file_format : str
59
- Format of the file. (Supported csv and parquet).
79
+ Format of the file to read. By default, it will be inferred from
80
+ the extension of the file.
60
81
  engine : str
61
82
  Dataframe framework, by default pandas.
62
- clean_tmp_path : bool
63
- If True, the temporary folder will be deleted.
64
83
  **kwargs : dict
65
84
  Keyword arguments passed to the read_df function.
66
85
 
@@ -69,30 +88,20 @@ class DataitemTable(Dataitem):
69
88
  Any
70
89
  DataFrame.
71
90
  """
72
- try:
73
- if has_local_scheme(self.spec.path):
74
- tmp_dir = None
75
- data_path = self.spec.path
76
- else:
77
- tmp_dir = self._context().root / "tmp_data"
78
- tmp_dir.mkdir(parents=True, exist_ok=True)
79
- data_path = self.download(destination=str(tmp_dir), overwrite=True)
80
-
81
- if Path(data_path).is_dir():
82
- files = [str(i) for i in Path(data_path).rglob("*") if i.is_file()]
83
- checker = files[0]
84
- else:
85
- checker = data_path
86
-
87
- extension = self._get_extension(checker, file_format)
88
- return get_store(self.project, "").read_df(data_path, extension, engine, **kwargs)
89
-
90
- except Exception as e:
91
- raise e
92
-
93
- finally:
94
- # Delete tmp folder
95
- self._clean_tmp_path(tmp_dir, clean_tmp_path)
91
+ if self._query is not None:
92
+ df = get_store(self.project, self.spec.path).query(
93
+ self._query,
94
+ self.spec.path,
95
+ engine,
96
+ )
97
+ self._query = None
98
+ return df
99
+ return get_store(self.project, self.spec.path).read_df(
100
+ self.spec.path,
101
+ file_format,
102
+ engine,
103
+ **kwargs,
104
+ )
96
105
 
97
106
  def write_df(
98
107
  self,
@@ -119,7 +128,12 @@ class DataitemTable(Dataitem):
119
128
  str
120
129
  Path to the written dataframe.
121
130
  """
122
- return get_store(self.project, self.spec.path).write_df(df, self.spec.path, extension=extension, **kwargs)
131
+ return get_store(self.project, self.spec.path).write_df(
132
+ df,
133
+ self.spec.path,
134
+ extension=extension,
135
+ **kwargs,
136
+ )
123
137
 
124
138
  @staticmethod
125
139
  def _clean_tmp_path(pth: Path | None, clean: bool) -> None:
@@ -8,7 +8,7 @@ from digitalhub.context.api import get_context
8
8
  from digitalhub.entities._base.entity._constructors.uuid import build_uuid
9
9
  from digitalhub.entities._base.material.utils import build_log_path_from_source, eval_local_source
10
10
  from digitalhub.entities._commons.enums import EntityKinds, EntityTypes
11
- from digitalhub.readers.api import get_reader_by_object
11
+ from digitalhub.readers.data.api import get_reader_by_object
12
12
  from digitalhub.utils.generic_utils import slugify_string
13
13
 
14
14
  if typing.TYPE_CHECKING:
@@ -6,7 +6,7 @@ from abc import abstractmethod
6
6
  from digitalhub.utils.exceptions import BuilderError
7
7
 
8
8
  if typing.TYPE_CHECKING:
9
- from digitalhub.readers._base.reader import DataframeReader
9
+ from digitalhub.readers.data._base.reader import DataframeReader
10
10
 
11
11
 
12
12
  class ReaderBuilder:
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from abc import abstractmethod
4
- from typing import Any
4
+ from typing import IO, Any
5
5
 
6
6
 
7
7
  class DataframeReader:
@@ -13,11 +13,16 @@ class DataframeReader:
13
13
  # Read methods
14
14
  ##############################
15
15
 
16
- @staticmethod
17
16
  @abstractmethod
18
- def read_df(path: str | list[str], extension: str, **kwargs) -> Any:
17
+ def read_df(self, path_or_buffer: str | IO, extension: str, **kwargs) -> Any:
18
+ """
19
+ Read DataFrame from path or buffer.
20
+ """
21
+
22
+ @abstractmethod
23
+ def read_table(self, *args, **kwargs) -> Any:
19
24
  """
20
- Read DataFrame from path.
25
+ Read table from db.
21
26
  """
22
27
 
23
28
  ##############################
@@ -68,3 +73,10 @@ class DataframeReader:
68
73
  """
69
74
  Get preview.
70
75
  """
76
+
77
+ @staticmethod
78
+ @abstractmethod
79
+ def concat_dfs(dfs: list[Any]) -> Any:
80
+ """
81
+ Concatenate a list of DataFrames.
82
+ """
@@ -3,11 +3,11 @@ from __future__ import annotations
3
3
  import typing
4
4
  from typing import Any
5
5
 
6
- from digitalhub.readers.factory import factory
6
+ from digitalhub.readers.data.factory import factory
7
7
  from digitalhub.utils.exceptions import ReaderError
8
8
 
9
9
  if typing.TYPE_CHECKING:
10
- from digitalhub.readers._base.reader import DataframeReader
10
+ from digitalhub.readers.data._base.reader import DataframeReader
11
11
 
12
12
 
13
13
  def get_reader_by_engine(engine: str | None = None) -> DataframeReader:
@@ -5,8 +5,8 @@ import typing
5
5
  from digitalhub.utils.exceptions import BuilderError
6
6
 
7
7
  if typing.TYPE_CHECKING:
8
- from digitalhub.readers._base.builder import ReaderBuilder
9
- from digitalhub.readers._base.reader import DataframeReader
8
+ from digitalhub.readers.data._base.builder import ReaderBuilder
9
+ from digitalhub.readers.data._base.reader import DataframeReader
10
10
 
11
11
 
12
12
  class ReaderFactory:
@@ -126,7 +126,7 @@ class ReaderFactory:
126
126
  factory = ReaderFactory()
127
127
 
128
128
  try:
129
- from digitalhub.readers.pandas.builder import ReaderBuilderPandas
129
+ from digitalhub.readers.data.pandas.builder import ReaderBuilderPandas
130
130
 
131
131
  factory.add_builder(
132
132
  ReaderBuilderPandas.ENGINE,
File without changes
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
- from digitalhub.readers._base.builder import ReaderBuilder
4
- from digitalhub.readers.pandas.reader import DataframeReaderPandas
3
+ from digitalhub.readers.data._base.builder import ReaderBuilder
4
+ from digitalhub.readers.data.pandas.reader import DataframeReaderPandas
5
5
 
6
6
 
7
7
  class ReaderBuilderPandas(ReaderBuilder):
@@ -2,15 +2,15 @@ from __future__ import annotations
2
2
 
3
3
  import json
4
4
  from io import BytesIO
5
- from typing import Any
5
+ from typing import IO, Any
6
6
 
7
7
  import numpy as np
8
8
  import pandas as pd
9
9
  from pandas.errors import ParserError
10
10
 
11
11
  from digitalhub.entities.dataitem.table.utils import check_preview_size, finalize_preview, prepare_data, prepare_preview
12
- from digitalhub.readers._base.reader import DataframeReader
13
- from digitalhub.readers.pandas.enums import Extensions
12
+ from digitalhub.readers.data._base.reader import DataframeReader
13
+ from digitalhub.readers.data.pandas.enums import Extensions
14
14
  from digitalhub.utils.exceptions import ReaderError
15
15
  from digitalhub.utils.generic_utils import CustomJsonEncoder
16
16
 
@@ -24,14 +24,14 @@ class DataframeReaderPandas(DataframeReader):
24
24
  # Read methods
25
25
  ##############################
26
26
 
27
- def read_df(self, path: str | list[str], extension: str, **kwargs) -> pd.DataFrame:
27
+ def read_df(self, path_or_buffer: str | IO, extension: str, **kwargs) -> pd.DataFrame:
28
28
  """
29
- Read DataFrame from path.
29
+ Read DataFrame from path or buffer.
30
30
 
31
31
  Parameters
32
32
  ----------
33
- path : str | list[str]
34
- Path(s) to read DataFrame from.
33
+ path_or_buffer : str | IO
34
+ Path or buffer to read DataFrame from.
35
35
  extension : str
36
36
  Extension of the file.
37
37
  **kwargs : dict
@@ -43,25 +43,40 @@ class DataframeReaderPandas(DataframeReader):
43
43
  Pandas DataFrame.
44
44
  """
45
45
  if extension == Extensions.CSV.value:
46
- method = pd.read_csv
47
- elif extension == Extensions.PARQUET.value:
48
- method = pd.read_parquet
49
- elif extension == Extensions.JSON.value:
50
- method = pd.read_json
51
- elif extension in (Extensions.EXCEL.value, Extensions.EXCEL_OLD.value):
52
- method = pd.read_excel
53
- elif extension in (Extensions.TXT.value, Extensions.FILE.value):
46
+ return pd.read_csv(path_or_buffer, **kwargs)
47
+ if extension == Extensions.PARQUET.value:
48
+ return pd.read_parquet(path_or_buffer, **kwargs)
49
+ if extension == Extensions.JSON.value:
50
+ return pd.read_json(path_or_buffer, **kwargs)
51
+ if extension in (Extensions.EXCEL.value, Extensions.EXCEL_OLD.value):
52
+ return pd.read_excel(path_or_buffer, **kwargs)
53
+ if extension in (Extensions.TXT.value, Extensions.FILE.value):
54
54
  try:
55
- return self.read_df(path, Extensions.CSV.value, **kwargs)
55
+ return self.read_df(path_or_buffer, Extensions.CSV.value, **kwargs)
56
56
  except ParserError:
57
- raise ReaderError(f"Unable to read from {path}.")
57
+ raise ReaderError(f"Unable to read from {path_or_buffer}.")
58
58
  else:
59
59
  raise ReaderError(f"Unsupported extension '{extension}' for reading.")
60
60
 
61
- if isinstance(path, list):
62
- dfs = [method(p, **kwargs) for p in path]
63
- return pd.concat(dfs)
64
- return method(path, **kwargs)
61
+ def read_table(self, sql: str, engine: Any, **kwargs) -> pd.DataFrame:
62
+ """
63
+ Read table from db.
64
+
65
+ Parameters
66
+ ----------
67
+ sql : str
68
+ SQL query.
69
+ engine : Any
70
+ SQL Engine.
71
+ **kwargs
72
+ Keyword arguments.
73
+
74
+ Returns
75
+ -------
76
+ pd.DataFrame
77
+ Pandas DataFrame.
78
+ """
79
+ return pd.read_sql(sql=sql, con=engine, **kwargs)
65
80
 
66
81
  ##############################
67
82
  # Write methods
@@ -92,7 +107,7 @@ class DataframeReaderPandas(DataframeReader):
92
107
  """
93
108
  if extension == Extensions.CSV.value:
94
109
  return self.write_csv(df, dst, **kwargs)
95
- elif extension == Extensions.PARQUET.value:
110
+ if extension == Extensions.PARQUET.value:
96
111
  return self.write_parquet(df, dst, **kwargs)
97
112
  raise ReaderError(f"Unsupported extension '{extension}' for writing.")
98
113
 
@@ -137,7 +152,7 @@ class DataframeReaderPandas(DataframeReader):
137
152
  df.to_parquet(dst, index=False, **kwargs)
138
153
 
139
154
  @staticmethod
140
- def write_table(df: pd.DataFrame, table: str, engine: Any, schema: str, **kwargs) -> None:
155
+ def write_table(df: pd.DataFrame, table: str, engine: Any, schema: str | None = None, **kwargs) -> None:
141
156
  """
142
157
  Write DataFrame as table.
143
158
 
@@ -148,7 +163,7 @@ class DataframeReaderPandas(DataframeReader):
148
163
  table : str
149
164
  The destination table.
150
165
  engine : Any
151
- The SQLAlchemy engine.
166
+ SQL Engine.
152
167
  schema : str
153
168
  The destination schema.
154
169
  **kwargs : dict
@@ -164,6 +179,23 @@ class DataframeReaderPandas(DataframeReader):
164
179
  # Utils
165
180
  ##############################
166
181
 
182
+ @staticmethod
183
+ def concat_dfs(dfs: list[pd.DataFrame]) -> pd.DataFrame:
184
+ """
185
+ Concatenate a list of DataFrames.
186
+
187
+ Parameters
188
+ ----------
189
+ dfs : list[pd.DataFrame]
190
+ The DataFrames to concatenate.
191
+
192
+ Returns
193
+ -------
194
+ pd.DataFrame
195
+ The concatenated DataFrame.
196
+ """
197
+ return pd.concat(dfs, ignore_index=True)
198
+
167
199
  @staticmethod
168
200
  def get_schema(df: pd.DataFrame) -> Any:
169
201
  """
File without changes
@@ -1,14 +1,18 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import typing
3
4
  from abc import abstractmethod
4
5
  from pathlib import Path
5
6
  from tempfile import mkdtemp
6
7
  from typing import Any
7
8
 
8
- from digitalhub.readers.api import get_reader_by_engine
9
+ from digitalhub.readers.data.api import get_reader_by_engine
9
10
  from digitalhub.utils.exceptions import StoreError
10
11
  from digitalhub.utils.uri_utils import has_local_scheme
11
12
 
13
+ if typing.TYPE_CHECKING:
14
+ from digitalhub.readers.data._base.reader import DataframeReader
15
+
12
16
 
13
17
  class Store:
14
18
  """
@@ -52,11 +56,6 @@ class Store:
52
56
  ##############################
53
57
 
54
58
  @abstractmethod
55
- def write_df(self, df: Any, dst: str, extension: str | None = None, **kwargs) -> str:
56
- """
57
- Write DataFrame as parquet or csv.
58
- """
59
-
60
59
  def read_df(
61
60
  self,
62
61
  path: str | list[str],
@@ -66,25 +65,23 @@ class Store:
66
65
  ) -> Any:
67
66
  """
68
67
  Read DataFrame from path.
68
+ """
69
69
 
70
- Parameters
71
- ----------
72
- path : str | list[str]
73
- Path(s) to read DataFrame from.
74
- extension : str
75
- Extension of the file.
76
- engine : str
77
- Dataframe engine (pandas, polars, etc.).
78
- **kwargs : dict
79
- Keyword arguments.
70
+ @abstractmethod
71
+ def query(
72
+ self,
73
+ query: str,
74
+ engine: str | None = None,
75
+ ) -> Any:
76
+ """
77
+ Query data from database.
78
+ """
80
79
 
81
- Returns
82
- -------
83
- Any
84
- DataFrame.
80
+ @abstractmethod
81
+ def write_df(self, df: Any, dst: str, extension: str | None = None, **kwargs) -> str:
82
+ """
83
+ Write DataFrame as parquet or csv.
85
84
  """
86
- reader = get_reader_by_engine(engine)
87
- return reader.read_df(path, extension, **kwargs)
88
85
 
89
86
  ##############################
90
87
  # Helpers methods
@@ -187,3 +184,43 @@ class Store:
187
184
  """
188
185
  tmpdir = mkdtemp()
189
186
  return Path(tmpdir)
187
+
188
+ @staticmethod
189
+ def _get_reader(engine: str | None = None) -> DataframeReader:
190
+ """
191
+ Get Dataframe reader.
192
+
193
+ Parameters
194
+ ----------
195
+ engine : str
196
+ Dataframe engine (pandas, polars, etc.).
197
+
198
+ Returns
199
+ -------
200
+ Any
201
+ Reader object.
202
+ """
203
+ return get_reader_by_engine(engine)
204
+
205
+ @staticmethod
206
+ def _get_extension(extension: str | None = None, path: str | None = None) -> str:
207
+ """
208
+ Get extension from path.
209
+
210
+ Parameters
211
+ ----------
212
+ extension : str
213
+ The extension to get.
214
+ path : str
215
+ The path to get the extension from.
216
+
217
+ Returns
218
+ -------
219
+ str
220
+ The extension.
221
+ """
222
+ if extension is not None:
223
+ return extension
224
+ if path is not None:
225
+ return Path(path).suffix.removeprefix(".")
226
+ raise ValueError("Extension or path must be provided.")