sibi-dst 0.3.18__tar.gz → 0.3.19__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/PKG-INFO +4 -4
  2. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/README.md +3 -3
  3. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/pyproject.toml +1 -1
  4. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/df_helper/__init__.py +2 -0
  5. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/df_helper/_df_helper.py +3 -0
  6. sibi_dst-0.3.19/sibi_dst/df_helper/_parquet_reader.py +49 -0
  7. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +1 -1
  8. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_model_builder.py +13 -18
  9. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/utils/_data_wrapper.py +16 -38
  10. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/__init__.py +0 -0
  11. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
  12. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/df_helper/backends/__init__.py +0 -0
  13. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/df_helper/backends/django/__init__.py +0 -0
  14. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/df_helper/backends/django/_django_db_connection.py +0 -0
  15. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/df_helper/backends/django/_django_load_from_db.py +0 -0
  16. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/df_helper/backends/django/_django_sql_model_builder.py +0 -0
  17. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/df_helper/backends/django/_io_dask.py +0 -0
  18. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/df_helper/backends/django/_io_dask_alt.py +0 -0
  19. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
  20. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
  21. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
  22. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/df_helper/backends/parquet/_parquet_filter_handler.py +0 -0
  23. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/df_helper/backends/sql_alchemy/__init__.py +0 -0
  24. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/df_helper/backends/sql_alchemy/_io_sqlalchemy_dask.py +0 -0
  25. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/df_helper/backends/sql_alchemy/_sqlachemy_filter_handler.py +0 -0
  26. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_db_connection.py +0 -0
  27. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_load_from_db.py +0 -0
  28. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/df_helper/backends/sql_model/__init__.py +0 -0
  29. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/df_helper/backends/sql_model/_sqlmodel_db_connection.py +0 -0
  30. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/df_helper/backends/sql_model/_sqlmodel_load_from_db.py +0 -0
  31. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/df_helper/core/__init__.py +0 -0
  32. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/df_helper/core/_defaults.py +0 -0
  33. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
  34. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/df_helper/core/_params_config.py +0 -0
  35. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/df_helper/core/_query_config.py +0 -0
  36. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/utils/__init__.py +0 -0
  37. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/utils/_airflow_manager.py +0 -0
  38. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/utils/_clickhouse_writer.py +0 -0
  39. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/utils/_credentials.py +0 -0
  40. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/utils/_data_utils.py +0 -0
  41. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/utils/_date_utils.py +0 -0
  42. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/utils/_df_utils.py +0 -0
  43. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/utils/_file_utils.py +0 -0
  44. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/utils/_filepath_generator.py +0 -0
  45. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/utils/_log_utils.py +0 -0
  46. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/utils/_parquet_saver.py +0 -0
  47. {sibi_dst-0.3.18 → sibi_dst-0.3.19}/sibi_dst/utils/_storage_manager.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 0.3.18
3
+ Version: 0.3.19
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -40,13 +40,13 @@ Description-Content-Type: text/markdown
40
40
 
41
41
  Data Science Toolkit
42
42
  ---------------------
43
- Data Science Toolkit built with Python, Pandas, Dask, OpenStreetMaps, Scikit-Learn, XGBOOST, Django ORM, DjangoRestFrameWork
43
+ Data Science Toolkit built with Python, Pandas, Dask, OpenStreetMaps, Scikit-Learn, XGBOOST, Django ORM, SQLAlchemy, DjangoRestFrameWork
44
44
 
45
45
  Major Functionality
46
46
  --------------------
47
47
  1) Build DataCubes, DataSets and DataObjects from different datasources. These include relational databases, parquet files, xlsx, delimited tables, json, json API REST.
48
- 2) Rich set of common dataframe management utilities.
49
- 3) Share Data with client applications by write to Datawarehouses in local filesystems as well as other supported platforms.
48
+ 2) Common dataframe management utilities.
49
+ 3) Share Data with client applications by write to Data Warehouses in local filesystems as well as other supported platforms.
50
50
  4) Build microservices to communicate/share data via API-REST, gRPC.
51
51
 
52
52
 
@@ -2,12 +2,12 @@
2
2
 
3
3
  Data Science Toolkit
4
4
  ---------------------
5
- Data Science Toolkit built with Python, Pandas, Dask, OpenStreetMaps, Scikit-Learn, XGBOOST, Django ORM, DjangoRestFrameWork
5
+ Data Science Toolkit built with Python, Pandas, Dask, OpenStreetMaps, Scikit-Learn, XGBOOST, Django ORM, SQLAlchemy, DjangoRestFrameWork
6
6
 
7
7
  Major Functionality
8
8
  --------------------
9
9
  1) Build DataCubes, DataSets and DataObjects from different datasources. These include relational databases, parquet files, xlsx, delimited tables, json, json API REST.
10
- 2) Rich set of common dataframe management utilities.
11
- 3) Share Data with client applications by write to Datawarehouses in local filesystems as well as other supported platforms.
10
+ 2) Common dataframe management utilities.
11
+ 3) Share Data with client applications by write to Data Warehouses in local filesystems as well as other supported platforms.
12
12
  4) Build microservices to communicate/share data via API-REST, gRPC.
13
13
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "sibi-dst"
3
- version = "0.3.18"
3
+ version = "0.3.19"
4
4
  description = "Data Science Toolkit"
5
5
  authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
6
6
  readme = "README.md"
@@ -2,8 +2,10 @@ from __future__ import annotations
2
2
 
3
3
  from ._df_helper import DfHelper
4
4
  from ._parquet_artifact import ParquetArtifact
5
+ from ._parquet_reader import ParquetReader
5
6
 
6
7
  __all__=[
7
8
  'DfHelper',
8
9
  'ParquetArtifact',
10
+ 'ParquetReader',
9
11
  ]
@@ -56,6 +56,9 @@ class DfHelper:
56
56
  kwargs.setdefault("logger", self.logger)
57
57
  self.post_init(**kwargs)
58
58
 
59
+ def __str__(self):
60
+ return self.__class__.__name__
61
+
59
62
  def post_init(self, **kwargs):
60
63
  self.logger.debug(f"backend used: {self.backend}")
61
64
  self.backend_query = self.__get_config(QueryConfig, kwargs)
@@ -0,0 +1,49 @@
1
+ from typing import Optional
2
+
3
+ import dask.dataframe as dd
4
+ import fsspec
5
+
6
+ from sibi_dst.df_helper import DfHelper
7
+
8
+ class ParquetReader(DfHelper):
9
+ DEFAULT_CONFIG = {
10
+ 'backend': 'parquet'
11
+ }
12
+
13
+ def __init__(self, filesystem_type="file", filesystem_options=None, **kwargs):
14
+ self.config = {
15
+ **self.DEFAULT_CONFIG,
16
+ **kwargs,
17
+ }
18
+ self.df: Optional[dd.DataFrame] = None
19
+ self.parquet_storage_path = self.config.setdefault('parquet_storage_path', None)
20
+ if self.parquet_storage_path is None:
21
+ raise ValueError('parquet_storage_path must be set')
22
+ self.parquet_start_date = self.config.setdefault('parquet_start_date', None)
23
+ if self.parquet_start_date is None:
24
+ raise ValueError('parquet_start_date must be set')
25
+
26
+ self.parquet_end_date = self.config.setdefault('parquet_end_date', None)
27
+ if self.parquet_end_date is None:
28
+ raise ValueError('parquet_end_date must be set')
29
+
30
+ # Filesystem setup
31
+ self.filesystem_type = filesystem_type
32
+ self.filesystem_options = filesystem_options or {}
33
+ self.fs = fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
34
+
35
+ if not self.directory_exists():
36
+ raise ValueError(f"{self.parquet_storage_path} does not exist")
37
+
38
+ super().__init__(**self.config)
39
+
40
+ def load(self, **kwargs):
41
+ self.df = super().load(**kwargs)
42
+ return self.df
43
+
44
+ def directory_exists(self):
45
+ try:
46
+ info = self.fs.info(self.parquet_storage_path)
47
+ return info['type'] == 'directory'
48
+ except FileNotFoundError:
49
+ return False
@@ -52,7 +52,7 @@ class ParquetConfig(BaseModel):
52
52
  raise ValueError('Parquet end date must be greater than start date')
53
53
 
54
54
  # Saving to parquet is disabled when start and end dates are provided, as we will load parquet files
55
- self.parquet_folder_list = FilePathGenerator(str(self.parquet_storage_path)).generate_file_paths(start_date, end_date)
55
+ self.parquet_folder_list = FilePathGenerator(str(self.parquet_storage_path), logger=self.logger).generate_file_paths(start_date, end_date)
56
56
  self.parquet_size_bytes = self.get_parquet_size_bytes()
57
57
  self.load_parquet = True
58
58
  #self.load_parquet = all([self.fs.exists(folder) for folder in self.parquet_folder_list]) and self.parquet_size_bytes > 0
@@ -9,6 +9,7 @@ Base = declarative_base()
9
9
  apps_label = "datacubes"
10
10
 
11
11
  class SqlAlchemyModelBuilder:
12
+ _model_cache = {} # Local cache for model classes
12
13
  def __init__(self, engine, table_name):
13
14
  """
14
15
  Initialize the model builder with a database engine and specific table.
@@ -21,28 +22,21 @@ class SqlAlchemyModelBuilder:
21
22
  self.table_name = table_name
22
23
  self.metadata = MetaData()
23
24
  self.table = None # Placeholder for the specific table
25
+ self.class_name = self.normalize_class_name(self.table_name)
24
26
 
25
27
  def build_model(self) -> type:
26
- """
27
- Build a SQLAlchemy ORM model for the specified table.
28
-
29
- Returns:
30
- type: Dynamically generated SQLAlchemy ORM model class.
31
- """
32
- # Check if the class is already registered
33
- class_name = self.normalize_class_name(self.table_name)
34
- mapper_registry = Base.registry
35
- if class_name in mapper_registry._class_registry:
36
- return mapper_registry._class_registry[class_name]
28
+ # Check if the model is already registered
29
+ model = Base.registry._class_registry.get(self.class_name)
30
+ if model:
31
+ return model
37
32
 
38
- # Reflect only the specified table
39
33
  self.metadata.reflect(only=[self.table_name], bind=self.engine)
40
34
  self.table = self.metadata.tables.get(self.table_name)
41
-
42
35
  if self.table is None:
43
36
  raise ValueError(f"Table '{self.table_name}' does not exist in the database.")
44
37
 
45
- return self.create_model()
38
+ model = self.create_model()
39
+ return model
46
40
 
47
41
  def create_model(self) -> type:
48
42
  """
@@ -52,7 +46,6 @@ class SqlAlchemyModelBuilder:
52
46
  type: Dynamically generated SQLAlchemy ORM model class.
53
47
  """
54
48
  # Normalize the class name from the table name
55
- class_name = self.normalize_class_name(self.table_name)
56
49
  columns = self.get_columns(self.table)
57
50
 
58
51
  # Define attributes for the model class
@@ -66,9 +59,11 @@ class SqlAlchemyModelBuilder:
66
59
  # Add columns and relationships to the model
67
60
  attrs.update(columns)
68
61
  #self.add_relationships(attrs, self.table)
69
-
70
- # Dynamically create the model class
71
- model = type(class_name, (Base,), attrs)
62
+ model = Base.registry._class_registry.get(self.class_name)
63
+ if not model:
64
+ model = type(self.class_name, (Base,), attrs)
65
+ # Add the class to Base.registry so it is registered
66
+ Base.registry._class_registry[self.class_name] = model
72
67
  return model
73
68
 
74
69
  def get_columns(self, table: Table):
@@ -1,15 +1,12 @@
1
1
  import datetime
2
2
  from typing import Type, Any, Dict, Optional
3
-
4
3
  import fsspec
5
4
  import pandas as pd
6
5
  from IPython.display import display
7
- from tqdm import tqdm
8
-
9
6
  from sibi_dst.utils import Logger
7
+ from tqdm import tqdm
10
8
  from sibi_dst.utils import ParquetSaver
11
9
 
12
-
13
10
  class DataWrapper:
14
11
  DEFAULT_MAX_AGE_MINUTES = 1440
15
12
  DEFAULT_HISTORY_DAYS_THRESHOLD = 30
@@ -46,7 +43,7 @@ class DataWrapper:
46
43
  self.reverse_order = reverse_order
47
44
  self.overwrite = overwrite
48
45
  self.ignore_missing = ignore_missing
49
- self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
46
+ self.logger = logger or Logger.default_logger(logger_name=self.dataclass.__name__)
50
47
  self.max_age_minutes = max_age_minutes
51
48
  self.history_days_threshold = history_days_threshold
52
49
  self.show_progress = show_progress
@@ -96,7 +93,7 @@ class DataWrapper:
96
93
 
97
94
  date_iterator = dates_to_process
98
95
  if self.show_progress:
99
- date_iterator = tqdm(date_iterator, desc=description, unit="date")
96
+ date_iterator = tqdm(date_iterator, desc=f"{description}:{self.dataclass.__name__}", unit="date")
100
97
 
101
98
  for current_date in date_iterator:
102
99
  self.process_date(current_date)
@@ -113,16 +110,14 @@ class DataWrapper:
113
110
  )
114
111
  current_time = datetime.datetime.now(datetime.timezone.utc)
115
112
  file_age_minutes = (current_time - file_modification_datetime).total_seconds() / 60
116
-
117
- if self.verbose:
118
- self.logger.debug(
119
- f"File {file_path} is {round(file_age_minutes, 2)} minutes old "
120
- f"(threshold: {self.max_age_minutes} minutes)"
121
- )
113
+ self.logger.info(
114
+ f"File {file_path} is {round(file_age_minutes, 2)} minutes old "
115
+ f"(threshold: {self.max_age_minutes} minutes)"
116
+ )
122
117
 
123
118
  return file_age_minutes > self.max_age_minutes
124
119
  except FileNotFoundError:
125
- return True # Treat missing files as old
120
+ return True
126
121
 
127
122
  def process_date(self, date: datetime.date):
128
123
  """Process a specific date by regenerating data as necessary."""
@@ -130,16 +125,13 @@ class DataWrapper:
130
125
  full_parquet_filename = f"{folder}{self.parquet_filename}"
131
126
 
132
127
  start_time = datetime.datetime.now()
133
-
134
- if self.verbose:
135
- self.logger.debug(f"Processing {full_parquet_filename}...")
128
+ self.logger.info(f"Processing {full_parquet_filename}...")
136
129
 
137
130
  data_object = self.dataclass(**self.class_params)
138
131
  df = data_object.load_period(dt_field=self.date_field, start=date, end=date)
139
132
 
140
133
  if len(df.index)==0:
141
- if self.verbose:
142
- self.logger.debug("No data found for the specified date.")
134
+ self.logger.error("No data found for the specified date.")
143
135
  return
144
136
 
145
137
  parquet_saver = ParquetSaver(df, folder, self.logger)
@@ -147,11 +139,9 @@ class DataWrapper:
147
139
 
148
140
  end_time = datetime.datetime.now()
149
141
  duration_seconds = (end_time - start_time).total_seconds()
150
-
151
- if self.verbose:
152
- self.logger.debug(
153
- f"Data saved to {full_parquet_filename}. Processing time: {duration_seconds:.2f} seconds"
154
- )
142
+ self.logger.info(
143
+ f"Data saved to {full_parquet_filename}. Processing time: {duration_seconds:.2f} seconds"
144
+ )
155
145
 
156
146
  def generate_update_plan_with_conditions(self):
157
147
  """
@@ -167,7 +157,7 @@ class DataWrapper:
167
157
 
168
158
  date_range = self.generate_date_range()
169
159
  if self.show_progress:
170
- date_range = tqdm(date_range, desc=f"Evaluating update plan {self.__class__.__name__}", unit="date")
160
+ date_range = tqdm(date_range, desc=f"Evaluating update plan:{self.dataclass.__name__}", unit="date")
171
161
 
172
162
  for current_date in date_range:
173
163
  folder = f'{self.data_path}{current_date.year}/{current_date.month:02d}/{current_date.day:02d}/'
@@ -203,25 +193,13 @@ class DataWrapper:
203
193
  "within_history": within_history,
204
194
  "missing_file": missing_file,
205
195
  "update_required": update_required,
206
- "update_category": category
196
+ "update_category": category,
197
+ "datawrapper class":self.dataclass.__name__
207
198
  })
208
199
 
209
200
  update_plan_table = pd.DataFrame(rows)
210
201
  return update_plan_table
211
202
 
212
-
213
-
214
- # # Usage:
215
- # # wrapper = DataWrapper(
216
- # # dataclass=YourDataClass,
217
- # # date_field="created_at",
218
- # # data_path="/path/to/data",
219
- # # parquet_filename="data.parquet",
220
- # # start_date="2022-01-01",
221
- # # end_date="2022-12-31",
222
- # # filesystem_type="file",
223
- # # verbose=True
224
- # # )
225
203
  # # wrapper.process()
226
204
  # # wrapper = DataWrapper(
227
205
  # # dataclass=YourDataClass,