sibi-dst 0.3.17__py3-none-any.whl → 0.3.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. sibi_dst/df_helper/__init__.py +2 -0
  2. sibi_dst/df_helper/_df_helper.py +52 -51
  3. sibi_dst/df_helper/_parquet_artifact.py +1 -1
  4. sibi_dst/df_helper/_parquet_reader.py +49 -0
  5. sibi_dst/df_helper/{plugins → backends}/django/_django_load_from_db.py +1 -1
  6. sibi_dst/df_helper/{plugins → backends}/parquet/_parquet_options.py +1 -1
  7. sibi_dst/df_helper/{plugins → backends}/sql_alchemy/_io_sqlalchemy_dask.py +2 -3
  8. sibi_dst/df_helper/{plugins → backends}/sql_alchemy/_sqlalchemy_load_from_db.py +5 -2
  9. sibi_dst/df_helper/{plugins → backends}/sql_alchemy/_sqlalchemy_model_builder.py +13 -18
  10. sibi_dst/utils/_data_wrapper.py +16 -38
  11. {sibi_dst-0.3.17.dist-info → sibi_dst-0.3.19.dist-info}/METADATA +4 -4
  12. sibi_dst-0.3.19.dist-info/RECORD +47 -0
  13. sibi_dst-0.3.17.dist-info/RECORD +0 -46
  14. /sibi_dst/df_helper/{plugins → backends}/__init__.py +0 -0
  15. /sibi_dst/df_helper/{plugins → backends}/django/__init__.py +0 -0
  16. /sibi_dst/df_helper/{plugins → backends}/django/_django_db_connection.py +0 -0
  17. /sibi_dst/df_helper/{plugins → backends}/django/_django_sql_model_builder.py +0 -0
  18. /sibi_dst/df_helper/{plugins → backends}/django/_io_dask.py +0 -0
  19. /sibi_dst/df_helper/{plugins → backends}/django/_io_dask_alt.py +0 -0
  20. /sibi_dst/df_helper/{plugins → backends}/http/__init__.py +0 -0
  21. /sibi_dst/df_helper/{plugins → backends}/http/_http_config.py +0 -0
  22. /sibi_dst/df_helper/{plugins → backends}/parquet/__init__.py +0 -0
  23. /sibi_dst/df_helper/{plugins → backends}/parquet/_parquet_filter_handler.py +0 -0
  24. /sibi_dst/df_helper/{plugins → backends}/sql_alchemy/__init__.py +0 -0
  25. /sibi_dst/df_helper/{plugins → backends}/sql_alchemy/_sqlachemy_filter_handler.py +0 -0
  26. /sibi_dst/df_helper/{plugins → backends}/sql_alchemy/_sqlalchemy_db_connection.py +0 -0
  27. /sibi_dst/df_helper/{plugins → backends}/sql_model/__init__.py +0 -0
  28. /sibi_dst/df_helper/{plugins → backends}/sql_model/_sqlmodel_db_connection.py +0 -0
  29. /sibi_dst/df_helper/{plugins → backends}/sql_model/_sqlmodel_load_from_db.py +0 -0
  30. {sibi_dst-0.3.17.dist-info → sibi_dst-0.3.19.dist-info}/WHEEL +0 -0
@@ -2,8 +2,10 @@ from __future__ import annotations
2
2
 
3
3
  from ._df_helper import DfHelper
4
4
  from ._parquet_artifact import ParquetArtifact
5
+ from ._parquet_reader import ParquetReader
5
6
 
6
7
  __all__=[
7
8
  'DfHelper',
8
9
  'ParquetArtifact',
10
+ 'ParquetReader',
9
11
  ]
@@ -6,17 +6,16 @@ from typing import Any, Dict, TypeVar
6
6
  from typing import Union, Optional
7
7
 
8
8
  import dask.dataframe as dd
9
- import dask_expr
10
9
  import pandas as pd
11
10
  from pydantic import BaseModel
12
11
 
13
12
  from sibi_dst.df_helper.core import QueryConfig, ParamsConfig, FilterHandler
14
13
  from sibi_dst.utils import Logger
15
14
  from sibi_dst.utils import ParquetSaver, ClickHouseWriter
16
- from .plugins.django import *
17
- from .plugins.http import HttpConfig
18
- from .plugins.parquet import ParquetConfig, ParquetFilterHandler
19
- from .plugins.sql_alchemy import *
15
+ from .backends.django import *
16
+ from .backends.http import HttpConfig
17
+ from .backends.parquet import ParquetConfig
18
+ from .backends.sql_alchemy import *
20
19
 
21
20
  # Define a generic type variable for BaseModel subclasses
22
21
  T = TypeVar("T", bound=BaseModel)
@@ -30,26 +29,25 @@ warnings.filterwarnings(
30
29
 
31
30
  class DfHelper:
32
31
  df: Union[dd.DataFrame, pd.DataFrame] = None
33
- plugin_django_connection: Optional[DjangoConnectionConfig] = None
34
- plugin_query: Optional[QueryConfig] = None
35
- plugin_params: Optional[ParamsConfig] = None
36
- plugin_parquet: Optional[ParquetConfig] = None
37
- plugin_http: Optional[HttpConfig] = None
38
- plugin_sqlalchemy: Optional[SqlAlchemyConnectionConfig] = None
32
+ backend_django: Optional[DjangoConnectionConfig] = None
33
+ backend_query: Optional[QueryConfig] = None
34
+ backend_params: Optional[ParamsConfig] = None
35
+ backend_parquet: Optional[ParquetConfig] = None
36
+ backend_http: Optional[HttpConfig] = None
37
+ backend_sqlalchemy: Optional[SqlAlchemyConnectionConfig] = None
39
38
  parquet_filename: str = None
40
39
  logger: Logger
41
40
  default_config: Dict = None
42
41
 
43
- def __init__(self, source='django_db', **kwargs):
42
+ def __init__(self, backend='django_db', **kwargs):
44
43
  # Ensure default_config is not shared across instances
45
44
  self.default_config = self.default_config or {}
46
45
  kwargs = {**self.default_config.copy(), **kwargs}
47
- self.source = source
46
+ self.backend = backend
48
47
  self.debug = kwargs.setdefault("debug", False)
49
- self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
48
+ self.logger = kwargs.get("logger",Logger.default_logger(logger_name=self.__class__.__name__))
50
49
  # Configure logger level
51
50
  self.logger.setLevel(logging.DEBUG if self.debug else logging.INFO)
52
- # Configure logger level
53
51
  self.logger.debug("Logger initialized in DEBUG mode.")
54
52
  self.parquet_storage_path = kwargs.setdefault("parquet_storage_path", None)
55
53
  self.dt_field = kwargs.setdefault("dt_field", None)
@@ -58,19 +56,22 @@ class DfHelper:
58
56
  kwargs.setdefault("logger", self.logger)
59
57
  self.post_init(**kwargs)
60
58
 
59
+ def __str__(self):
60
+ return self.__class__.__name__
61
+
61
62
  def post_init(self, **kwargs):
62
- self.logger.debug(f"Source used: {self.source}")
63
- self.plugin_query = self.__get_config(QueryConfig, kwargs)
64
- self.plugin_params = self.__get_config(ParamsConfig, kwargs)
65
- if self.source == 'django_db':
66
- self.plugin_django_connection = self.__get_config(DjangoConnectionConfig, kwargs)
67
- elif self.source == 'parquet':
63
+ self.logger.debug(f"backend used: {self.backend}")
64
+ self.backend_query = self.__get_config(QueryConfig, kwargs)
65
+ self.backend_params = self.__get_config(ParamsConfig, kwargs)
66
+ if self.backend == 'django_db':
67
+ self.backend_django = self.__get_config(DjangoConnectionConfig, kwargs)
68
+ elif self.backend == 'parquet':
68
69
  self.parquet_filename = kwargs.setdefault("parquet_filename", None)
69
- self.plugin_parquet = ParquetConfig(**kwargs)
70
- elif self.source == 'http':
71
- self.plugin_http = HttpConfig(**kwargs)
72
- elif self.source == 'sqlalchemy':
73
- self.plugin_sqlalchemy = self.__get_config(SqlAlchemyConnectionConfig, kwargs)
70
+ self.backend_parquet = ParquetConfig(**kwargs)
71
+ elif self.backend == 'http':
72
+ self.backend_http = HttpConfig(**kwargs)
73
+ elif self.backend == 'sqlalchemy':
74
+ self.backend_sqlalchemy = self.__get_config(SqlAlchemyConnectionConfig, kwargs)
74
75
 
75
76
  @staticmethod
76
77
  def __get_config(model: [T], kwargs: Dict[str, Any]) -> Union[T]:
@@ -87,7 +88,7 @@ class DfHelper:
87
88
  return model(**model_kwargs)
88
89
 
89
90
  def load(self, **options):
90
- # this will be the universal method to load data from a df irrespective of the source
91
+ # this will be the universal method to load data from a df irrespective of the backend
91
92
  df = self._load(**options)
92
93
  if self.as_pandas:
93
94
  return df.compute()
@@ -95,15 +96,15 @@ class DfHelper:
95
96
 
96
97
  def _load(self, **options):
97
98
 
98
- if self.source == 'django_db':
99
- self.plugin_params.parse_params(options)
99
+ if self.backend == 'django_db':
100
+ self.backend_params.parse_params(options)
100
101
  return self._load_from_db(**options)
101
- elif self.source == 'sqlalchemy':
102
- self.plugin_params.parse_params(options)
102
+ elif self.backend == 'sqlalchemy':
103
+ self.backend_params.parse_params(options)
103
104
  return self._load_from_sqlalchemy(**options)
104
- elif self.source == 'parquet':
105
+ elif self.backend == 'parquet':
105
106
  return self._load_from_parquet(**options)
106
- elif self.source == 'http':
107
+ elif self.backend == 'http':
107
108
  if asyncio.get_event_loop().is_running():
108
109
  self.logger.debug("Running as a task from an event loop")
109
110
  return asyncio.create_task(self._load_from_http(**options))
@@ -115,9 +116,9 @@ class DfHelper:
115
116
  try:
116
117
  options.setdefault("debug", self.debug)
117
118
  db_loader = SqlAlchemyLoadFromDb(
118
- self.plugin_sqlalchemy,
119
- self.plugin_query,
120
- self.plugin_params,
119
+ self.backend_sqlalchemy,
120
+ self.backend_query,
121
+ self.backend_params,
121
122
  self.logger,
122
123
  **options
123
124
  )
@@ -135,9 +136,9 @@ class DfHelper:
135
136
  try:
136
137
  options.setdefault("debug", self.debug)
137
138
  db_loader = DjangoLoadFromDb(
138
- self.plugin_django_connection,
139
- self.plugin_query,
140
- self.plugin_params,
139
+ self.backend_django,
140
+ self.backend_query,
141
+ self.backend_params,
141
142
  self.logger,
142
143
  **options
143
144
  )
@@ -152,12 +153,12 @@ class DfHelper:
152
153
  return self.df
153
154
 
154
155
  async def _load_from_http(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
155
- """Delegate asynchronous HTTP data loading to HttpDataSource plugin."""
156
- if not self.plugin_http:
156
+ """Delegate asynchronous HTTP data loading to HttpDatabackend plugin."""
157
+ if not self.backend_http:
157
158
  self.logger.debug("HTTP plugin not configured properly.")
158
159
  return dd.from_pandas(pd.DataFrame(), npartitions=1)
159
160
  try:
160
- self.df = await self.plugin_http.fetch_data(**options)
161
+ self.df = await self.backend_http.fetch_data(**options)
161
162
  except Exception as e:
162
163
  self.logger.debug(f"Failed to load data from http plugin: {e}")
163
164
  self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
@@ -168,7 +169,7 @@ class DfHelper:
168
169
  Efficiently process the DataFrame by filtering, renaming, and setting indices.
169
170
  Optimized for large datasets with Dask compatibility.
170
171
  """
171
- df_params = self.plugin_params.df_params
172
+ df_params = self.backend_params.df_params
172
173
  fieldnames = df_params.get("fieldnames", None)
173
174
  index_col = df_params.get("index_col", None)
174
175
  datetime_index = df_params.get("datetime_index", False)
@@ -205,7 +206,7 @@ class DfHelper:
205
206
  def _process_loaded_data(self):
206
207
  self.logger.debug(f"Type of self.df: {type(self.df)}")
207
208
  if self.df.map_partitions(len).compute().sum() > 0:
208
- field_map = self.plugin_params.field_map or {}
209
+ field_map = self.backend_params.field_map or {}
209
210
  if isinstance(field_map, dict):
210
211
  rename_mapping = {k: v for k, v in field_map.items() if k in self.df.columns}
211
212
  missing_columns = [k for k in field_map.keys() if k not in self.df.columns]
@@ -237,7 +238,7 @@ class DfHelper:
237
238
  self.logger.debug("Save to ClickHouse completed.")
238
239
 
239
240
  def _load_from_parquet(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
240
- self.df = self.plugin_parquet.load_files()
241
+ self.df = self.backend_parquet.load_files()
241
242
  if options:
242
243
  """
243
244
  deprecated specific filter handling to a generic one
@@ -273,20 +274,20 @@ class DfHelper:
273
274
  raise ValueError("The 'start' date cannot be later than the 'end' date.")
274
275
 
275
276
  # Reverse map to original field name
276
- field_map = getattr(self.plugin_params, 'field_map', {}) or {}
277
+ field_map = getattr(self.backend_params, 'field_map', {}) or {}
277
278
  reverse_map = {v: k for k, v in field_map.items()}
278
279
  mapped_field = reverse_map.get(dt_field, dt_field)
279
280
 
280
281
  # Common logic for Django and SQLAlchemy
281
- if self.source == 'django_db':
282
- model_fields = {field.name: field for field in self.plugin_django_connection.model._meta.get_fields()}
282
+ if self.backend == 'django_db':
283
+ model_fields = {field.name: field for field in self.backend_django.model._meta.get_fields()}
283
284
  if mapped_field not in model_fields:
284
285
  raise ValueError(f"Field '{dt_field}' does not exist in the Django model.")
285
286
  field_type = type(model_fields[mapped_field]).__name__
286
287
  is_date_field = field_type == 'DateField'
287
288
  is_datetime_field = field_type == 'DateTimeField'
288
- elif self.source == 'sqlalchemy':
289
- model = self.plugin_sqlalchemy.model
289
+ elif self.backend == 'sqlalchemy':
290
+ model = self.backend_sqlalchemy.model
290
291
  fields = [column.name for column in model.__table__.columns]
291
292
  if mapped_field not in fields:
292
293
  raise ValueError(f"Field '{dt_field}' does not exist in the SQLAlchemy model.")
@@ -295,7 +296,7 @@ class DfHelper:
295
296
  is_date_field = field_type == 'DATE'
296
297
  is_datetime_field = field_type == 'DATETIME'
297
298
  else:
298
- raise ValueError(f"Unsupported source '{self.source}'")
299
+ raise ValueError(f"Unsupported backend '{self.backend}'")
299
300
  # Build query filters
300
301
  if start == end:
301
302
  if is_date_field:
@@ -8,7 +8,7 @@ from sibi_dst.utils import DateUtils
8
8
 
9
9
  class ParquetArtifact(DfHelper):
10
10
  DEFAULT_CONFIG = {
11
- 'source': 'parquet'
11
+ 'backend': 'parquet'
12
12
  }
13
13
 
14
14
  def __init__(self, data_wrapper_class, filesystem_type="file", filesystem_options=None, **kwargs):
@@ -0,0 +1,49 @@
1
+ from typing import Optional
2
+
3
+ import dask.dataframe as dd
4
+ import fsspec
5
+
6
+ from sibi_dst.df_helper import DfHelper
7
+
8
+ class ParquetReader(DfHelper):
9
+ DEFAULT_CONFIG = {
10
+ 'backend': 'parquet'
11
+ }
12
+
13
+ def __init__(self, filesystem_type="file", filesystem_options=None, **kwargs):
14
+ self.config = {
15
+ **self.DEFAULT_CONFIG,
16
+ **kwargs,
17
+ }
18
+ self.df: Optional[dd.DataFrame] = None
19
+ self.parquet_storage_path = self.config.setdefault('parquet_storage_path', None)
20
+ if self.parquet_storage_path is None:
21
+ raise ValueError('parquet_storage_path must be set')
22
+ self.parquet_start_date = self.config.setdefault('parquet_start_date', None)
23
+ if self.parquet_start_date is None:
24
+ raise ValueError('parquet_start_date must be set')
25
+
26
+ self.parquet_end_date = self.config.setdefault('parquet_end_date', None)
27
+ if self.parquet_end_date is None:
28
+ raise ValueError('parquet_end_date must be set')
29
+
30
+ # Filesystem setup
31
+ self.filesystem_type = filesystem_type
32
+ self.filesystem_options = filesystem_options or {}
33
+ self.fs = fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
34
+
35
+ if not self.directory_exists():
36
+ raise ValueError(f"{self.parquet_storage_path} does not exist")
37
+
38
+ super().__init__(**self.config)
39
+
40
+ def load(self, **kwargs):
41
+ self.df = super().load(**kwargs)
42
+ return self.df
43
+
44
+ def directory_exists(self):
45
+ try:
46
+ info = self.fs.info(self.parquet_storage_path)
47
+ return info['type'] == 'directory'
48
+ except FileNotFoundError:
49
+ return False
@@ -5,7 +5,7 @@ import pandas as pd
5
5
  from IPython.core.hooks import deprecated
6
6
  from django.db.models import Q
7
7
 
8
- from sibi_dst.df_helper.plugins.django import ReadFrameDask
8
+ from sibi_dst.df_helper.backends.django import ReadFrameDask
9
9
  from sibi_dst.utils import Logger
10
10
  from sibi_dst.df_helper.core import django_field_conversion_map_dask
11
11
 
@@ -52,7 +52,7 @@ class ParquetConfig(BaseModel):
52
52
  raise ValueError('Parquet end date must be greater than start date')
53
53
 
54
54
  # Saving to parquet is disabled when start and end dates are provided, as we will load parquet files
55
- self.parquet_folder_list = FilePathGenerator(str(self.parquet_storage_path)).generate_file_paths(start_date, end_date)
55
+ self.parquet_folder_list = FilePathGenerator(str(self.parquet_storage_path), logger=self.logger).generate_file_paths(start_date, end_date)
56
56
  self.parquet_size_bytes = self.get_parquet_size_bytes()
57
57
  self.load_parquet = True
58
58
  #self.load_parquet = all([self.fs.exists(folder) for folder in self.parquet_folder_list]) and self.parquet_size_bytes > 0
@@ -1,14 +1,13 @@
1
1
  import itertools
2
2
 
3
3
  import dask.dataframe as dd
4
- import dask_expr
5
4
  import pandas as pd
6
5
  from sqlalchemy import create_engine, inspect, select
7
6
  from sqlalchemy.orm import sessionmaker
8
7
 
9
- from sibi_dst.df_helper.plugins.sql_alchemy._sqlachemy_filter_handler import SqlAlchemyFilterHandler
10
- from sibi_dst.utils import Logger
11
8
  from sibi_dst.df_helper.core import FilterHandler
9
+ from sibi_dst.utils import Logger
10
+
12
11
 
13
12
  class SQLAlchemyDask:
14
13
  def __init__(self, model, filters, engine_url, chunk_size=1000, logger=None, debug=False):
@@ -29,6 +29,7 @@ class SqlAlchemyLoadFromDb:
29
29
  self.query_config = plugin_query
30
30
  self.params_config = plugin_params
31
31
  self.debug = kwargs.pop("debug", False)
32
+ self.chunk_size = kwargs.pop("chunk_size", 1000)
32
33
 
33
34
  def build_and_load(self) -> dd.DataFrame:
34
35
  """
@@ -45,8 +46,10 @@ class SqlAlchemyLoadFromDb:
45
46
  filters=self.params_config.filters,
46
47
  engine_url=self.engine.url,
47
48
  logger=self.logger,
48
- chunk_size=1000,
49
- debug=self.debug).read_frame()
49
+ chunk_size=self.chunk_size,
50
+ debug=self.debug
51
+ ).read_frame()
52
+
50
53
  if self.df is None or len(self.df.head().index) == 0:
51
54
  self.logger.debug("Query returned no results.")
52
55
  dask_df=dd.from_pandas(pd.DataFrame(), npartitions=1)
@@ -9,6 +9,7 @@ Base = declarative_base()
9
9
  apps_label = "datacubes"
10
10
 
11
11
  class SqlAlchemyModelBuilder:
12
+ _model_cache = {} # Local cache for model classes
12
13
  def __init__(self, engine, table_name):
13
14
  """
14
15
  Initialize the model builder with a database engine and specific table.
@@ -21,28 +22,21 @@ class SqlAlchemyModelBuilder:
21
22
  self.table_name = table_name
22
23
  self.metadata = MetaData()
23
24
  self.table = None # Placeholder for the specific table
25
+ self.class_name = self.normalize_class_name(self.table_name)
24
26
 
25
27
  def build_model(self) -> type:
26
- """
27
- Build a SQLAlchemy ORM model for the specified table.
28
-
29
- Returns:
30
- type: Dynamically generated SQLAlchemy ORM model class.
31
- """
32
- # Check if the class is already registered
33
- class_name = self.normalize_class_name(self.table_name)
34
- mapper_registry = Base.registry
35
- if class_name in mapper_registry._class_registry:
36
- return mapper_registry._class_registry[class_name]
28
+ # Check if the model is already registered
29
+ model = Base.registry._class_registry.get(self.class_name)
30
+ if model:
31
+ return model
37
32
 
38
- # Reflect only the specified table
39
33
  self.metadata.reflect(only=[self.table_name], bind=self.engine)
40
34
  self.table = self.metadata.tables.get(self.table_name)
41
-
42
35
  if self.table is None:
43
36
  raise ValueError(f"Table '{self.table_name}' does not exist in the database.")
44
37
 
45
- return self.create_model()
38
+ model = self.create_model()
39
+ return model
46
40
 
47
41
  def create_model(self) -> type:
48
42
  """
@@ -52,7 +46,6 @@ class SqlAlchemyModelBuilder:
52
46
  type: Dynamically generated SQLAlchemy ORM model class.
53
47
  """
54
48
  # Normalize the class name from the table name
55
- class_name = self.normalize_class_name(self.table_name)
56
49
  columns = self.get_columns(self.table)
57
50
 
58
51
  # Define attributes for the model class
@@ -66,9 +59,11 @@ class SqlAlchemyModelBuilder:
66
59
  # Add columns and relationships to the model
67
60
  attrs.update(columns)
68
61
  #self.add_relationships(attrs, self.table)
69
-
70
- # Dynamically create the model class
71
- model = type(class_name, (Base,), attrs)
62
+ model = Base.registry._class_registry.get(self.class_name)
63
+ if not model:
64
+ model = type(self.class_name, (Base,), attrs)
65
+ # Add the class to Base.registry so it is registered
66
+ Base.registry._class_registry[self.class_name] = model
72
67
  return model
73
68
 
74
69
  def get_columns(self, table: Table):
@@ -1,15 +1,12 @@
1
1
  import datetime
2
2
  from typing import Type, Any, Dict, Optional
3
-
4
3
  import fsspec
5
4
  import pandas as pd
6
5
  from IPython.display import display
7
- from tqdm import tqdm
8
-
9
6
  from sibi_dst.utils import Logger
7
+ from tqdm import tqdm
10
8
  from sibi_dst.utils import ParquetSaver
11
9
 
12
-
13
10
  class DataWrapper:
14
11
  DEFAULT_MAX_AGE_MINUTES = 1440
15
12
  DEFAULT_HISTORY_DAYS_THRESHOLD = 30
@@ -46,7 +43,7 @@ class DataWrapper:
46
43
  self.reverse_order = reverse_order
47
44
  self.overwrite = overwrite
48
45
  self.ignore_missing = ignore_missing
49
- self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
46
+ self.logger = logger or Logger.default_logger(logger_name=self.dataclass.__name__)
50
47
  self.max_age_minutes = max_age_minutes
51
48
  self.history_days_threshold = history_days_threshold
52
49
  self.show_progress = show_progress
@@ -96,7 +93,7 @@ class DataWrapper:
96
93
 
97
94
  date_iterator = dates_to_process
98
95
  if self.show_progress:
99
- date_iterator = tqdm(date_iterator, desc=description, unit="date")
96
+ date_iterator = tqdm(date_iterator, desc=f"{description}:{self.dataclass.__name__}", unit="date")
100
97
 
101
98
  for current_date in date_iterator:
102
99
  self.process_date(current_date)
@@ -113,16 +110,14 @@ class DataWrapper:
113
110
  )
114
111
  current_time = datetime.datetime.now(datetime.timezone.utc)
115
112
  file_age_minutes = (current_time - file_modification_datetime).total_seconds() / 60
116
-
117
- if self.verbose:
118
- self.logger.debug(
119
- f"File {file_path} is {round(file_age_minutes, 2)} minutes old "
120
- f"(threshold: {self.max_age_minutes} minutes)"
121
- )
113
+ self.logger.info(
114
+ f"File {file_path} is {round(file_age_minutes, 2)} minutes old "
115
+ f"(threshold: {self.max_age_minutes} minutes)"
116
+ )
122
117
 
123
118
  return file_age_minutes > self.max_age_minutes
124
119
  except FileNotFoundError:
125
- return True # Treat missing files as old
120
+ return True
126
121
 
127
122
  def process_date(self, date: datetime.date):
128
123
  """Process a specific date by regenerating data as necessary."""
@@ -130,16 +125,13 @@ class DataWrapper:
130
125
  full_parquet_filename = f"{folder}{self.parquet_filename}"
131
126
 
132
127
  start_time = datetime.datetime.now()
133
-
134
- if self.verbose:
135
- self.logger.debug(f"Processing {full_parquet_filename}...")
128
+ self.logger.info(f"Processing {full_parquet_filename}...")
136
129
 
137
130
  data_object = self.dataclass(**self.class_params)
138
131
  df = data_object.load_period(dt_field=self.date_field, start=date, end=date)
139
132
 
140
133
  if len(df.index)==0:
141
- if self.verbose:
142
- self.logger.debug("No data found for the specified date.")
134
+ self.logger.error("No data found for the specified date.")
143
135
  return
144
136
 
145
137
  parquet_saver = ParquetSaver(df, folder, self.logger)
@@ -147,11 +139,9 @@ class DataWrapper:
147
139
 
148
140
  end_time = datetime.datetime.now()
149
141
  duration_seconds = (end_time - start_time).total_seconds()
150
-
151
- if self.verbose:
152
- self.logger.debug(
153
- f"Data saved to {full_parquet_filename}. Processing time: {duration_seconds:.2f} seconds"
154
- )
142
+ self.logger.info(
143
+ f"Data saved to {full_parquet_filename}. Processing time: {duration_seconds:.2f} seconds"
144
+ )
155
145
 
156
146
  def generate_update_plan_with_conditions(self):
157
147
  """
@@ -167,7 +157,7 @@ class DataWrapper:
167
157
 
168
158
  date_range = self.generate_date_range()
169
159
  if self.show_progress:
170
- date_range = tqdm(date_range, desc=f"Evaluating update plan {self.__class__.__name__}", unit="date")
160
+ date_range = tqdm(date_range, desc=f"Evaluating update plan:{self.dataclass.__name__}", unit="date")
171
161
 
172
162
  for current_date in date_range:
173
163
  folder = f'{self.data_path}{current_date.year}/{current_date.month:02d}/{current_date.day:02d}/'
@@ -203,25 +193,13 @@ class DataWrapper:
203
193
  "within_history": within_history,
204
194
  "missing_file": missing_file,
205
195
  "update_required": update_required,
206
- "update_category": category
196
+ "update_category": category,
197
+ "datawrapper class":self.dataclass.__name__
207
198
  })
208
199
 
209
200
  update_plan_table = pd.DataFrame(rows)
210
201
  return update_plan_table
211
202
 
212
-
213
-
214
- # # Usage:
215
- # # wrapper = DataWrapper(
216
- # # dataclass=YourDataClass,
217
- # # date_field="created_at",
218
- # # data_path="/path/to/data",
219
- # # parquet_filename="data.parquet",
220
- # # start_date="2022-01-01",
221
- # # end_date="2022-12-31",
222
- # # filesystem_type="file",
223
- # # verbose=True
224
- # # )
225
203
  # # wrapper.process()
226
204
  # # wrapper = DataWrapper(
227
205
  # # dataclass=YourDataClass,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 0.3.17
3
+ Version: 0.3.19
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -40,13 +40,13 @@ Description-Content-Type: text/markdown
40
40
 
41
41
  Data Science Toolkit
42
42
  ---------------------
43
- Data Science Toolkit built with Python, Pandas, Dask, OpenStreetMaps, Scikit-Learn, XGBOOST, Django ORM, DjangoRestFrameWork
43
+ Data Science Toolkit built with Python, Pandas, Dask, OpenStreetMaps, Scikit-Learn, XGBOOST, Django ORM, SQLAlchemy, DjangoRestFrameWork
44
44
 
45
45
  Major Functionality
46
46
  --------------------
47
47
  1) Build DataCubes, DataSets and DataObjects from different datasources. These include relational databases, parquet files, xlsx, delimited tables, json, json API REST.
48
- 2) Rich set of common dataframe management utilities.
49
- 3) Share Data with client applications by write to Datawarehouses in local filesystems as well as other supported platforms.
48
+ 2) Common dataframe management utilities.
49
+ 3) Share Data with client applications by write to Data Warehouses in local filesystems as well as other supported platforms.
50
50
  4) Build microservices to communicate/share data via API-REST, gRPC.
51
51
 
52
52
 
@@ -0,0 +1,47 @@
1
+ sibi_dst/__init__.py,sha256=1KaC0LYTHxjpENq-NXI325WcEYZ8GCBrHGkLoFxEcu0,251
2
+ sibi_dst/df_helper/__init__.py,sha256=rbTr9CqwbJhu8pbZabwfcOqhm-5hm2iXk0vVBtK01bA,231
3
+ sibi_dst/df_helper/_df_helper.py,sha256=e6e32CRTCKjFVvYMytWTuBVpwB1VcnVQ1T4Rg8KXWvY,13917
4
+ sibi_dst/df_helper/_parquet_artifact.py,sha256=ctISmwxP9icFCXsELBjbPiz-FK3CEojN7yNIlStdOWw,4974
5
+ sibi_dst/df_helper/_parquet_reader.py,sha256=A8qWuWQiaiS7pk4sD5EDAvGs-qz7VfziINXpSA7o00U,1683
6
+ sibi_dst/df_helper/backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ sibi_dst/df_helper/backends/django/__init__.py,sha256=VkvYql-PUyCKOfoKx5aGdUAki8X-kULfhnCvgSQzHDQ,311
8
+ sibi_dst/df_helper/backends/django/_django_db_connection.py,sha256=9fGvXXgqPk_TC7BvaSljxTgNDWoXi_LZxKabEgExznM,1654
9
+ sibi_dst/df_helper/backends/django/_django_load_from_db.py,sha256=E_6ptiouluyLziXkNy_MztRi36qqW7-3AvlafL78Sug,5592
10
+ sibi_dst/df_helper/backends/django/_django_sql_model_builder.py,sha256=GprCh2c6PFlRBUCir4hh8pmf4Eqb-4OIK6Vz_xXQjMw,14864
11
+ sibi_dst/df_helper/backends/django/_io_dask.py,sha256=P3WmkuFzmWRzFchjsVD2OElIR3stuevwDH9G6Mu8IWE,9080
12
+ sibi_dst/df_helper/backends/django/_io_dask_alt.py,sha256=zDjLyYxBeL0ffn3yfE_7vqMLMpeEEk2o-zMr66sKkDw,6827
13
+ sibi_dst/df_helper/backends/http/__init__.py,sha256=AG9JSDRyVna2r1yxCQ9HcY32EaGnzWsfKgNLgPpSXjY,102
14
+ sibi_dst/df_helper/backends/http/_http_config.py,sha256=NN3bol7NgBTDv70yOX7hJkazt1-dAAdFWVkYyHdIXsI,2128
15
+ sibi_dst/df_helper/backends/parquet/__init__.py,sha256=ClkyIsIh_ovEwqm0dTrkXImbPjLDTVHW2NQqqfQwWAw,187
16
+ sibi_dst/df_helper/backends/parquet/_parquet_filter_handler.py,sha256=6iFvblnVq0qj89QvieQuYxe_2RPX5ArKfq5zBcEIj90,3660
17
+ sibi_dst/df_helper/backends/parquet/_parquet_options.py,sha256=09b9yLPREvx6ebs62B9qEqJt1cCKJz97plGW82i4630,4414
18
+ sibi_dst/df_helper/backends/sql_alchemy/__init__.py,sha256=FHorj40SbHc0OBzQ_ieG6MG-HLbf0tw6I_5eoIjJkOI,369
19
+ sibi_dst/df_helper/backends/sql_alchemy/_io_sqlalchemy_dask.py,sha256=YYhjt5rL1yomcrby4i4bD5wPVDzRJpZZbxHp5CM40tQ,5414
20
+ sibi_dst/df_helper/backends/sql_alchemy/_sqlachemy_filter_handler.py,sha256=KShsLJYGVxN0ps9Wot7fF0nR0wW9WzcPIcWZ9f5vdBo,4654
21
+ sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_db_connection.py,sha256=HtMsfH5com4dLVJxh3wdMUpQI3mz0cKDJz0CmFS2S8U,1648
22
+ sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_load_from_db.py,sha256=QkR-_S4zqJpwH9dJ5cqXW8iy9XoAFUXmcsgUSm3PbLo,2251
23
+ sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_model_builder.py,sha256=RjtKEk-i8EmX98rwqkq1Bg7IgPwYDduL967gsl9T73c,4401
24
+ sibi_dst/df_helper/backends/sql_model/__init__.py,sha256=MXd4OOdTqR4cENSV733SGodPO6eQMCexANs-3w0qL5U,226
25
+ sibi_dst/df_helper/backends/sql_model/_sqlmodel_db_connection.py,sha256=6jmMjKIv5Btysj3kZMaXQ98IqKQkhnOC-JWtb1B8rus,4265
26
+ sibi_dst/df_helper/backends/sql_model/_sqlmodel_load_from_db.py,sha256=jYwkIz7_E9Z6Mqw1a9TCWKWD146Tbx7mcQFxIpmKgKU,3686
27
+ sibi_dst/df_helper/core/__init__.py,sha256=o4zDwgVmaijde3oix0ezb6KLxI5QFy-SGUhFTDVFLT4,569
28
+ sibi_dst/df_helper/core/_defaults.py,sha256=pJU-lX7w4nrt0Anx35j08mVr_0oMGn1bTA_iCl_p1qI,6700
29
+ sibi_dst/df_helper/core/_filter_handler.py,sha256=SYZqpX4Vt6GAGR0L0LohlDOdjLLWQXJDiWWqFG-lSu0,8563
30
+ sibi_dst/df_helper/core/_params_config.py,sha256=hO-PddoaGjFebqJFgtn76WwVHcCjzPW3z5i3NyK6mDw,3475
31
+ sibi_dst/df_helper/core/_query_config.py,sha256=HEiyR_fBJjIMum-PSQroY3KaefQ2SpW1w1SQS8oT-NU,489
32
+ sibi_dst/utils/__init__.py,sha256=TV229dPIIEzU5qCLI1G6fnCZW-VirUwSuffp7z7OTFg,783
33
+ sibi_dst/utils/_airflow_manager.py,sha256=rlt3eolR5QvtxWhAtBTCpHXvxftnKM-ibPMv3fVwNZk,7524
34
+ sibi_dst/utils/_clickhouse_writer.py,sha256=JcnWN2635ATCOaFiB6NYglNXDwqKw0jC7Urs9WOZE20,8571
35
+ sibi_dst/utils/_credentials.py,sha256=8i6z7y3y5S-6mSk4xrT2AwhzCA32mTn1n1iYX9IVyHk,1724
36
+ sibi_dst/utils/_data_utils.py,sha256=ch4j5FEs8ZnniUzpbeLO-b4Yco_6nwCu71xHaVqMGi4,7050
37
+ sibi_dst/utils/_data_wrapper.py,sha256=cvUkGRiPfCyLD4XcoX7FWLYzM8gnHBGR1pJ08PMneCk,9010
38
+ sibi_dst/utils/_date_utils.py,sha256=KYB07puKDrSG8tOm_i1HGX0TjLNUtSWjwfsCYBmW9co,10619
39
+ sibi_dst/utils/_df_utils.py,sha256=pjEfkof9hggXQgYerG0p4DXrwBeIRynJFg4IX3Yrb4c,10919
40
+ sibi_dst/utils/_file_utils.py,sha256=5EN90c8N1n9d-_xwz2RzaYcXRMQY_rws2Q3EA3pNAog,1254
41
+ sibi_dst/utils/_filepath_generator.py,sha256=ytPSZ9GYOnnSP25zwA-0NjFHupPRZyXwixWnn_68_n0,6686
42
+ sibi_dst/utils/_log_utils.py,sha256=rPp8z1UglwvqzBOOAvMOct0syQZ-54gGYafnJDRYZN4,2313
43
+ sibi_dst/utils/_parquet_saver.py,sha256=3BK0XXgMOOAdIw4OzbwMxmDrzDw3_MKi8RTpulIVUe0,4367
44
+ sibi_dst/utils/_storage_manager.py,sha256=KP2HBXnLUMMquqcO30ecfuoU7g1z8RtaV3Dv0TvEXoY,3856
45
+ sibi_dst-0.3.19.dist-info/METADATA,sha256=IDeMqZZHRsAV-v5TngSTKaB7y7SQhMjfEduHozqhOsk,2134
46
+ sibi_dst-0.3.19.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
47
+ sibi_dst-0.3.19.dist-info/RECORD,,
@@ -1,46 +0,0 @@
1
- sibi_dst/__init__.py,sha256=1KaC0LYTHxjpENq-NXI325WcEYZ8GCBrHGkLoFxEcu0,251
2
- sibi_dst/df_helper/__init__.py,sha256=JXJBY47G6wOYhzNI646OBl3pSGWIy4282-3qPGYHU7w,167
3
- sibi_dst/df_helper/_df_helper.py,sha256=yOapAc3MLQnylGKs0TG4Nmf8gaLdM7Nvzt4H1bEp8ik,13898
4
- sibi_dst/df_helper/_parquet_artifact.py,sha256=f5oHwXtsNW6-ONSFsRB0AniVefA0THzP92J-nugp9vo,4973
5
- sibi_dst/df_helper/core/__init__.py,sha256=o4zDwgVmaijde3oix0ezb6KLxI5QFy-SGUhFTDVFLT4,569
6
- sibi_dst/df_helper/core/_defaults.py,sha256=pJU-lX7w4nrt0Anx35j08mVr_0oMGn1bTA_iCl_p1qI,6700
7
- sibi_dst/df_helper/core/_filter_handler.py,sha256=SYZqpX4Vt6GAGR0L0LohlDOdjLLWQXJDiWWqFG-lSu0,8563
8
- sibi_dst/df_helper/core/_params_config.py,sha256=hO-PddoaGjFebqJFgtn76WwVHcCjzPW3z5i3NyK6mDw,3475
9
- sibi_dst/df_helper/core/_query_config.py,sha256=HEiyR_fBJjIMum-PSQroY3KaefQ2SpW1w1SQS8oT-NU,489
10
- sibi_dst/df_helper/plugins/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
- sibi_dst/df_helper/plugins/django/__init__.py,sha256=VkvYql-PUyCKOfoKx5aGdUAki8X-kULfhnCvgSQzHDQ,311
12
- sibi_dst/df_helper/plugins/django/_django_db_connection.py,sha256=9fGvXXgqPk_TC7BvaSljxTgNDWoXi_LZxKabEgExznM,1654
13
- sibi_dst/df_helper/plugins/django/_django_load_from_db.py,sha256=NSPNCNzvEAR-4AuTSPQWJsidBvT9zRHAN6L3JC1xRV0,5591
14
- sibi_dst/df_helper/plugins/django/_django_sql_model_builder.py,sha256=GprCh2c6PFlRBUCir4hh8pmf4Eqb-4OIK6Vz_xXQjMw,14864
15
- sibi_dst/df_helper/plugins/django/_io_dask.py,sha256=P3WmkuFzmWRzFchjsVD2OElIR3stuevwDH9G6Mu8IWE,9080
16
- sibi_dst/df_helper/plugins/django/_io_dask_alt.py,sha256=zDjLyYxBeL0ffn3yfE_7vqMLMpeEEk2o-zMr66sKkDw,6827
17
- sibi_dst/df_helper/plugins/http/__init__.py,sha256=AG9JSDRyVna2r1yxCQ9HcY32EaGnzWsfKgNLgPpSXjY,102
18
- sibi_dst/df_helper/plugins/http/_http_config.py,sha256=NN3bol7NgBTDv70yOX7hJkazt1-dAAdFWVkYyHdIXsI,2128
19
- sibi_dst/df_helper/plugins/parquet/__init__.py,sha256=ClkyIsIh_ovEwqm0dTrkXImbPjLDTVHW2NQqqfQwWAw,187
20
- sibi_dst/df_helper/plugins/parquet/_parquet_filter_handler.py,sha256=6iFvblnVq0qj89QvieQuYxe_2RPX5ArKfq5zBcEIj90,3660
21
- sibi_dst/df_helper/plugins/parquet/_parquet_options.py,sha256=suJC7LfNEWAo-7_R62YTMSRku3k8orysft83VxRUems,4394
22
- sibi_dst/df_helper/plugins/sql_alchemy/__init__.py,sha256=FHorj40SbHc0OBzQ_ieG6MG-HLbf0tw6I_5eoIjJkOI,369
23
- sibi_dst/df_helper/plugins/sql_alchemy/_io_sqlalchemy_dask.py,sha256=fna8xZL8Ij6uCM_tZINO8vPdpJZaXs41gGzR4xn5zd8,5531
24
- sibi_dst/df_helper/plugins/sql_alchemy/_sqlachemy_filter_handler.py,sha256=KShsLJYGVxN0ps9Wot7fF0nR0wW9WzcPIcWZ9f5vdBo,4654
25
- sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_db_connection.py,sha256=HtMsfH5com4dLVJxh3wdMUpQI3mz0cKDJz0CmFS2S8U,1648
26
- sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_load_from_db.py,sha256=DBIM4kk86GxWkyiEZ4dSl_DdKa9SMvANCbympfzOqgQ,2169
27
- sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_model_builder.py,sha256=IQK2jOXMNJRQOSD0VQ0p11BeDGlvxD8NfFRilw9Go80,4466
28
- sibi_dst/df_helper/plugins/sql_model/__init__.py,sha256=MXd4OOdTqR4cENSV733SGodPO6eQMCexANs-3w0qL5U,226
29
- sibi_dst/df_helper/plugins/sql_model/_sqlmodel_db_connection.py,sha256=6jmMjKIv5Btysj3kZMaXQ98IqKQkhnOC-JWtb1B8rus,4265
30
- sibi_dst/df_helper/plugins/sql_model/_sqlmodel_load_from_db.py,sha256=jYwkIz7_E9Z6Mqw1a9TCWKWD146Tbx7mcQFxIpmKgKU,3686
31
- sibi_dst/utils/__init__.py,sha256=TV229dPIIEzU5qCLI1G6fnCZW-VirUwSuffp7z7OTFg,783
32
- sibi_dst/utils/_airflow_manager.py,sha256=rlt3eolR5QvtxWhAtBTCpHXvxftnKM-ibPMv3fVwNZk,7524
33
- sibi_dst/utils/_clickhouse_writer.py,sha256=JcnWN2635ATCOaFiB6NYglNXDwqKw0jC7Urs9WOZE20,8571
34
- sibi_dst/utils/_credentials.py,sha256=8i6z7y3y5S-6mSk4xrT2AwhzCA32mTn1n1iYX9IVyHk,1724
35
- sibi_dst/utils/_data_utils.py,sha256=ch4j5FEs8ZnniUzpbeLO-b4Yco_6nwCu71xHaVqMGi4,7050
36
- sibi_dst/utils/_data_wrapper.py,sha256=_hLZhKqSxcfXe8IyWM2paBxtW2JlOCq2jYhNGcInPi4,9406
37
- sibi_dst/utils/_date_utils.py,sha256=KYB07puKDrSG8tOm_i1HGX0TjLNUtSWjwfsCYBmW9co,10619
38
- sibi_dst/utils/_df_utils.py,sha256=pjEfkof9hggXQgYerG0p4DXrwBeIRynJFg4IX3Yrb4c,10919
39
- sibi_dst/utils/_file_utils.py,sha256=5EN90c8N1n9d-_xwz2RzaYcXRMQY_rws2Q3EA3pNAog,1254
40
- sibi_dst/utils/_filepath_generator.py,sha256=ytPSZ9GYOnnSP25zwA-0NjFHupPRZyXwixWnn_68_n0,6686
41
- sibi_dst/utils/_log_utils.py,sha256=rPp8z1UglwvqzBOOAvMOct0syQZ-54gGYafnJDRYZN4,2313
42
- sibi_dst/utils/_parquet_saver.py,sha256=3BK0XXgMOOAdIw4OzbwMxmDrzDw3_MKi8RTpulIVUe0,4367
43
- sibi_dst/utils/_storage_manager.py,sha256=KP2HBXnLUMMquqcO30ecfuoU7g1z8RtaV3Dv0TvEXoY,3856
44
- sibi_dst-0.3.17.dist-info/METADATA,sha256=Aw__Wr7myZwJfWGRNFy0Ye5FLXUnGf6b14GW5KBDGtE,2133
45
- sibi_dst-0.3.17.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
46
- sibi_dst-0.3.17.dist-info/RECORD,,
File without changes