sibi-dst 2025.1.7__tar.gz → 2025.1.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/PKG-INFO +1 -1
  2. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/pyproject.toml +1 -1
  3. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/utils/data_wrapper.py +33 -30
  4. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/utils/parquet_saver.py +0 -4
  5. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/utils/update_planner.py +5 -7
  6. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/README.md +0 -0
  7. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/__init__.py +0 -0
  8. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/df_helper/__init__.py +0 -0
  9. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +0 -0
  10. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/df_helper/_df_helper.py +0 -0
  11. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
  12. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/df_helper/_parquet_reader.py +0 -0
  13. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/df_helper/backends/__init__.py +0 -0
  14. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
  15. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
  16. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
  17. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/df_helper/backends/parquet/_filter_handler.py +0 -0
  18. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +0 -0
  19. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
  20. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
  21. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  22. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
  23. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
  24. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/df_helper/core/__init__.py +0 -0
  25. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/df_helper/core/_defaults.py +0 -0
  26. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
  27. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/df_helper/core/_params_config.py +0 -0
  28. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/df_helper/core/_query_config.py +0 -0
  29. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/df_helper/data_cleaner.py +0 -0
  30. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/geopy_helper/__init__.py +0 -0
  31. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
  32. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/geopy_helper/utils.py +0 -0
  33. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/osmnx_helper/__init__.py +0 -0
  34. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
  35. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
  36. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
  37. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
  38. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/osmnx_helper/utils.py +0 -0
  39. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/tests/__init__.py +0 -0
  40. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
  41. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/utils/__init__.py +0 -0
  42. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/utils/base.py +0 -0
  43. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/utils/clickhouse_writer.py +0 -0
  44. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/utils/credentials.py +0 -0
  45. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/utils/data_from_http_source.py +0 -0
  46. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/utils/data_utils.py +0 -0
  47. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/utils/date_utils.py +0 -0
  48. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/utils/df_utils.py +0 -0
  49. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/utils/file_utils.py +0 -0
  50. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/utils/filepath_generator.py +0 -0
  51. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/utils/log_utils.py +0 -0
  52. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/utils/manifest_manager.py +0 -0
  53. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/utils/phone_formatter.py +0 -0
  54. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/utils/storage_config.py +0 -0
  55. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/utils/storage_manager.py +0 -0
  56. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/utils/webdav_client.py +0 -0
  57. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/v2/__init__.py +0 -0
  58. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/v2/df_helper/__init__.py +0 -0
  59. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/v2/df_helper/_df_helper.py +0 -0
  60. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/v2/df_helper/backends/__init__.py +0 -0
  61. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py +0 -0
  62. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
  63. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  64. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
  65. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py +0 -0
  66. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py +0 -0
  67. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +0 -0
  68. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py +0 -0
  69. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py +0 -0
  70. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py +0 -0
  71. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/v2/df_helper/core/__init__.py +0 -0
  72. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/v2/df_helper/core/_filter_handler.py +0 -0
  73. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/v2/df_helper/core/_params_config.py +0 -0
  74. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/v2/df_helper/core/_query_config.py +0 -0
  75. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/v2/utils/__init__.py +0 -0
  76. {sibi_dst-2025.1.7 → sibi_dst-2025.1.9}/sibi_dst/v2/utils/log_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 2025.1.7
3
+ Version: 2025.1.9
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "sibi-dst"
3
- version = "2025.1.7"
3
+ version = "2025.1.9"
4
4
  description = "Data Science Toolkit"
5
5
  authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
6
6
  readme = "README.md"
@@ -1,16 +1,13 @@
1
1
  import datetime
2
- import logging
3
2
  import threading
4
3
  import time
5
4
  from concurrent.futures import ThreadPoolExecutor, as_completed
6
5
  from typing import Type, Any, Dict, Optional, Union, List, ClassVar
7
6
 
8
- import fsspec
9
7
  import pandas as pd
10
8
  from tqdm import tqdm
11
9
 
12
10
  from . import ManagedResource
13
- from .log_utils import Logger
14
11
  from .parquet_saver import ParquetSaver
15
12
 
16
13
 
@@ -60,8 +57,7 @@ class DataWrapper(ManagedResource):
60
57
  self.processed_dates: List[datetime.date] = []
61
58
  self.benchmarks: Dict[datetime.date, Dict[str, float]] = {}
62
59
  self.mmanifest = kwargs.get("mmanifest", None)
63
- self.update_planner=kwargs.get("update_planner", None)
64
-
60
+ self.update_planner = kwargs.get("update_planner", None)
65
61
 
66
62
  def __exit__(self, exc_type, exc_val, exc_tb):
67
63
  """Context manager exit"""
@@ -104,7 +100,6 @@ class DataWrapper(ManagedResource):
104
100
  if self.update_planner.show_progress:
105
101
  self.show_benchmark_summary()
106
102
 
107
-
108
103
  def _execute_task_batch(self, priority: int, dates: List[datetime.date], max_retries: int):
109
104
  """Executes a single batch of tasks (dates) using a thread pool."""
110
105
  desc = f"Processing {self.dataclass.__name__}, priority: {priority}"
@@ -135,7 +130,7 @@ class DataWrapper(ManagedResource):
135
130
  time.sleep(2 ** attempt) # Exponential backoff
136
131
  else:
137
132
  self.logger.error(f"Failed processing {date} after {max_retries} attempts.")
138
- #raise
133
+ # raise
139
134
 
140
135
  def _process_single_date(self, date: datetime.date):
141
136
  """Core date processing logic with load/save timing and thread reporting"""
@@ -146,8 +141,8 @@ class DataWrapper(ManagedResource):
146
141
  return
147
142
  full_path = f"{path}{self.parquet_filename}"
148
143
 
149
- #thread_name = threading.current_thread().name
150
- #self.logger.debug(f"[{thread_name}] Executing date: {date} -> saving to: {full_path}")
144
+ # thread_name = threading.current_thread().name
145
+ # self.logger.debug(f"[{thread_name}] Executing date: {date} -> saving to: {full_path}")
151
146
 
152
147
  overall_start = time.perf_counter()
153
148
  try:
@@ -159,35 +154,43 @@ class DataWrapper(ManagedResource):
159
154
  local_load_params = self.load_params.copy()
160
155
  local_load_params.update(date_filter)
161
156
  local_class_instance = self.dataclass(**self.class_params)
162
- df=local_class_instance.load(**local_load_params)
157
+ df = local_class_instance.load(**local_load_params)
163
158
  load_time = time.perf_counter() - load_start
164
159
 
165
160
  if hasattr(local_class_instance, "total_records"):
166
- self.logger.debug(f"Total records loaded by {local_class_instance.__class__.__name__}: {local_class_instance.total_records}")
161
+ self.logger.debug(
162
+ f"Total records loaded by {local_class_instance.__class__.__name__}: {local_class_instance.total_records}")
167
163
  if int(local_class_instance.total_records) == 0: # If no records were loaded but not due to an error
168
164
  if self.mmanifest:
169
165
  self.mmanifest.record(
170
166
  full_path=path
171
167
  )
172
- self.logger.info(f"No data found for {date}. Logged to missing manifest.")
173
- return
174
- save_start = time.perf_counter()
175
- with ParquetSaver(
176
- df_result=df,
177
- parquet_storage_path=path,
178
- fs=self.fs,
179
- logger=self.logger
180
- ) as ps:
181
- ps.save_to_parquet(self.parquet_filename, overwrite=True)
182
- save_time = time.perf_counter() - save_start
183
-
184
- total_time = time.perf_counter() - overall_start
185
- self.benchmarks[date] = {
186
- "load_duration": load_time,
187
- "save_duration": save_time,
188
- "total_duration": total_time
189
- }
190
- self._log_success(date, total_time, full_path)
168
+ self.logger.info(f"No data found for {full_path}. Logged to missing manifest.")
169
+ elif int(local_class_instance.total_records) < 0:
170
+ self.logger.warning(
171
+ f"Negative record count ({local_class_instance.total_records}) for {full_path}. "
172
+ "This may indicate an error in the data loading process."
173
+ )
174
+ else:
175
+ save_start = time.perf_counter()
176
+ parquet_params ={
177
+ "df_result": df,
178
+ "parquet_storage_path": path,
179
+ "fs": self.fs,
180
+ "logger": self.logger,
181
+ "debug": self.debug,
182
+ }
183
+ with ParquetSaver(**parquet_params) as ps:
184
+ ps.save_to_parquet(self.parquet_filename, overwrite=True)
185
+ save_time = time.perf_counter() - save_start
186
+
187
+ total_time = time.perf_counter() - overall_start
188
+ self.benchmarks[date] = {
189
+ "load_duration": load_time,
190
+ "save_duration": save_time,
191
+ "total_duration": total_time
192
+ }
193
+ self._log_success(date, total_time, full_path)
191
194
  except Exception as e:
192
195
  self._log_failure(date, e)
193
196
  raise
@@ -26,10 +26,6 @@ class ParquetSaver(ManagedResource):
26
26
  super().__init__(**kwargs)
27
27
  self.df_result = df_result
28
28
  self.parquet_storage_path = parquet_storage_path.rstrip("/")
29
- #self.debug = debug
30
- #self.logger = logger or Logger.default_logger(self.__class__.__name__)
31
- #self.logger.set_level(logging.DEBUG if self.debug else logging.INFO)
32
- #self.fs = fs
33
29
  # Determine protocol for special handling (e.g., 's3')
34
30
  if not self.fs:
35
31
  raise ValueError("File system (fs) must be provided to ParquetSaver.")
@@ -72,9 +72,6 @@ class UpdatePlanner(ManagedResource):
72
72
  data_path: str,
73
73
  filename: str,
74
74
  description: str = "Update Planner",
75
- #fs: Optional[fsspec.AbstractFileSystem] = None,
76
- #filesystem_type: str = "file",
77
- #filesystem_options: Optional[Dict] = None,
78
75
  reference_date: Union[str, datetime.date] = None,
79
76
  history_days_threshold: int = DEFAULT_HISTORY_DAYS_THRESHOLD,
80
77
  max_age_minutes: int = DEFAULT_MAX_AGE_MINUTES,
@@ -199,11 +196,12 @@ class UpdatePlanner(ManagedResource):
199
196
  for _, row in self.plan.iterrows():
200
197
  table.add_row(*(str(item) for item in row))
201
198
 
202
- console.print(table)
199
+ console = Console()
200
+ with console.capture() as capture:
201
+ console.print(table)
202
+ plan_string = capture.get()
203
203
 
204
- plan_string = console.export_text()
205
-
206
- self.logger.info(f"Full Update Plan:\n{plan_string}")
204
+ self.logger.info(f"Full Update Plan:\n{plan_string.strip()}")
207
205
 
208
206
  def get_tasks_by_priority(self) -> Iterator[Tuple[int, List[datetime.date]]]:
209
207
  """Yields batches of dates to be processed, grouped and sorted by priority."""
File without changes