sibi-dst 0.3.38__tar.gz → 0.3.40__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/PKG-INFO +1 -1
  2. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/pyproject.toml +1 -1
  3. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/utils/data_wrapper.py +51 -49
  4. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/README.md +0 -0
  5. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/__init__.py +0 -0
  6. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/__init__.py +0 -0
  7. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/_df_helper.py +0 -0
  8. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
  9. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/_parquet_reader.py +0 -0
  10. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/backends/__init__.py +0 -0
  11. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/backends/django/__init__.py +0 -0
  12. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/backends/django/_db_connection.py +0 -0
  13. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/backends/django/_io_dask.py +0 -0
  14. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/backends/django/_load_from_db.py +0 -0
  15. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/backends/django/_sql_model_builder.py +0 -0
  16. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
  17. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
  18. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
  19. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/backends/parquet/_filter_handler.py +0 -0
  20. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +0 -0
  21. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
  22. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
  23. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py +0 -0
  24. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  25. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
  26. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
  27. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/core/__init__.py +0 -0
  28. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/core/_defaults.py +0 -0
  29. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
  30. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/core/_params_config.py +0 -0
  31. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/core/_query_config.py +0 -0
  32. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/data_cleaner.py +0 -0
  33. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/geopy_helper/__init__.py +0 -0
  34. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
  35. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/geopy_helper/utils.py +0 -0
  36. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/osmnx_helper/__init__.py +0 -0
  37. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
  38. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
  39. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
  40. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
  41. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/osmnx_helper/utils.py +0 -0
  42. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/tests/__init__.py +0 -0
  43. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
  44. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/utils/__init__.py +0 -0
  45. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/utils/airflow_manager.py +0 -0
  46. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/utils/clickhouse_writer.py +0 -0
  47. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/utils/credentials.py +0 -0
  48. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/utils/data_utils.py +0 -0
  49. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/utils/date_utils.py +0 -0
  50. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/utils/df_utils.py +0 -0
  51. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/utils/file_utils.py +0 -0
  52. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/utils/filepath_generator.py +0 -0
  53. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/utils/log_utils.py +0 -0
  54. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/utils/parquet_saver.py +0 -0
  55. {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/utils/storage_manager.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 0.3.38
3
+ Version: 0.3.40
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "sibi-dst"
3
- version = "0.3.38"
3
+ version = "0.3.40"
4
4
  description = "Data Science Toolkit"
5
5
  authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
6
6
  readme = "README.md"
@@ -91,14 +91,14 @@ class DataWrapper:
91
91
  max_age_minutes: int = DEFAULT_MAX_AGE_MINUTES,
92
92
  history_days_threshold: int = DEFAULT_HISTORY_DAYS_THRESHOLD,
93
93
  show_progress: bool = False,
94
- timeout: float = 300):
94
+ timeout: float = 60):
95
95
  self.dataclass = dataclass
96
96
  self.date_field = date_field
97
97
  self.data_path = self.ensure_forward_slash(data_path)
98
98
  self.parquet_filename = parquet_filename
99
99
  self.filesystem_type = filesystem_type
100
100
  self.filesystem_options = filesystem_options or {}
101
- self.fs = fs or fsspec.filesystem(filesystem_type, **self.filesystem_options)
101
+ self.fs = fs
102
102
  self.verbose = verbose
103
103
  self.class_params = class_params or {}
104
104
  self.load_params = load_params or {}
@@ -116,6 +116,10 @@ class DataWrapper:
116
116
  self._lock = Lock()
117
117
  self.processed_dates = []
118
118
  self.date_utils = DateUtils(logger=self.logger)
119
+ if self.fs is None:
120
+ with self._lock:
121
+ if self.fs is None:
122
+ self.fs = fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
119
123
 
120
124
  @staticmethod
121
125
  def convert_to_date(date: Union[datetime.date, str]) -> datetime.date:
@@ -154,16 +158,16 @@ class DataWrapper:
154
158
  """
155
159
  update_plan_table = self.generate_update_plan_with_conditions()
156
160
 
157
- # Display the update plan table to the user if requested
158
- if self.show_progress:
159
- display(update_plan_table)
160
-
161
161
  # Filter out rows that do not require updates (priority 0 means skip)
162
162
  with self._lock:
163
163
  update_plan_table = update_plan_table[
164
164
  (update_plan_table["update_required"] == True) & (update_plan_table["update_priority"] != 0)
165
165
  ]
166
-
166
+ # Display the update plan table to the user if requested
167
+ if len(update_plan_table.index) == 0:
168
+ return
169
+ if self.show_progress:
170
+ display(update_plan_table)
167
171
  # Group by priority
168
172
  with self._lock:
169
173
  priorities = sorted(update_plan_table["update_priority"].unique())
@@ -172,21 +176,20 @@ class DataWrapper:
172
176
  # Each thread will handle all dates associated with that priority.
173
177
  def process_priority(priority):
174
178
  # Extract dates for the current priority
175
- with self._lock:
176
- dates_to_process = update_plan_table[
177
- update_plan_table["update_priority"] == priority
178
- ]["date"].tolist()
179
+ dates_to_process = update_plan_table[
180
+ update_plan_table["update_priority"] == priority
181
+ ]["date"].tolist()
179
182
 
180
- # If show_progress is True, wrap in a progress bar
181
- date_iterator = dates_to_process
182
- if self.show_progress:
183
- date_iterator = tqdm(date_iterator,
184
- desc=f"Processing priority {priority}:{self.dataclass.__name__}",
185
- unit="date")
183
+ # If show_progress is True, wrap in a progress bar
184
+ date_iterator = dates_to_process
185
+ if self.show_progress:
186
+ date_iterator = tqdm(date_iterator,
187
+ desc=f"Processing priority {priority}:{self.dataclass.__name__}",
188
+ unit="date")
186
189
 
187
- # Process each date for this priority
188
- for current_date in date_iterator:
189
- self.process_date(current_date)
190
+ # Process each date for this priority
191
+ for current_date in date_iterator:
192
+ self.process_date(current_date)
190
193
 
191
194
  # Launch a separate thread for each priority
192
195
  with ThreadPoolExecutor(max_workers=len(priorities)) as executor:
@@ -232,21 +235,21 @@ class DataWrapper:
232
235
  :type date: datetime.date
233
236
  :return: None
234
237
  """
235
- with self._lock:
236
- folder = f'{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/'
237
- full_parquet_filename = f"{folder}{self.parquet_filename}"
238
+ folder = f'{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/'
239
+ full_parquet_filename = f"{folder}{self.parquet_filename}"
238
240
 
239
- start_time = datetime.datetime.now()
240
- self.logger.info(f"Processing date: {date}")
241
- self.logger.info(f"Processing {full_parquet_filename}...")
241
+ start_time = datetime.datetime.now()
242
+ self.logger.info(f"Processing date: {date}")
243
+ self.logger.info(f"Processing {full_parquet_filename}...")
242
244
 
243
- data_object = self.dataclass(**self.class_params)
244
- df = data_object.load_period(dt_field=self.date_field, start=date, end=date)
245
+ data_object = self.dataclass(**self.class_params)
246
+ df = data_object.load_period(dt_field=self.date_field, start=date, end=date)
245
247
 
246
- if len(df.index) == 0:
247
- self.logger.error("No data found for the specified date.")
248
- return
248
+ if len(df.index) == 0:
249
+ self.logger.error("No data found for the specified date.")
250
+ return
249
251
 
252
+ with self._lock:
250
253
  parquet_saver = ParquetSaver(df, parquet_storage_path=folder, logger=self.logger, fs=self.fs)
251
254
  parquet_saver.save_to_parquet(self.parquet_filename, clear_existing=True)
252
255
 
@@ -255,8 +258,9 @@ class DataWrapper:
255
258
  self.logger.info(
256
259
  f"Data saved to {full_parquet_filename}. Processing time: {duration_seconds:.2f} seconds"
257
260
  )
261
+
258
262
  self.processed_dates.append(date)
259
- self.logger.info(f"Finished processing date: {date}")
263
+ self.logger.info(f"Finished processing date: {date}")
260
264
 
261
265
  def generate_update_plan_with_conditions(self):
262
266
  """
@@ -294,29 +298,33 @@ class DataWrapper:
294
298
  within_history = history_start_date <= current_date <= today
295
299
  missing_file = not file_exists and not self.ignore_missing
296
300
  category = None
301
+ update_required = False
297
302
 
298
303
  # Hierarchy 1: Overwrite
299
304
  if self.overwrite:
300
305
  category = "overwrite"
301
306
  update_required = True
302
- # Hierarchy 2: History threshold evaluation
307
+ elif missing_file and current_date < today:
308
+ category = "missing_files"
309
+ update_required = True
310
+
303
311
  elif within_history:
304
- if self.date_utils.is_file_older_than(
312
+ if file_exists:
313
+ if self.date_utils.is_file_older_than(
305
314
  full_parquet_filename,
306
315
  max_age_minutes=self.max_age_minutes,
307
316
  fs=self.fs,
308
317
  ignore_missing=self.ignore_missing,
309
318
  verbose=self.verbose
310
- ):
311
- category = "history_days"
312
- update_required = True
319
+ ):
320
+ category = "history_days"
321
+ update_required = True
322
+ else:
323
+ category = "file is recent"
324
+ update_required = False
313
325
  else:
314
- category = "file is recent"
315
- update_required = False
316
- # Hierarchy 3: Missing files
317
- elif missing_file and current_date <= today:
318
- category = "missing_files"
319
- update_required = True
326
+ category = "missing_files"
327
+ update_required = True
320
328
  else:
321
329
  category = "No Update Required"
322
330
  update_required = False
@@ -334,12 +342,6 @@ class DataWrapper:
334
342
  }
335
343
  rows.append(row)
336
344
 
337
-
338
- for row in rows:
339
- category = row.get("update_category")
340
- # Default to None if no category assigned (no update required)
341
- row["update_priority"] = priority_map.get(category, 0)
342
-
343
345
  update_plan_table = pd.DataFrame(rows)
344
346
  return update_plan_table
345
347
 
File without changes