sibi-dst 0.3.38__tar.gz → 0.3.39__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/PKG-INFO +1 -1
  2. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/pyproject.toml +1 -1
  3. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/utils/data_wrapper.py +41 -38
  4. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/README.md +0 -0
  5. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/__init__.py +0 -0
  6. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/df_helper/__init__.py +0 -0
  7. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/df_helper/_df_helper.py +0 -0
  8. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
  9. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/df_helper/_parquet_reader.py +0 -0
  10. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/df_helper/backends/__init__.py +0 -0
  11. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/df_helper/backends/django/__init__.py +0 -0
  12. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/df_helper/backends/django/_db_connection.py +0 -0
  13. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/df_helper/backends/django/_io_dask.py +0 -0
  14. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/df_helper/backends/django/_load_from_db.py +0 -0
  15. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/df_helper/backends/django/_sql_model_builder.py +0 -0
  16. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
  17. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
  18. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
  19. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/df_helper/backends/parquet/_filter_handler.py +0 -0
  20. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +0 -0
  21. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
  22. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
  23. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py +0 -0
  24. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  25. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
  26. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
  27. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/df_helper/core/__init__.py +0 -0
  28. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/df_helper/core/_defaults.py +0 -0
  29. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
  30. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/df_helper/core/_params_config.py +0 -0
  31. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/df_helper/core/_query_config.py +0 -0
  32. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/df_helper/data_cleaner.py +0 -0
  33. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/geopy_helper/__init__.py +0 -0
  34. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
  35. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/geopy_helper/utils.py +0 -0
  36. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/osmnx_helper/__init__.py +0 -0
  37. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
  38. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
  39. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
  40. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
  41. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/osmnx_helper/utils.py +0 -0
  42. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/tests/__init__.py +0 -0
  43. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
  44. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/utils/__init__.py +0 -0
  45. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/utils/airflow_manager.py +0 -0
  46. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/utils/clickhouse_writer.py +0 -0
  47. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/utils/credentials.py +0 -0
  48. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/utils/data_utils.py +0 -0
  49. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/utils/date_utils.py +0 -0
  50. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/utils/df_utils.py +0 -0
  51. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/utils/file_utils.py +0 -0
  52. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/utils/filepath_generator.py +0 -0
  53. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/utils/log_utils.py +0 -0
  54. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/utils/parquet_saver.py +0 -0
  55. {sibi_dst-0.3.38 → sibi_dst-0.3.39}/sibi_dst/utils/storage_manager.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 0.3.38
3
+ Version: 0.3.39
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "sibi-dst"
3
- version = "0.3.38"
3
+ version = "0.3.39"
4
4
  description = "Data Science Toolkit"
5
5
  authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
6
6
  readme = "README.md"
@@ -98,7 +98,7 @@ class DataWrapper:
98
98
  self.parquet_filename = parquet_filename
99
99
  self.filesystem_type = filesystem_type
100
100
  self.filesystem_options = filesystem_options or {}
101
- self.fs = fs or fsspec.filesystem(filesystem_type, **self.filesystem_options)
101
+ self.fs = fs
102
102
  self.verbose = verbose
103
103
  self.class_params = class_params or {}
104
104
  self.load_params = load_params or {}
@@ -116,6 +116,10 @@ class DataWrapper:
116
116
  self._lock = Lock()
117
117
  self.processed_dates = []
118
118
  self.date_utils = DateUtils(logger=self.logger)
119
+ if self.fs is None:
120
+ with self._lock:
121
+ if self.fs is None:
122
+ self.fs = fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
119
123
 
120
124
  @staticmethod
121
125
  def convert_to_date(date: Union[datetime.date, str]) -> datetime.date:
@@ -172,21 +176,20 @@ class DataWrapper:
172
176
  # Each thread will handle all dates associated with that priority.
173
177
  def process_priority(priority):
174
178
  # Extract dates for the current priority
175
- with self._lock:
176
- dates_to_process = update_plan_table[
177
- update_plan_table["update_priority"] == priority
178
- ]["date"].tolist()
179
+ dates_to_process = update_plan_table[
180
+ update_plan_table["update_priority"] == priority
181
+ ]["date"].tolist()
179
182
 
180
- # If show_progress is True, wrap in a progress bar
181
- date_iterator = dates_to_process
182
- if self.show_progress:
183
- date_iterator = tqdm(date_iterator,
184
- desc=f"Processing priority {priority}:{self.dataclass.__name__}",
185
- unit="date")
183
+ # If show_progress is True, wrap in a progress bar
184
+ date_iterator = dates_to_process
185
+ if self.show_progress:
186
+ date_iterator = tqdm(date_iterator,
187
+ desc=f"Processing priority {priority}:{self.dataclass.__name__}",
188
+ unit="date")
186
189
 
187
- # Process each date for this priority
188
- for current_date in date_iterator:
189
- self.process_date(current_date)
190
+ # Process each date for this priority
191
+ for current_date in date_iterator:
192
+ self.process_date(current_date)
190
193
 
191
194
  # Launch a separate thread for each priority
192
195
  with ThreadPoolExecutor(max_workers=len(priorities)) as executor:
@@ -232,21 +235,21 @@ class DataWrapper:
232
235
  :type date: datetime.date
233
236
  :return: None
234
237
  """
235
- with self._lock:
236
- folder = f'{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/'
237
- full_parquet_filename = f"{folder}{self.parquet_filename}"
238
+ folder = f'{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/'
239
+ full_parquet_filename = f"{folder}{self.parquet_filename}"
238
240
 
239
- start_time = datetime.datetime.now()
240
- self.logger.info(f"Processing date: {date}")
241
- self.logger.info(f"Processing {full_parquet_filename}...")
241
+ start_time = datetime.datetime.now()
242
+ self.logger.info(f"Processing date: {date}")
243
+ self.logger.info(f"Processing {full_parquet_filename}...")
242
244
 
243
- data_object = self.dataclass(**self.class_params)
244
- df = data_object.load_period(dt_field=self.date_field, start=date, end=date)
245
+ data_object = self.dataclass(**self.class_params)
246
+ df = data_object.load_period(dt_field=self.date_field, start=date, end=date)
245
247
 
246
- if len(df.index) == 0:
247
- self.logger.error("No data found for the specified date.")
248
- return
248
+ if len(df.index) == 0:
249
+ self.logger.error("No data found for the specified date.")
250
+ return
249
251
 
252
+ with self._lock:
250
253
  parquet_saver = ParquetSaver(df, parquet_storage_path=folder, logger=self.logger, fs=self.fs)
251
254
  parquet_saver.save_to_parquet(self.parquet_filename, clear_existing=True)
252
255
 
@@ -255,8 +258,9 @@ class DataWrapper:
255
258
  self.logger.info(
256
259
  f"Data saved to {full_parquet_filename}. Processing time: {duration_seconds:.2f} seconds"
257
260
  )
261
+
258
262
  self.processed_dates.append(date)
259
- self.logger.info(f"Finished processing date: {date}")
263
+ self.logger.info(f"Finished processing date: {date}")
260
264
 
261
265
  def generate_update_plan_with_conditions(self):
262
266
  """
@@ -301,18 +305,23 @@ class DataWrapper:
301
305
  update_required = True
302
306
  # Hierarchy 2: History threshold evaluation
303
307
  elif within_history:
304
- if self.date_utils.is_file_older_than(
308
+ if file_exists:
309
+ if self.date_utils.is_file_older_than(
305
310
  full_parquet_filename,
306
311
  max_age_minutes=self.max_age_minutes,
307
312
  fs=self.fs,
308
313
  ignore_missing=self.ignore_missing,
309
314
  verbose=self.verbose
310
- ):
311
- category = "history_days"
312
- update_required = True
315
+ ):
316
+ category = "history_days"
317
+ update_required = True
318
+ else:
319
+ category = "file is recent"
320
+ update_required = False
313
321
  else:
314
- category = "file is recent"
315
- update_required = False
322
+ category = "missing_files"
323
+ update_required = True
324
+
316
325
  # Hierarchy 3: Missing files
317
326
  elif missing_file and current_date <= today:
318
327
  category = "missing_files"
@@ -334,12 +343,6 @@ class DataWrapper:
334
343
  }
335
344
  rows.append(row)
336
345
 
337
-
338
- for row in rows:
339
- category = row.get("update_category")
340
- # Default to None if no category assigned (no update required)
341
- row["update_priority"] = priority_map.get(category, 0)
342
-
343
346
  update_plan_table = pd.DataFrame(rows)
344
347
  return update_plan_table
345
348
 
File without changes