sibi-dst 0.3.38__tar.gz → 0.3.40__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/PKG-INFO +1 -1
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/pyproject.toml +1 -1
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/utils/data_wrapper.py +51 -49
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/README.md +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/__init__.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/__init__.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/_df_helper.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/_parquet_reader.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/backends/__init__.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/backends/django/__init__.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/backends/django/_db_connection.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/backends/django/_io_dask.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/backends/django/_load_from_db.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/backends/django/_sql_model_builder.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/backends/parquet/_filter_handler.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/core/__init__.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/core/_defaults.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/core/_params_config.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/core/_query_config.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/data_cleaner.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/geopy_helper/__init__.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/geopy_helper/utils.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/osmnx_helper/__init__.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/osmnx_helper/utils.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/tests/__init__.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/utils/__init__.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/utils/airflow_manager.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/utils/clickhouse_writer.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/utils/credentials.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/utils/data_utils.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/utils/date_utils.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/utils/df_utils.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/utils/file_utils.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/utils/filepath_generator.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/utils/log_utils.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/utils/parquet_saver.py +0 -0
- {sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/utils/storage_manager.py +0 -0
@@ -91,14 +91,14 @@ class DataWrapper:
|
|
91
91
|
max_age_minutes: int = DEFAULT_MAX_AGE_MINUTES,
|
92
92
|
history_days_threshold: int = DEFAULT_HISTORY_DAYS_THRESHOLD,
|
93
93
|
show_progress: bool = False,
|
94
|
-
timeout: float =
|
94
|
+
timeout: float = 60):
|
95
95
|
self.dataclass = dataclass
|
96
96
|
self.date_field = date_field
|
97
97
|
self.data_path = self.ensure_forward_slash(data_path)
|
98
98
|
self.parquet_filename = parquet_filename
|
99
99
|
self.filesystem_type = filesystem_type
|
100
100
|
self.filesystem_options = filesystem_options or {}
|
101
|
-
self.fs = fs
|
101
|
+
self.fs = fs
|
102
102
|
self.verbose = verbose
|
103
103
|
self.class_params = class_params or {}
|
104
104
|
self.load_params = load_params or {}
|
@@ -116,6 +116,10 @@ class DataWrapper:
|
|
116
116
|
self._lock = Lock()
|
117
117
|
self.processed_dates = []
|
118
118
|
self.date_utils = DateUtils(logger=self.logger)
|
119
|
+
if self.fs is None:
|
120
|
+
with self._lock:
|
121
|
+
if self.fs is None:
|
122
|
+
self.fs = fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
|
119
123
|
|
120
124
|
@staticmethod
|
121
125
|
def convert_to_date(date: Union[datetime.date, str]) -> datetime.date:
|
@@ -154,16 +158,16 @@ class DataWrapper:
|
|
154
158
|
"""
|
155
159
|
update_plan_table = self.generate_update_plan_with_conditions()
|
156
160
|
|
157
|
-
# Display the update plan table to the user if requested
|
158
|
-
if self.show_progress:
|
159
|
-
display(update_plan_table)
|
160
|
-
|
161
161
|
# Filter out rows that do not require updates (priority 0 means skip)
|
162
162
|
with self._lock:
|
163
163
|
update_plan_table = update_plan_table[
|
164
164
|
(update_plan_table["update_required"] == True) & (update_plan_table["update_priority"] != 0)
|
165
165
|
]
|
166
|
-
|
166
|
+
# Display the update plan table to the user if requested
|
167
|
+
if len(update_plan_table.index) == 0:
|
168
|
+
return
|
169
|
+
if self.show_progress:
|
170
|
+
display(update_plan_table)
|
167
171
|
# Group by priority
|
168
172
|
with self._lock:
|
169
173
|
priorities = sorted(update_plan_table["update_priority"].unique())
|
@@ -172,21 +176,20 @@ class DataWrapper:
|
|
172
176
|
# Each thread will handle all dates associated with that priority.
|
173
177
|
def process_priority(priority):
|
174
178
|
# Extract dates for the current priority
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
]["date"].tolist()
|
179
|
+
dates_to_process = update_plan_table[
|
180
|
+
update_plan_table["update_priority"] == priority
|
181
|
+
]["date"].tolist()
|
179
182
|
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
183
|
+
# If show_progress is True, wrap in a progress bar
|
184
|
+
date_iterator = dates_to_process
|
185
|
+
if self.show_progress:
|
186
|
+
date_iterator = tqdm(date_iterator,
|
187
|
+
desc=f"Processing priority {priority}:{self.dataclass.__name__}",
|
188
|
+
unit="date")
|
186
189
|
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
+
# Process each date for this priority
|
191
|
+
for current_date in date_iterator:
|
192
|
+
self.process_date(current_date)
|
190
193
|
|
191
194
|
# Launch a separate thread for each priority
|
192
195
|
with ThreadPoolExecutor(max_workers=len(priorities)) as executor:
|
@@ -232,21 +235,21 @@ class DataWrapper:
|
|
232
235
|
:type date: datetime.date
|
233
236
|
:return: None
|
234
237
|
"""
|
235
|
-
|
236
|
-
|
237
|
-
full_parquet_filename = f"{folder}{self.parquet_filename}"
|
238
|
+
folder = f'{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/'
|
239
|
+
full_parquet_filename = f"{folder}{self.parquet_filename}"
|
238
240
|
|
239
|
-
|
240
|
-
|
241
|
-
|
241
|
+
start_time = datetime.datetime.now()
|
242
|
+
self.logger.info(f"Processing date: {date}")
|
243
|
+
self.logger.info(f"Processing {full_parquet_filename}...")
|
242
244
|
|
243
|
-
|
244
|
-
|
245
|
+
data_object = self.dataclass(**self.class_params)
|
246
|
+
df = data_object.load_period(dt_field=self.date_field, start=date, end=date)
|
245
247
|
|
246
|
-
|
247
|
-
|
248
|
-
|
248
|
+
if len(df.index) == 0:
|
249
|
+
self.logger.error("No data found for the specified date.")
|
250
|
+
return
|
249
251
|
|
252
|
+
with self._lock:
|
250
253
|
parquet_saver = ParquetSaver(df, parquet_storage_path=folder, logger=self.logger, fs=self.fs)
|
251
254
|
parquet_saver.save_to_parquet(self.parquet_filename, clear_existing=True)
|
252
255
|
|
@@ -255,8 +258,9 @@ class DataWrapper:
|
|
255
258
|
self.logger.info(
|
256
259
|
f"Data saved to {full_parquet_filename}. Processing time: {duration_seconds:.2f} seconds"
|
257
260
|
)
|
261
|
+
|
258
262
|
self.processed_dates.append(date)
|
259
|
-
|
263
|
+
self.logger.info(f"Finished processing date: {date}")
|
260
264
|
|
261
265
|
def generate_update_plan_with_conditions(self):
|
262
266
|
"""
|
@@ -294,29 +298,33 @@ class DataWrapper:
|
|
294
298
|
within_history = history_start_date <= current_date <= today
|
295
299
|
missing_file = not file_exists and not self.ignore_missing
|
296
300
|
category = None
|
301
|
+
update_required = False
|
297
302
|
|
298
303
|
# Hierarchy 1: Overwrite
|
299
304
|
if self.overwrite:
|
300
305
|
category = "overwrite"
|
301
306
|
update_required = True
|
302
|
-
|
307
|
+
elif missing_file and current_date < today:
|
308
|
+
category = "missing_files"
|
309
|
+
update_required = True
|
310
|
+
|
303
311
|
elif within_history:
|
304
|
-
if
|
312
|
+
if file_exists:
|
313
|
+
if self.date_utils.is_file_older_than(
|
305
314
|
full_parquet_filename,
|
306
315
|
max_age_minutes=self.max_age_minutes,
|
307
316
|
fs=self.fs,
|
308
317
|
ignore_missing=self.ignore_missing,
|
309
318
|
verbose=self.verbose
|
310
|
-
|
311
|
-
|
312
|
-
|
319
|
+
):
|
320
|
+
category = "history_days"
|
321
|
+
update_required = True
|
322
|
+
else:
|
323
|
+
category = "file is recent"
|
324
|
+
update_required = False
|
313
325
|
else:
|
314
|
-
category = "
|
315
|
-
update_required =
|
316
|
-
# Hierarchy 3: Missing files
|
317
|
-
elif missing_file and current_date <= today:
|
318
|
-
category = "missing_files"
|
319
|
-
update_required = True
|
326
|
+
category = "missing_files"
|
327
|
+
update_required = True
|
320
328
|
else:
|
321
329
|
category = "No Update Required"
|
322
330
|
update_required = False
|
@@ -334,12 +342,6 @@ class DataWrapper:
|
|
334
342
|
}
|
335
343
|
rows.append(row)
|
336
344
|
|
337
|
-
|
338
|
-
for row in rows:
|
339
|
-
category = row.get("update_category")
|
340
|
-
# Default to None if no category assigned (no update required)
|
341
|
-
row["update_priority"] = priority_map.get(category, 0)
|
342
|
-
|
343
345
|
update_plan_table = pd.DataFrame(rows)
|
344
346
|
return update_plan_table
|
345
347
|
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/backends/django/_sql_model_builder.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py
RENAMED
File without changes
|
{sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|