sibi-dst 0.3.40__py3-none-any.whl → 0.3.43__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. sibi_dst/df_helper/__init__.py +2 -0
  2. sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +262 -0
  3. sibi_dst/df_helper/_df_helper.py +5 -2
  4. sibi_dst/df_helper/_parquet_artifact.py +8 -2
  5. sibi_dst/df_helper/_parquet_reader.py +5 -1
  6. sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +1 -0
  7. sibi_dst/osmnx_helper/__init__.py +2 -2
  8. sibi_dst/osmnx_helper/v1/basemaps/__init__.py +0 -0
  9. sibi_dst/osmnx_helper/{basemaps → v1/basemaps}/router_plotter.py +85 -30
  10. sibi_dst/osmnx_helper/v2/__init__.py +0 -0
  11. sibi_dst/osmnx_helper/v2/base_osm_map.py +153 -0
  12. sibi_dst/osmnx_helper/v2/basemaps/__init__.py +0 -0
  13. sibi_dst/osmnx_helper/v2/basemaps/utils.py +0 -0
  14. sibi_dst/utils/__init__.py +3 -0
  15. sibi_dst/utils/data_utils.py +66 -25
  16. sibi_dst/utils/data_wrapper.py +222 -285
  17. sibi_dst/utils/date_utils.py +118 -113
  18. sibi_dst/utils/df_utils.py +7 -0
  19. sibi_dst/utils/log_utils.py +57 -18
  20. sibi_dst/utils/parquet_saver.py +4 -2
  21. sibi_dst/utils/phone_formatter.py +127 -0
  22. sibi_dst/utils/storage_manager.py +14 -7
  23. sibi_dst-0.3.43.dist-info/METADATA +194 -0
  24. {sibi_dst-0.3.40.dist-info → sibi_dst-0.3.43.dist-info}/RECORD +29 -22
  25. sibi_dst-0.3.40.dist-info/METADATA +0 -62
  26. /sibi_dst/osmnx_helper/{basemaps → v1}/__init__.py +0 -0
  27. /sibi_dst/osmnx_helper/{base_osm_map.py → v1/base_osm_map.py} +0 -0
  28. /sibi_dst/osmnx_helper/{basemaps → v1/basemaps}/calendar_html.py +0 -0
  29. /sibi_dst/osmnx_helper/{utils.py → v1/utils.py} +0 -0
  30. {sibi_dst-0.3.40.dist-info → sibi_dst-0.3.43.dist-info}/WHEEL +0 -0
@@ -1,73 +1,24 @@
1
1
  import datetime
2
- from concurrent.futures import ThreadPoolExecutor
3
- from typing import Type, Any, Dict, Optional, Union
2
+ import logging
3
+ from concurrent.futures import ThreadPoolExecutor, as_completed
4
+ from typing import Type, Any, Dict, Optional, Union, List, Tuple
4
5
  from threading import Lock
5
6
  import fsspec
6
7
  import pandas as pd
7
8
  from IPython.display import display
8
9
  from tqdm import tqdm
9
10
 
10
- from sibi_dst.utils import Logger, DateUtils
11
- from sibi_dst.utils import ParquetSaver
11
+ from sibi_dst.utils import Logger, FileAgeChecker, ParquetSaver
12
12
 
13
13
 
14
14
  class DataWrapper:
15
- """
16
- Utility class for handling file-based operations, including processing and saving data
17
- in Parquet format, while managing a hierarchy of conditions such as overwrite, history
18
- threshold, and missing file detection.
19
-
20
- This class aims to simplify the process of managing large datasets stored in a filesystem.
21
- It allows for controlled updates to data files based on parameters set by the user, with
22
- support for different filesystem types and options.
23
-
24
- It also provides features like logging actions, managing processing threads, generating
25
- update plans, checking file age, and dynamically creating date ranges for data operations.
26
-
27
- The design supports flexible integration with user-defined classes (dataclasses) to define
28
- custom loading and processing behavior.
29
-
30
- :ivar dataclass: The user-defined class for data processing.
31
- :type dataclass: Type
32
- :ivar date_field: The name of the date field in the user-defined class.
33
- :type date_field: str
34
- :ivar data_path: Base path for the dataset storage.
35
- :type data_path: str
36
- :ivar parquet_filename: File name for the Parquet file.
37
- :type parquet_filename: str
38
- :ivar start_date: Start date for processing.
39
- :type start_date: datetime.date
40
- :ivar end_date: End date for processing.
41
- :type end_date: datetime.date
42
- :ivar fs: File system object for managing files.
43
- :type fs: Optional[fsspec.AbstractFileSystem]
44
- :ivar filesystem_type: Type of the filesystem (e.g., "file", "s3").
45
- :type filesystem_type: str
46
- :ivar filesystem_options: Additional options for initializing the filesystem.
47
- :type filesystem_options: Optional[Dict]
48
- :ivar verbose: Flag to enable verbose logging.
49
- :type verbose: bool
50
- :ivar class_params: Parameters to initialize the dataclass.
51
- :type class_params: Optional[Dict]
52
- :ivar load_params: Additional parameters for loading functions.
53
- :type load_params: Optional[Dict]
54
- :ivar reverse_order: Flag to reverse the order of date range generation.
55
- :type reverse_order: bool
56
- :ivar overwrite: Whether to overwrite all files during processing.
57
- :type overwrite: bool
58
- :ivar ignore_missing: Whether to ignore missing files.
59
- :type ignore_missing: bool
60
- :ivar logger: Logger instance for logging information.
61
- :type logger: Optional[Logger]
62
- :ivar max_age_minutes: Maximum file age threshold in minutes.
63
- :type max_age_minutes: int
64
- :ivar history_days_threshold: Number of days for the history threshold.
65
- :type history_days_threshold: int
66
- :ivar show_progress: Flag to enable progress display.
67
- :type show_progress: bool
68
- :ivar timeout: Timeout in seconds for processing tasks with threads.
69
- :type timeout: Optional[int]
70
- """
15
+ DEFAULT_PRIORITY_MAP = {
16
+ "overwrite": 1,
17
+ "missing_in_history": 2,
18
+ "existing_but_stale": 3,
19
+ "missing_outside_history": 4,
20
+ "file_is_recent": 0
21
+ }
71
22
  DEFAULT_MAX_AGE_MINUTES = 1440
72
23
  DEFAULT_HISTORY_DAYS_THRESHOLD = 30
73
24
 
@@ -81,6 +32,7 @@ class DataWrapper:
81
32
  fs: Optional[fsspec.AbstractFileSystem] = None,
82
33
  filesystem_type: str = "file",
83
34
  filesystem_options: Optional[Dict] = None,
35
+ debug: bool = False,
84
36
  verbose: bool = False,
85
37
  class_params: Optional[Dict] = None,
86
38
  load_params: Optional[Dict] = None,
@@ -91,14 +43,17 @@ class DataWrapper:
91
43
  max_age_minutes: int = DEFAULT_MAX_AGE_MINUTES,
92
44
  history_days_threshold: int = DEFAULT_HISTORY_DAYS_THRESHOLD,
93
45
  show_progress: bool = False,
94
- timeout: float = 60):
46
+ timeout: float = 60,
47
+ reference_date: datetime.date = None,
48
+ custom_priority_map: Dict[str, int] = None):
95
49
  self.dataclass = dataclass
96
50
  self.date_field = date_field
97
- self.data_path = self.ensure_forward_slash(data_path)
51
+ self.data_path = self._ensure_forward_slash(data_path)
98
52
  self.parquet_filename = parquet_filename
99
53
  self.filesystem_type = filesystem_type
100
54
  self.filesystem_options = filesystem_options or {}
101
- self.fs = fs
55
+ self.fs = fs or self._init_filesystem()
56
+ self.debug = debug
102
57
  self.verbose = verbose
103
58
  self.class_params = class_params or {}
104
59
  self.load_params = load_params or {}
@@ -106,23 +61,26 @@ class DataWrapper:
106
61
  self.overwrite = overwrite
107
62
  self.ignore_missing = ignore_missing
108
63
  self.logger = logger or Logger.default_logger(logger_name=self.dataclass.__name__)
64
+ self.logger.set_level(logging.DEBUG if debug else logging.INFO)
109
65
  self.max_age_minutes = max_age_minutes
110
66
  self.history_days_threshold = history_days_threshold
111
67
  self.show_progress = show_progress
112
68
  self.timeout = timeout
69
+ self.reference_date = reference_date or datetime.date.today()
70
+ self.priority_map = custom_priority_map or self.DEFAULT_PRIORITY_MAP
113
71
 
114
- self.start_date = self.convert_to_date(start_date)
115
- self.end_date = self.convert_to_date(end_date)
72
+ self.start_date = self._convert_to_date(start_date)
73
+ self.end_date = self._convert_to_date(end_date)
116
74
  self._lock = Lock()
117
75
  self.processed_dates = []
118
- self.date_utils = DateUtils(logger=self.logger)
119
- if self.fs is None:
120
- with self._lock:
121
- if self.fs is None:
122
- self.fs = fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
76
+ self.age_checker = FileAgeChecker(logger=self.logger)
77
+
78
+ def _init_filesystem(self) -> fsspec.AbstractFileSystem:
79
+ with self._lock:
80
+ return fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
123
81
 
124
82
  @staticmethod
125
- def convert_to_date(date: Union[datetime.date, str]) -> datetime.date:
83
+ def _convert_to_date(date: Union[datetime.date, str]) -> datetime.date:
126
84
  if isinstance(date, datetime.date):
127
85
  return date
128
86
  try:
@@ -131,234 +89,213 @@ class DataWrapper:
131
89
  raise ValueError(f"Error converting {date} to datetime: {e}")
132
90
 
133
91
  @staticmethod
134
- def ensure_forward_slash(path: str) -> str:
135
- return path if path.endswith('/') else path + '/'
92
+ def _ensure_forward_slash(path: str) -> str:
93
+ return path.rstrip('/') + '/'
94
+
95
+ def generate_date_range(self) -> List[datetime.date]:
96
+ """Generate ordered date range with future date handling"""
97
+ date_range = pd.date_range(
98
+ start=self.start_date,
99
+ end=self.end_date,
100
+ freq='D'
101
+ ).date.tolist()
136
102
 
137
- def generate_date_range(self):
138
- """Generate a range of dates between start_date and end_date."""
139
- date_range = pd.date_range(self.start_date, self.end_date, freq='D')
140
103
  if self.reverse_order:
141
- date_range = date_range[::-1]
142
- for date in date_range:
143
- yield date.date()
104
+ date_range.reverse()
105
+
106
+ return [
107
+ d for d in date_range
108
+ if d <= self.reference_date or self.overwrite
109
+ ]
144
110
 
145
111
  def process(self, max_retries: int = 3):
146
- """
147
- Processes update tasks by generating an update plan, filtering required updates, and distributing
148
- the workload across threads based on priority levels.
149
-
150
- This method operates by assessing required updates through generated conditions,
151
- grouping them by priority levels, and processing them in parallel threads.
152
- Each thread handles the updates for a specific priority level, ensuring a streamlined approach
153
- to handling the updates efficiently.
154
-
155
- :param max_retries: Maximum number of retries for a task after a timeout. Defaults to 3.
156
- :raises TimeoutError: If a thread processing a priority level exceeds the allowed timeout duration.
157
- :return: None
158
- """
159
- update_plan_table = self.generate_update_plan_with_conditions()
160
-
161
- # Filter out rows that do not require updates (priority 0 means skip)
162
- with self._lock:
163
- update_plan_table = update_plan_table[
164
- (update_plan_table["update_required"] == True) & (update_plan_table["update_priority"] != 0)
165
- ]
166
- # Display the update plan table to the user if requested
167
- if len(update_plan_table.index) == 0:
112
+ """Process updates with priority-based execution and retries"""
113
+ update_plan = self.generate_update_plan()
114
+
115
+ if update_plan.empty:
116
+ self.logger.info("No updates required")
168
117
  return
118
+ # Filter for required updates first
119
+ update_plan = update_plan[update_plan["update_required"] == True]
120
+
169
121
  if self.show_progress:
170
- display(update_plan_table)
171
- # Group by priority
172
- with self._lock:
173
- priorities = sorted(update_plan_table["update_priority"].unique())
174
-
175
- # We will process each priority level in its own thread.
176
- # Each thread will handle all dates associated with that priority.
177
- def process_priority(priority):
178
- # Extract dates for the current priority
179
- dates_to_process = update_plan_table[
180
- update_plan_table["update_priority"] == priority
181
- ]["date"].tolist()
182
-
183
- # If show_progress is True, wrap in a progress bar
184
- date_iterator = dates_to_process
185
- if self.show_progress:
186
- date_iterator = tqdm(date_iterator,
187
- desc=f"Processing priority {priority}:{self.dataclass.__name__}",
188
- unit="date")
189
-
190
- # Process each date for this priority
191
- for current_date in date_iterator:
192
- self.process_date(current_date)
193
-
194
- # Launch a separate thread for each priority
195
- with ThreadPoolExecutor(max_workers=len(priorities)) as executor:
196
- futures = {executor.submit(process_priority, p): p for p in priorities}
197
- retries = {p: 0 for p in priorities} # Track retry counts for each priority
198
-
199
- while futures:
200
- for future in list(futures.keys()):
201
- try:
202
- future.result(timeout=self.timeout)
203
- del futures[future] # Remove completed future
204
- except TimeoutError:
205
- priority = futures[future]
206
- retries[priority] += 1
207
-
208
- if retries[priority] <= max_retries:
209
- self.logger.warning(
210
- f"Thread for priority {priority} timed out. Retrying ({retries[priority]}/{max_retries})..."
211
- )
212
- new_future = executor.submit(process_priority, priority)
213
- futures[new_future] = priority
214
- else:
215
- self.logger.error(
216
- f"Thread for priority {priority} timed out. Max retries ({max_retries}) exceeded. Skipping."
217
- )
218
- del futures[future] # Remove the timed-out future
219
- except Exception as e:
220
- self.logger.error(f"Error processing priority {futures[future]}: {e}")
221
- del futures[future] # Remove the failed future
222
-
223
- def process_date(self, date: datetime.date):
224
- """
225
- Processes data for a given date and saves it as a Parquet file.
226
-
227
- This method processes data for the specified date by loading the data
228
- corresponding to that day, saving it into a structured storage format
229
- (Parquet), and logging relevant information such as processing time
230
- and errors that may occur during the process. It uses provided
231
- dataclass and parameters to operate and ensures the data is stored
232
- in a structured folder hierarchy.
233
-
234
- :param date: The specific date for which data processing and saving should occur
235
- :type date: datetime.date
236
- :return: None
237
- """
238
- folder = f'{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/'
239
- full_parquet_filename = f"{folder}{self.parquet_filename}"
122
+ #display(self._enhanced_display_table(update_plan))
123
+ display(update_plan)
124
+
125
+ for priority in sorted(update_plan["update_priority"].unique()):
126
+ self._process_priority_group(update_plan, priority, max_retries)
127
+
128
+ def _process_priority_group(self,
129
+ update_plan: pd.DataFrame,
130
+ priority: int,
131
+ max_retries: int):
132
+ """Process a single priority group with parallel execution"""
133
+ dates = update_plan[update_plan["update_priority"] == priority]["date"].tolist()
134
+ if not dates:
135
+ return
240
136
 
241
- start_time = datetime.datetime.now()
242
- self.logger.info(f"Processing date: {date}")
243
- self.logger.info(f"Processing {full_parquet_filename}...")
137
+ desc = f"Processing {self.dataclass.__name__}, task: {self._priority_label(priority)}"
138
+ self.logger.info(f"Starting {desc.lower()}")
244
139
 
245
- data_object = self.dataclass(**self.class_params)
246
- df = data_object.load_period(dt_field=self.date_field, start=date, end=date)
140
+ with ThreadPoolExecutor() as executor:
141
+ futures = {
142
+ executor.submit(self._process_date_with_retry, date, max_retries): date
143
+ for date in dates
144
+ }
247
145
 
248
- if len(df.index) == 0:
249
- self.logger.error("No data found for the specified date.")
250
- return
146
+ for future in tqdm(as_completed(futures),
147
+ total=len(futures),
148
+ desc=desc,
149
+ disable=not self.show_progress):
150
+ date = futures[future]
151
+ try:
152
+ future.result(timeout=self.timeout)
153
+ except Exception as e:
154
+ self.logger.error(f"Permanent failure processing {date}: {str(e)}")
155
+
156
+ def _priority_label(self, priority: int) -> str:
157
+ """Get human-readable label for priority level"""
158
+ return next(
159
+ (k for k, v in self.priority_map.items() if v == priority),
160
+ f"Unknown Priority {priority}"
161
+ )
162
+
163
+ def _enhanced_display_table(self, df: pd.DataFrame) -> pd.DataFrame.style:
164
+ """Format the update plan table for better readability"""
165
+ return df.style \
166
+ .bar(subset=["file_age_minutes"], color="#5fba7d") \
167
+ .background_gradient(subset=["update_priority"], cmap="YlOrBr") \
168
+ .set_caption(f"Update Plan: {self.dataclass.__name__}")
169
+
170
+ def generate_update_plan(self) -> pd.DataFrame:
171
+ """Generate update plan with parallel file status checks"""
172
+ dates = self.generate_date_range()
173
+ history_start = self.reference_date - datetime.timedelta(days=self.history_days_threshold)
174
+ rows = []
251
175
 
252
- with self._lock:
253
- parquet_saver = ParquetSaver(df, parquet_storage_path=folder, logger=self.logger, fs=self.fs)
254
- parquet_saver.save_to_parquet(self.parquet_filename, clear_existing=True)
176
+ with ThreadPoolExecutor() as executor:
177
+ future_to_date = {
178
+ executor.submit(self._get_file_status, date): date
179
+ for date in dates
180
+ }
181
+
182
+ for future in tqdm(as_completed(future_to_date),
183
+ total=len(future_to_date),
184
+ desc=f"Analyzing files for {self.dataclass.__name__} ",
185
+ disable=not self.show_progress):
186
+ current_date = future_to_date[future]
187
+ file_exists, file_age = future.result()
188
+ rows.append(self._create_plan_row(
189
+ current_date,
190
+ history_start,
191
+ file_exists,
192
+ file_age
193
+ ))
194
+
195
+ return pd.DataFrame(rows).sort_values("update_priority")
196
+
197
+ def _get_file_status(self, date: datetime.date) -> Tuple[bool, float]:
198
+ """Get file existence and age with error handling"""
199
+ path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/{self.parquet_filename}"
200
+ try:
201
+ exists = self.fs.exists(path)
202
+ age = self.age_checker.get_file_or_dir_age_minutes(path, self.fs) if exists else None
203
+ return exists, age
204
+ except Exception as e:
205
+ self.logger.warning(f"Error checking {path}: {str(e)}")
206
+ return False, None
207
+
208
+ def _create_plan_row(self,
209
+ date: datetime.date,
210
+ history_start: datetime.date,
211
+ file_exists: bool,
212
+ file_age: float) -> dict:
213
+ """Create a row for the update plan DataFrame"""
214
+ within_history = history_start <= date <= self.reference_date
215
+ category, update_required = "file_is_recent", False
216
+
217
+ if self.overwrite:
218
+ category, update_required = "overwrite", True
219
+ elif within_history:
220
+ if not file_exists:
221
+ category, update_required = "missing_in_history", True
222
+ elif file_age > self.max_age_minutes:
223
+ category, update_required = "existing_but_stale", True
224
+ elif not file_exists and not self.ignore_missing:
225
+ category, update_required = "missing_outside_history", True
226
+
227
+ return {
228
+ "date": date,
229
+ "file_exists": file_exists,
230
+ "file_age_minutes": file_age,
231
+ "age_threshold": self.max_age_minutes,
232
+ "within_history": within_history,
233
+ "ignore_missing": self.ignore_missing,
234
+ "update_category": category,
235
+ "update_priority": self.priority_map[category],
236
+ "update_required": update_required,
237
+ "class": self.dataclass.__name__
238
+ }
239
+
240
+ def _process_date_with_retry(self, date: datetime.date, max_retries: int):
241
+ """Process a date with retry logic"""
242
+ for attempt in range(1, max_retries + 1):
243
+ try:
244
+ self._process_single_date(date)
245
+ return
246
+ except Exception as e:
247
+ if attempt < max_retries:
248
+ self.logger.warning(f"Retry {attempt}/{max_retries} for {date}: {str(e)}")
249
+ else:
250
+ raise RuntimeError(f"Failed processing {date} after {max_retries} attempts") from e
251
+
252
+ def _process_single_date(self, date: datetime.date):
253
+ """Core date processing logic"""
254
+ path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
255
+ full_path = f"{path}{self.parquet_filename}"
255
256
 
256
- end_time = datetime.datetime.now()
257
- duration_seconds = (end_time - start_time).total_seconds()
258
- self.logger.info(
259
- f"Data saved to {full_parquet_filename}. Processing time: {duration_seconds:.2f} seconds"
257
+ self.logger.info(f"Processing {date} ({full_path})")
258
+ start_time = datetime.datetime.now()
259
+
260
+ try:
261
+ self.logger.debug(f"Class Params: {self.class_params}")
262
+ self.logger.debug(f"Load Params: {self.load_params}")
263
+ data = self.dataclass(**self.class_params)
264
+ df = data.load_period(
265
+ dt_field=self.date_field,
266
+ start=date,
267
+ end=date,
268
+ **self.load_params
260
269
  )
261
270
 
262
- self.processed_dates.append(date)
263
- self.logger.info(f"Finished processing date: {date}")
271
+ if len(df.index)==0:
272
+ self.logger.warning(f"No data found for {date}")
273
+ return
264
274
 
265
- def generate_update_plan_with_conditions(self):
266
- """
267
- Generates an update plan for data files based on specific conditions. The function evaluates the need for updating or
268
- overwriting data files for a given date range. Conditions include file existence, whether the file falls within a
269
- specified historical threshold, and the necessity to overwrite or handle missing files. A priority map is utilized to
270
- assign priority levels to update categories.
275
+ with self._lock:
276
+ ParquetSaver(
277
+ df_result=df,
278
+ parquet_storage_path=path,
279
+ fs=self.fs,
280
+ logger=self.logger
281
+ ).save_to_parquet(self.parquet_filename)
271
282
 
272
- :raises FileNotFoundError: If any file is referenced that does not exist and the ``ignore_missing`` property is set to False.
273
- :raises AttributeError: If any required attribute like ``fs``, ``dataclass``, or others are not properly set or initialized.
283
+ duration = (datetime.datetime.now() - start_time).total_seconds()
284
+ self._log_success(date, duration, full_path)
285
+
286
+ except Exception as e:
287
+ self._log_failure(date, e)
288
+ raise
289
+
290
+ def _log_success(self, date: datetime.date, duration: float, path: str):
291
+ """Handle successful processing logging"""
292
+ msg = f"Completed {date} in {duration:.1f}s | Saved to {path}"
293
+ self.logger.info(msg)
294
+ self.processed_dates.append(date)
295
+
296
+ def _log_failure(self, date: datetime.date, error: Exception):
297
+ """Handle error logging"""
298
+ msg = f"Failed processing {date}: {str(error)}"
299
+ self.logger.error(msg)
274
300
 
275
- :return: A Pandas DataFrame representing the update plan, where each row contains information about a date, the conditions
276
- evaluated for that date, and the determined update priority.
277
- :rtype: pandas.DataFrame
278
- """
279
- rows = []
280
301
 
281
- today = datetime.date.today()
282
- history_start_date = today - datetime.timedelta(days=self.history_days_threshold)
283
- priority_map = {
284
- "file is recent":0,
285
- "overwrite": 1,
286
- "history_days": 2,
287
- "missing_files": 3
288
- }
289
- date_range = self.generate_date_range()
290
- if self.show_progress:
291
- date_range = tqdm(date_range, desc=f"Evaluating update plan:{self.dataclass.__name__}", unit="date")
292
-
293
- for current_date in date_range:
294
- folder = f'{self.data_path}{current_date.year}/{current_date.month:02d}/{current_date.day:02d}/'
295
- full_parquet_filename = f"{folder}{self.parquet_filename}"
296
-
297
- file_exists = self.fs.exists(full_parquet_filename)
298
- within_history = history_start_date <= current_date <= today
299
- missing_file = not file_exists and not self.ignore_missing
300
- category = None
301
- update_required = False
302
-
303
- # Hierarchy 1: Overwrite
304
- if self.overwrite:
305
- category = "overwrite"
306
- update_required = True
307
- elif missing_file and current_date < today:
308
- category = "missing_files"
309
- update_required = True
310
-
311
- elif within_history:
312
- if file_exists:
313
- if self.date_utils.is_file_older_than(
314
- full_parquet_filename,
315
- max_age_minutes=self.max_age_minutes,
316
- fs=self.fs,
317
- ignore_missing=self.ignore_missing,
318
- verbose=self.verbose
319
- ):
320
- category = "history_days"
321
- update_required = True
322
- else:
323
- category = "file is recent"
324
- update_required = False
325
- else:
326
- category = "missing_files"
327
- update_required = True
328
- else:
329
- category = "No Update Required"
330
- update_required = False
331
-
332
- # Collect condition descriptions for the update plan table
333
- row = {
334
- "date": current_date,
335
- "file_exists": file_exists,
336
- "within_history": within_history,
337
- "missing_file": missing_file,
338
- "update_required": update_required,
339
- "update_category": category,
340
- "datawrapper class": self.dataclass.__name__,
341
- "update_priority": priority_map.get(category, 0)
342
- }
343
- rows.append(row)
344
-
345
- update_plan_table = pd.DataFrame(rows)
346
- return update_plan_table
347
-
348
- # # wrapper.process()
349
- # # wrapper = DataWrapper(
350
- # # dataclass=YourDataClass,
351
- # # date_field="created_at",
352
- # # data_path="s3://your-bucket-name/path/to/data",
353
- # # parquet_filename="data.parquet",
354
- # # start_date="2022-01-01",
355
- # # end_date="2022-12-31",
356
- # # filesystem_type="s3",
357
- # # filesystem_options={
358
- # # "key": "your_aws_access_key",
359
- # # "secret": "your_aws_secret_key",
360
- # # "client_kwargs": {"endpoint_url": "https://s3.amazonaws.com"}
361
- # # },
362
- # # verbose=True
363
- # #)
364
- # #wrapper.process()