sibi-dst 2025.1.13__py3-none-any.whl → 2025.8.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. sibi_dst/__init__.py +7 -1
  2. sibi_dst/df_helper/__init__.py +3 -2
  3. sibi_dst/df_helper/_artifact_updater_async.py +238 -0
  4. sibi_dst/df_helper/_artifact_updater_threaded.py +195 -0
  5. sibi_dst/df_helper/_df_helper.py +418 -118
  6. sibi_dst/df_helper/_parquet_artifact.py +275 -283
  7. sibi_dst/df_helper/_parquet_reader.py +9 -10
  8. sibi_dst/df_helper/backends/parquet/_parquet_options.py +8 -4
  9. sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +68 -107
  10. sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
  11. sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +105 -255
  12. sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +90 -42
  13. sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +192 -0
  14. sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +122 -72
  15. sibi_dst/osmnx_helper/route_path_builder.py +45 -46
  16. sibi_dst/utils/__init__.py +2 -0
  17. sibi_dst/utils/base.py +235 -100
  18. sibi_dst/utils/business_days.py +248 -0
  19. sibi_dst/utils/clickhouse_writer.py +472 -206
  20. sibi_dst/utils/data_utils.py +139 -186
  21. sibi_dst/utils/data_wrapper.py +392 -88
  22. sibi_dst/utils/date_utils.py +711 -393
  23. sibi_dst/utils/df_utils.py +193 -213
  24. sibi_dst/utils/file_age_checker.py +301 -0
  25. sibi_dst/utils/file_utils.py +3 -2
  26. sibi_dst/utils/filepath_generator.py +314 -152
  27. sibi_dst/utils/log_utils.py +581 -242
  28. sibi_dst/utils/manifest_manager.py +60 -76
  29. sibi_dst/utils/parquet_saver.py +33 -27
  30. sibi_dst/utils/periods.py +42 -0
  31. sibi_dst/utils/phone_formatter.py +88 -95
  32. sibi_dst/utils/update_planner.py +180 -178
  33. sibi_dst/utils/webdav_client.py +116 -166
  34. {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/METADATA +1 -1
  35. {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/RECORD +36 -30
  36. sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +0 -422
  37. {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/WHEEL +0 -0
@@ -0,0 +1,301 @@
1
+ import datetime as _dt
2
+ from typing import Tuple, List, Optional, Dict, Any
3
+
4
+ import fsspec
5
+
6
+ from .log_utils import Logger
7
+
8
+
9
+ class FileAgeChecker:
10
+ """
11
+ Check file/directory "age" (minutes since last modification) using fsspec.
12
+
13
+ Backward compatible methods:
14
+ - is_file_older_than(file_path, max_age_minutes, fs=None, ignore_missing=False, verbose=False) -> bool
15
+ - get_file_or_dir_age_minutes(file_path, fs=None) -> float
16
+
17
+ Enhancements:
18
+ - dir_policy: 'oldest' | 'newest' | 'mean' when evaluating directories
19
+ - recursive: recurse into subdirectories using fs.find()
20
+ - robust mtime extraction for local/S3/FTP-like backends
21
+ - grace_minutes: optional slack for threshold comparisons
22
+ """
23
+
24
+ _UTC = _dt.timezone.utc
25
+
26
+ def __init__(
27
+ self,
28
+ debug: bool = False,
29
+ logger: Optional[Logger] = None,
30
+ *,
31
+ dir_policy: str = "oldest", # 'oldest' (legacy), 'newest', or 'mean'
32
+ recursive_default: bool = False,
33
+ ):
34
+ self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
35
+ self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
36
+
37
+ if dir_policy not in {"oldest", "newest", "mean"}:
38
+ raise ValueError("dir_policy must be one of: 'oldest', 'newest', 'mean'")
39
+ self.dir_policy = dir_policy
40
+ self.recursive_default = recursive_default
41
+
42
+ # ---------------- Public API ----------------
43
+
44
+ def is_file_older_than(
45
+ self,
46
+ file_path: str,
47
+ max_age_minutes: float,
48
+ fs: Optional[fsspec.AbstractFileSystem] = None,
49
+ ignore_missing: bool = False,
50
+ verbose: bool = False,
51
+ *,
52
+ recursive: Optional[bool] = None,
53
+ dir_policy: Optional[str] = None,
54
+ grace_minutes: float = 0.0,
55
+ ) -> bool:
56
+ """
57
+ Return True if file/dir age (in minutes) is greater than the threshold.
58
+
59
+ :param file_path: Path to the file or directory.
60
+ :param max_age_minutes: Maximum allowed "age" in minutes.
61
+ :param fs: Filesystem object (defaults to local 'file' fs).
62
+ :param ignore_missing: If True, missing paths are treated as NOT old.
63
+ :param verbose: Log extra details.
64
+ :param recursive: Recurse into subdirectories (defaults to instance setting).
65
+ :param dir_policy: 'oldest' | 'newest' | 'mean' (defaults to instance policy).
66
+ :param grace_minutes: Threshold slack; if provided, we compare against
67
+ (max_age_minutes - grace_minutes), floored at 0.
68
+ """
69
+ fs = fs or fsspec.filesystem("file")
70
+ use_recursive = self.recursive_default if recursive is None else bool(recursive)
71
+ policy = dir_policy or self.dir_policy
72
+
73
+ try:
74
+ if not fs.exists(file_path):
75
+ if verbose:
76
+ self.logger.debug(f"Path not found: {file_path}")
77
+ return False if ignore_missing else True
78
+
79
+ age = self.get_file_or_dir_age_minutes(
80
+ file_path, fs=fs, recursive=use_recursive, dir_policy=policy, verbose=verbose
81
+ )
82
+ threshold = max(0.0, float(max_age_minutes) - float(grace_minutes))
83
+ if verbose:
84
+ self.logger.debug(
85
+ f"Age check for {file_path}: age={age:.2f} min, "
86
+ f"threshold={threshold:.2f} min (policy={policy}, recursive={use_recursive})"
87
+ )
88
+ return age > threshold
89
+
90
+ except Exception as e:
91
+ # On errors, be conservative and consider it old (legacy behavior)
92
+ self.logger.warning(f"Error checking {file_path}: {e}")
93
+ return True
94
+
95
+ def get_file_or_dir_age_minutes(
96
+ self,
97
+ file_path: str,
98
+ fs: Optional[fsspec.AbstractFileSystem] = None,
99
+ *,
100
+ recursive: Optional[bool] = None,
101
+ dir_policy: Optional[str] = None,
102
+ verbose: bool = False,
103
+ ) -> float:
104
+ """
105
+ Compute the age (minutes since last modification timestamp) for a file or directory.
106
+
107
+ For directories, applies `dir_policy`:
108
+ - 'oldest' : age of the OLDEST file (max age) [legacy default]
109
+ - 'newest' : age since the MOST RECENT file update (min age)
110
+ - 'mean' : average age across files
111
+
112
+ Returns float('inf') for missing paths, invalid path types, or on errors.
113
+ """
114
+ fs = fs or fsspec.filesystem("file")
115
+ use_recursive = self.recursive_default if recursive is None else bool(recursive)
116
+ policy = dir_policy or self.dir_policy
117
+
118
+ try:
119
+ if not fs.exists(file_path):
120
+ if verbose:
121
+ self.logger.debug(f"Path not found: {file_path}")
122
+ return float("inf")
123
+
124
+ if fs.isdir(file_path):
125
+ return self._get_directory_age_minutes(
126
+ file_path, fs, verbose=verbose, recursive=use_recursive, policy=policy
127
+ )
128
+ if fs.isfile(file_path):
129
+ return self._get_file_age_minutes(file_path, fs, verbose=verbose)
130
+
131
+ self.logger.warning(f"Invalid path type (neither file nor dir): {file_path}")
132
+ return float("inf")
133
+
134
+ except Exception as e:
135
+ self.logger.warning(f"Error getting age for {file_path}: {e}")
136
+ return float("inf")
137
+
138
+ # ---------------- Internals ----------------
139
+
140
+ def _now_utc(self) -> _dt.datetime:
141
+ return _dt.datetime.now(self._UTC)
142
+
143
+ def _get_directory_age_minutes(
144
+ self,
145
+ dir_path: str,
146
+ fs: fsspec.AbstractFileSystem,
147
+ *,
148
+ verbose: bool,
149
+ recursive: bool,
150
+ policy: str,
151
+ ) -> float:
152
+ """Compute directory age using the chosen policy."""
153
+ try:
154
+ paths = self._list_files(dir_path, fs, recursive=recursive)
155
+ except Exception as e:
156
+ self.logger.warning(f"Error listing {dir_path}: {e}")
157
+ return float("inf")
158
+
159
+ if not paths:
160
+ if verbose:
161
+ self.logger.debug(f"Empty directory: {dir_path}")
162
+ return float("inf")
163
+
164
+ ages: List[float] = []
165
+ for p in paths:
166
+ try:
167
+ info = fs.info(p)
168
+ mt = self._extract_mtime_utc(info, p)
169
+ if mt is None:
170
+ continue
171
+ age_min = (self._now_utc() - mt).total_seconds() / 60.0
172
+ ages.append(age_min)
173
+ except Exception as e:
174
+ # Skip problem files but continue
175
+ self.logger.debug(f"Skipping {p}: {e}")
176
+
177
+ if not ages:
178
+ self.logger.warning(f"No valid files with mtime in {dir_path}")
179
+ return float("inf")
180
+
181
+ if policy == "oldest":
182
+ chosen = max(ages) # age of oldest file
183
+ elif policy == "newest":
184
+ chosen = min(ages) # since most recent update
185
+ elif policy == "mean":
186
+ chosen = sum(ages) / len(ages)
187
+ else:
188
+ raise ValueError(f"Unknown dir_policy: {policy}")
189
+
190
+ if verbose:
191
+ self.logger.debug(
192
+ f"Directory age ({policy}) for {dir_path}: {chosen:.2f} minutes "
193
+ f"from {len(ages)} files (recursive={recursive})"
194
+ )
195
+ return chosen
196
+
197
+ def _get_file_age_minutes(
198
+ self,
199
+ file_path: str,
200
+ fs: fsspec.AbstractFileSystem,
201
+ *,
202
+ verbose: bool,
203
+ ) -> float:
204
+ """Age for a single file in minutes."""
205
+ info = fs.info(file_path)
206
+ mt = self._extract_mtime_utc(info, file_path)
207
+ if mt is None:
208
+ if verbose:
209
+ self.logger.debug(f"Missing/invalid mtime for {file_path} (info: {info})")
210
+ return float("inf")
211
+ age = (self._now_utc() - mt).total_seconds() / 60.0
212
+ if verbose:
213
+ self.logger.debug(f"File age for {file_path}: {age:.2f} minutes")
214
+ return age
215
+
216
+ def _list_files(
217
+ self,
218
+ dir_path: str,
219
+ fs: fsspec.AbstractFileSystem,
220
+ *,
221
+ recursive: bool,
222
+ ) -> List[str]:
223
+ """
224
+ Return a list of file paths inside dir_path.
225
+ Uses fs.find() if recursive else fs.ls(); filters out directories.
226
+ """
227
+ if recursive and hasattr(fs, "find"):
228
+ found = fs.find(dir_path)
229
+ # Some fs.find implementations return only files; still filter defensively
230
+ return [p for p in (found or []) if self._is_file(fs, p)]
231
+ else:
232
+ items = fs.ls(dir_path)
233
+ return [p for p in (items or []) if self._is_file(fs, p)]
234
+
235
+ def _is_file(self, fs: fsspec.AbstractFileSystem, path: str) -> bool:
236
+ try:
237
+ return fs.isfile(path)
238
+ except Exception:
239
+ # Some backends: rely on info['type']
240
+ try:
241
+ info = fs.info(path)
242
+ return info.get("type") == "file"
243
+ except Exception:
244
+ return False
245
+
246
+ def _extract_mtime_utc(self, info: Dict[str, Any], path: str) -> Optional[_dt.datetime]:
247
+ """
248
+ Normalize an mtime from fsspec info to a timezone-aware UTC datetime.
249
+ Supports common keys across local/S3/FTP-ish backends.
250
+ """
251
+ # 1) S3-like
252
+ if "LastModified" in info:
253
+ lm = info["LastModified"]
254
+ if isinstance(lm, _dt.datetime):
255
+ return lm if lm.tzinfo else lm.replace(tzinfo=self._UTC)
256
+ if isinstance(lm, str):
257
+ # Try ISO; honor trailing 'Z'
258
+ s = lm.replace("Z", "+00:00") if lm.endswith("Z") else lm
259
+ try:
260
+ dt = _dt.datetime.fromisoformat(s)
261
+ return dt if dt.tzinfo else dt.replace(tzinfo=self._UTC)
262
+ except ValueError:
263
+ pass
264
+
265
+ # 2) Local/posix fsspec
266
+ if "mtime" in info:
267
+ mt = info["mtime"]
268
+ try:
269
+ # fsspec local often returns float seconds
270
+ ts = float(mt)
271
+ return _dt.datetime.fromtimestamp(ts, tz=self._UTC)
272
+ except (TypeError, ValueError):
273
+ # Sometimes mtime is an ISO string
274
+ if isinstance(mt, str):
275
+ s = mt.replace("Z", "+00:00") if mt.endswith("Z") else mt
276
+ try:
277
+ dt = _dt.datetime.fromisoformat(s)
278
+ return dt if dt.tzinfo else dt.replace(tzinfo=self._UTC)
279
+ except ValueError:
280
+ pass
281
+
282
+ # 3) FTP/SSH style
283
+ for k in ("modified", "last_modified", "updated"):
284
+ if k in info and isinstance(info[k], str):
285
+ val = info[k]
286
+ # Try common "%Y-%m-%d %H:%M:%S" first, then ISO
287
+ try:
288
+ dt = _dt.datetime.strptime(val, "%Y-%m-%d %H:%M:%S").replace(tzinfo=self._UTC)
289
+ return dt
290
+ except ValueError:
291
+ s = val.replace("Z", "+00:00") if val.endswith("Z") else val
292
+ try:
293
+ dt = _dt.datetime.fromisoformat(s)
294
+ return dt if dt.tzinfo else dt.replace(tzinfo=self._UTC)
295
+ except ValueError:
296
+ continue
297
+
298
+ # If nothing matched, log once at debug level
299
+ self.logger.debug(f"No usable mtime in info for {path}: {info}")
300
+ return None
301
+
@@ -8,8 +8,9 @@ from .log_utils import Logger
8
8
 
9
9
 
10
10
  class FileUtils:
11
- def __init__(self, logger=None):
12
- self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
11
+ def __init__(self, **kwargs):
12
+ self.logger = kwargs.get('logger', Logger.default_logger(logger_name=self.__class__.__name__))
13
+ self.debug = kwargs.get('debug', False)
13
14
 
14
15
  @staticmethod
15
16
  def ensure_directory_exists(directory_path, clear_existing=False):