sibi-dst 2025.1.12__py3-none-any.whl → 2025.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. sibi_dst/__init__.py +7 -1
  2. sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +235 -342
  3. sibi_dst/df_helper/_df_helper.py +417 -117
  4. sibi_dst/df_helper/_parquet_artifact.py +255 -283
  5. sibi_dst/df_helper/backends/parquet/_parquet_options.py +8 -4
  6. sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +68 -107
  7. sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
  8. sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +105 -255
  9. sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +90 -42
  10. sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +192 -0
  11. sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +122 -72
  12. sibi_dst/osmnx_helper/__init__.py +1 -0
  13. sibi_dst/osmnx_helper/basemaps/route_map_plotter.py +203 -0
  14. sibi_dst/osmnx_helper/route_path_builder.py +97 -0
  15. sibi_dst/osmnx_helper/utils.py +2 -0
  16. sibi_dst/utils/base.py +302 -96
  17. sibi_dst/utils/clickhouse_writer.py +472 -206
  18. sibi_dst/utils/data_utils.py +139 -186
  19. sibi_dst/utils/data_wrapper.py +317 -73
  20. sibi_dst/utils/date_utils.py +1 -0
  21. sibi_dst/utils/df_utils.py +193 -213
  22. sibi_dst/utils/file_utils.py +3 -2
  23. sibi_dst/utils/filepath_generator.py +314 -152
  24. sibi_dst/utils/log_utils.py +581 -242
  25. sibi_dst/utils/manifest_manager.py +60 -76
  26. sibi_dst/utils/parquet_saver.py +33 -27
  27. sibi_dst/utils/phone_formatter.py +88 -95
  28. sibi_dst/utils/update_planner.py +180 -178
  29. sibi_dst/utils/webdav_client.py +116 -166
  30. {sibi_dst-2025.1.12.dist-info → sibi_dst-2025.8.1.dist-info}/METADATA +1 -1
  31. {sibi_dst-2025.1.12.dist-info → sibi_dst-2025.8.1.dist-info}/RECORD +32 -28
  32. {sibi_dst-2025.1.12.dist-info → sibi_dst-2025.8.1.dist-info}/WHEEL +0 -0
@@ -1,187 +1,349 @@
1
- import datetime
1
+ import datetime as dt
2
2
  import re
3
+ from pathlib import PurePosixPath
4
+ from typing import Iterable, List, Optional
3
5
 
4
6
  import fsspec
7
+ from fsspec.utils import infer_storage_options
5
8
 
6
9
  from .log_utils import Logger
7
10
 
8
11
 
9
12
  class FilePathGenerator:
10
13
  """
11
- Dynamically generates file paths by scanning directories starting from the base path
12
- and determining the innermost directory structure.
13
-
14
- Now supports generating appropriate paths for both pandas and Dask.
14
+ Scans date-partitioned directories base/YYYY/MM/DD and returns paths for pandas or Dask.
15
+ Works with any fsspec filesystem.
15
16
  """
16
17
 
17
- def __init__(self, base_path='', fs=None, logger=None, **kwargs):
18
- """
19
- Initialize the FilePathGenerator.
20
-
21
- Parameters:
22
- base_path (str): Base directory path where data files are stored.
23
- fs (fsspec.AbstractFileSystem, optional): Filesystem object to use for file operations.
24
- logger (Logger, optional): Logger instance for logging information.
25
- **kwargs: Additional keyword arguments.
26
- - debug (bool): If True, enables debug logging.
27
- - storage_options (dict): Options for the filesystem (e.g., credentials, tokens).
28
- - exclude_patterns (list): List of regex patterns to exclude from file paths.
29
- - file_extension (str): File extension to look for (default: 'parquet').
30
- """
31
- self.base_path = base_path.rstrip('/')
32
- self.fs = fs # Filesystem object
18
+ def __init__(
19
+ self,
20
+ base_path: str = "",
21
+ *,
22
+ fs=None,
23
+ logger: Optional[Logger] = None,
24
+ debug: bool = False,
25
+ storage_options: Optional[dict] = None,
26
+ exclude_patterns: Optional[Iterable[str]] = None,
27
+ file_extension: str = "parquet",
28
+ ):
33
29
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
34
- self.debug = kwargs.get('debug', False)
35
- self.storage_options = kwargs.get('storage_options', {})
36
- self.exclude_patterns = kwargs.get('exclude_patterns', [])
37
- self.file_extension = kwargs.get('file_extension', 'parquet').lstrip('.')
30
+ self.debug = debug
31
+ self.storage_options = storage_options or {}
32
+ self.file_extension = file_extension.lstrip(".")
33
+ self._compiled_exclusions = [re.compile(p) for p in (exclude_patterns or [])]
34
+
35
+ # Normalize base path & derive protocol + root path
36
+ opts = infer_storage_options(base_path or ".")
37
+ proto = opts.get("protocol") or "file"
38
+ root = opts.get("path") or "" # protocol-stripped
39
+
40
+ # If no fs given, make one from base_path
41
+ if fs is None:
42
+ self.fs, resolved_root = fsspec.core.url_to_fs(base_path or ".", **self.storage_options)
43
+ # Prefer resolved_root (already stripped and normalized by fsspec)
44
+ root = resolved_root or root
45
+ else:
46
+ self.fs = fs
38
47
 
39
- # If fs is not provided, initialize it based on base_path and storage_options
40
- if self.fs is None:
41
- self.fs, _ = fsspec.core.url_to_fs(self.base_path, **self.storage_options)
48
+ self._protocol = proto if isinstance(proto, str) else (proto[0] if proto else "file")
49
+ self._root = self._ensure_no_trailing_slash(self._to_posix(root))
42
50
 
43
- def generate_file_paths(self, start_date, end_date, engine='dask'):
44
- """
45
- Generate paths dynamically for files within the date range by scanning directories.
46
- Returns a list of file paths compatible with the specified engine.
51
+ if self.debug:
52
+ self.logger.debug(
53
+ f"FilePathGenerator init: protocol={self._protocol!r}, root={self._root!r}, fs={type(self.fs).__name__}"
54
+ )
47
55
 
48
- Parameters:
49
- start_date (str or datetime): Start date in 'YYYY-MM-DD' format or datetime object.
50
- end_date (str or datetime): End date in 'YYYY-MM-DD' format or datetime object.
51
- engine (str): 'pandas' or 'dask' to specify which library the paths are intended for.
56
+ # ------------------------- public API -------------------------
52
57
 
53
- Returns:
54
- list: List of file paths.
58
+ def generate_file_paths(self, start_date, end_date, engine: str = "dask") -> List[str]:
55
59
  """
56
- start_date = self._convert_to_datetime(start_date)
57
- end_date = self._convert_to_datetime(end_date)
58
-
59
- paths = []
60
- curr_date = start_date
61
-
62
- while curr_date <= end_date:
63
- year, month, day = curr_date.year, curr_date.month, curr_date.day
64
- day_paths = self._collect_paths(year, month, day, engine)
65
- if day_paths:
66
- paths.extend(day_paths)
67
- curr_date += datetime.timedelta(days=1)
60
+ Return a list of file (engine='dask') or dataset directory (engine='pandas') paths
61
+ for all dates in [start_date, end_date].
62
+ """
63
+ sd = self._to_date(start_date)
64
+ ed = self._to_date(end_date)
65
+ if sd > ed:
66
+ sd, ed = ed, sd # be forgiving on reversed dates
67
+
68
+ paths: List[str] = []
69
+ current = sd
70
+ while current <= ed:
71
+ y, m, d = current.year, current.month, current.day
72
+ paths.extend(self._collect_paths_for_day(y, m, d, engine))
73
+ current += dt.timedelta(days=1)
68
74
 
75
+ if self.debug:
76
+ self.logger.debug(f"Generated {len(paths)} path(s) for {sd}..{ed} (engine={engine})")
69
77
  return paths
70
78
 
71
- def _collect_paths(self, year, month, day, engine):
72
- """
73
- Collect appropriate paths for a given date, depending on the engine.
79
+ # ------------------------- internals -------------------------
74
80
 
75
- Parameters:
76
- year (int): Year component of the date.
77
- month (int): Month component of the date.
78
- day (int): Day component of the date.
79
- engine (str): 'pandas' or 'dask'.
81
+ def _collect_paths_for_day(self, year: int, month: int, day: int, engine: str) -> List[str]:
82
+ # IMPORTANT: use protocol-stripped paths with fs methods
83
+ day_dir = self._join(self._root, f"{year:04d}", f"{month:02d}", f"{day:02d}")
80
84
 
81
- Returns:
82
- list: List of file or directory paths.
83
- """
84
- base_dir = f"{self.base_path}/{year}/{str(month).zfill(2)}/{str(day).zfill(2)}"
85
-
86
- if not self.fs.exists(base_dir):
85
+ if not self.fs.exists(day_dir):
87
86
  if self.debug:
88
- self.logger.debug(f"Directory does not exist: {base_dir}")
87
+ self.logger.debug(f"Directory does not exist: {day_dir}")
89
88
  return []
90
89
 
91
- if engine == 'dask':
92
- # Collect individual file paths
93
- file_pattern = f"{base_dir}/**/*.{self.file_extension}"
94
- all_paths = self.fs.glob(file_pattern)
95
-
96
- if not all_paths and self.debug:
97
- self.logger.debug(f"No files found with pattern: {file_pattern}")
98
-
99
- # Exclude unwanted files and directories
100
- filtered_paths = self._exclude_unwanted_paths(all_paths)
101
-
102
- # Filter out directories
103
- file_paths = [path for path in filtered_paths if not self.fs.isdir(path)]
104
-
105
- elif engine == 'pandas':
106
- # Collect dataset directories
107
- # Assume that the base_dir is a Parquet dataset
108
- if self.fs.isdir(base_dir):
109
- file_paths = [base_dir]
110
- else:
111
- file_paths = []
112
-
90
+ if engine == "dask":
91
+ # Try recursive glob first
92
+ pattern = self._join(day_dir, "**", f"*.{self.file_extension}")
93
+ all_paths = self.fs.glob(pattern) or []
94
+
95
+ # Some filesystems don’t support recursive glob well; fallback to find()
96
+ if not all_paths:
97
+ try:
98
+ found = self.fs.find(day_dir) # recursive listing
99
+ except Exception:
100
+ found = []
101
+ all_paths = [p for p in found if p.endswith(f".{self.file_extension}")]
102
+
103
+ # Filter out dirs & excluded patterns
104
+ file_paths = [
105
+ p for p in all_paths
106
+ if not self._is_dir(p) and not self._is_excluded(p)
107
+ ]
108
+
109
+ elif engine == "pandas":
110
+ # For pandas, return the dataset directory for the day (if not excluded)
111
+ file_paths = [day_dir] if self._is_dir(day_dir) and not self._is_excluded(day_dir) else []
113
112
  else:
114
- raise ValueError("Engine must be 'pandas' or 'dask'.")
113
+ raise ValueError("engine must be 'pandas' or 'dask'.")
115
114
 
116
- protocol = self.fs.protocol if isinstance(self.fs.protocol, str) else self.fs.protocol[0]
115
+ # Reattach protocol ONLY for returned paths
116
+ return [self._with_protocol(p) for p in file_paths]
117
117
 
118
- # Ensure the protocol is included in the paths
119
- file_paths = [
120
- f"{protocol}://{path}" if not path.startswith(f"{protocol}://") else path
121
- for path in file_paths
122
- ]
118
+ def _is_dir(self, path: str) -> bool:
119
+ try:
120
+ return bool(self.fs.isdir(path))
121
+ except Exception:
122
+ # Robust fallback via info()
123
+ try:
124
+ return (self.fs.info(path).get("type") == "directory")
125
+ except Exception:
126
+ return False
123
127
 
124
- if self.debug:
125
- self.logger.debug(f"Collected {len(file_paths)} paths from {base_dir} for engine '{engine}'")
126
-
127
- return file_paths
128
-
129
- def _exclude_unwanted_paths(self, paths):
130
- """
131
- Exclude paths that match any of the exclusion patterns.
132
- """
133
- # Combine default patterns with user-provided patterns
134
- exclude_patterns = self.exclude_patterns
128
+ def _is_excluded(self, path: str) -> bool:
129
+ return any(pat.search(path) for pat in self._compiled_exclusions)
135
130
 
136
- # Compile regex patterns for efficiency
137
- compiled_patterns = [re.compile(pattern) for pattern in exclude_patterns]
138
-
139
- # Filter out paths matching any of the exclude patterns
140
- filtered_paths = [
141
- path for path in paths
142
- if not any(pattern.match(path) for pattern in compiled_patterns)
143
- ]
144
-
145
- return filtered_paths
131
+ # ------------------------- helpers -------------------------
146
132
 
147
133
  @staticmethod
148
- def _convert_to_datetime(date):
149
- """Convert a date string or datetime object into a datetime object."""
150
- if isinstance(date, str):
151
- return datetime.datetime.strptime(date, '%Y-%m-%d')
152
- return date
153
-
154
-
155
- """
156
- Usage:
157
- # Initialize the generator
158
- generator = FilePathGenerator(
159
- base_path='/Users/lvalverdeb/TeamDev/sibi-dst/IbisDataWH/logistics_storage/products/tracking',
160
- debug=True
161
- )
134
+ def _to_date(x) -> dt.date:
135
+ if isinstance(x, dt.datetime):
136
+ return x.date()
137
+ if isinstance(x, dt.date):
138
+ return x
139
+ return dt.datetime.strptime(str(x), "%Y-%m-%d").date()
162
140
 
163
- # Generate dataset paths for Dask
164
- dataset_paths = generator.generate_file_paths('2024-01-01', '2024-01-05', engine='dask')
165
-
166
- # Read data with Dask
167
- import dask.dataframe as dd
168
-
169
- df = dd.read_parquet(dataset_paths)
170
-
171
- # Now you can use df as a Dask DataFrame
172
- print(df.head())
173
-
174
- # Generate file paths for pandas
175
- file_paths = generator.generate_file_paths('2024-01-01', '2024-01-05', engine='pandas')
176
-
177
- # Read data with pandas
178
- import pandas as pd
141
+ @staticmethod
142
+ def _to_posix(path: str) -> str:
143
+ return PurePosixPath(path).as_posix()
179
144
 
180
- dataframes = []
181
- for fp in file_paths:
182
- df = pd.read_parquet(fp)
183
- dataframes.append(df)
145
+ @staticmethod
146
+ def _ensure_no_trailing_slash(path: str) -> str:
147
+ return path[:-1] if path.endswith("/") else path
184
148
 
185
- df_pandas = pd.concat(dataframes, ignore_index=True)
186
- print(df_pandas.head())
187
- """
149
+ @staticmethod
150
+ def _join(*parts: str) -> str:
151
+ p = PurePosixPath(parts[0])
152
+ for part in parts[1:]:
153
+ p = p / part
154
+ return p.as_posix()
155
+
156
+ def _with_protocol(self, path: str) -> str:
157
+ # If path already has a scheme, leave it
158
+ if "://" in path:
159
+ return path
160
+ # For local file, return absolute-like path without scheme or keep 'file://'? Keep scheme for consistency.
161
+ return f"{self._protocol}://{path}"
162
+
163
+ # import datetime
164
+ # import re
165
+ #
166
+ # import fsspec
167
+ #
168
+ # from .log_utils import Logger
169
+ #
170
+ #
171
+ # class FilePathGenerator:
172
+ # """
173
+ # Dynamically generates file paths by scanning directories starting from the base path
174
+ # and determining the innermost directory structure.
175
+ #
176
+ # Now supports generating appropriate paths for both pandas and Dask.
177
+ # """
178
+ #
179
+ # def __init__(self, base_path='', fs=None, logger=None, **kwargs):
180
+ # """
181
+ # Initialize the FilePathGenerator.
182
+ #
183
+ # Parameters:
184
+ # base_path (str): Base directory path where data files are stored.
185
+ # fs (fsspec.AbstractFileSystem, optional): Filesystem object to use for file operations.
186
+ # logger (Logger, optional): Logger instance for logging information.
187
+ # **kwargs: Additional keyword arguments.
188
+ # - debug (bool): If True, enables debug logging.
189
+ # - storage_options (dict): Options for the filesystem (e.g., credentials, tokens).
190
+ # - exclude_patterns (list): List of regex patterns to exclude from file paths.
191
+ # - file_extension (str): File extension to look for (default: 'parquet').
192
+ # """
193
+ # self.base_path = base_path.rstrip('/')
194
+ # self.fs = fs # Filesystem object
195
+ # self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
196
+ # self.debug = kwargs.get('debug', False)
197
+ # self.storage_options = kwargs.get('storage_options', {})
198
+ # self.exclude_patterns = kwargs.get('exclude_patterns', [])
199
+ # self.file_extension = kwargs.get('file_extension', 'parquet').lstrip('.')
200
+ #
201
+ # # If fs is not provided, initialize it based on base_path and storage_options
202
+ # if self.fs is None:
203
+ # self.fs, _ = fsspec.core.url_to_fs(self.base_path, **self.storage_options)
204
+ #
205
+ # def generate_file_paths(self, start_date, end_date, engine='dask'):
206
+ # """
207
+ # Generate paths dynamically for files within the date range by scanning directories.
208
+ # Returns a list of file paths compatible with the specified engine.
209
+ #
210
+ # Parameters:
211
+ # start_date (str or datetime): Start date in 'YYYY-MM-DD' format or datetime object.
212
+ # end_date (str or datetime): End date in 'YYYY-MM-DD' format or datetime object.
213
+ # engine (str): 'pandas' or 'dask' to specify which library the paths are intended for.
214
+ #
215
+ # Returns:
216
+ # list: List of file paths.
217
+ # """
218
+ # start_date = self._convert_to_datetime(start_date)
219
+ # end_date = self._convert_to_datetime(end_date)
220
+ #
221
+ # paths = []
222
+ # curr_date = start_date
223
+ #
224
+ # while curr_date <= end_date:
225
+ # year, month, day = curr_date.year, curr_date.month, curr_date.day
226
+ # day_paths = self._collect_paths(year, month, day, engine)
227
+ # if day_paths:
228
+ # paths.extend(day_paths)
229
+ # curr_date += datetime.timedelta(days=1)
230
+ #
231
+ # return paths
232
+ #
233
+ # def _collect_paths(self, year, month, day, engine):
234
+ # """
235
+ # Collect appropriate paths for a given date, depending on the engine.
236
+ #
237
+ # Parameters:
238
+ # year (int): Year component of the date.
239
+ # month (int): Month component of the date.
240
+ # day (int): Day component of the date.
241
+ # engine (str): 'pandas' or 'dask'.
242
+ #
243
+ # Returns:
244
+ # list: List of file or directory paths.
245
+ # """
246
+ # base_dir = f"{self.base_path}/{year}/{str(month).zfill(2)}/{str(day).zfill(2)}"
247
+ #
248
+ # if not self.fs.exists(base_dir):
249
+ # if self.debug:
250
+ # self.logger.debug(f"Directory does not exist: {base_dir}")
251
+ # return []
252
+ #
253
+ # if engine == 'dask':
254
+ # # Collect individual file paths
255
+ # file_pattern = f"{base_dir}/**/*.{self.file_extension}"
256
+ # all_paths = self.fs.glob(file_pattern)
257
+ #
258
+ # if not all_paths and self.debug:
259
+ # self.logger.debug(f"No files found with pattern: {file_pattern}")
260
+ #
261
+ # # Exclude unwanted files and directories
262
+ # filtered_paths = self._exclude_unwanted_paths(all_paths)
263
+ #
264
+ # # Filter out directories
265
+ # file_paths = [path for path in filtered_paths if not self.fs.isdir(path)]
266
+ #
267
+ # elif engine == 'pandas':
268
+ # # Collect dataset directories
269
+ # # Assume that the base_dir is a Parquet dataset
270
+ # if self.fs.isdir(base_dir):
271
+ # file_paths = [base_dir]
272
+ # else:
273
+ # file_paths = []
274
+ #
275
+ # else:
276
+ # raise ValueError("Engine must be 'pandas' or 'dask'.")
277
+ #
278
+ # protocol = self.fs.protocol if isinstance(self.fs.protocol, str) else self.fs.protocol[0]
279
+ #
280
+ # # Ensure the protocol is included in the paths
281
+ # file_paths = [
282
+ # f"{protocol}://{path}" if not path.startswith(f"{protocol}://") else path
283
+ # for path in file_paths
284
+ # ]
285
+ #
286
+ # if self.debug:
287
+ # self.logger.debug(f"Collected {len(file_paths)} paths from {base_dir} for engine '{engine}'")
288
+ #
289
+ # return file_paths
290
+ #
291
+ # def _exclude_unwanted_paths(self, paths):
292
+ # """
293
+ # Exclude paths that match any of the exclusion patterns.
294
+ # """
295
+ # # Combine default patterns with user-provided patterns
296
+ # exclude_patterns = self.exclude_patterns
297
+ #
298
+ # # Compile regex patterns for efficiency
299
+ # compiled_patterns = [re.compile(pattern) for pattern in exclude_patterns]
300
+ #
301
+ # # Filter out paths matching any of the exclude patterns
302
+ # filtered_paths = [
303
+ # path for path in paths
304
+ # if not any(pattern.match(path) for pattern in compiled_patterns)
305
+ # ]
306
+ #
307
+ # return filtered_paths
308
+ #
309
+ # @staticmethod
310
+ # def _convert_to_datetime(date):
311
+ # """Convert a date string or datetime object into a datetime object."""
312
+ # if isinstance(date, str):
313
+ # return datetime.datetime.strptime(date, '%Y-%m-%d')
314
+ # return date
315
+ #
316
+ #
317
+ # """
318
+ # Usage:
319
+ # # Initialize the generator
320
+ # generator = FilePathGenerator(
321
+ # base_path='/Users/lvalverdeb/TeamDev/sibi-dst/IbisDataWH/logistics_storage/products/tracking',
322
+ # debug=True
323
+ # )
324
+ #
325
+ # # Generate dataset paths for Dask
326
+ # dataset_paths = generator.generate_file_paths('2024-01-01', '2024-01-05', engine='dask')
327
+ #
328
+ # # Read data with Dask
329
+ # import dask.dataframe as dd
330
+ #
331
+ # df = dd.read_parquet(dataset_paths)
332
+ #
333
+ # # Now you can use df as a Dask DataFrame
334
+ # print(df.head())
335
+ #
336
+ # # Generate file paths for pandas
337
+ # file_paths = generator.generate_file_paths('2024-01-01', '2024-01-05', engine='pandas')
338
+ #
339
+ # # Read data with pandas
340
+ # import pandas as pd
341
+ #
342
+ # dataframes = []
343
+ # for fp in file_paths:
344
+ # df = pd.read_parquet(fp)
345
+ # dataframes.append(df)
346
+ #
347
+ # df_pandas = pd.concat(dataframes, ignore_index=True)
348
+ # print(df_pandas.head())
349
+ # """