sibi-dst 2025.9.10__tar.gz → 2025.9.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/PKG-INFO +1 -1
  2. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/pyproject.toml +1 -1
  3. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +1 -3
  4. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/utils/business_days.py +19 -51
  5. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/utils/filepath_generator.py +1 -154
  6. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/README.md +0 -0
  7. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/__init__.py +0 -0
  8. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/df_helper/__init__.py +0 -0
  9. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/df_helper/_artifact_updater_async.py +0 -0
  10. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/df_helper/_artifact_updater_threaded.py +0 -0
  11. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/df_helper/_df_helper.py +0 -0
  12. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
  13. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/df_helper/_parquet_reader.py +0 -0
  14. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/df_helper/backends/__init__.py +0 -0
  15. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
  16. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
  17. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
  18. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
  19. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
  20. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +0 -0
  21. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  22. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
  23. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +0 -0
  24. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
  25. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/df_helper/core/__init__.py +0 -0
  26. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/df_helper/core/_defaults.py +0 -0
  27. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
  28. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/df_helper/core/_params_config.py +0 -0
  29. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/df_helper/core/_query_config.py +0 -0
  30. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/df_helper/data_cleaner.py +0 -0
  31. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/geopy_helper/__init__.py +0 -0
  32. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
  33. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/geopy_helper/utils.py +0 -0
  34. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/osmnx_helper/__init__.py +0 -0
  35. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
  36. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
  37. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
  38. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/osmnx_helper/basemaps/route_map_plotter.py +0 -0
  39. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
  40. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/osmnx_helper/route_path_builder.py +0 -0
  41. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/osmnx_helper/utils.py +0 -0
  42. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/tests/__init__.py +0 -0
  43. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/tests/test_baseclass.py +0 -0
  44. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
  45. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/utils/__init__.py +0 -0
  46. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/utils/async_utils.py +0 -0
  47. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/utils/base.py +0 -0
  48. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/utils/boilerplate/__init__.py +0 -0
  49. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/utils/boilerplate/base_attacher.py +0 -0
  50. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/utils/boilerplate/base_data_cube.py +0 -0
  51. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/utils/boilerplate/base_parquet_artifact.py +0 -0
  52. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/utils/boilerplate/base_parquet_reader.py +0 -0
  53. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/utils/boilerplate/base_pipeline.py +0 -0
  54. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/utils/boilerplate/base_pipeline_template.py +0 -0
  55. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/utils/boilerplate/hybrid_data_loader.py +0 -0
  56. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/utils/clickhouse_writer.py +0 -0
  57. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/utils/credentials.py +0 -0
  58. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/utils/dask_utils.py +0 -0
  59. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/utils/data_from_http_source.py +0 -0
  60. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/utils/data_utils.py +0 -0
  61. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/utils/data_wrapper.py +0 -0
  62. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/utils/date_utils.py +0 -0
  63. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/utils/df_utils.py +0 -0
  64. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/utils/file_age_checker.py +0 -0
  65. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/utils/file_utils.py +0 -0
  66. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/utils/iceberg_saver.py +0 -0
  67. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/utils/log_utils.py +0 -0
  68. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/utils/manifest_manager.py +0 -0
  69. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/utils/parquet_saver.py +0 -0
  70. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/utils/periods.py +0 -0
  71. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/utils/phone_formatter.py +0 -0
  72. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/utils/progress/__init__.py +0 -0
  73. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/utils/progress/jobs.py +0 -0
  74. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/utils/progress/sse_runner.py +0 -0
  75. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/utils/storage_config.py +0 -0
  76. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/utils/storage_hive.py +0 -0
  77. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/utils/storage_manager.py +0 -0
  78. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/utils/update_planner.py +0 -0
  79. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/utils/webdav_client.py +0 -0
  80. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/utils/write_gatekeeper.py +0 -0
  81. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/v2/__init__.py +0 -0
  82. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/v2/df_helper/__init__.py +0 -0
  83. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/v2/df_helper/_df_helper.py +0 -0
  84. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/v2/df_helper/backends/__init__.py +0 -0
  85. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py +0 -0
  86. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
  87. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  88. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
  89. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py +0 -0
  90. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py +0 -0
  91. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +0 -0
  92. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py +0 -0
  93. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py +0 -0
  94. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py +0 -0
  95. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/v2/df_helper/core/__init__.py +0 -0
  96. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/v2/df_helper/core/_filter_handler.py +0 -0
  97. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/v2/df_helper/core/_params_config.py +0 -0
  98. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/v2/df_helper/core/_query_config.py +0 -0
  99. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/v2/utils/__init__.py +0 -0
  100. {sibi_dst-2025.9.10 → sibi_dst-2025.9.11}/sibi_dst/v2/utils/log_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 2025.9.10
3
+ Version: 2025.9.11
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "sibi-dst"
3
- version = "2025.9.10"
3
+ version = "2025.9.11"
4
4
  description = "Data Science Toolkit"
5
5
  authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
6
6
  readme = "README.md"
@@ -231,7 +231,7 @@ class ParquetConfig(BaseModel):
231
231
  Builds a list of path patterns for dask.read_parquet.
232
232
  Respects partition_on + start/end date if given.
233
233
  """
234
- print(f"_resolve_paths_for_read: {self.partition_on}")
234
+ self.logger.debug(f"_resolve_paths_for_read: {self.partition_on}")
235
235
  # Partitioned dataset by column
236
236
  if self.partition_on and self.parquet_start_date and self.parquet_end_date:
237
237
  if not isinstance(self.partition_on, (list, tuple)):
@@ -244,12 +244,10 @@ class ParquetConfig(BaseModel):
244
244
  days = pd.date_range(start=start, end=end, freq="D").date
245
245
 
246
246
  base = self.parquet_storage_path.rstrip("/")
247
- print("base:",base)
248
247
  result= [
249
248
  f"{base}/{parts[0]}={d.isoformat()}/*.parquet"
250
249
  for d in days
251
250
  ]
252
- print("result:",result)
253
251
  return result
254
252
 
255
253
  # Date-ranged folders (non-partitioned, using FilePathGenerator)
@@ -1,19 +1,22 @@
1
+ from __future__ import annotations
2
+
1
3
  import datetime as dt
2
4
  from typing import Any, Dict, Iterable, Optional
3
- from sibi_dst.utils import Logger
5
+
6
+ import dask.dataframe as dd
4
7
  import numpy as np
5
8
  import pandas as pd
6
- import dask.dataframe as dd
9
+
10
+ from sibi_dst.utils import Logger
7
11
 
8
12
 
9
13
  # ---------------- Vectorized helpers (used by Dask map_partitions) ----------------
10
14
 
11
15
  def _to_np_days(series: pd.Series) -> np.ndarray:
12
16
  """Coerce to numpy datetime64[D] with NaT-safe conversion."""
13
- # Use pandas for robust parsing, then cast to date-days
14
17
  s = pd.to_datetime(series, errors="coerce")
15
- # Convert to numpy datetime64[D] (day precision)
16
- return s.values.astype("datetime64[D]")
18
+ # Return day precision array directly
19
+ return s.dt.floor("D").to_numpy(dtype="datetime64[D]")
17
20
 
18
21
 
19
22
  def _vectorized_busday_count(
@@ -24,8 +27,8 @@ def _vectorized_busday_count(
24
27
  weekmask: Optional[str],
25
28
  inclusive: bool,
26
29
  ) -> pd.Series:
27
- start = _to_np_days(part[begin_col]) # numpy datetime64[D]
28
- end = _to_np_days(part[end_col]) # numpy datetime64[D]
30
+ start = _to_np_days(part[begin_col])
31
+ end = _to_np_days(part[end_col])
29
32
 
30
33
  kwargs: Dict[str, Any] = {}
31
34
  if holidays:
@@ -38,7 +41,7 @@ def _vectorized_busday_count(
38
41
  with np.errstate(invalid="ignore"):
39
42
  end_adj = end + np.timedelta64(1, "D")
40
43
 
41
- valid = (~pd.isna(start)) & (~pd.isna(end)) # numpy bool mask
44
+ valid = (~pd.isna(start)) & (~pd.isna(end))
42
45
  result = np.full(part.shape[0], np.nan, dtype="float64")
43
46
  if valid.any():
44
47
  counts = np.busday_count(
@@ -59,8 +62,8 @@ def _vectorized_busday_offset(
59
62
  weekmask: Optional[str],
60
63
  roll: str,
61
64
  ) -> pd.Series:
62
- start = _to_np_days(part[start_col]) # numpy datetime64[D]
63
- n_days = pd.to_numeric(part[n_days_col], errors="coerce").to_numpy() # numpy float -> cast later
65
+ start = _to_np_days(part[start_col])
66
+ n_days = pd.to_numeric(part[n_days_col], errors="coerce").to_numpy()
64
67
 
65
68
  kwargs: Dict[str, Any] = {"roll": roll}
66
69
  if holidays:
@@ -68,7 +71,7 @@ def _vectorized_busday_offset(
68
71
  if weekmask:
69
72
  kwargs["weekmask"] = weekmask
70
73
 
71
- valid = (~pd.isna(start)) & (~pd.isna(n_days)) # numpy bool mask
74
+ valid = (~pd.isna(start)) & (~pd.isna(n_days))
72
75
  out = np.full(part.shape[0], np.datetime64("NaT", "ns"), dtype="datetime64[ns]")
73
76
  if valid.any():
74
77
  offs = np.busday_offset(
@@ -86,26 +89,6 @@ def _vectorized_busday_offset(
86
89
  class BusinessDays:
87
90
  """
88
91
  Business day calculations with custom holidays and optional weekmask.
89
-
90
- Features
91
- - Scalar helpers:
92
- - get_business_days_count(begin, end, inclusive=False) -> int
93
- - add_business_days(start_date, n_days, roll='forward') -> np.datetime64
94
- - Dask DataFrame helpers (vectorized via map_partitions):
95
- - calc_business_days_from_df(df, begin_col, end_col, result_col='business_days', inclusive=False)
96
- - calc_sla_end_date(df, start_date_col, n_days_col, result_col='sla_end_date', roll='forward')
97
-
98
- Parameters
99
- ----------
100
- holiday_list : dict[str, list[str]] | Iterable[str]
101
- Either a mapping of year -> [YYYY-MM-DD, ...] or a flat iterable of YYYY-MM-DD strings.
102
- logger : Any
103
- Logger with .debug/.info/.warning/.error.
104
- weekmask : str | None
105
- A numpy business day weekmask like '1111100' (Mon–Fri). None means default Mon–Fri.
106
- Examples:
107
- '1111100' -> Mon-Fri
108
- '1111110' -> Mon-Sat
109
92
  """
110
93
 
111
94
  def __init__(
@@ -119,12 +102,11 @@ class BusinessDays:
119
102
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
120
103
  self.weekmask = weekmask
121
104
 
122
- # Normalize holidays to a flat, sorted tuple of 'YYYY-MM-DD'
123
105
  if isinstance(holiday_list, dict):
124
106
  flat = [d for _, days in sorted(holiday_list.items()) for d in days]
125
107
  else:
126
108
  flat = list(holiday_list)
127
- # Deduplicate while preserving order
109
+
128
110
  seen = set()
129
111
  flat_unique = []
130
112
  for d in flat:
@@ -142,7 +124,6 @@ class BusinessDays:
142
124
  *,
143
125
  inclusive: bool = False,
144
126
  ) -> int:
145
- """Business days between two dates. If inclusive=True, include the end date."""
146
127
  b = pd.to_datetime(begin_date).date()
147
128
  e = pd.to_datetime(end_date).date()
148
129
 
@@ -153,11 +134,11 @@ class BusinessDays:
153
134
  kwargs["weekmask"] = self.weekmask
154
135
 
155
136
  if inclusive:
156
- e_np = np.datetime64(e) + np.timedelta64(1, "D")
137
+ e_np = np.datetime64(e, "D") + np.timedelta64(1, "D")
157
138
  else:
158
- e_np = np.datetime64(e)
139
+ e_np = np.datetime64(e, "D")
159
140
 
160
- val = int(np.busday_count(np.datetime64(b), e_np, **kwargs))
141
+ val = int(np.busday_count(np.datetime64(b, "D"), e_np, **kwargs))
161
142
  return val
162
143
 
163
144
  def add_business_days(
@@ -167,11 +148,6 @@ class BusinessDays:
167
148
  *,
168
149
  roll: str = "forward",
169
150
  ) -> np.datetime64:
170
- """
171
- Add (or subtract) business days to a date. Returns numpy datetime64[D].
172
- roll: {'forward','backward','following','preceding','modifiedfollowing',
173
- 'modifiedpreceding','nat'}
174
- """
175
151
  s = pd.to_datetime(start_date).date()
176
152
  kwargs: Dict[str, Any] = {"roll": roll}
177
153
  if self.holidays:
@@ -179,7 +155,7 @@ class BusinessDays:
179
155
  if self.weekmask:
180
156
  kwargs["weekmask"] = self.weekmask
181
157
 
182
- return np.busday_offset(np.datetime64(s), int(n_days), **kwargs)
158
+ return np.busday_offset(np.datetime64(s, "D"), int(n_days), **kwargs)
183
159
 
184
160
  # -------- Dask API --------
185
161
 
@@ -192,10 +168,6 @@ class BusinessDays:
192
168
  *,
193
169
  inclusive: bool = False,
194
170
  ) -> dd.DataFrame:
195
- """
196
- Vectorized business-day difference between two date columns.
197
- Produces float64 (NaN where either side is missing).
198
- """
199
171
  missing = {begin_date_col, end_date_col} - set(df.columns)
200
172
  if missing:
201
173
  self.logger.error(f"Missing columns: {missing}")
@@ -224,10 +196,6 @@ class BusinessDays:
224
196
  *,
225
197
  roll: str = "forward",
226
198
  ) -> dd.DataFrame:
227
- """
228
- Vectorized business-day offset for SLA end date.
229
- Produces datetime64[ns] with NaT where invalid.
230
- """
231
199
  missing = {start_date_col, n_days_col} - set(df.columns)
232
200
  if missing:
233
201
  self.logger.error(f"Missing columns: {missing}")
@@ -160,160 +160,7 @@ class FilePathGenerator:
160
160
  # For local file, return absolute-like path without scheme or keep 'file://'? Keep scheme for consistency.
161
161
  return f"{self._protocol}://{path}"
162
162
 
163
- # import datetime
164
- # import re
165
- #
166
- # import fsspec
167
- #
168
- # from .log_utils import Logger
169
- #
170
- #
171
- # class FilePathGenerator:
172
- # """
173
- # Dynamically generates file paths by scanning directories starting from the base path
174
- # and determining the innermost directory structure.
175
- #
176
- # Now supports generating appropriate paths for both pandas and Dask.
177
- # """
178
- #
179
- # def __init__(self, base_path='', fs=None, logger=None, **kwargs):
180
- # """
181
- # Initialize the FilePathGenerator.
182
- #
183
- # Parameters:
184
- # base_path (str): Base directory path where data files are stored.
185
- # fs (fsspec.AbstractFileSystem, optional): Filesystem object to use for file operations.
186
- # logger (Logger, optional): Logger instance for logging information.
187
- # **kwargs: Additional keyword arguments.
188
- # - debug (bool): If True, enables debug logging.
189
- # - storage_options (dict): Options for the filesystem (e.g., credentials, tokens).
190
- # - exclude_patterns (list): List of regex patterns to exclude from file paths.
191
- # - file_extension (str): File extension to look for (default: 'parquet').
192
- # """
193
- # self.base_path = base_path.rstrip('/')
194
- # self.fs = fs # Filesystem object
195
- # self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
196
- # self.debug = kwargs.get('debug', False)
197
- # self.storage_options = kwargs.get('storage_options', {})
198
- # self.exclude_patterns = kwargs.get('exclude_patterns', [])
199
- # self.file_extension = kwargs.get('file_extension', 'parquet').lstrip('.')
200
- #
201
- # # If fs is not provided, initialize it based on base_path and storage_options
202
- # if self.fs is None:
203
- # self.fs, _ = fsspec.core.url_to_fs(self.base_path, **self.storage_options)
204
- #
205
- # def generate_file_paths(self, start_date, end_date, engine='dask'):
206
- # """
207
- # Generate paths dynamically for files within the date range by scanning directories.
208
- # Returns a list of file paths compatible with the specified engine.
209
- #
210
- # Parameters:
211
- # start_date (str or datetime): Start date in 'YYYY-MM-DD' format or datetime object.
212
- # end_date (str or datetime): End date in 'YYYY-MM-DD' format or datetime object.
213
- # engine (str): 'pandas' or 'dask' to specify which library the paths are intended for.
214
- #
215
- # Returns:
216
- # list: List of file paths.
217
- # """
218
- # start_date = self._convert_to_datetime(start_date)
219
- # end_date = self._convert_to_datetime(end_date)
220
- #
221
- # paths = []
222
- # curr_date = start_date
223
- #
224
- # while curr_date <= end_date:
225
- # year, month, day = curr_date.year, curr_date.month, curr_date.day
226
- # day_paths = self._collect_paths(year, month, day, engine)
227
- # if day_paths:
228
- # paths.extend(day_paths)
229
- # curr_date += datetime.timedelta(days=1)
230
- #
231
- # return paths
232
- #
233
- # def _collect_paths(self, year, month, day, engine):
234
- # """
235
- # Collect appropriate paths for a given date, depending on the engine.
236
- #
237
- # Parameters:
238
- # year (int): Year component of the date.
239
- # month (int): Month component of the date.
240
- # day (int): Day component of the date.
241
- # engine (str): 'pandas' or 'dask'.
242
- #
243
- # Returns:
244
- # list: List of file or directory paths.
245
- # """
246
- # base_dir = f"{self.base_path}/{year}/{str(month).zfill(2)}/{str(day).zfill(2)}"
247
- #
248
- # if not self.fs.exists(base_dir):
249
- # if self.debug:
250
- # self.logger.debug(f"Directory does not exist: {base_dir}")
251
- # return []
252
- #
253
- # if engine == 'dask':
254
- # # Collect individual file paths
255
- # file_pattern = f"{base_dir}/**/*.{self.file_extension}"
256
- # all_paths = self.fs.glob(file_pattern)
257
- #
258
- # if not all_paths and self.debug:
259
- # self.logger.debug(f"No files found with pattern: {file_pattern}")
260
- #
261
- # # Exclude unwanted files and directories
262
- # filtered_paths = self._exclude_unwanted_paths(all_paths)
263
- #
264
- # # Filter out directories
265
- # file_paths = [path for path in filtered_paths if not self.fs.isdir(path)]
266
- #
267
- # elif engine == 'pandas':
268
- # # Collect dataset directories
269
- # # Assume that the base_dir is a Parquet dataset
270
- # if self.fs.isdir(base_dir):
271
- # file_paths = [base_dir]
272
- # else:
273
- # file_paths = []
274
- #
275
- # else:
276
- # raise ValueError("Engine must be 'pandas' or 'dask'.")
277
- #
278
- # protocol = self.fs.protocol if isinstance(self.fs.protocol, str) else self.fs.protocol[0]
279
- #
280
- # # Ensure the protocol is included in the paths
281
- # file_paths = [
282
- # f"{protocol}://{path}" if not path.startswith(f"{protocol}://") else path
283
- # for path in file_paths
284
- # ]
285
- #
286
- # if self.debug:
287
- # self.logger.debug(f"Collected {len(file_paths)} paths from {base_dir} for engine '{engine}'")
288
- #
289
- # return file_paths
290
- #
291
- # def _exclude_unwanted_paths(self, paths):
292
- # """
293
- # Exclude paths that match any of the exclusion patterns.
294
- # """
295
- # # Combine default patterns with user-provided patterns
296
- # exclude_patterns = self.exclude_patterns
297
- #
298
- # # Compile regex patterns for efficiency
299
- # compiled_patterns = [re.compile(pattern) for pattern in exclude_patterns]
300
- #
301
- # # Filter out paths matching any of the exclude patterns
302
- # filtered_paths = [
303
- # path for path in paths
304
- # if not any(pattern.match(path) for pattern in compiled_patterns)
305
- # ]
306
- #
307
- # return filtered_paths
308
- #
309
- # @staticmethod
310
- # def _convert_to_datetime(date):
311
- # """Convert a date string or datetime object into a datetime object."""
312
- # if isinstance(date, str):
313
- # return datetime.datetime.strptime(date, '%Y-%m-%d')
314
- # return date
315
- #
316
- #
163
+
317
164
  # """
318
165
  # Usage:
319
166
  # # Initialize the generator
File without changes