sibi-dst 2025.9.9__py3-none-any.whl → 2025.9.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/_artifact_updater_async.py +191 -137
- sibi_dst/df_helper/_parquet_artifact.py +6 -326
- sibi_dst/df_helper/_parquet_reader.py +2 -1
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +24 -2
- sibi_dst/utils/boilerplate/__init__.py +5 -3
- sibi_dst/utils/boilerplate/base_pipeline.py +14 -29
- sibi_dst/utils/business_days.py +19 -51
- sibi_dst/utils/clickhouse_writer.py +1 -1
- sibi_dst/utils/data_wrapper.py +46 -312
- sibi_dst/utils/filepath_generator.py +1 -154
- sibi_dst/utils/parquet_saver.py +29 -16
- sibi_dst/utils/progress/sse_runner.py +39 -11
- sibi_dst/utils/update_planner.py +161 -805
- {sibi_dst-2025.9.9.dist-info → sibi_dst-2025.9.11.dist-info}/METADATA +2 -1
- {sibi_dst-2025.9.9.dist-info → sibi_dst-2025.9.11.dist-info}/RECORD +16 -16
- {sibi_dst-2025.9.9.dist-info → sibi_dst-2025.9.11.dist-info}/WHEEL +0 -0
@@ -160,160 +160,7 @@ class FilePathGenerator:
|
|
160
160
|
# For local file, return absolute-like path without scheme or keep 'file://'? Keep scheme for consistency.
|
161
161
|
return f"{self._protocol}://{path}"
|
162
162
|
|
163
|
-
|
164
|
-
# import re
|
165
|
-
#
|
166
|
-
# import fsspec
|
167
|
-
#
|
168
|
-
# from .log_utils import Logger
|
169
|
-
#
|
170
|
-
#
|
171
|
-
# class FilePathGenerator:
|
172
|
-
# """
|
173
|
-
# Dynamically generates file paths by scanning directories starting from the base path
|
174
|
-
# and determining the innermost directory structure.
|
175
|
-
#
|
176
|
-
# Now supports generating appropriate paths for both pandas and Dask.
|
177
|
-
# """
|
178
|
-
#
|
179
|
-
# def __init__(self, base_path='', fs=None, logger=None, **kwargs):
|
180
|
-
# """
|
181
|
-
# Initialize the FilePathGenerator.
|
182
|
-
#
|
183
|
-
# Parameters:
|
184
|
-
# base_path (str): Base directory path where data files are stored.
|
185
|
-
# fs (fsspec.AbstractFileSystem, optional): Filesystem object to use for file operations.
|
186
|
-
# logger (Logger, optional): Logger instance for logging information.
|
187
|
-
# **kwargs: Additional keyword arguments.
|
188
|
-
# - debug (bool): If True, enables debug logging.
|
189
|
-
# - storage_options (dict): Options for the filesystem (e.g., credentials, tokens).
|
190
|
-
# - exclude_patterns (list): List of regex patterns to exclude from file paths.
|
191
|
-
# - file_extension (str): File extension to look for (default: 'parquet').
|
192
|
-
# """
|
193
|
-
# self.base_path = base_path.rstrip('/')
|
194
|
-
# self.fs = fs # Filesystem object
|
195
|
-
# self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
196
|
-
# self.debug = kwargs.get('debug', False)
|
197
|
-
# self.storage_options = kwargs.get('storage_options', {})
|
198
|
-
# self.exclude_patterns = kwargs.get('exclude_patterns', [])
|
199
|
-
# self.file_extension = kwargs.get('file_extension', 'parquet').lstrip('.')
|
200
|
-
#
|
201
|
-
# # If fs is not provided, initialize it based on base_path and storage_options
|
202
|
-
# if self.fs is None:
|
203
|
-
# self.fs, _ = fsspec.core.url_to_fs(self.base_path, **self.storage_options)
|
204
|
-
#
|
205
|
-
# def generate_file_paths(self, start_date, end_date, engine='dask'):
|
206
|
-
# """
|
207
|
-
# Generate paths dynamically for files within the date range by scanning directories.
|
208
|
-
# Returns a list of file paths compatible with the specified engine.
|
209
|
-
#
|
210
|
-
# Parameters:
|
211
|
-
# start_date (str or datetime): Start date in 'YYYY-MM-DD' format or datetime object.
|
212
|
-
# end_date (str or datetime): End date in 'YYYY-MM-DD' format or datetime object.
|
213
|
-
# engine (str): 'pandas' or 'dask' to specify which library the paths are intended for.
|
214
|
-
#
|
215
|
-
# Returns:
|
216
|
-
# list: List of file paths.
|
217
|
-
# """
|
218
|
-
# start_date = self._convert_to_datetime(start_date)
|
219
|
-
# end_date = self._convert_to_datetime(end_date)
|
220
|
-
#
|
221
|
-
# paths = []
|
222
|
-
# curr_date = start_date
|
223
|
-
#
|
224
|
-
# while curr_date <= end_date:
|
225
|
-
# year, month, day = curr_date.year, curr_date.month, curr_date.day
|
226
|
-
# day_paths = self._collect_paths(year, month, day, engine)
|
227
|
-
# if day_paths:
|
228
|
-
# paths.extend(day_paths)
|
229
|
-
# curr_date += datetime.timedelta(days=1)
|
230
|
-
#
|
231
|
-
# return paths
|
232
|
-
#
|
233
|
-
# def _collect_paths(self, year, month, day, engine):
|
234
|
-
# """
|
235
|
-
# Collect appropriate paths for a given date, depending on the engine.
|
236
|
-
#
|
237
|
-
# Parameters:
|
238
|
-
# year (int): Year component of the date.
|
239
|
-
# month (int): Month component of the date.
|
240
|
-
# day (int): Day component of the date.
|
241
|
-
# engine (str): 'pandas' or 'dask'.
|
242
|
-
#
|
243
|
-
# Returns:
|
244
|
-
# list: List of file or directory paths.
|
245
|
-
# """
|
246
|
-
# base_dir = f"{self.base_path}/{year}/{str(month).zfill(2)}/{str(day).zfill(2)}"
|
247
|
-
#
|
248
|
-
# if not self.fs.exists(base_dir):
|
249
|
-
# if self.debug:
|
250
|
-
# self.logger.debug(f"Directory does not exist: {base_dir}")
|
251
|
-
# return []
|
252
|
-
#
|
253
|
-
# if engine == 'dask':
|
254
|
-
# # Collect individual file paths
|
255
|
-
# file_pattern = f"{base_dir}/**/*.{self.file_extension}"
|
256
|
-
# all_paths = self.fs.glob(file_pattern)
|
257
|
-
#
|
258
|
-
# if not all_paths and self.debug:
|
259
|
-
# self.logger.debug(f"No files found with pattern: {file_pattern}")
|
260
|
-
#
|
261
|
-
# # Exclude unwanted files and directories
|
262
|
-
# filtered_paths = self._exclude_unwanted_paths(all_paths)
|
263
|
-
#
|
264
|
-
# # Filter out directories
|
265
|
-
# file_paths = [path for path in filtered_paths if not self.fs.isdir(path)]
|
266
|
-
#
|
267
|
-
# elif engine == 'pandas':
|
268
|
-
# # Collect dataset directories
|
269
|
-
# # Assume that the base_dir is a Parquet dataset
|
270
|
-
# if self.fs.isdir(base_dir):
|
271
|
-
# file_paths = [base_dir]
|
272
|
-
# else:
|
273
|
-
# file_paths = []
|
274
|
-
#
|
275
|
-
# else:
|
276
|
-
# raise ValueError("Engine must be 'pandas' or 'dask'.")
|
277
|
-
#
|
278
|
-
# protocol = self.fs.protocol if isinstance(self.fs.protocol, str) else self.fs.protocol[0]
|
279
|
-
#
|
280
|
-
# # Ensure the protocol is included in the paths
|
281
|
-
# file_paths = [
|
282
|
-
# f"{protocol}://{path}" if not path.startswith(f"{protocol}://") else path
|
283
|
-
# for path in file_paths
|
284
|
-
# ]
|
285
|
-
#
|
286
|
-
# if self.debug:
|
287
|
-
# self.logger.debug(f"Collected {len(file_paths)} paths from {base_dir} for engine '{engine}'")
|
288
|
-
#
|
289
|
-
# return file_paths
|
290
|
-
#
|
291
|
-
# def _exclude_unwanted_paths(self, paths):
|
292
|
-
# """
|
293
|
-
# Exclude paths that match any of the exclusion patterns.
|
294
|
-
# """
|
295
|
-
# # Combine default patterns with user-provided patterns
|
296
|
-
# exclude_patterns = self.exclude_patterns
|
297
|
-
#
|
298
|
-
# # Compile regex patterns for efficiency
|
299
|
-
# compiled_patterns = [re.compile(pattern) for pattern in exclude_patterns]
|
300
|
-
#
|
301
|
-
# # Filter out paths matching any of the exclude patterns
|
302
|
-
# filtered_paths = [
|
303
|
-
# path for path in paths
|
304
|
-
# if not any(pattern.match(path) for pattern in compiled_patterns)
|
305
|
-
# ]
|
306
|
-
#
|
307
|
-
# return filtered_paths
|
308
|
-
#
|
309
|
-
# @staticmethod
|
310
|
-
# def _convert_to_datetime(date):
|
311
|
-
# """Convert a date string or datetime object into a datetime object."""
|
312
|
-
# if isinstance(date, str):
|
313
|
-
# return datetime.datetime.strptime(date, '%Y-%m-%d')
|
314
|
-
# return date
|
315
|
-
#
|
316
|
-
#
|
163
|
+
|
317
164
|
# """
|
318
165
|
# Usage:
|
319
166
|
# # Initialize the generator
|
sibi_dst/utils/parquet_saver.py
CHANGED
@@ -71,6 +71,7 @@ class ParquetSaver(ManagedResource):
|
|
71
71
|
max_delete_workers: int = 8,
|
72
72
|
write_gate_max: int = 2,
|
73
73
|
write_gate_key: Optional[str] = None,
|
74
|
+
partition_on: Optional[list[str]] = None,
|
74
75
|
**kwargs: Any,
|
75
76
|
):
|
76
77
|
super().__init__(**kwargs)
|
@@ -93,6 +94,7 @@ class ParquetSaver(ManagedResource):
|
|
93
94
|
self.max_delete_workers = max(1, int(max_delete_workers))
|
94
95
|
self.write_gate_max = max(1, int(write_gate_max))
|
95
96
|
self.write_gate_key = (write_gate_key or self.parquet_storage_path).rstrip("/")
|
97
|
+
self.partition_on = partition_on
|
96
98
|
|
97
99
|
# Fix: Remove deprecated coerce_timestamps parameter
|
98
100
|
self.pyarrow_args.setdefault("compression", "zstd")
|
@@ -103,7 +105,18 @@ class ParquetSaver(ManagedResource):
|
|
103
105
|
|
104
106
|
# ---------- public API ----------
|
105
107
|
def save_to_parquet(self, output_directory_name: str = "default_output", overwrite: bool = True) -> str:
|
106
|
-
|
108
|
+
"""
|
109
|
+
Save the Dask DataFrame to Parquet. If partition_on is provided, write as a
|
110
|
+
partitioned dataset without overwriting earlier partitions.
|
111
|
+
"""
|
112
|
+
# Always treat as a directory target
|
113
|
+
if self.partition_on:
|
114
|
+
overwrite = False
|
115
|
+
# we override the output_directory_name and overwrite setting to avoid confusion since dask will (re) create subdirs
|
116
|
+
# Partitioned dataset → write directly to root directory
|
117
|
+
target_path = self.parquet_storage_path.rstrip("/")
|
118
|
+
else:
|
119
|
+
target_path = f"{self.parquet_storage_path}/{output_directory_name}".rstrip("/")
|
107
120
|
|
108
121
|
sem = get_write_sem(self.write_gate_key, self.write_gate_max)
|
109
122
|
with sem:
|
@@ -111,7 +124,7 @@ class ParquetSaver(ManagedResource):
|
|
111
124
|
self._clear_directory_safely(target_path)
|
112
125
|
self.fs.mkdirs(target_path, exist_ok=True)
|
113
126
|
|
114
|
-
#
|
127
|
+
# Enforce schema before write
|
115
128
|
schema = self._define_schema()
|
116
129
|
ddf = self._coerce_ddf_to_schema(self.df_result, schema)
|
117
130
|
|
@@ -128,25 +141,25 @@ class ParquetSaver(ManagedResource):
|
|
128
141
|
pa.set_cpu_count(self.arrow_cpu)
|
129
142
|
|
130
143
|
try:
|
144
|
+
params = {
|
145
|
+
"path": target_path,
|
146
|
+
"engine": "pyarrow",
|
147
|
+
"filesystem": self.fs,
|
148
|
+
"write_index": self.write_index,
|
149
|
+
"write_metadata_file": self.write_metadata_file,
|
150
|
+
**self.pyarrow_args,
|
151
|
+
}
|
152
|
+
self.partition_on = self.partition_on if isinstance(self.partition_on, list) else None
|
153
|
+
if self.partition_on:
|
154
|
+
params["partition_on"] = self.partition_on
|
155
|
+
|
131
156
|
with self._local_dask_pool():
|
132
|
-
ddf.to_parquet(
|
133
|
-
path=target_path,
|
134
|
-
engine="pyarrow",
|
135
|
-
schema=schema,
|
136
|
-
overwrite=False,
|
137
|
-
filesystem=self.fs,
|
138
|
-
write_index=self.write_index,
|
139
|
-
write_metadata_file=self.write_metadata_file,
|
140
|
-
**self.pyarrow_args,
|
141
|
-
)
|
157
|
+
ddf.to_parquet(**params)
|
142
158
|
finally:
|
143
159
|
if old_arrow_cpu is not None:
|
144
160
|
pa.set_cpu_count(old_arrow_cpu)
|
145
161
|
|
146
|
-
self.logger.info(
|
147
|
-
f"Parquet dataset written: {target_path}",
|
148
|
-
extra=self.logger_extra,
|
149
|
-
)
|
162
|
+
self.logger.info(f"Parquet dataset written: {target_path}", extra=self.logger_extra)
|
150
163
|
return target_path
|
151
164
|
|
152
165
|
@contextmanager
|
@@ -1,6 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
import asyncio, contextlib, inspect, json
|
3
|
-
from typing import Any, Awaitable, Callable, Dict, Optional, Union
|
3
|
+
from typing import Any, Awaitable, Callable, Dict, Optional, Union, Mapping, MutableMapping
|
4
4
|
from fastapi import Request
|
5
5
|
from sse_starlette.sse import EventSourceResponse
|
6
6
|
from sibi_dst.utils import Logger
|
@@ -13,6 +13,16 @@ TaskFn = Union[Task2, Task3]
|
|
13
13
|
def _as_sse_msg(event: str, data: Any) -> dict:
|
14
14
|
return {"event": event, "data": json.dumps(data) if not isinstance(data, (str, bytes)) else data}
|
15
15
|
|
16
|
+
def _merge_ctx(*parts: Optional[Mapping[str, Any]]) -> Dict[str, Any]:
|
17
|
+
"""Right-most precedence; shallow merge is sufficient for our keys."""
|
18
|
+
out: Dict[str, Any] = {}
|
19
|
+
for p in parts:
|
20
|
+
if not p:
|
21
|
+
continue
|
22
|
+
for k, v in p.items():
|
23
|
+
out[k] = v
|
24
|
+
return out
|
25
|
+
|
16
26
|
class SSERunner:
|
17
27
|
def __init__(self, *, task: TaskFn, logger: Logger, ping: int = 15,
|
18
28
|
headers: Optional[dict] = None, auto_complete: bool = True) -> None:
|
@@ -31,39 +41,57 @@ class SSERunner:
|
|
31
41
|
async def _worker(self, queue: asyncio.Queue, task_id: str, ctx: Dict[str, Any]) -> None:
|
32
42
|
self.logger.info(f"SSE {task_id}: start")
|
33
43
|
try:
|
34
|
-
await queue.put(_as_sse_msg("progress", {"message": "Task started"}))
|
44
|
+
await queue.put(_as_sse_msg("progress", {"message": "Task started", "task_id": task_id}))
|
35
45
|
payload = await self._call_task(queue, task_id, ctx)
|
36
46
|
if self.auto_complete:
|
37
47
|
final = payload if payload is not None else {"status": "complete"}
|
48
|
+
if isinstance(final, dict) and "task_id" not in final:
|
49
|
+
final["task_id"] = task_id
|
38
50
|
await queue.put(_as_sse_msg("complete", final))
|
39
51
|
self.logger.info(f"SSE {task_id}: complete")
|
40
52
|
except asyncio.CancelledError:
|
41
53
|
raise
|
42
54
|
except Exception as e:
|
43
55
|
self.logger.error(f"SSE {task_id} failed: {e}", exc_info=True)
|
44
|
-
await queue.put(_as_sse_msg("error", {"detail": str(e)}))
|
56
|
+
await queue.put(_as_sse_msg("error", {"detail": str(e), "task_id": task_id}))
|
45
57
|
finally:
|
46
58
|
await queue.put(None)
|
47
59
|
|
48
|
-
def endpoint(self):
|
49
|
-
|
60
|
+
def endpoint(self, *, ctx: Optional[Dict[str, Any]] = None):
|
61
|
+
"""
|
62
|
+
Create an SSE endpoint.
|
63
|
+
- ctx: optional explicit context dict provided by the caller.
|
64
|
+
This overrides request-derived context and request.state.ctx.
|
65
|
+
Precedence when merging: request-derived < request.state.ctx < ctx (explicit).
|
66
|
+
"""
|
67
|
+
async def handler(request: Request):
|
50
68
|
queue: asyncio.Queue = asyncio.Queue()
|
51
69
|
task_id = str(asyncio.get_running_loop().time()).replace(".", "")
|
52
70
|
self.logger.debug(
|
53
|
-
f"SSE {task_id}: new request client={request.client} path={request.url.path} q={dict(request.query_params)}"
|
71
|
+
f"SSE {task_id}: new request client={request.client} path={request.url.path} q={dict(request.query_params)}"
|
72
|
+
)
|
54
73
|
|
55
|
-
ctx
|
56
|
-
|
74
|
+
# Base ctx from the HTTP request
|
75
|
+
base_ctx: Dict[str, Any] = {
|
76
|
+
"path": dict(request.path_params),
|
57
77
|
"query": dict(request.query_params),
|
58
78
|
"method": request.method,
|
79
|
+
"headers": dict(request.headers) if hasattr(request, "headers") else None,
|
59
80
|
}
|
60
81
|
if request.headers.get("content-type", "").startswith("application/json"):
|
61
82
|
try:
|
62
|
-
|
83
|
+
base_ctx["body"] = await request.json()
|
63
84
|
except Exception:
|
64
|
-
|
85
|
+
base_ctx["body"] = None
|
86
|
+
|
87
|
+
# Pull any pre-attached ctx from request.state
|
88
|
+
state_ctx: Optional[Dict[str, Any]] = getattr(request.state, "ctx", None)
|
89
|
+
|
90
|
+
# Merge with precedence: base_ctx < state_ctx < explicit ctx
|
91
|
+
merged_ctx = _merge_ctx(base_ctx, state_ctx, ctx)
|
65
92
|
|
66
|
-
|
93
|
+
# Run worker
|
94
|
+
worker = asyncio.create_task(self._worker(queue, task_id, merged_ctx))
|
67
95
|
|
68
96
|
async def gen():
|
69
97
|
try:
|