snakemake-logger-plugin-prometheus 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,379 @@
1
+ from dataclasses import dataclass, field
2
+ import logging
3
+ import threading
4
+ import sys
5
+ import uuid
6
+ from typing import Any
7
+
8
+ from snakemake_interface_logger_plugins.base import LogHandlerBase
9
+ from snakemake_interface_logger_plugins.settings import LogHandlerSettingsBase
10
+ from snakemake_interface_logger_plugins.common import LogEvent
11
+ from prometheus_client import (
12
+ start_http_server,
13
+ Gauge,
14
+ Counter,
15
+ push_to_gateway,
16
+ REGISTRY,
17
+ )
18
+
19
+
20
+ @dataclass
21
+ class LogHandlerSettings(LogHandlerSettingsBase):
22
+ port: int = field(
23
+ default=8000,
24
+ metadata={
25
+ "help": "Port to expose Prometheus metrics on",
26
+ "env_var": False,
27
+ "required": False,
28
+ "type": int,
29
+ },
30
+ )
31
+ push_gateway: str | None = field(
32
+ default=None,
33
+ metadata={
34
+ "help": "URL of the Prometheus Pushgateway (e.g. http://localhost:9091)",
35
+ "env_var": False,
36
+ "required": False,
37
+ "type": str,
38
+ },
39
+ )
40
+ push_job_name: str = field(
41
+ default="snakemake",
42
+ metadata={
43
+ "help": "Job name for the Pushgateway grouping",
44
+ "env_var": False,
45
+ "required": False,
46
+ "type": str,
47
+ },
48
+ )
49
+ run_id: str | None = field(
50
+ default=None,
51
+ metadata={
52
+ "help": "Unique identifier for this workflow run (label). Defaults to UUID.",
53
+ "env_var": False,
54
+ "required": False,
55
+ "type": str,
56
+ },
57
+ )
58
+
59
+
60
+ @dataclass
61
+ class JobMetadata:
62
+ """
63
+ Structured container for job information extracted from JOB_INFO logs.
64
+ """
65
+
66
+ job_id: int
67
+ rule_name: str
68
+ resources: dict[str, int | float]
69
+ threads: int
70
+
71
+
72
+ class LogHandler(LogHandlerBase):
73
+ def __post_init__(self) -> None:
74
+ if self.common_settings.dryrun:
75
+ return
76
+ self.job_registry: dict[int, JobMetadata] = {}
77
+ self.running_jobs: set[int] = set()
78
+ self.deferred_starts: set[int] = set()
79
+
80
+ self.run_id = getattr(self.settings, "run_id", None)
81
+ if not self.run_id:
82
+ self.run_id = str(uuid.uuid4())
83
+ sys.stderr.write(f"[PrometheusPlugin] Auto-generated run_id: {self.run_id}\n")
84
+
85
+ self.metric_submitted = Gauge(
86
+ "snakemake_jobs_submitted",
87
+ "Number of jobs known to the scheduler (Queued + Running).",
88
+ ["run_id", "rule"],
89
+ )
90
+
91
+ self.metric_running = Gauge(
92
+ "snakemake_jobs_running",
93
+ "Number of jobs actively executing.",
94
+ ["run_id", "rule"],
95
+ )
96
+
97
+ self.metric_finished = Counter(
98
+ "snakemake_jobs_finished_total",
99
+ "Total number of jobs finished/failed.",
100
+ ["run_id", "status", "rule"],
101
+ )
102
+
103
+ self.metric_resource_usage = Gauge(
104
+ "snakemake_resource_usage_total",
105
+ "Total resources currently consumed by RUNNING jobs.",
106
+ ["run_id", "resource"],
107
+ )
108
+
109
+ self.metric_resource_limits = Gauge(
110
+ "snakemake_resource_limits",
111
+ "Maximum resources available to the workflow execution.",
112
+ ["run_id", "resource"],
113
+ )
114
+
115
+ self.metric_progress = Gauge(
116
+ "snakemake_workflow_progress",
117
+ "High-level workflow progress (jobs done vs total).",
118
+ ["run_id", "type"],
119
+ )
120
+
121
+ self.metric_planned = Gauge(
122
+ "snakemake_jobs_planned_total", "Jobs planned per rule. Updates on DAG re-evaluation.", ["run_id", "rule"]
123
+ )
124
+
125
+ self.metric_errors = Counter("snakemake_errors_total", "Total number of workflow errors.", ["run_id"])
126
+
127
+ self._setup_server()
128
+ self._setup_push_gateway()
129
+
130
+ def _setup_server(self) -> None:
131
+ port = getattr(self.settings, "port", 8000)
132
+ try:
133
+ start_http_server(port)
134
+ sys.stderr.write(f"[PrometheusPlugin] Metrics server started on port {port}\n")
135
+ except OSError as e:
136
+ sys.stderr.write(f"[PrometheusPlugin] ⚠️ Could not start server on port {port}: {e}\n")
137
+
138
+ def _setup_push_gateway(self) -> None:
139
+ self._stop_push_event = threading.Event()
140
+ push_gateway = getattr(self.settings, "push_gateway", None)
141
+
142
+ if push_gateway:
143
+ push_job_name = getattr(self.settings, "push_job_name", "snakemake")
144
+ sys.stderr.write(f"[PrometheusPlugin] Pushing metrics to {push_gateway} (job: {push_job_name})\n")
145
+ self._push_thread = threading.Thread(target=self._push_loop, daemon=True)
146
+ self._push_thread.start()
147
+
148
+ def _push_loop(self) -> None:
149
+ """Periodically pushes metrics to the Pushgateway."""
150
+ push_gateway = getattr(self.settings, "push_gateway", None)
151
+ assert push_gateway is not None
152
+ push_job_name = getattr(self.settings, "push_job_name", "snakemake")
153
+
154
+ while not self._stop_push_event.is_set():
155
+ try:
156
+ push_to_gateway(
157
+ push_gateway, job=push_job_name, registry=REGISTRY, grouping_key={"run_id": self.run_id}
158
+ )
159
+ except Exception as e:
160
+ sys.stderr.write(f"[PrometheusPlugin] Push failed: {e}\n")
161
+
162
+ if self._stop_push_event.wait(15):
163
+ break
164
+
165
+ def close(self) -> None:
166
+ if hasattr(self, "_stop_push_event"):
167
+ self._stop_push_event.set()
168
+ if hasattr(self, "_push_thread") and self._push_thread.is_alive():
169
+ self._push_thread.join(timeout=1.0)
170
+
171
+ self._cleanup_registry()
172
+ super().close()
173
+
174
+ def _cleanup_registry(self) -> None:
175
+ """Unregister metrics to prevent state bleeding in test suites."""
176
+ metrics = [
177
+ self.metric_submitted,
178
+ self.metric_running,
179
+ self.metric_finished,
180
+ self.metric_resource_usage,
181
+ self.metric_resource_limits,
182
+ self.metric_progress,
183
+ self.metric_planned,
184
+ self.metric_errors,
185
+ ]
186
+ for m in metrics:
187
+ try:
188
+ REGISTRY.unregister(m)
189
+ except (KeyError, AttributeError, TypeError):
190
+ pass
191
+
192
+ @property
193
+ def writes_to_stream(self) -> bool:
194
+ return False
195
+
196
+ @property
197
+ def writes_to_file(self) -> bool:
198
+ return False
199
+
200
+ @property
201
+ def has_filter(self) -> bool:
202
+ return True
203
+
204
+ @property
205
+ def has_formatter(self) -> bool:
206
+ return True
207
+
208
+ @property
209
+ def needs_rulegraph(self) -> bool:
210
+ return False
211
+
212
+ def emit(self, record: logging.LogRecord) -> None:
213
+ try:
214
+ if not hasattr(record, "event"):
215
+ return
216
+
217
+ event = record.event
218
+
219
+ if event == LogEvent.JOB_INFO:
220
+ self._handle_job_info(record)
221
+ elif event == LogEvent.JOB_STARTED:
222
+ self._handle_job_started(record)
223
+ elif event == LogEvent.JOB_FINISHED:
224
+ self._handle_job_finished(record)
225
+ elif event == LogEvent.JOB_ERROR:
226
+ self._handle_job_error(record)
227
+ elif event == LogEvent.ERROR:
228
+ self.metric_errors.labels(run_id=self.run_id).inc()
229
+ elif event == LogEvent.RUN_INFO:
230
+ self._handle_run_info(record)
231
+ elif event == LogEvent.PROGRESS:
232
+ self._handle_progress(record)
233
+ elif event == LogEvent.RESOURCES_INFO:
234
+ self._handle_resources_info(record)
235
+
236
+ except Exception:
237
+ self.handleError(record)
238
+
239
+ def _parse_resources(self, record: logging.LogRecord) -> dict[str, int | float]:
240
+ """
241
+ Robustly extracts resources from a LogRecord.
242
+ Handles: dicts, argparse.Namespace, and Snakemake's internal Resource objects.
243
+ """
244
+ raw_res = getattr(record, "resources", None)
245
+ resources: dict[str, Any] = {}
246
+
247
+ if raw_res:
248
+ if hasattr(raw_res, "_names"):
249
+ for name in raw_res._names:
250
+ resources[name] = getattr(raw_res, name)
251
+ elif isinstance(raw_res, dict):
252
+ resources = raw_res.copy()
253
+ elif hasattr(raw_res, "__dict__"):
254
+ resources = vars(raw_res).copy()
255
+ elif hasattr(raw_res, "keys") and callable(raw_res.keys):
256
+ for k in raw_res.keys():
257
+ resources[k] = raw_res[k]
258
+
259
+ filtered: dict[str, int | float] = {
260
+ k: v for k, v in resources.items() if k not in {"_cores", "_nodes"} and isinstance(v, (int, float))
261
+ }
262
+
263
+ if "threads" not in filtered:
264
+ threads = getattr(record, "threads", 1)
265
+ if isinstance(threads, (int, float)):
266
+ filtered["threads"] = threads
267
+
268
+ return filtered
269
+
270
+ def _handle_run_info(self, record: logging.LogRecord) -> None:
271
+ """Handles run info (DAG stats). Can be called multiple times if DAG is re-evaluated."""
272
+ counts = getattr(record, "per_rule_job_counts", {})
273
+ if counts:
274
+ for rule, count in counts.items():
275
+ self.metric_planned.labels(run_id=self.run_id, rule=rule).set(count)
276
+
277
+ def _handle_progress(self, record: logging.LogRecord) -> None:
278
+ """Handles progress updates. 'total' can change dynamically via checkpoints."""
279
+ done = getattr(record, "done", 0)
280
+ total = getattr(record, "total", 0)
281
+ self.metric_progress.labels(run_id=self.run_id, type="done").set(done)
282
+ self.metric_progress.labels(run_id=self.run_id, type="total").set(total)
283
+
284
+ def _handle_resources_info(self, record: logging.LogRecord) -> None:
285
+ """Handles the global resources info to set usage limits."""
286
+ cores = getattr(record, "cores", None)
287
+ if cores is not None:
288
+ self.metric_resource_limits.labels(run_id=self.run_id, resource="threads").set(cores)
289
+ self.metric_resource_limits.labels(run_id=self.run_id, resource="cores").set(cores)
290
+
291
+ provided = getattr(record, "provided_resources", {})
292
+ if provided:
293
+ for res, value in provided.items():
294
+ if isinstance(value, (int, float)):
295
+ self.metric_resource_limits.labels(run_id=self.run_id, resource=res).set(value)
296
+
297
+ def _handle_job_info(self, record: logging.LogRecord) -> None:
298
+ job_id = getattr(record, "jobid", None)
299
+ if job_id is None:
300
+ return
301
+
302
+ rule_name = getattr(record, "rule_name", "unknown")
303
+ resources = self._parse_resources(record)
304
+ threads = int(resources.get("threads", 1))
305
+
306
+ metadata = JobMetadata(job_id=job_id, rule_name=rule_name, resources=resources, threads=threads)
307
+
308
+ self.job_registry[job_id] = metadata
309
+
310
+ self.metric_submitted.labels(run_id=self.run_id, rule=rule_name).inc()
311
+
312
+ if job_id in self.deferred_starts:
313
+ self.deferred_starts.remove(job_id)
314
+ self._record_job_start(metadata)
315
+
316
+ def _handle_job_started(self, record: logging.LogRecord) -> None:
317
+ job_ids = getattr(record, "job_ids", [])
318
+
319
+ if not job_ids:
320
+ job_ids = getattr(record, "jobs", [])
321
+
322
+ if isinstance(job_ids, int):
323
+ job_ids = [job_ids]
324
+ elif not job_ids and hasattr(record, "jobid"):
325
+ jid = getattr(record, "jobid")
326
+ if jid is not None:
327
+ job_ids = [jid]
328
+
329
+ for job_id in job_ids:
330
+ if not isinstance(job_id, int):
331
+ continue
332
+
333
+ if job_id in self.job_registry:
334
+ self._record_job_start(self.job_registry[job_id])
335
+ else:
336
+ self.deferred_starts.add(job_id)
337
+
338
+ def _record_job_start(self, metadata: JobMetadata) -> None:
339
+ """Internal helper to increment running metrics."""
340
+ if metadata.job_id in self.running_jobs:
341
+ return
342
+
343
+ self.metric_running.labels(run_id=self.run_id, rule=metadata.rule_name).inc()
344
+
345
+ for res_name, res_value in metadata.resources.items():
346
+ self.metric_resource_usage.labels(run_id=self.run_id, resource=res_name).inc(res_value)
347
+
348
+ self.running_jobs.add(metadata.job_id)
349
+
350
+ def _handle_job_finished(self, record: logging.LogRecord) -> None:
351
+ job_id = getattr(record, "job_id", getattr(record, "jobid", None))
352
+ if job_id is not None:
353
+ self._finalize_job(job_id, "finished")
354
+
355
+ def _handle_job_error(self, record: logging.LogRecord) -> None:
356
+ job_id = getattr(record, "jobid", None)
357
+ if job_id is not None:
358
+ self._finalize_job(job_id, "failed")
359
+
360
+ def _finalize_job(self, job_id: int, final_status: str) -> None:
361
+ job_info = self.job_registry.pop(job_id, None)
362
+
363
+ if job_id in self.deferred_starts:
364
+ self.deferred_starts.remove(job_id)
365
+
366
+ if job_info:
367
+ rule = job_info.rule_name
368
+
369
+ self.metric_finished.labels(run_id=self.run_id, status=final_status, rule=rule).inc()
370
+
371
+ self.metric_submitted.labels(run_id=self.run_id, rule=rule).dec()
372
+
373
+ if job_id in self.running_jobs:
374
+ self.metric_running.labels(run_id=self.run_id, rule=rule).dec()
375
+
376
+ for res_name, res_value in job_info.resources.items():
377
+ self.metric_resource_usage.labels(run_id=self.run_id, resource=res_name).dec(res_value)
378
+
379
+ self.running_jobs.remove(job_id)
@@ -0,0 +1,105 @@
1
+ Metadata-Version: 2.4
2
+ Name: snakemake-logger-plugin-prometheus
3
+ Version: 0.1.1
4
+ Summary: Snakemake logger plugin that exposes job info to prometheus.
5
+ Project-URL: repository, https://github.com/tedil/snakemake-logger-plugin-prometheus
6
+ Author-email: Till Hartmann <till.hartmann@bih-charite.de>
7
+ License-File: LICENSE
8
+ Requires-Python: <4.0,>=3.11
9
+ Requires-Dist: prometheus-client<0.24,>=0.23.1
10
+ Requires-Dist: snakemake-interface-logger-plugins<3,>=2.0
11
+ Description-Content-Type: text/markdown
12
+
13
+ # Snakemake Prometheus Logger Plugin
14
+
15
+ A Snakemake logger plugin that exposes workflow metrics via a Prometheus-compatible HTTP endpoint.
16
+
17
+ ## Installation
18
+
19
+ You can install this plugin via pip:
20
+
21
+ ```sh
22
+ pip install snakemake-logger-plugin-prometheus
23
+ ```
24
+
25
+ Or via Pixi/Conda if available in your channels.
26
+
27
+ ## Usage
28
+
29
+ To use this logger, run Snakemake with the `--logger prometheus` flag.
30
+
31
+ ```sh
32
+ snakemake --logger prometheus ...
33
+ ```
34
+
35
+ ### Pull Mode (Default)
36
+
37
+ By default, the metrics server runs on port `8000`. Prometheus scrapes this address.
38
+ You can change the port using the `--logger-prometheus-port` flag:
39
+
40
+ ```sh
41
+ snakemake --logger prometheus --logger-prometheus-port 9090 ...
42
+ ```
43
+
44
+ ### Push Mode (Pushgateway)
45
+
46
+ If you cannot expose a port or prefer pushing metrics, specify a Pushgateway URL.
47
+ The plugin will push metrics every 15 seconds.
48
+
49
+ ```sh
50
+ snakemake --logger prometheus --logger-prometheus-push-gateway http://localhost:9091
51
+ ```
52
+
53
+ You can optionally set the job name used in the Pushgateway (default: `"snakemake"`):
54
+
55
+ ```sh
56
+ snakemake --logger prometheus --logger-prometheus-push-gateway http://localhost:9091 --logger-prometheus-push-job-name my-workflow
57
+ ```
58
+
59
+ ## Available Metrics
60
+
61
+ ### Job Lifecycle
62
+
63
+ | Metric Name | Type | Labels | Description |
64
+ |:--------------------------------|:--------|:-----------------------------------|:----------------------------------------------------------------------------------------------------------|
65
+ | `snakemake_jobs_submitted` | Gauge | `rule` | Jobs known to the scheduler (Queued + Running). Increments when a job is ready, decrements when finished. |
66
+ | `snakemake_jobs_running` | Gauge | `rule` | Jobs actively executing. Increments when execution starts, decrements when finished. |
67
+ | `snakemake_jobs_finished_total` | Counter | `status` (finished/failed), `rule` | Cumulative count of completed jobs. Useful for throughput rates. |
68
+ | `snakemake_jobs_planned_total` | Gauge | `rule` | Total jobs planned for the workflow. Updates dynamically if checkpoints trigger DAG re-evaluation. |
69
+
70
+ ### Resources
71
+
72
+ | Metric Name | Type | Labels | Description |
73
+ |:---------------------------------|:------|:-----------|:------------------------------------------------------------------------------------|
74
+ | `snakemake_resource_usage_total` | Gauge | `resource` | Total resources currently consumed by **Running** jobs (e.g., `threads`, `mem_mb`). |
75
+ | `snakemake_resource_limits` | Gauge | `resource` | Maximum resources available to the workflow (CLI limits or cluster capacity). |
76
+
77
+ ### Workflow Status
78
+
79
+ | Metric Name | Type | Labels | Description |
80
+ |:------------------------------|:--------|:--------------------|:---------------------------------------|
81
+ | `snakemake_workflow_progress` | Gauge | `type` (done/total) | High-level progress counters. |
82
+ | `snakemake_errors_total` | Counter | None | Total number of workflow-level errors. |
83
+
84
+ ## Grafana Query Examples
85
+
86
+ Here are common queries to visualize this data:
87
+
88
+ | Description | Query |
89
+ |:-----------------------------|:---------------------------------------------------------------------------------------------------------------------|
90
+ | **Queued Jobs** | `sum(snakemake_jobs_submitted) - sum(snakemake_jobs_running)` |
91
+ | **Running Jobs** | `sum(snakemake_jobs_running)` |
92
+ | **Throughput (Jobs/Minute)** | `sum(rate(snakemake_jobs_finished_total{status="finished"}[5m])) * 60` |
93
+ | **Resource Utilization %** | `sum(snakemake_resource_usage_total{resource="threads"}) / sum(snakemake_resource_limits{resource="threads"}) * 100` |
94
+ | **Failure Rate** | `sum(rate(snakemake_jobs_finished_total{status="failed"}[5m]))` |
95
+
96
+ ## Development
97
+
98
+ This project uses [pixi](https://pixi.sh) for dependency management and [uv](https://docs.astral.sh/uv/) for
99
+ building/publishing.
100
+
101
+ - Install dependencies: `pixi install`
102
+ - Run tests: `pixi run test`
103
+ - Run code formatting and linting: `pixi run qc`
104
+ - Build package: `pixi run build`
105
+ - Publish package: `pixi run publish`
@@ -0,0 +1,5 @@
1
+ snakemake_logger_plugin_prometheus/__init__.py,sha256=IasH6hqoYmhxeVN_iAgtkUtAk79VXFZYJxOBq949dyA,13387
2
+ snakemake_logger_plugin_prometheus-0.1.1.dist-info/METADATA,sha256=eGCHpBS4woIP5Iserw5o94Qha9KvBhTf3AuMR6UuUSg,5360
3
+ snakemake_logger_plugin_prometheus-0.1.1.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
4
+ snakemake_logger_plugin_prometheus-0.1.1.dist-info/licenses/LICENSE,sha256=y8AftXce0Ne8jKczrAZvysqhg9zE5lFB5g7gLZL9FdU,1070
5
+ snakemake_logger_plugin_prometheus-0.1.1.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Till Hartmann
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.