snakemake-logger-plugin-prometheus 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snakemake_logger_plugin_prometheus/__init__.py +379 -0
- snakemake_logger_plugin_prometheus-0.1.1.dist-info/METADATA +105 -0
- snakemake_logger_plugin_prometheus-0.1.1.dist-info/RECORD +5 -0
- snakemake_logger_plugin_prometheus-0.1.1.dist-info/WHEEL +4 -0
- snakemake_logger_plugin_prometheus-0.1.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,379 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
import logging
|
|
3
|
+
import threading
|
|
4
|
+
import sys
|
|
5
|
+
import uuid
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from snakemake_interface_logger_plugins.base import LogHandlerBase
|
|
9
|
+
from snakemake_interface_logger_plugins.settings import LogHandlerSettingsBase
|
|
10
|
+
from snakemake_interface_logger_plugins.common import LogEvent
|
|
11
|
+
from prometheus_client import (
|
|
12
|
+
start_http_server,
|
|
13
|
+
Gauge,
|
|
14
|
+
Counter,
|
|
15
|
+
push_to_gateway,
|
|
16
|
+
REGISTRY,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class LogHandlerSettings(LogHandlerSettingsBase):
|
|
22
|
+
port: int = field(
|
|
23
|
+
default=8000,
|
|
24
|
+
metadata={
|
|
25
|
+
"help": "Port to expose Prometheus metrics on",
|
|
26
|
+
"env_var": False,
|
|
27
|
+
"required": False,
|
|
28
|
+
"type": int,
|
|
29
|
+
},
|
|
30
|
+
)
|
|
31
|
+
push_gateway: str | None = field(
|
|
32
|
+
default=None,
|
|
33
|
+
metadata={
|
|
34
|
+
"help": "URL of the Prometheus Pushgateway (e.g. http://localhost:9091)",
|
|
35
|
+
"env_var": False,
|
|
36
|
+
"required": False,
|
|
37
|
+
"type": str,
|
|
38
|
+
},
|
|
39
|
+
)
|
|
40
|
+
push_job_name: str = field(
|
|
41
|
+
default="snakemake",
|
|
42
|
+
metadata={
|
|
43
|
+
"help": "Job name for the Pushgateway grouping",
|
|
44
|
+
"env_var": False,
|
|
45
|
+
"required": False,
|
|
46
|
+
"type": str,
|
|
47
|
+
},
|
|
48
|
+
)
|
|
49
|
+
run_id: str | None = field(
|
|
50
|
+
default=None,
|
|
51
|
+
metadata={
|
|
52
|
+
"help": "Unique identifier for this workflow run (label). Defaults to UUID.",
|
|
53
|
+
"env_var": False,
|
|
54
|
+
"required": False,
|
|
55
|
+
"type": str,
|
|
56
|
+
},
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@dataclass
|
|
61
|
+
class JobMetadata:
|
|
62
|
+
"""
|
|
63
|
+
Structured container for job information extracted from JOB_INFO logs.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
job_id: int
|
|
67
|
+
rule_name: str
|
|
68
|
+
resources: dict[str, int | float]
|
|
69
|
+
threads: int
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class LogHandler(LogHandlerBase):
|
|
73
|
+
def __post_init__(self) -> None:
|
|
74
|
+
if self.common_settings.dryrun:
|
|
75
|
+
return
|
|
76
|
+
self.job_registry: dict[int, JobMetadata] = {}
|
|
77
|
+
self.running_jobs: set[int] = set()
|
|
78
|
+
self.deferred_starts: set[int] = set()
|
|
79
|
+
|
|
80
|
+
self.run_id = getattr(self.settings, "run_id", None)
|
|
81
|
+
if not self.run_id:
|
|
82
|
+
self.run_id = str(uuid.uuid4())
|
|
83
|
+
sys.stderr.write(f"[PrometheusPlugin] Auto-generated run_id: {self.run_id}\n")
|
|
84
|
+
|
|
85
|
+
self.metric_submitted = Gauge(
|
|
86
|
+
"snakemake_jobs_submitted",
|
|
87
|
+
"Number of jobs known to the scheduler (Queued + Running).",
|
|
88
|
+
["run_id", "rule"],
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
self.metric_running = Gauge(
|
|
92
|
+
"snakemake_jobs_running",
|
|
93
|
+
"Number of jobs actively executing.",
|
|
94
|
+
["run_id", "rule"],
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
self.metric_finished = Counter(
|
|
98
|
+
"snakemake_jobs_finished_total",
|
|
99
|
+
"Total number of jobs finished/failed.",
|
|
100
|
+
["run_id", "status", "rule"],
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
self.metric_resource_usage = Gauge(
|
|
104
|
+
"snakemake_resource_usage_total",
|
|
105
|
+
"Total resources currently consumed by RUNNING jobs.",
|
|
106
|
+
["run_id", "resource"],
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
self.metric_resource_limits = Gauge(
|
|
110
|
+
"snakemake_resource_limits",
|
|
111
|
+
"Maximum resources available to the workflow execution.",
|
|
112
|
+
["run_id", "resource"],
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
self.metric_progress = Gauge(
|
|
116
|
+
"snakemake_workflow_progress",
|
|
117
|
+
"High-level workflow progress (jobs done vs total).",
|
|
118
|
+
["run_id", "type"],
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
self.metric_planned = Gauge(
|
|
122
|
+
"snakemake_jobs_planned_total", "Jobs planned per rule. Updates on DAG re-evaluation.", ["run_id", "rule"]
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
self.metric_errors = Counter("snakemake_errors_total", "Total number of workflow errors.", ["run_id"])
|
|
126
|
+
|
|
127
|
+
self._setup_server()
|
|
128
|
+
self._setup_push_gateway()
|
|
129
|
+
|
|
130
|
+
def _setup_server(self) -> None:
|
|
131
|
+
port = getattr(self.settings, "port", 8000)
|
|
132
|
+
try:
|
|
133
|
+
start_http_server(port)
|
|
134
|
+
sys.stderr.write(f"[PrometheusPlugin] Metrics server started on port {port}\n")
|
|
135
|
+
except OSError as e:
|
|
136
|
+
sys.stderr.write(f"[PrometheusPlugin] ⚠️ Could not start server on port {port}: {e}\n")
|
|
137
|
+
|
|
138
|
+
def _setup_push_gateway(self) -> None:
|
|
139
|
+
self._stop_push_event = threading.Event()
|
|
140
|
+
push_gateway = getattr(self.settings, "push_gateway", None)
|
|
141
|
+
|
|
142
|
+
if push_gateway:
|
|
143
|
+
push_job_name = getattr(self.settings, "push_job_name", "snakemake")
|
|
144
|
+
sys.stderr.write(f"[PrometheusPlugin] Pushing metrics to {push_gateway} (job: {push_job_name})\n")
|
|
145
|
+
self._push_thread = threading.Thread(target=self._push_loop, daemon=True)
|
|
146
|
+
self._push_thread.start()
|
|
147
|
+
|
|
148
|
+
def _push_loop(self) -> None:
|
|
149
|
+
"""Periodically pushes metrics to the Pushgateway."""
|
|
150
|
+
push_gateway = getattr(self.settings, "push_gateway", None)
|
|
151
|
+
assert push_gateway is not None
|
|
152
|
+
push_job_name = getattr(self.settings, "push_job_name", "snakemake")
|
|
153
|
+
|
|
154
|
+
while not self._stop_push_event.is_set():
|
|
155
|
+
try:
|
|
156
|
+
push_to_gateway(
|
|
157
|
+
push_gateway, job=push_job_name, registry=REGISTRY, grouping_key={"run_id": self.run_id}
|
|
158
|
+
)
|
|
159
|
+
except Exception as e:
|
|
160
|
+
sys.stderr.write(f"[PrometheusPlugin] Push failed: {e}\n")
|
|
161
|
+
|
|
162
|
+
if self._stop_push_event.wait(15):
|
|
163
|
+
break
|
|
164
|
+
|
|
165
|
+
def close(self) -> None:
|
|
166
|
+
if hasattr(self, "_stop_push_event"):
|
|
167
|
+
self._stop_push_event.set()
|
|
168
|
+
if hasattr(self, "_push_thread") and self._push_thread.is_alive():
|
|
169
|
+
self._push_thread.join(timeout=1.0)
|
|
170
|
+
|
|
171
|
+
self._cleanup_registry()
|
|
172
|
+
super().close()
|
|
173
|
+
|
|
174
|
+
def _cleanup_registry(self) -> None:
|
|
175
|
+
"""Unregister metrics to prevent state bleeding in test suites."""
|
|
176
|
+
metrics = [
|
|
177
|
+
self.metric_submitted,
|
|
178
|
+
self.metric_running,
|
|
179
|
+
self.metric_finished,
|
|
180
|
+
self.metric_resource_usage,
|
|
181
|
+
self.metric_resource_limits,
|
|
182
|
+
self.metric_progress,
|
|
183
|
+
self.metric_planned,
|
|
184
|
+
self.metric_errors,
|
|
185
|
+
]
|
|
186
|
+
for m in metrics:
|
|
187
|
+
try:
|
|
188
|
+
REGISTRY.unregister(m)
|
|
189
|
+
except (KeyError, AttributeError, TypeError):
|
|
190
|
+
pass
|
|
191
|
+
|
|
192
|
+
@property
|
|
193
|
+
def writes_to_stream(self) -> bool:
|
|
194
|
+
return False
|
|
195
|
+
|
|
196
|
+
@property
|
|
197
|
+
def writes_to_file(self) -> bool:
|
|
198
|
+
return False
|
|
199
|
+
|
|
200
|
+
@property
|
|
201
|
+
def has_filter(self) -> bool:
|
|
202
|
+
return True
|
|
203
|
+
|
|
204
|
+
@property
|
|
205
|
+
def has_formatter(self) -> bool:
|
|
206
|
+
return True
|
|
207
|
+
|
|
208
|
+
@property
|
|
209
|
+
def needs_rulegraph(self) -> bool:
|
|
210
|
+
return False
|
|
211
|
+
|
|
212
|
+
def emit(self, record: logging.LogRecord) -> None:
|
|
213
|
+
try:
|
|
214
|
+
if not hasattr(record, "event"):
|
|
215
|
+
return
|
|
216
|
+
|
|
217
|
+
event = record.event
|
|
218
|
+
|
|
219
|
+
if event == LogEvent.JOB_INFO:
|
|
220
|
+
self._handle_job_info(record)
|
|
221
|
+
elif event == LogEvent.JOB_STARTED:
|
|
222
|
+
self._handle_job_started(record)
|
|
223
|
+
elif event == LogEvent.JOB_FINISHED:
|
|
224
|
+
self._handle_job_finished(record)
|
|
225
|
+
elif event == LogEvent.JOB_ERROR:
|
|
226
|
+
self._handle_job_error(record)
|
|
227
|
+
elif event == LogEvent.ERROR:
|
|
228
|
+
self.metric_errors.labels(run_id=self.run_id).inc()
|
|
229
|
+
elif event == LogEvent.RUN_INFO:
|
|
230
|
+
self._handle_run_info(record)
|
|
231
|
+
elif event == LogEvent.PROGRESS:
|
|
232
|
+
self._handle_progress(record)
|
|
233
|
+
elif event == LogEvent.RESOURCES_INFO:
|
|
234
|
+
self._handle_resources_info(record)
|
|
235
|
+
|
|
236
|
+
except Exception:
|
|
237
|
+
self.handleError(record)
|
|
238
|
+
|
|
239
|
+
def _parse_resources(self, record: logging.LogRecord) -> dict[str, int | float]:
|
|
240
|
+
"""
|
|
241
|
+
Robustly extracts resources from a LogRecord.
|
|
242
|
+
Handles: dicts, argparse.Namespace, and Snakemake's internal Resource objects.
|
|
243
|
+
"""
|
|
244
|
+
raw_res = getattr(record, "resources", None)
|
|
245
|
+
resources: dict[str, Any] = {}
|
|
246
|
+
|
|
247
|
+
if raw_res:
|
|
248
|
+
if hasattr(raw_res, "_names"):
|
|
249
|
+
for name in raw_res._names:
|
|
250
|
+
resources[name] = getattr(raw_res, name)
|
|
251
|
+
elif isinstance(raw_res, dict):
|
|
252
|
+
resources = raw_res.copy()
|
|
253
|
+
elif hasattr(raw_res, "__dict__"):
|
|
254
|
+
resources = vars(raw_res).copy()
|
|
255
|
+
elif hasattr(raw_res, "keys") and callable(raw_res.keys):
|
|
256
|
+
for k in raw_res.keys():
|
|
257
|
+
resources[k] = raw_res[k]
|
|
258
|
+
|
|
259
|
+
filtered: dict[str, int | float] = {
|
|
260
|
+
k: v for k, v in resources.items() if k not in {"_cores", "_nodes"} and isinstance(v, (int, float))
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
if "threads" not in filtered:
|
|
264
|
+
threads = getattr(record, "threads", 1)
|
|
265
|
+
if isinstance(threads, (int, float)):
|
|
266
|
+
filtered["threads"] = threads
|
|
267
|
+
|
|
268
|
+
return filtered
|
|
269
|
+
|
|
270
|
+
def _handle_run_info(self, record: logging.LogRecord) -> None:
|
|
271
|
+
"""Handles run info (DAG stats). Can be called multiple times if DAG is re-evaluated."""
|
|
272
|
+
counts = getattr(record, "per_rule_job_counts", {})
|
|
273
|
+
if counts:
|
|
274
|
+
for rule, count in counts.items():
|
|
275
|
+
self.metric_planned.labels(run_id=self.run_id, rule=rule).set(count)
|
|
276
|
+
|
|
277
|
+
def _handle_progress(self, record: logging.LogRecord) -> None:
|
|
278
|
+
"""Handles progress updates. 'total' can change dynamically via checkpoints."""
|
|
279
|
+
done = getattr(record, "done", 0)
|
|
280
|
+
total = getattr(record, "total", 0)
|
|
281
|
+
self.metric_progress.labels(run_id=self.run_id, type="done").set(done)
|
|
282
|
+
self.metric_progress.labels(run_id=self.run_id, type="total").set(total)
|
|
283
|
+
|
|
284
|
+
def _handle_resources_info(self, record: logging.LogRecord) -> None:
|
|
285
|
+
"""Handles the global resources info to set usage limits."""
|
|
286
|
+
cores = getattr(record, "cores", None)
|
|
287
|
+
if cores is not None:
|
|
288
|
+
self.metric_resource_limits.labels(run_id=self.run_id, resource="threads").set(cores)
|
|
289
|
+
self.metric_resource_limits.labels(run_id=self.run_id, resource="cores").set(cores)
|
|
290
|
+
|
|
291
|
+
provided = getattr(record, "provided_resources", {})
|
|
292
|
+
if provided:
|
|
293
|
+
for res, value in provided.items():
|
|
294
|
+
if isinstance(value, (int, float)):
|
|
295
|
+
self.metric_resource_limits.labels(run_id=self.run_id, resource=res).set(value)
|
|
296
|
+
|
|
297
|
+
def _handle_job_info(self, record: logging.LogRecord) -> None:
|
|
298
|
+
job_id = getattr(record, "jobid", None)
|
|
299
|
+
if job_id is None:
|
|
300
|
+
return
|
|
301
|
+
|
|
302
|
+
rule_name = getattr(record, "rule_name", "unknown")
|
|
303
|
+
resources = self._parse_resources(record)
|
|
304
|
+
threads = int(resources.get("threads", 1))
|
|
305
|
+
|
|
306
|
+
metadata = JobMetadata(job_id=job_id, rule_name=rule_name, resources=resources, threads=threads)
|
|
307
|
+
|
|
308
|
+
self.job_registry[job_id] = metadata
|
|
309
|
+
|
|
310
|
+
self.metric_submitted.labels(run_id=self.run_id, rule=rule_name).inc()
|
|
311
|
+
|
|
312
|
+
if job_id in self.deferred_starts:
|
|
313
|
+
self.deferred_starts.remove(job_id)
|
|
314
|
+
self._record_job_start(metadata)
|
|
315
|
+
|
|
316
|
+
def _handle_job_started(self, record: logging.LogRecord) -> None:
|
|
317
|
+
job_ids = getattr(record, "job_ids", [])
|
|
318
|
+
|
|
319
|
+
if not job_ids:
|
|
320
|
+
job_ids = getattr(record, "jobs", [])
|
|
321
|
+
|
|
322
|
+
if isinstance(job_ids, int):
|
|
323
|
+
job_ids = [job_ids]
|
|
324
|
+
elif not job_ids and hasattr(record, "jobid"):
|
|
325
|
+
jid = getattr(record, "jobid")
|
|
326
|
+
if jid is not None:
|
|
327
|
+
job_ids = [jid]
|
|
328
|
+
|
|
329
|
+
for job_id in job_ids:
|
|
330
|
+
if not isinstance(job_id, int):
|
|
331
|
+
continue
|
|
332
|
+
|
|
333
|
+
if job_id in self.job_registry:
|
|
334
|
+
self._record_job_start(self.job_registry[job_id])
|
|
335
|
+
else:
|
|
336
|
+
self.deferred_starts.add(job_id)
|
|
337
|
+
|
|
338
|
+
def _record_job_start(self, metadata: JobMetadata) -> None:
|
|
339
|
+
"""Internal helper to increment running metrics."""
|
|
340
|
+
if metadata.job_id in self.running_jobs:
|
|
341
|
+
return
|
|
342
|
+
|
|
343
|
+
self.metric_running.labels(run_id=self.run_id, rule=metadata.rule_name).inc()
|
|
344
|
+
|
|
345
|
+
for res_name, res_value in metadata.resources.items():
|
|
346
|
+
self.metric_resource_usage.labels(run_id=self.run_id, resource=res_name).inc(res_value)
|
|
347
|
+
|
|
348
|
+
self.running_jobs.add(metadata.job_id)
|
|
349
|
+
|
|
350
|
+
def _handle_job_finished(self, record: logging.LogRecord) -> None:
|
|
351
|
+
job_id = getattr(record, "job_id", getattr(record, "jobid", None))
|
|
352
|
+
if job_id is not None:
|
|
353
|
+
self._finalize_job(job_id, "finished")
|
|
354
|
+
|
|
355
|
+
def _handle_job_error(self, record: logging.LogRecord) -> None:
|
|
356
|
+
job_id = getattr(record, "jobid", None)
|
|
357
|
+
if job_id is not None:
|
|
358
|
+
self._finalize_job(job_id, "failed")
|
|
359
|
+
|
|
360
|
+
def _finalize_job(self, job_id: int, final_status: str) -> None:
|
|
361
|
+
job_info = self.job_registry.pop(job_id, None)
|
|
362
|
+
|
|
363
|
+
if job_id in self.deferred_starts:
|
|
364
|
+
self.deferred_starts.remove(job_id)
|
|
365
|
+
|
|
366
|
+
if job_info:
|
|
367
|
+
rule = job_info.rule_name
|
|
368
|
+
|
|
369
|
+
self.metric_finished.labels(run_id=self.run_id, status=final_status, rule=rule).inc()
|
|
370
|
+
|
|
371
|
+
self.metric_submitted.labels(run_id=self.run_id, rule=rule).dec()
|
|
372
|
+
|
|
373
|
+
if job_id in self.running_jobs:
|
|
374
|
+
self.metric_running.labels(run_id=self.run_id, rule=rule).dec()
|
|
375
|
+
|
|
376
|
+
for res_name, res_value in job_info.resources.items():
|
|
377
|
+
self.metric_resource_usage.labels(run_id=self.run_id, resource=res_name).dec(res_value)
|
|
378
|
+
|
|
379
|
+
self.running_jobs.remove(job_id)
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: snakemake-logger-plugin-prometheus
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Snakemake logger plugin that exposes job info to prometheus.
|
|
5
|
+
Project-URL: repository, https://github.com/tedil/snakemake-logger-plugin-prometheus
|
|
6
|
+
Author-email: Till Hartmann <till.hartmann@bih-charite.de>
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Requires-Python: <4.0,>=3.11
|
|
9
|
+
Requires-Dist: prometheus-client<0.24,>=0.23.1
|
|
10
|
+
Requires-Dist: snakemake-interface-logger-plugins<3,>=2.0
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
|
|
13
|
+
# Snakemake Prometheus Logger Plugin
|
|
14
|
+
|
|
15
|
+
A Snakemake logger plugin that exposes workflow metrics via a Prometheus-compatible HTTP endpoint.
|
|
16
|
+
|
|
17
|
+
## Installation
|
|
18
|
+
|
|
19
|
+
You can install this plugin via pip:
|
|
20
|
+
|
|
21
|
+
```sh
|
|
22
|
+
pip install snakemake-logger-plugin-prometheus
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
Or via Pixi/Conda if available in your channels.
|
|
26
|
+
|
|
27
|
+
## Usage
|
|
28
|
+
|
|
29
|
+
To use this logger, run Snakemake with the `--logger prometheus` flag.
|
|
30
|
+
|
|
31
|
+
```sh
|
|
32
|
+
snakemake --logger prometheus ...
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
### Pull Mode (Default)
|
|
36
|
+
|
|
37
|
+
By default, the metrics server runs on port `8000`. Prometheus scrapes this address.
|
|
38
|
+
You can change the port using the `--logger-prometheus-port` flag:
|
|
39
|
+
|
|
40
|
+
```sh
|
|
41
|
+
snakemake --logger prometheus --logger-prometheus-port 9090 ...
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### Push Mode (Pushgateway)
|
|
45
|
+
|
|
46
|
+
If you cannot expose a port or prefer pushing metrics, specify a Pushgateway URL.
|
|
47
|
+
The plugin will push metrics every 15 seconds.
|
|
48
|
+
|
|
49
|
+
```sh
|
|
50
|
+
snakemake --logger prometheus --logger-prometheus-push-gateway http://localhost:9091
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
You can optionally set the job name used in the Pushgateway (default: `"snakemake"`):
|
|
54
|
+
|
|
55
|
+
```sh
|
|
56
|
+
snakemake --logger prometheus --logger-prometheus-push-gateway http://localhost:9091 --logger-prometheus-push-job-name my-workflow
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## Available Metrics
|
|
60
|
+
|
|
61
|
+
### Job Lifecycle
|
|
62
|
+
|
|
63
|
+
| Metric Name | Type | Labels | Description |
|
|
64
|
+
|:--------------------------------|:--------|:-----------------------------------|:----------------------------------------------------------------------------------------------------------|
|
|
65
|
+
| `snakemake_jobs_submitted` | Gauge | `rule` | Jobs known to the scheduler (Queued + Running). Increments when a job is ready, decrements when finished. |
|
|
66
|
+
| `snakemake_jobs_running` | Gauge | `rule` | Jobs actively executing. Increments when execution starts, decrements when finished. |
|
|
67
|
+
| `snakemake_jobs_finished_total` | Counter | `status` (finished/failed), `rule` | Cumulative count of completed jobs. Useful for throughput rates. |
|
|
68
|
+
| `snakemake_jobs_planned_total` | Gauge | `rule` | Total jobs planned for the workflow. Updates dynamically if checkpoints trigger DAG re-evaluation. |
|
|
69
|
+
|
|
70
|
+
### Resources
|
|
71
|
+
|
|
72
|
+
| Metric Name | Type | Labels | Description |
|
|
73
|
+
|:---------------------------------|:------|:-----------|:------------------------------------------------------------------------------------|
|
|
74
|
+
| `snakemake_resource_usage_total` | Gauge | `resource` | Total resources currently consumed by **Running** jobs (e.g., `threads`, `mem_mb`). |
|
|
75
|
+
| `snakemake_resource_limits` | Gauge | `resource` | Maximum resources available to the workflow (CLI limits or cluster capacity). |
|
|
76
|
+
|
|
77
|
+
### Workflow Status
|
|
78
|
+
|
|
79
|
+
| Metric Name | Type | Labels | Description |
|
|
80
|
+
|:------------------------------|:--------|:--------------------|:---------------------------------------|
|
|
81
|
+
| `snakemake_workflow_progress` | Gauge | `type` (done/total) | High-level progress counters. |
|
|
82
|
+
| `snakemake_errors_total` | Counter | None | Total number of workflow-level errors. |
|
|
83
|
+
|
|
84
|
+
## Grafana Query Examples
|
|
85
|
+
|
|
86
|
+
Here are common queries to visualize this data:
|
|
87
|
+
|
|
88
|
+
| Description | Query |
|
|
89
|
+
|:-----------------------------|:---------------------------------------------------------------------------------------------------------------------|
|
|
90
|
+
| **Queued Jobs** | `sum(snakemake_jobs_submitted) - sum(snakemake_jobs_running)` |
|
|
91
|
+
| **Running Jobs** | `sum(snakemake_jobs_running)` |
|
|
92
|
+
| **Throughput (Jobs/Minute)** | `sum(rate(snakemake_jobs_finished_total{status="finished"}[5m])) * 60` |
|
|
93
|
+
| **Resource Utilization %** | `sum(snakemake_resource_usage_total{resource="threads"}) / sum(snakemake_resource_limits{resource="threads"}) * 100` |
|
|
94
|
+
| **Failure Rate** | `sum(rate(snakemake_jobs_finished_total{status="failed"}[5m]))` |
|
|
95
|
+
|
|
96
|
+
## Development
|
|
97
|
+
|
|
98
|
+
This project uses [pixi](https://pixi.sh) for dependency management and [uv](https://docs.astral.sh/uv/) for
|
|
99
|
+
building/publishing.
|
|
100
|
+
|
|
101
|
+
- Install dependencies: `pixi install`
|
|
102
|
+
- Run tests: `pixi run test`
|
|
103
|
+
- Run code formatting and linting: `pixi run qc`
|
|
104
|
+
- Build package: `pixi run build`
|
|
105
|
+
- Publish package: `pixi run publish`
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
snakemake_logger_plugin_prometheus/__init__.py,sha256=IasH6hqoYmhxeVN_iAgtkUtAk79VXFZYJxOBq949dyA,13387
|
|
2
|
+
snakemake_logger_plugin_prometheus-0.1.1.dist-info/METADATA,sha256=eGCHpBS4woIP5Iserw5o94Qha9KvBhTf3AuMR6UuUSg,5360
|
|
3
|
+
snakemake_logger_plugin_prometheus-0.1.1.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
4
|
+
snakemake_logger_plugin_prometheus-0.1.1.dist-info/licenses/LICENSE,sha256=y8AftXce0Ne8jKczrAZvysqhg9zE5lFB5g7gLZL9FdU,1070
|
|
5
|
+
snakemake_logger_plugin_prometheus-0.1.1.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Till Hartmann
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|