detectkit 0.5.3__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {detectkit-0.5.3/detectkit.egg-info → detectkit-0.6.0}/PKG-INFO +5 -2
- {detectkit-0.5.3 → detectkit-0.6.0}/README.md +4 -1
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/cli/commands/init.py +3 -0
- detectkit-0.6.0/detectkit/cli/commands/unlock.py +105 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/cli/main.py +34 -0
- detectkit-0.6.0/detectkit/database/internal_tables/_tasks.py +151 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/orchestration/task_manager/manager.py +33 -21
- {detectkit-0.5.3 → detectkit-0.6.0/detectkit.egg-info}/PKG-INFO +5 -2
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit.egg-info/SOURCES.txt +1 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/pyproject.toml +1 -1
- detectkit-0.5.3/detectkit/database/internal_tables/_tasks.py +0 -87
- {detectkit-0.5.3 → detectkit-0.6.0}/LICENSE +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/MANIFEST.in +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/__init__.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/alerting/__init__.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/alerting/channels/__init__.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/alerting/channels/base.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/alerting/channels/email.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/alerting/channels/factory.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/alerting/channels/mattermost.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/alerting/channels/slack.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/alerting/channels/telegram.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/alerting/channels/webhook.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/alerting/orchestrator/__init__.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/alerting/orchestrator/_base.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/alerting/orchestrator/_cooldown.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/alerting/orchestrator/_decision.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/alerting/orchestrator/_dispatch.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/alerting/orchestrator/_recovery.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/alerting/orchestrator/_types.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/alerting/orchestrator/orchestrator.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/cli/__init__.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/cli/commands/__init__.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/cli/commands/run.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/cli/commands/test_alert.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/config/__init__.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/config/metric_config.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/config/profile.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/config/project_config.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/config/validator.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/core/__init__.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/core/interval.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/core/models.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/database/__init__.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/database/clickhouse_manager.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/database/internal_tables/__init__.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/database/internal_tables/_alert_states.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/database/internal_tables/_base.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/database/internal_tables/_datapoints.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/database/internal_tables/_detections.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/database/internal_tables/_metrics.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/database/internal_tables/_schema.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/database/internal_tables/manager.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/database/manager.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/database/tables.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/detectors/__init__.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/detectors/base.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/detectors/factory.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/detectors/seasonality.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/detectors/statistical/__init__.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/detectors/statistical/iqr.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/detectors/statistical/mad.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/detectors/statistical/manual_bounds.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/detectors/statistical/zscore.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/loaders/__init__.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/loaders/metric_loader.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/loaders/query_template.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/orchestration/__init__.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/orchestration/error_dispatch.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/orchestration/task_manager/__init__.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/orchestration/task_manager/_alert_step.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/orchestration/task_manager/_base.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/orchestration/task_manager/_detect_step.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/orchestration/task_manager/_load_step.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/orchestration/task_manager/_types.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/utils/__init__.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/utils/datetime_utils.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/utils/env_interpolation.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/utils/json_utils.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/utils/stats.py +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit.egg-info/dependency_links.txt +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit.egg-info/entry_points.txt +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit.egg-info/requires.txt +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/detectkit.egg-info/top_level.txt +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/requirements.txt +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/setup.cfg +0 -0
- {detectkit-0.5.3 → detectkit-0.6.0}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: detectkit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: Metric monitoring with automatic anomaly detection
|
|
5
5
|
Author: detectkit team
|
|
6
6
|
License: MIT
|
|
@@ -83,7 +83,7 @@ Dynamic: license-file
|
|
|
83
83
|
- **Project-level error alerts** — catch DB outages and pipeline crashes once per run
|
|
84
84
|
- **Database agnostic** — ClickHouse, PostgreSQL, MySQL
|
|
85
85
|
- **Idempotent** — resume from interruptions, no duplicate processing
|
|
86
|
-
- **CLI** — `dtk init`, `dtk run --select`, tag-based selectors
|
|
86
|
+
- **CLI** — `dtk init`, `dtk run --select`, `dtk unlock`, tag-based selectors
|
|
87
87
|
|
|
88
88
|
## Installation
|
|
89
89
|
|
|
@@ -112,6 +112,9 @@ dtk run --select cpu_usage
|
|
|
112
112
|
dtk run --select tag:critical
|
|
113
113
|
dtk run --select cpu_usage --steps load,detect
|
|
114
114
|
dtk run --select cpu_usage --from 2024-01-01
|
|
115
|
+
|
|
116
|
+
# Clear a stuck lock left by a crashed run (e.g. DB restarted mid-run)
|
|
117
|
+
dtk unlock --select cpu_usage
|
|
115
118
|
```
|
|
116
119
|
|
|
117
120
|
### Metric Configuration
|
|
@@ -17,7 +17,7 @@
|
|
|
17
17
|
- **Project-level error alerts** — catch DB outages and pipeline crashes once per run
|
|
18
18
|
- **Database agnostic** — ClickHouse, PostgreSQL, MySQL
|
|
19
19
|
- **Idempotent** — resume from interruptions, no duplicate processing
|
|
20
|
-
- **CLI** — `dtk init`, `dtk run --select`, tag-based selectors
|
|
20
|
+
- **CLI** — `dtk init`, `dtk run --select`, `dtk unlock`, tag-based selectors
|
|
21
21
|
|
|
22
22
|
## Installation
|
|
23
23
|
|
|
@@ -46,6 +46,9 @@ dtk run --select cpu_usage
|
|
|
46
46
|
dtk run --select tag:critical
|
|
47
47
|
dtk run --select cpu_usage --steps load,detect
|
|
48
48
|
dtk run --select cpu_usage --from 2024-01-01
|
|
49
|
+
|
|
50
|
+
# Clear a stuck lock left by a crashed run (e.g. DB restarted mid-run)
|
|
51
|
+
dtk unlock --select cpu_usage
|
|
49
52
|
```
|
|
50
53
|
|
|
51
54
|
### Metric Configuration
|
|
@@ -260,6 +260,9 @@ dtk run --select cpu_usage --from 2024-01-01
|
|
|
260
260
|
|
|
261
261
|
# Full refresh
|
|
262
262
|
dtk run --select cpu_usage --full-refresh
|
|
263
|
+
|
|
264
|
+
# Clear a stuck lock left by a crashed run (e.g. DB restarted mid-run)
|
|
265
|
+
dtk unlock --select cpu_usage
|
|
263
266
|
```
|
|
264
267
|
|
|
265
268
|
## Documentation
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Implementation of 'dtk unlock' command.
|
|
3
|
+
|
|
4
|
+
Force-releases stale pipeline locks left behind by a run that died without
|
|
5
|
+
releasing them (e.g. the database restarted mid-run). Normally the lock
|
|
6
|
+
auto-expires after its timeout, but this command clears it immediately.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import click
|
|
10
|
+
|
|
11
|
+
from detectkit.cli.commands.run import find_project_root, select_metrics
|
|
12
|
+
from detectkit.config.profile import ProfilesConfig
|
|
13
|
+
from detectkit.database.internal_tables import InternalTablesManager
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def run_unlock(select: str, profile: str | None):
|
|
17
|
+
"""
|
|
18
|
+
Clear pipeline locks for the selected metric(s).
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
select: Metric selector (name, path, or tag) — same semantics as `dtk run`
|
|
22
|
+
profile: Profile name to use (defaults to project's default_profile)
|
|
23
|
+
"""
|
|
24
|
+
# Find project root
|
|
25
|
+
project_root = find_project_root()
|
|
26
|
+
if not project_root:
|
|
27
|
+
click.echo(
|
|
28
|
+
click.style(
|
|
29
|
+
"Error: Not in a detectkit project directory!",
|
|
30
|
+
fg="red",
|
|
31
|
+
bold=True,
|
|
32
|
+
)
|
|
33
|
+
)
|
|
34
|
+
click.echo("Run 'dtk init <project_name>' to create a new project.")
|
|
35
|
+
return
|
|
36
|
+
|
|
37
|
+
click.echo(f"Project root: {project_root}")
|
|
38
|
+
|
|
39
|
+
# Select metrics based on selector
|
|
40
|
+
try:
|
|
41
|
+
metrics = select_metrics(select, project_root)
|
|
42
|
+
except ValueError as e:
|
|
43
|
+
click.echo(click.style(f"Error: {e}", fg="red", bold=True))
|
|
44
|
+
return
|
|
45
|
+
|
|
46
|
+
if not metrics:
|
|
47
|
+
click.echo(
|
|
48
|
+
click.style(
|
|
49
|
+
f"No metrics found matching selector: {select}",
|
|
50
|
+
fg="yellow",
|
|
51
|
+
)
|
|
52
|
+
)
|
|
53
|
+
return
|
|
54
|
+
|
|
55
|
+
# Load profiles.yml
|
|
56
|
+
profiles_path = project_root / "profiles.yml"
|
|
57
|
+
if not profiles_path.exists():
|
|
58
|
+
click.echo(click.style("Error: profiles.yml not found!", fg="red", bold=True))
|
|
59
|
+
click.echo(f"Expected at: {profiles_path}")
|
|
60
|
+
return
|
|
61
|
+
|
|
62
|
+
try:
|
|
63
|
+
profiles_config = ProfilesConfig.from_yaml(profiles_path)
|
|
64
|
+
except Exception as e:
|
|
65
|
+
click.echo(click.style(f"Error loading profiles.yml: {e}", fg="red", bold=True))
|
|
66
|
+
return
|
|
67
|
+
|
|
68
|
+
# Create database / internal tables manager
|
|
69
|
+
try:
|
|
70
|
+
db_manager = profiles_config.create_manager(profile)
|
|
71
|
+
except Exception as e:
|
|
72
|
+
click.echo(click.style(f"Error creating database manager: {e}", fg="red", bold=True))
|
|
73
|
+
return
|
|
74
|
+
|
|
75
|
+
internal_manager = InternalTablesManager(db_manager)
|
|
76
|
+
|
|
77
|
+
click.echo(f"Found {len(metrics)} metric(s) to unlock")
|
|
78
|
+
click.echo()
|
|
79
|
+
|
|
80
|
+
cleared = 0
|
|
81
|
+
for _, config in metrics:
|
|
82
|
+
metric_name = config.name
|
|
83
|
+
try:
|
|
84
|
+
was_locked = internal_manager.clear_lock(metric_name)
|
|
85
|
+
except Exception as e:
|
|
86
|
+
click.echo(
|
|
87
|
+
click.style(f" ✗ {metric_name}: error clearing lock: {e}", fg="red"),
|
|
88
|
+
err=True,
|
|
89
|
+
)
|
|
90
|
+
continue
|
|
91
|
+
|
|
92
|
+
if was_locked:
|
|
93
|
+
cleared += 1
|
|
94
|
+
click.echo(click.style(f" ✓ {metric_name}: lock cleared", fg="green"))
|
|
95
|
+
else:
|
|
96
|
+
click.echo(f" • {metric_name}: no active lock")
|
|
97
|
+
|
|
98
|
+
click.echo()
|
|
99
|
+
click.echo(
|
|
100
|
+
click.style(
|
|
101
|
+
f"Done. Cleared {cleared} lock(s) of {len(metrics)} metric(s).",
|
|
102
|
+
fg="cyan",
|
|
103
|
+
bold=True,
|
|
104
|
+
)
|
|
105
|
+
)
|
|
@@ -182,5 +182,39 @@ def test_alert(metric_name: str, profile: str):
|
|
|
182
182
|
run_test_alert(metric_name=metric_name, profile=profile)
|
|
183
183
|
|
|
184
184
|
|
|
185
|
+
@cli.command()
|
|
186
|
+
@click.option(
|
|
187
|
+
"--select",
|
|
188
|
+
"-s",
|
|
189
|
+
help="Selector for metrics to unlock (metric name, path, or tag)",
|
|
190
|
+
required=True,
|
|
191
|
+
)
|
|
192
|
+
@click.option(
|
|
193
|
+
"--profile",
|
|
194
|
+
help="Profile to use (default: from project config)",
|
|
195
|
+
)
|
|
196
|
+
def unlock(select: str, profile: str):
|
|
197
|
+
"""
|
|
198
|
+
Clear stale pipeline locks for the selected metric(s).
|
|
199
|
+
|
|
200
|
+
Use this to recover from a run that died without releasing its lock
|
|
201
|
+
(e.g. the database restarted mid-run), which would otherwise make
|
|
202
|
+
subsequent runs fail with "Failed to acquire lock ... Use --force".
|
|
203
|
+
|
|
204
|
+
Locks also auto-expire after their timeout, so this is only needed to
|
|
205
|
+
clear a stuck lock immediately. Selector semantics match `dtk run`.
|
|
206
|
+
|
|
207
|
+
Examples:
|
|
208
|
+
# Unlock a single metric
|
|
209
|
+
dtk unlock --select cpu_usage
|
|
210
|
+
|
|
211
|
+
# Unlock everything matching a tag
|
|
212
|
+
dtk unlock --select "tag:critical"
|
|
213
|
+
"""
|
|
214
|
+
from detectkit.cli.commands.unlock import run_unlock
|
|
215
|
+
|
|
216
|
+
run_unlock(select=select, profile=profile)
|
|
217
|
+
|
|
218
|
+
|
|
185
219
|
if __name__ == "__main__":
|
|
186
220
|
cli()
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
"""Task locking mixin: ``_dtk_tasks`` operations."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
|
|
7
|
+
from detectkit.database.internal_tables._base import _InternalTablesBase
|
|
8
|
+
from detectkit.database.tables import TABLE_TASKS
|
|
9
|
+
from detectkit.utils.datetime_utils import now_utc_naive, to_naive_utc
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class _TasksMixin(_InternalTablesBase):
|
|
13
|
+
def acquire_lock(
|
|
14
|
+
self,
|
|
15
|
+
metric_name: str,
|
|
16
|
+
detector_id: str,
|
|
17
|
+
process_type: str,
|
|
18
|
+
timeout_seconds: int = 3600,
|
|
19
|
+
force: bool = False,
|
|
20
|
+
) -> bool:
|
|
21
|
+
"""Try to acquire the task lock; return False if it's actively held.
|
|
22
|
+
|
|
23
|
+
A ``running`` row whose age exceeds its stored ``timeout_seconds`` is
|
|
24
|
+
treated as stale and overridden — the owning process likely died
|
|
25
|
+
without releasing the lock (e.g. the database restarted mid-run), and
|
|
26
|
+
a hung row must never block future runs (TECHNICAL_SPEC.md §13.1).
|
|
27
|
+
|
|
28
|
+
With ``force=True`` the running-status check is skipped entirely and
|
|
29
|
+
the lock is taken unconditionally; the row is still (re)written as
|
|
30
|
+
``running`` so the forced run owns the lock and releases it on exit.
|
|
31
|
+
"""
|
|
32
|
+
if not force and self.check_lock(metric_name, detector_id, process_type) is not None:
|
|
33
|
+
return False
|
|
34
|
+
|
|
35
|
+
self._manager.upsert_task_status(
|
|
36
|
+
metric_name=metric_name,
|
|
37
|
+
detector_id=detector_id,
|
|
38
|
+
process_type=process_type,
|
|
39
|
+
status="running",
|
|
40
|
+
timeout_seconds=timeout_seconds,
|
|
41
|
+
)
|
|
42
|
+
return True
|
|
43
|
+
|
|
44
|
+
def clear_lock(
|
|
45
|
+
self,
|
|
46
|
+
metric_name: str,
|
|
47
|
+
detector_id: str = "pipeline",
|
|
48
|
+
process_type: str = "pipeline",
|
|
49
|
+
) -> bool:
|
|
50
|
+
"""Force-release a (possibly stale) lock; return True if one was held.
|
|
51
|
+
|
|
52
|
+
Used by ``dtk unlock`` to recover from a hung run that left a
|
|
53
|
+
``running`` row behind. The age check is ignored so even a not-yet-
|
|
54
|
+
stale lock is cleared. Marks the task ``completed`` so future runs
|
|
55
|
+
proceed without ``--force``.
|
|
56
|
+
"""
|
|
57
|
+
existing = self.check_lock(metric_name, detector_id, process_type, ignore_timeout=True)
|
|
58
|
+
if existing is None:
|
|
59
|
+
return False
|
|
60
|
+
|
|
61
|
+
self.release_lock(
|
|
62
|
+
metric_name=metric_name,
|
|
63
|
+
detector_id=detector_id,
|
|
64
|
+
process_type=process_type,
|
|
65
|
+
status="completed",
|
|
66
|
+
)
|
|
67
|
+
return True
|
|
68
|
+
|
|
69
|
+
def release_lock(
|
|
70
|
+
self,
|
|
71
|
+
metric_name: str,
|
|
72
|
+
detector_id: str,
|
|
73
|
+
process_type: str,
|
|
74
|
+
status: str,
|
|
75
|
+
last_processed_timestamp: datetime | None = None,
|
|
76
|
+
error_message: str | None = None,
|
|
77
|
+
) -> None:
|
|
78
|
+
"""Mark the task as ``completed`` or ``failed``."""
|
|
79
|
+
self._manager.upsert_task_status(
|
|
80
|
+
metric_name=metric_name,
|
|
81
|
+
detector_id=detector_id,
|
|
82
|
+
process_type=process_type,
|
|
83
|
+
status=status,
|
|
84
|
+
last_processed_timestamp=last_processed_timestamp,
|
|
85
|
+
error_message=error_message,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
def check_lock(
|
|
89
|
+
self,
|
|
90
|
+
metric_name: str,
|
|
91
|
+
detector_id: str,
|
|
92
|
+
process_type: str,
|
|
93
|
+
ignore_timeout: bool = False,
|
|
94
|
+
) -> dict | None:
|
|
95
|
+
"""Return the active running-task row, or ``None`` if no lock is active.
|
|
96
|
+
|
|
97
|
+
A ``running`` row whose age (``now - started_at``) exceeds its stored
|
|
98
|
+
``timeout_seconds`` is considered stale and reported as released
|
|
99
|
+
(returns ``None``), so a hung process never blocks future runs. Pass
|
|
100
|
+
``ignore_timeout=True`` to get the raw running row regardless of age
|
|
101
|
+
(used by ``dtk unlock`` to detect and report even stale locks).
|
|
102
|
+
"""
|
|
103
|
+
full_table_name = self._manager.get_full_table_name(TABLE_TASKS, use_internal=True)
|
|
104
|
+
query = f"""
|
|
105
|
+
SELECT *
|
|
106
|
+
FROM {full_table_name}
|
|
107
|
+
WHERE metric_name = %(metric_name)s
|
|
108
|
+
AND detector_id = %(detector_id)s
|
|
109
|
+
AND process_type = %(process_type)s
|
|
110
|
+
AND status = 'running'
|
|
111
|
+
"""
|
|
112
|
+
results = self._manager.execute_query(
|
|
113
|
+
query,
|
|
114
|
+
{
|
|
115
|
+
"metric_name": metric_name,
|
|
116
|
+
"detector_id": detector_id,
|
|
117
|
+
"process_type": process_type,
|
|
118
|
+
},
|
|
119
|
+
)
|
|
120
|
+
if not results:
|
|
121
|
+
return None
|
|
122
|
+
|
|
123
|
+
row = results[0]
|
|
124
|
+
if ignore_timeout:
|
|
125
|
+
return row
|
|
126
|
+
|
|
127
|
+
started_at = to_naive_utc(row.get("started_at"))
|
|
128
|
+
timeout_seconds = row.get("timeout_seconds")
|
|
129
|
+
if started_at is not None and timeout_seconds is not None:
|
|
130
|
+
elapsed = (now_utc_naive() - started_at).total_seconds()
|
|
131
|
+
if elapsed > timeout_seconds:
|
|
132
|
+
# Stale lock: the owning process never released it. Treat as
|
|
133
|
+
# free so the caller can override it.
|
|
134
|
+
return None
|
|
135
|
+
return row
|
|
136
|
+
|
|
137
|
+
def update_task_progress(
|
|
138
|
+
self,
|
|
139
|
+
metric_name: str,
|
|
140
|
+
detector_id: str,
|
|
141
|
+
process_type: str,
|
|
142
|
+
last_processed_timestamp: datetime,
|
|
143
|
+
) -> None:
|
|
144
|
+
"""Update ``last_processed_timestamp`` for an in-flight task."""
|
|
145
|
+
self._manager.upsert_task_status(
|
|
146
|
+
metric_name=metric_name,
|
|
147
|
+
detector_id=detector_id,
|
|
148
|
+
process_type=process_type,
|
|
149
|
+
status="running",
|
|
150
|
+
last_processed_timestamp=last_processed_timestamp,
|
|
151
|
+
)
|
|
@@ -15,6 +15,12 @@ from detectkit.orchestration.task_manager._detect_step import _DetectStepMixin
|
|
|
15
15
|
from detectkit.orchestration.task_manager._load_step import _LoadStepMixin
|
|
16
16
|
from detectkit.orchestration.task_manager._types import PipelineStep, TaskStatus
|
|
17
17
|
|
|
18
|
+
# Age (seconds) after which a 'running' pipeline lock is considered stale and
|
|
19
|
+
# overridden — see acquire_lock / TECHNICAL_SPEC.md §13.1. A run whose
|
|
20
|
+
# 'running' row is older than this is assumed to have died without releasing
|
|
21
|
+
# the lock (e.g. the database restarted mid-run).
|
|
22
|
+
PIPELINE_LOCK_TIMEOUT_SECONDS = 3600
|
|
23
|
+
|
|
18
24
|
|
|
19
25
|
class TaskManager(_LoadStepMixin, _DetectStepMixin, _AlertStepMixin):
|
|
20
26
|
"""Drives the load → detect → alert pipeline for a single metric.
|
|
@@ -63,19 +69,23 @@ class TaskManager(_LoadStepMixin, _DetectStepMixin, _AlertStepMixin):
|
|
|
63
69
|
table_name_override=metrics_table_name,
|
|
64
70
|
)
|
|
65
71
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
72
|
+
# Acquire the pipeline lock. A stale 'running' row (older than the
|
|
73
|
+
# timeout) is auto-overridden inside acquire_lock; --force skips the
|
|
74
|
+
# held-lock check but still takes ownership so the lock is released
|
|
75
|
+
# on exit. Done outside the try/finally below so we never release a
|
|
76
|
+
# lock held by another (still-active) process.
|
|
77
|
+
lock_acquired = self.internal.acquire_lock(
|
|
78
|
+
metric_name=metric_name,
|
|
79
|
+
detector_id="pipeline",
|
|
80
|
+
process_type="pipeline",
|
|
81
|
+
timeout_seconds=PIPELINE_LOCK_TIMEOUT_SECONDS,
|
|
82
|
+
force=force,
|
|
83
|
+
)
|
|
84
|
+
if not lock_acquired:
|
|
85
|
+
raise RuntimeError(
|
|
86
|
+
f"Failed to acquire lock for metric '{metric_name}'. "
|
|
87
|
+
"Another task is running. Use --force to override."
|
|
73
88
|
)
|
|
74
|
-
if not lock_acquired:
|
|
75
|
-
raise RuntimeError(
|
|
76
|
-
f"Failed to acquire lock for metric '{metric_name}'. "
|
|
77
|
-
"Another task is running. Use --force to override."
|
|
78
|
-
)
|
|
79
89
|
|
|
80
90
|
try:
|
|
81
91
|
if PipelineStep.LOAD in steps:
|
|
@@ -98,15 +108,17 @@ class TaskManager(_LoadStepMixin, _DetectStepMixin, _AlertStepMixin):
|
|
|
98
108
|
result["steps_completed"].append(PipelineStep.ALERT)
|
|
99
109
|
|
|
100
110
|
finally:
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
111
|
+
# Always release the lock we acquired — including forced runs,
|
|
112
|
+
# so a --force run heals a previously stuck 'running' row
|
|
113
|
+
# instead of leaving it behind.
|
|
114
|
+
status = "completed" if result["status"] == TaskStatus.SUCCESS else "failed"
|
|
115
|
+
self.internal.release_lock(
|
|
116
|
+
metric_name=metric_name,
|
|
117
|
+
detector_id="pipeline",
|
|
118
|
+
process_type="pipeline",
|
|
119
|
+
status=status,
|
|
120
|
+
error_message=result.get("error"),
|
|
121
|
+
)
|
|
110
122
|
|
|
111
123
|
except Exception as exc:
|
|
112
124
|
# Surface the failure with type + message so the CLI/log shows
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: detectkit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: Metric monitoring with automatic anomaly detection
|
|
5
5
|
Author: detectkit team
|
|
6
6
|
License: MIT
|
|
@@ -83,7 +83,7 @@ Dynamic: license-file
|
|
|
83
83
|
- **Project-level error alerts** — catch DB outages and pipeline crashes once per run
|
|
84
84
|
- **Database agnostic** — ClickHouse, PostgreSQL, MySQL
|
|
85
85
|
- **Idempotent** — resume from interruptions, no duplicate processing
|
|
86
|
-
- **CLI** — `dtk init`, `dtk run --select`, tag-based selectors
|
|
86
|
+
- **CLI** — `dtk init`, `dtk run --select`, `dtk unlock`, tag-based selectors
|
|
87
87
|
|
|
88
88
|
## Installation
|
|
89
89
|
|
|
@@ -112,6 +112,9 @@ dtk run --select cpu_usage
|
|
|
112
112
|
dtk run --select tag:critical
|
|
113
113
|
dtk run --select cpu_usage --steps load,detect
|
|
114
114
|
dtk run --select cpu_usage --from 2024-01-01
|
|
115
|
+
|
|
116
|
+
# Clear a stuck lock left by a crashed run (e.g. DB restarted mid-run)
|
|
117
|
+
dtk unlock --select cpu_usage
|
|
115
118
|
```
|
|
116
119
|
|
|
117
120
|
### Metric Configuration
|
|
@@ -34,6 +34,7 @@ detectkit/cli/commands/__init__.py
|
|
|
34
34
|
detectkit/cli/commands/init.py
|
|
35
35
|
detectkit/cli/commands/run.py
|
|
36
36
|
detectkit/cli/commands/test_alert.py
|
|
37
|
+
detectkit/cli/commands/unlock.py
|
|
37
38
|
detectkit/config/__init__.py
|
|
38
39
|
detectkit/config/metric_config.py
|
|
39
40
|
detectkit/config/profile.py
|
|
@@ -1,87 +0,0 @@
|
|
|
1
|
-
"""Task locking mixin: ``_dtk_tasks`` operations."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
from datetime import datetime
|
|
6
|
-
|
|
7
|
-
from detectkit.database.internal_tables._base import _InternalTablesBase
|
|
8
|
-
from detectkit.database.tables import TABLE_TASKS
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class _TasksMixin(_InternalTablesBase):
|
|
12
|
-
def acquire_lock(
|
|
13
|
-
self,
|
|
14
|
-
metric_name: str,
|
|
15
|
-
detector_id: str,
|
|
16
|
-
process_type: str,
|
|
17
|
-
timeout_seconds: int = 3600,
|
|
18
|
-
) -> bool:
|
|
19
|
-
"""Try to acquire the task lock; return False if it's already held."""
|
|
20
|
-
# TODO: respect *timeout_seconds* by treating stale 'running' rows as released.
|
|
21
|
-
if self.check_lock(metric_name, detector_id, process_type):
|
|
22
|
-
return False
|
|
23
|
-
|
|
24
|
-
self._manager.upsert_task_status(
|
|
25
|
-
metric_name=metric_name,
|
|
26
|
-
detector_id=detector_id,
|
|
27
|
-
process_type=process_type,
|
|
28
|
-
status="running",
|
|
29
|
-
timeout_seconds=timeout_seconds,
|
|
30
|
-
)
|
|
31
|
-
return True
|
|
32
|
-
|
|
33
|
-
def release_lock(
|
|
34
|
-
self,
|
|
35
|
-
metric_name: str,
|
|
36
|
-
detector_id: str,
|
|
37
|
-
process_type: str,
|
|
38
|
-
status: str,
|
|
39
|
-
last_processed_timestamp: datetime | None = None,
|
|
40
|
-
error_message: str | None = None,
|
|
41
|
-
) -> None:
|
|
42
|
-
"""Mark the task as ``completed`` or ``failed``."""
|
|
43
|
-
self._manager.upsert_task_status(
|
|
44
|
-
metric_name=metric_name,
|
|
45
|
-
detector_id=detector_id,
|
|
46
|
-
process_type=process_type,
|
|
47
|
-
status=status,
|
|
48
|
-
last_processed_timestamp=last_processed_timestamp,
|
|
49
|
-
error_message=error_message,
|
|
50
|
-
)
|
|
51
|
-
|
|
52
|
-
def check_lock(self, metric_name: str, detector_id: str, process_type: str) -> dict | None:
|
|
53
|
-
"""Return the running-task row, or ``None`` if no lock is active."""
|
|
54
|
-
full_table_name = self._manager.get_full_table_name(TABLE_TASKS, use_internal=True)
|
|
55
|
-
query = f"""
|
|
56
|
-
SELECT *
|
|
57
|
-
FROM {full_table_name}
|
|
58
|
-
WHERE metric_name = %(metric_name)s
|
|
59
|
-
AND detector_id = %(detector_id)s
|
|
60
|
-
AND process_type = %(process_type)s
|
|
61
|
-
AND status = 'running'
|
|
62
|
-
"""
|
|
63
|
-
results = self._manager.execute_query(
|
|
64
|
-
query,
|
|
65
|
-
{
|
|
66
|
-
"metric_name": metric_name,
|
|
67
|
-
"detector_id": detector_id,
|
|
68
|
-
"process_type": process_type,
|
|
69
|
-
},
|
|
70
|
-
)
|
|
71
|
-
return results[0] if results else None
|
|
72
|
-
|
|
73
|
-
def update_task_progress(
|
|
74
|
-
self,
|
|
75
|
-
metric_name: str,
|
|
76
|
-
detector_id: str,
|
|
77
|
-
process_type: str,
|
|
78
|
-
last_processed_timestamp: datetime,
|
|
79
|
-
) -> None:
|
|
80
|
-
"""Update ``last_processed_timestamp`` for an in-flight task."""
|
|
81
|
-
self._manager.upsert_task_status(
|
|
82
|
-
metric_name=metric_name,
|
|
83
|
-
detector_id=detector_id,
|
|
84
|
-
process_type=process_type,
|
|
85
|
-
status="running",
|
|
86
|
-
last_processed_timestamp=last_processed_timestamp,
|
|
87
|
-
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|