detectkit 0.5.3__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. {detectkit-0.5.3/detectkit.egg-info → detectkit-0.6.0}/PKG-INFO +5 -2
  2. {detectkit-0.5.3 → detectkit-0.6.0}/README.md +4 -1
  3. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/cli/commands/init.py +3 -0
  4. detectkit-0.6.0/detectkit/cli/commands/unlock.py +105 -0
  5. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/cli/main.py +34 -0
  6. detectkit-0.6.0/detectkit/database/internal_tables/_tasks.py +151 -0
  7. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/orchestration/task_manager/manager.py +33 -21
  8. {detectkit-0.5.3 → detectkit-0.6.0/detectkit.egg-info}/PKG-INFO +5 -2
  9. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit.egg-info/SOURCES.txt +1 -0
  10. {detectkit-0.5.3 → detectkit-0.6.0}/pyproject.toml +1 -1
  11. detectkit-0.5.3/detectkit/database/internal_tables/_tasks.py +0 -87
  12. {detectkit-0.5.3 → detectkit-0.6.0}/LICENSE +0 -0
  13. {detectkit-0.5.3 → detectkit-0.6.0}/MANIFEST.in +0 -0
  14. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/__init__.py +0 -0
  15. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/alerting/__init__.py +0 -0
  16. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/alerting/channels/__init__.py +0 -0
  17. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/alerting/channels/base.py +0 -0
  18. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/alerting/channels/email.py +0 -0
  19. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/alerting/channels/factory.py +0 -0
  20. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/alerting/channels/mattermost.py +0 -0
  21. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/alerting/channels/slack.py +0 -0
  22. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/alerting/channels/telegram.py +0 -0
  23. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/alerting/channels/webhook.py +0 -0
  24. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/alerting/orchestrator/__init__.py +0 -0
  25. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/alerting/orchestrator/_base.py +0 -0
  26. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/alerting/orchestrator/_cooldown.py +0 -0
  27. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/alerting/orchestrator/_decision.py +0 -0
  28. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/alerting/orchestrator/_dispatch.py +0 -0
  29. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/alerting/orchestrator/_recovery.py +0 -0
  30. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/alerting/orchestrator/_types.py +0 -0
  31. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/alerting/orchestrator/orchestrator.py +0 -0
  32. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/cli/__init__.py +0 -0
  33. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/cli/commands/__init__.py +0 -0
  34. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/cli/commands/run.py +0 -0
  35. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/cli/commands/test_alert.py +0 -0
  36. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/config/__init__.py +0 -0
  37. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/config/metric_config.py +0 -0
  38. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/config/profile.py +0 -0
  39. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/config/project_config.py +0 -0
  40. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/config/validator.py +0 -0
  41. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/core/__init__.py +0 -0
  42. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/core/interval.py +0 -0
  43. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/core/models.py +0 -0
  44. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/database/__init__.py +0 -0
  45. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/database/clickhouse_manager.py +0 -0
  46. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/database/internal_tables/__init__.py +0 -0
  47. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/database/internal_tables/_alert_states.py +0 -0
  48. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/database/internal_tables/_base.py +0 -0
  49. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/database/internal_tables/_datapoints.py +0 -0
  50. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/database/internal_tables/_detections.py +0 -0
  51. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/database/internal_tables/_metrics.py +0 -0
  52. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/database/internal_tables/_schema.py +0 -0
  53. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/database/internal_tables/manager.py +0 -0
  54. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/database/manager.py +0 -0
  55. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/database/tables.py +0 -0
  56. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/detectors/__init__.py +0 -0
  57. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/detectors/base.py +0 -0
  58. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/detectors/factory.py +0 -0
  59. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/detectors/seasonality.py +0 -0
  60. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/detectors/statistical/__init__.py +0 -0
  61. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/detectors/statistical/iqr.py +0 -0
  62. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/detectors/statistical/mad.py +0 -0
  63. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/detectors/statistical/manual_bounds.py +0 -0
  64. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/detectors/statistical/zscore.py +0 -0
  65. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/loaders/__init__.py +0 -0
  66. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/loaders/metric_loader.py +0 -0
  67. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/loaders/query_template.py +0 -0
  68. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/orchestration/__init__.py +0 -0
  69. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/orchestration/error_dispatch.py +0 -0
  70. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/orchestration/task_manager/__init__.py +0 -0
  71. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/orchestration/task_manager/_alert_step.py +0 -0
  72. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/orchestration/task_manager/_base.py +0 -0
  73. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/orchestration/task_manager/_detect_step.py +0 -0
  74. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/orchestration/task_manager/_load_step.py +0 -0
  75. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/orchestration/task_manager/_types.py +0 -0
  76. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/utils/__init__.py +0 -0
  77. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/utils/datetime_utils.py +0 -0
  78. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/utils/env_interpolation.py +0 -0
  79. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/utils/json_utils.py +0 -0
  80. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit/utils/stats.py +0 -0
  81. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit.egg-info/dependency_links.txt +0 -0
  82. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit.egg-info/entry_points.txt +0 -0
  83. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit.egg-info/requires.txt +0 -0
  84. {detectkit-0.5.3 → detectkit-0.6.0}/detectkit.egg-info/top_level.txt +0 -0
  85. {detectkit-0.5.3 → detectkit-0.6.0}/requirements.txt +0 -0
  86. {detectkit-0.5.3 → detectkit-0.6.0}/setup.cfg +0 -0
  87. {detectkit-0.5.3 → detectkit-0.6.0}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: detectkit
3
- Version: 0.5.3
3
+ Version: 0.6.0
4
4
  Summary: Metric monitoring with automatic anomaly detection
5
5
  Author: detectkit team
6
6
  License: MIT
@@ -83,7 +83,7 @@ Dynamic: license-file
83
83
  - **Project-level error alerts** — catch DB outages and pipeline crashes once per run
84
84
  - **Database agnostic** — ClickHouse, PostgreSQL, MySQL
85
85
  - **Idempotent** — resume from interruptions, no duplicate processing
86
- - **CLI** — `dtk init`, `dtk run --select`, tag-based selectors
86
+ - **CLI** — `dtk init`, `dtk run --select`, `dtk unlock`, tag-based selectors
87
87
 
88
88
  ## Installation
89
89
 
@@ -112,6 +112,9 @@ dtk run --select cpu_usage
112
112
  dtk run --select tag:critical
113
113
  dtk run --select cpu_usage --steps load,detect
114
114
  dtk run --select cpu_usage --from 2024-01-01
115
+
116
+ # Clear a stuck lock left by a crashed run (e.g. DB restarted mid-run)
117
+ dtk unlock --select cpu_usage
115
118
  ```
116
119
 
117
120
  ### Metric Configuration
@@ -17,7 +17,7 @@
17
17
  - **Project-level error alerts** — catch DB outages and pipeline crashes once per run
18
18
  - **Database agnostic** — ClickHouse, PostgreSQL, MySQL
19
19
  - **Idempotent** — resume from interruptions, no duplicate processing
20
- - **CLI** — `dtk init`, `dtk run --select`, tag-based selectors
20
+ - **CLI** — `dtk init`, `dtk run --select`, `dtk unlock`, tag-based selectors
21
21
 
22
22
  ## Installation
23
23
 
@@ -46,6 +46,9 @@ dtk run --select cpu_usage
46
46
  dtk run --select tag:critical
47
47
  dtk run --select cpu_usage --steps load,detect
48
48
  dtk run --select cpu_usage --from 2024-01-01
49
+
50
+ # Clear a stuck lock left by a crashed run (e.g. DB restarted mid-run)
51
+ dtk unlock --select cpu_usage
49
52
  ```
50
53
 
51
54
  ### Metric Configuration
@@ -260,6 +260,9 @@ dtk run --select cpu_usage --from 2024-01-01
260
260
 
261
261
  # Full refresh
262
262
  dtk run --select cpu_usage --full-refresh
263
+
264
+ # Clear a stuck lock left by a crashed run (e.g. DB restarted mid-run)
265
+ dtk unlock --select cpu_usage
263
266
  ```
264
267
 
265
268
  ## Documentation
@@ -0,0 +1,105 @@
1
+ """
2
+ Implementation of 'dtk unlock' command.
3
+
4
+ Force-releases stale pipeline locks left behind by a run that died without
5
+ releasing them (e.g. the database restarted mid-run). Normally the lock
6
+ auto-expires after its timeout, but this command clears it immediately.
7
+ """
8
+
9
+ import click
10
+
11
+ from detectkit.cli.commands.run import find_project_root, select_metrics
12
+ from detectkit.config.profile import ProfilesConfig
13
+ from detectkit.database.internal_tables import InternalTablesManager
14
+
15
+
16
+ def run_unlock(select: str, profile: str | None):
17
+ """
18
+ Clear pipeline locks for the selected metric(s).
19
+
20
+ Args:
21
+ select: Metric selector (name, path, or tag) — same semantics as `dtk run`
22
+ profile: Profile name to use (defaults to project's default_profile)
23
+ """
24
+ # Find project root
25
+ project_root = find_project_root()
26
+ if not project_root:
27
+ click.echo(
28
+ click.style(
29
+ "Error: Not in a detectkit project directory!",
30
+ fg="red",
31
+ bold=True,
32
+ )
33
+ )
34
+ click.echo("Run 'dtk init <project_name>' to create a new project.")
35
+ return
36
+
37
+ click.echo(f"Project root: {project_root}")
38
+
39
+ # Select metrics based on selector
40
+ try:
41
+ metrics = select_metrics(select, project_root)
42
+ except ValueError as e:
43
+ click.echo(click.style(f"Error: {e}", fg="red", bold=True))
44
+ return
45
+
46
+ if not metrics:
47
+ click.echo(
48
+ click.style(
49
+ f"No metrics found matching selector: {select}",
50
+ fg="yellow",
51
+ )
52
+ )
53
+ return
54
+
55
+ # Load profiles.yml
56
+ profiles_path = project_root / "profiles.yml"
57
+ if not profiles_path.exists():
58
+ click.echo(click.style("Error: profiles.yml not found!", fg="red", bold=True))
59
+ click.echo(f"Expected at: {profiles_path}")
60
+ return
61
+
62
+ try:
63
+ profiles_config = ProfilesConfig.from_yaml(profiles_path)
64
+ except Exception as e:
65
+ click.echo(click.style(f"Error loading profiles.yml: {e}", fg="red", bold=True))
66
+ return
67
+
68
+ # Create database / internal tables manager
69
+ try:
70
+ db_manager = profiles_config.create_manager(profile)
71
+ except Exception as e:
72
+ click.echo(click.style(f"Error creating database manager: {e}", fg="red", bold=True))
73
+ return
74
+
75
+ internal_manager = InternalTablesManager(db_manager)
76
+
77
+ click.echo(f"Found {len(metrics)} metric(s) to unlock")
78
+ click.echo()
79
+
80
+ cleared = 0
81
+ for _, config in metrics:
82
+ metric_name = config.name
83
+ try:
84
+ was_locked = internal_manager.clear_lock(metric_name)
85
+ except Exception as e:
86
+ click.echo(
87
+ click.style(f" ✗ {metric_name}: error clearing lock: {e}", fg="red"),
88
+ err=True,
89
+ )
90
+ continue
91
+
92
+ if was_locked:
93
+ cleared += 1
94
+ click.echo(click.style(f" ✓ {metric_name}: lock cleared", fg="green"))
95
+ else:
96
+ click.echo(f" • {metric_name}: no active lock")
97
+
98
+ click.echo()
99
+ click.echo(
100
+ click.style(
101
+ f"Done. Cleared {cleared} lock(s) of {len(metrics)} metric(s).",
102
+ fg="cyan",
103
+ bold=True,
104
+ )
105
+ )
@@ -182,5 +182,39 @@ def test_alert(metric_name: str, profile: str):
182
182
  run_test_alert(metric_name=metric_name, profile=profile)
183
183
 
184
184
 
185
+ @cli.command()
186
+ @click.option(
187
+ "--select",
188
+ "-s",
189
+ help="Selector for metrics to unlock (metric name, path, or tag)",
190
+ required=True,
191
+ )
192
+ @click.option(
193
+ "--profile",
194
+ help="Profile to use (default: from project config)",
195
+ )
196
+ def unlock(select: str, profile: str):
197
+ """
198
+ Clear stale pipeline locks for the selected metric(s).
199
+
200
+ Use this to recover from a run that died without releasing its lock
201
+ (e.g. the database restarted mid-run), which would otherwise make
202
+ subsequent runs fail with "Failed to acquire lock ... Use --force".
203
+
204
+ Locks also auto-expire after their timeout, so this is only needed to
205
+ clear a stuck lock immediately. Selector semantics match `dtk run`.
206
+
207
+ Examples:
208
+ # Unlock a single metric
209
+ dtk unlock --select cpu_usage
210
+
211
+ # Unlock everything matching a tag
212
+ dtk unlock --select "tag:critical"
213
+ """
214
+ from detectkit.cli.commands.unlock import run_unlock
215
+
216
+ run_unlock(select=select, profile=profile)
217
+
218
+
185
219
  if __name__ == "__main__":
186
220
  cli()
@@ -0,0 +1,151 @@
1
+ """Task locking mixin: ``_dtk_tasks`` operations."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from datetime import datetime
6
+
7
+ from detectkit.database.internal_tables._base import _InternalTablesBase
8
+ from detectkit.database.tables import TABLE_TASKS
9
+ from detectkit.utils.datetime_utils import now_utc_naive, to_naive_utc
10
+
11
+
12
+ class _TasksMixin(_InternalTablesBase):
13
+ def acquire_lock(
14
+ self,
15
+ metric_name: str,
16
+ detector_id: str,
17
+ process_type: str,
18
+ timeout_seconds: int = 3600,
19
+ force: bool = False,
20
+ ) -> bool:
21
+ """Try to acquire the task lock; return False if it's actively held.
22
+
23
+ A ``running`` row whose age exceeds its stored ``timeout_seconds`` is
24
+ treated as stale and overridden — the owning process likely died
25
+ without releasing the lock (e.g. the database restarted mid-run), and
26
+ a hung row must never block future runs (TECHNICAL_SPEC.md §13.1).
27
+
28
+ With ``force=True`` the running-status check is skipped entirely and
29
+ the lock is taken unconditionally; the row is still (re)written as
30
+ ``running`` so the forced run owns the lock and releases it on exit.
31
+ """
32
+ if not force and self.check_lock(metric_name, detector_id, process_type) is not None:
33
+ return False
34
+
35
+ self._manager.upsert_task_status(
36
+ metric_name=metric_name,
37
+ detector_id=detector_id,
38
+ process_type=process_type,
39
+ status="running",
40
+ timeout_seconds=timeout_seconds,
41
+ )
42
+ return True
43
+
44
+ def clear_lock(
45
+ self,
46
+ metric_name: str,
47
+ detector_id: str = "pipeline",
48
+ process_type: str = "pipeline",
49
+ ) -> bool:
50
+ """Force-release a (possibly stale) lock; return True if one was held.
51
+
52
+ Used by ``dtk unlock`` to recover from a hung run that left a
53
+ ``running`` row behind. The age check is ignored so even a not-yet-
54
+ stale lock is cleared. Marks the task ``completed`` so future runs
55
+ proceed without ``--force``.
56
+ """
57
+ existing = self.check_lock(metric_name, detector_id, process_type, ignore_timeout=True)
58
+ if existing is None:
59
+ return False
60
+
61
+ self.release_lock(
62
+ metric_name=metric_name,
63
+ detector_id=detector_id,
64
+ process_type=process_type,
65
+ status="completed",
66
+ )
67
+ return True
68
+
69
+ def release_lock(
70
+ self,
71
+ metric_name: str,
72
+ detector_id: str,
73
+ process_type: str,
74
+ status: str,
75
+ last_processed_timestamp: datetime | None = None,
76
+ error_message: str | None = None,
77
+ ) -> None:
78
+ """Mark the task as ``completed`` or ``failed``."""
79
+ self._manager.upsert_task_status(
80
+ metric_name=metric_name,
81
+ detector_id=detector_id,
82
+ process_type=process_type,
83
+ status=status,
84
+ last_processed_timestamp=last_processed_timestamp,
85
+ error_message=error_message,
86
+ )
87
+
88
+ def check_lock(
89
+ self,
90
+ metric_name: str,
91
+ detector_id: str,
92
+ process_type: str,
93
+ ignore_timeout: bool = False,
94
+ ) -> dict | None:
95
+ """Return the active running-task row, or ``None`` if no lock is active.
96
+
97
+ A ``running`` row whose age (``now - started_at``) exceeds its stored
98
+ ``timeout_seconds`` is considered stale and reported as released
99
+ (returns ``None``), so a hung process never blocks future runs. Pass
100
+ ``ignore_timeout=True`` to get the raw running row regardless of age
101
+ (used by ``dtk unlock`` to detect and report even stale locks).
102
+ """
103
+ full_table_name = self._manager.get_full_table_name(TABLE_TASKS, use_internal=True)
104
+ query = f"""
105
+ SELECT *
106
+ FROM {full_table_name}
107
+ WHERE metric_name = %(metric_name)s
108
+ AND detector_id = %(detector_id)s
109
+ AND process_type = %(process_type)s
110
+ AND status = 'running'
111
+ """
112
+ results = self._manager.execute_query(
113
+ query,
114
+ {
115
+ "metric_name": metric_name,
116
+ "detector_id": detector_id,
117
+ "process_type": process_type,
118
+ },
119
+ )
120
+ if not results:
121
+ return None
122
+
123
+ row = results[0]
124
+ if ignore_timeout:
125
+ return row
126
+
127
+ started_at = to_naive_utc(row.get("started_at"))
128
+ timeout_seconds = row.get("timeout_seconds")
129
+ if started_at is not None and timeout_seconds is not None:
130
+ elapsed = (now_utc_naive() - started_at).total_seconds()
131
+ if elapsed > timeout_seconds:
132
+ # Stale lock: the owning process never released it. Treat as
133
+ # free so the caller can override it.
134
+ return None
135
+ return row
136
+
137
+ def update_task_progress(
138
+ self,
139
+ metric_name: str,
140
+ detector_id: str,
141
+ process_type: str,
142
+ last_processed_timestamp: datetime,
143
+ ) -> None:
144
+ """Update ``last_processed_timestamp`` for an in-flight task."""
145
+ self._manager.upsert_task_status(
146
+ metric_name=metric_name,
147
+ detector_id=detector_id,
148
+ process_type=process_type,
149
+ status="running",
150
+ last_processed_timestamp=last_processed_timestamp,
151
+ )
@@ -15,6 +15,12 @@ from detectkit.orchestration.task_manager._detect_step import _DetectStepMixin
15
15
  from detectkit.orchestration.task_manager._load_step import _LoadStepMixin
16
16
  from detectkit.orchestration.task_manager._types import PipelineStep, TaskStatus
17
17
 
18
+ # Age (seconds) after which a 'running' pipeline lock is considered stale and
19
+ # overridden — see acquire_lock / TECHNICAL_SPEC.md §13.1. A run whose
20
+ # 'running' row is older than this is assumed to have died without releasing
21
+ # the lock (e.g. the database restarted mid-run).
22
+ PIPELINE_LOCK_TIMEOUT_SECONDS = 3600
23
+
18
24
 
19
25
  class TaskManager(_LoadStepMixin, _DetectStepMixin, _AlertStepMixin):
20
26
  """Drives the load → detect → alert pipeline for a single metric.
@@ -63,19 +69,23 @@ class TaskManager(_LoadStepMixin, _DetectStepMixin, _AlertStepMixin):
63
69
  table_name_override=metrics_table_name,
64
70
  )
65
71
 
66
- if not force:
67
- # TODO: surface the timeout via ProjectConfig.
68
- lock_acquired = self.internal.acquire_lock(
69
- metric_name=metric_name,
70
- detector_id="pipeline",
71
- process_type="pipeline",
72
- timeout_seconds=3600,
72
+ # Acquire the pipeline lock. A stale 'running' row (older than the
73
+ # timeout) is auto-overridden inside acquire_lock; --force skips the
74
+ # held-lock check but still takes ownership so the lock is released
75
+ # on exit. Done outside the try/finally below so we never release a
76
+ # lock held by another (still-active) process.
77
+ lock_acquired = self.internal.acquire_lock(
78
+ metric_name=metric_name,
79
+ detector_id="pipeline",
80
+ process_type="pipeline",
81
+ timeout_seconds=PIPELINE_LOCK_TIMEOUT_SECONDS,
82
+ force=force,
83
+ )
84
+ if not lock_acquired:
85
+ raise RuntimeError(
86
+ f"Failed to acquire lock for metric '{metric_name}'. "
87
+ "Another task is running. Use --force to override."
73
88
  )
74
- if not lock_acquired:
75
- raise RuntimeError(
76
- f"Failed to acquire lock for metric '{metric_name}'. "
77
- "Another task is running. Use --force to override."
78
- )
79
89
 
80
90
  try:
81
91
  if PipelineStep.LOAD in steps:
@@ -98,15 +108,17 @@ class TaskManager(_LoadStepMixin, _DetectStepMixin, _AlertStepMixin):
98
108
  result["steps_completed"].append(PipelineStep.ALERT)
99
109
 
100
110
  finally:
101
- if not force:
102
- status = "completed" if result["status"] == TaskStatus.SUCCESS else "failed"
103
- self.internal.release_lock(
104
- metric_name=metric_name,
105
- detector_id="pipeline",
106
- process_type="pipeline",
107
- status=status,
108
- error_message=result.get("error"),
109
- )
111
+ # Always release the lock we acquired — including forced runs,
112
+ # so a --force run heals a previously stuck 'running' row
113
+ # instead of leaving it behind.
114
+ status = "completed" if result["status"] == TaskStatus.SUCCESS else "failed"
115
+ self.internal.release_lock(
116
+ metric_name=metric_name,
117
+ detector_id="pipeline",
118
+ process_type="pipeline",
119
+ status=status,
120
+ error_message=result.get("error"),
121
+ )
110
122
 
111
123
  except Exception as exc:
112
124
  # Surface the failure with type + message so the CLI/log shows
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: detectkit
3
- Version: 0.5.3
3
+ Version: 0.6.0
4
4
  Summary: Metric monitoring with automatic anomaly detection
5
5
  Author: detectkit team
6
6
  License: MIT
@@ -83,7 +83,7 @@ Dynamic: license-file
83
83
  - **Project-level error alerts** — catch DB outages and pipeline crashes once per run
84
84
  - **Database agnostic** — ClickHouse, PostgreSQL, MySQL
85
85
  - **Idempotent** — resume from interruptions, no duplicate processing
86
- - **CLI** — `dtk init`, `dtk run --select`, tag-based selectors
86
+ - **CLI** — `dtk init`, `dtk run --select`, `dtk unlock`, tag-based selectors
87
87
 
88
88
  ## Installation
89
89
 
@@ -112,6 +112,9 @@ dtk run --select cpu_usage
112
112
  dtk run --select tag:critical
113
113
  dtk run --select cpu_usage --steps load,detect
114
114
  dtk run --select cpu_usage --from 2024-01-01
115
+
116
+ # Clear a stuck lock left by a crashed run (e.g. DB restarted mid-run)
117
+ dtk unlock --select cpu_usage
115
118
  ```
116
119
 
117
120
  ### Metric Configuration
@@ -34,6 +34,7 @@ detectkit/cli/commands/__init__.py
34
34
  detectkit/cli/commands/init.py
35
35
  detectkit/cli/commands/run.py
36
36
  detectkit/cli/commands/test_alert.py
37
+ detectkit/cli/commands/unlock.py
37
38
  detectkit/config/__init__.py
38
39
  detectkit/config/metric_config.py
39
40
  detectkit/config/profile.py
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "detectkit"
7
- version = "0.5.3"
7
+ version = "0.6.0"
8
8
  description = "Metric monitoring with automatic anomaly detection"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -1,87 +0,0 @@
1
- """Task locking mixin: ``_dtk_tasks`` operations."""
2
-
3
- from __future__ import annotations
4
-
5
- from datetime import datetime
6
-
7
- from detectkit.database.internal_tables._base import _InternalTablesBase
8
- from detectkit.database.tables import TABLE_TASKS
9
-
10
-
11
- class _TasksMixin(_InternalTablesBase):
12
- def acquire_lock(
13
- self,
14
- metric_name: str,
15
- detector_id: str,
16
- process_type: str,
17
- timeout_seconds: int = 3600,
18
- ) -> bool:
19
- """Try to acquire the task lock; return False if it's already held."""
20
- # TODO: respect *timeout_seconds* by treating stale 'running' rows as released.
21
- if self.check_lock(metric_name, detector_id, process_type):
22
- return False
23
-
24
- self._manager.upsert_task_status(
25
- metric_name=metric_name,
26
- detector_id=detector_id,
27
- process_type=process_type,
28
- status="running",
29
- timeout_seconds=timeout_seconds,
30
- )
31
- return True
32
-
33
- def release_lock(
34
- self,
35
- metric_name: str,
36
- detector_id: str,
37
- process_type: str,
38
- status: str,
39
- last_processed_timestamp: datetime | None = None,
40
- error_message: str | None = None,
41
- ) -> None:
42
- """Mark the task as ``completed`` or ``failed``."""
43
- self._manager.upsert_task_status(
44
- metric_name=metric_name,
45
- detector_id=detector_id,
46
- process_type=process_type,
47
- status=status,
48
- last_processed_timestamp=last_processed_timestamp,
49
- error_message=error_message,
50
- )
51
-
52
- def check_lock(self, metric_name: str, detector_id: str, process_type: str) -> dict | None:
53
- """Return the running-task row, or ``None`` if no lock is active."""
54
- full_table_name = self._manager.get_full_table_name(TABLE_TASKS, use_internal=True)
55
- query = f"""
56
- SELECT *
57
- FROM {full_table_name}
58
- WHERE metric_name = %(metric_name)s
59
- AND detector_id = %(detector_id)s
60
- AND process_type = %(process_type)s
61
- AND status = 'running'
62
- """
63
- results = self._manager.execute_query(
64
- query,
65
- {
66
- "metric_name": metric_name,
67
- "detector_id": detector_id,
68
- "process_type": process_type,
69
- },
70
- )
71
- return results[0] if results else None
72
-
73
- def update_task_progress(
74
- self,
75
- metric_name: str,
76
- detector_id: str,
77
- process_type: str,
78
- last_processed_timestamp: datetime,
79
- ) -> None:
80
- """Update ``last_processed_timestamp`` for an in-flight task."""
81
- self._manager.upsert_task_status(
82
- metric_name=metric_name,
83
- detector_id=detector_id,
84
- process_type=process_type,
85
- status="running",
86
- last_processed_timestamp=last_processed_timestamp,
87
- )
File without changes
File without changes
File without changes
File without changes
File without changes