PyPI - detectkit - Versions diffs - 0.5.3__tar.gz → 0.6.0__tar.gz - Mend

detectkit 0.5.3tar.gz → 0.6.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

{detectkit-0.5.3/detectkit.egg-info → detectkit-0.6.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: detectkit
-Version: 0.5.3
+Version: 0.6.0
 Summary: Metric monitoring with automatic anomaly detection
 Author: detectkit team
 License: MIT
@@ -83,7 +83,7 @@ Dynamic: license-file
 - **Project-level error alerts** — catch DB outages and pipeline crashes once per run
 - **Database agnostic** — ClickHouse, PostgreSQL, MySQL
 - **Idempotent** — resume from interruptions, no duplicate processing
-- **CLI** — `dtk init`, `dtk run --select`, tag-based selectors
+- **CLI** — `dtk init`, `dtk run --select`, `dtk unlock`, tag-based selectors
 ## Installation
@@ -112,6 +112,9 @@ dtk run --select cpu_usage
 dtk run --select tag:critical
 dtk run --select cpu_usage --steps load,detect
 dtk run --select cpu_usage --from 2024-01-01
+# Clear a stuck lock left by a crashed run (e.g. DB restarted mid-run)
+dtk unlock --select cpu_usage
 ```
 ### Metric Configuration

{detectkit-0.5.3 → detectkit-0.6.0}/README.md RENAMED Viewed

@@ -17,7 +17,7 @@
 - **Project-level error alerts** — catch DB outages and pipeline crashes once per run
 - **Database agnostic** — ClickHouse, PostgreSQL, MySQL
 - **Idempotent** — resume from interruptions, no duplicate processing
-- **CLI** — `dtk init`, `dtk run --select`, tag-based selectors
+- **CLI** — `dtk init`, `dtk run --select`, `dtk unlock`, tag-based selectors
 ## Installation
@@ -46,6 +46,9 @@ dtk run --select cpu_usage
 dtk run --select tag:critical
 dtk run --select cpu_usage --steps load,detect
 dtk run --select cpu_usage --from 2024-01-01
+# Clear a stuck lock left by a crashed run (e.g. DB restarted mid-run)
+dtk unlock --select cpu_usage
 ```
 ### Metric Configuration

{detectkit-0.5.3 → detectkit-0.6.0}/detectkit/cli/commands/init.py RENAMED Viewed

@@ -260,6 +260,9 @@ dtk run --select cpu_usage --from 2024-01-01
 # Full refresh
 dtk run --select cpu_usage --full-refresh
+# Clear a stuck lock left by a crashed run (e.g. DB restarted mid-run)
+dtk unlock --select cpu_usage
 ```
 ## Documentation

detectkit-0.6.0/detectkit/cli/commands/unlock.py ADDED Viewed

@@ -0,0 +1,105 @@
+"""
+Implementation of 'dtk unlock' command.
+Force-releases stale pipeline locks left behind by a run that died without
+releasing them (e.g. the database restarted mid-run). Normally the lock
+auto-expires after its timeout, but this command clears it immediately.
+"""
+import click
+from detectkit.cli.commands.run import find_project_root, select_metrics
+from detectkit.config.profile import ProfilesConfig
+from detectkit.database.internal_tables import InternalTablesManager
+def run_unlock(select: str, profile: str | None):
+    """
+    Clear pipeline locks for the selected metric(s).
+    Args:
+        select: Metric selector (name, path, or tag) — same semantics as `dtk run`
+        profile: Profile name to use (defaults to project's default_profile)
+    """
+    # Find project root
+    project_root = find_project_root()
+    if not project_root:
+        click.echo(
+            click.style(
+                "Error: Not in a detectkit project directory!",
+                fg="red",
+                bold=True,
+            )
+        )
+        click.echo("Run 'dtk init <project_name>' to create a new project.")
+        return
+    click.echo(f"Project root: {project_root}")
+    # Select metrics based on selector
+    try:
+        metrics = select_metrics(select, project_root)
+    except ValueError as e:
+        click.echo(click.style(f"Error: {e}", fg="red", bold=True))
+        return
+    if not metrics:
+        click.echo(
+            click.style(
+                f"No metrics found matching selector: {select}",
+                fg="yellow",
+            )
+        )
+        return
+    # Load profiles.yml
+    profiles_path = project_root / "profiles.yml"
+    if not profiles_path.exists():
+        click.echo(click.style("Error: profiles.yml not found!", fg="red", bold=True))
+        click.echo(f"Expected at: {profiles_path}")
+        return
+    try:
+        profiles_config = ProfilesConfig.from_yaml(profiles_path)
+    except Exception as e:
+        click.echo(click.style(f"Error loading profiles.yml: {e}", fg="red", bold=True))
+        return
+    # Create database / internal tables manager
+    try:
+        db_manager = profiles_config.create_manager(profile)
+    except Exception as e:
+        click.echo(click.style(f"Error creating database manager: {e}", fg="red", bold=True))
+        return
+    internal_manager = InternalTablesManager(db_manager)
+    click.echo(f"Found {len(metrics)} metric(s) to unlock")
+    click.echo()
+    cleared = 0
+    for _, config in metrics:
+        metric_name = config.name
+        try:
+            was_locked = internal_manager.clear_lock(metric_name)
+        except Exception as e:
+            click.echo(
+                click.style(f"  ✗ {metric_name}: error clearing lock: {e}", fg="red"),
+                err=True,
+            )
+            continue
+        if was_locked:
+            cleared += 1
+            click.echo(click.style(f"  ✓ {metric_name}: lock cleared", fg="green"))
+        else:
+            click.echo(f"  • {metric_name}: no active lock")
+    click.echo()
+    click.echo(
+        click.style(
+            f"Done. Cleared {cleared} lock(s) of {len(metrics)} metric(s).",
+            fg="cyan",
+            bold=True,
+        )
+    )

{detectkit-0.5.3 → detectkit-0.6.0}/detectkit/cli/main.py RENAMED Viewed

@@ -182,5 +182,39 @@ def test_alert(metric_name: str, profile: str):
     run_test_alert(metric_name=metric_name, profile=profile)
+@cli.command()
+@click.option(
+    "--select",
+    "-s",
+    help="Selector for metrics to unlock (metric name, path, or tag)",
+    required=True,
+)
+@click.option(
+    "--profile",
+    help="Profile to use (default: from project config)",
+)
+def unlock(select: str, profile: str):
+    """
+    Clear stale pipeline locks for the selected metric(s).
+    Use this to recover from a run that died without releasing its lock
+    (e.g. the database restarted mid-run), which would otherwise make
+    subsequent runs fail with "Failed to acquire lock ... Use --force".
+    Locks also auto-expire after their timeout, so this is only needed to
+    clear a stuck lock immediately. Selector semantics match `dtk run`.
+    Examples:
+        # Unlock a single metric
+        dtk unlock --select cpu_usage
+        # Unlock everything matching a tag
+        dtk unlock --select "tag:critical"
+    """
+    from detectkit.cli.commands.unlock import run_unlock
+    run_unlock(select=select, profile=profile)
 if __name__ == "__main__":
     cli()

detectkit-0.6.0/detectkit/database/internal_tables/_tasks.py ADDED Viewed

@@ -0,0 +1,151 @@
+"""Task locking mixin: ``_dtk_tasks`` operations."""
+from __future__ import annotations
+from datetime import datetime
+from detectkit.database.internal_tables._base import _InternalTablesBase
+from detectkit.database.tables import TABLE_TASKS
+from detectkit.utils.datetime_utils import now_utc_naive, to_naive_utc
+class _TasksMixin(_InternalTablesBase):
+    def acquire_lock(
+        self,
+        metric_name: str,
+        detector_id: str,
+        process_type: str,
+        timeout_seconds: int = 3600,
+        force: bool = False,
+    ) -> bool:
+        """Try to acquire the task lock; return False if it's actively held.
+        A ``running`` row whose age exceeds its stored ``timeout_seconds`` is
+        treated as stale and overridden — the owning process likely died
+        without releasing the lock (e.g. the database restarted mid-run), and
+        a hung row must never block future runs (TECHNICAL_SPEC.md §13.1).
+        With ``force=True`` the running-status check is skipped entirely and
+        the lock is taken unconditionally; the row is still (re)written as
+        ``running`` so the forced run owns the lock and releases it on exit.
+        """
+        if not force and self.check_lock(metric_name, detector_id, process_type) is not None:
+            return False
+        self._manager.upsert_task_status(
+            metric_name=metric_name,
+            detector_id=detector_id,
+            process_type=process_type,
+            status="running",
+            timeout_seconds=timeout_seconds,
+        )
+        return True
+    def clear_lock(
+        self,
+        metric_name: str,
+        detector_id: str = "pipeline",
+        process_type: str = "pipeline",
+    ) -> bool:
+        """Force-release a (possibly stale) lock; return True if one was held.
+        Used by ``dtk unlock`` to recover from a hung run that left a
+        ``running`` row behind. The age check is ignored so even a not-yet-
+        stale lock is cleared. Marks the task ``completed`` so future runs
+        proceed without ``--force``.
+        """
+        existing = self.check_lock(metric_name, detector_id, process_type, ignore_timeout=True)
+        if existing is None:
+            return False
+        self.release_lock(
+            metric_name=metric_name,
+            detector_id=detector_id,
+            process_type=process_type,
+            status="completed",
+        )
+        return True
+    def release_lock(
+        self,
+        metric_name: str,
+        detector_id: str,
+        process_type: str,
+        status: str,
+        last_processed_timestamp: datetime | None = None,
+        error_message: str | None = None,
+    ) -> None:
+        """Mark the task as ``completed`` or ``failed``."""
+        self._manager.upsert_task_status(
+            metric_name=metric_name,
+            detector_id=detector_id,
+            process_type=process_type,
+            status=status,
+            last_processed_timestamp=last_processed_timestamp,
+            error_message=error_message,
+        )
+    def check_lock(
+        self,
+        metric_name: str,
+        detector_id: str,
+        process_type: str,
+        ignore_timeout: bool = False,
+    ) -> dict | None:
+        """Return the active running-task row, or ``None`` if no lock is active.
+        A ``running`` row whose age (``now - started_at``) exceeds its stored
+        ``timeout_seconds`` is considered stale and reported as released
+        (returns ``None``), so a hung process never blocks future runs. Pass
+        ``ignore_timeout=True`` to get the raw running row regardless of age
+        (used by ``dtk unlock`` to detect and report even stale locks).
+        """
+        full_table_name = self._manager.get_full_table_name(TABLE_TASKS, use_internal=True)
+        query = f"""
+        SELECT *
+        FROM {full_table_name}
+        WHERE metric_name = %(metric_name)s
+          AND detector_id = %(detector_id)s
+          AND process_type = %(process_type)s
+          AND status = 'running'
+        """
+        results = self._manager.execute_query(
+            query,
+            {
+                "metric_name": metric_name,
+                "detector_id": detector_id,
+                "process_type": process_type,
+            },
+        )
+        if not results:
+            return None
+        row = results[0]
+        if ignore_timeout:
+            return row
+        started_at = to_naive_utc(row.get("started_at"))
+        timeout_seconds = row.get("timeout_seconds")
+        if started_at is not None and timeout_seconds is not None:
+            elapsed = (now_utc_naive() - started_at).total_seconds()
+            if elapsed > timeout_seconds:
+                # Stale lock: the owning process never released it. Treat as
+                # free so the caller can override it.
+                return None
+        return row
+    def update_task_progress(
+        self,
+        metric_name: str,
+        detector_id: str,
+        process_type: str,
+        last_processed_timestamp: datetime,
+    ) -> None:
+        """Update ``last_processed_timestamp`` for an in-flight task."""
+        self._manager.upsert_task_status(
+            metric_name=metric_name,
+            detector_id=detector_id,
+            process_type=process_type,
+            status="running",
+            last_processed_timestamp=last_processed_timestamp,
+        )

{detectkit-0.5.3 → detectkit-0.6.0}/detectkit/orchestration/task_manager/manager.py RENAMED Viewed

@@ -15,6 +15,12 @@ from detectkit.orchestration.task_manager._detect_step import _DetectStepMixin
 from detectkit.orchestration.task_manager._load_step import _LoadStepMixin
 from detectkit.orchestration.task_manager._types import PipelineStep, TaskStatus
+# Age (seconds) after which a 'running' pipeline lock is considered stale and
+# overridden — see acquire_lock / TECHNICAL_SPEC.md §13.1. A run whose
+# 'running' row is older than this is assumed to have died without releasing
+# the lock (e.g. the database restarted mid-run).
+PIPELINE_LOCK_TIMEOUT_SECONDS = 3600
 class TaskManager(_LoadStepMixin, _DetectStepMixin, _AlertStepMixin):
     """Drives the load → detect → alert pipeline for a single metric.
@@ -63,19 +69,23 @@ class TaskManager(_LoadStepMixin, _DetectStepMixin, _AlertStepMixin):
                     table_name_override=metrics_table_name,
                 )
-            if not force:
-                # TODO: surface the timeout via ProjectConfig.
-                lock_acquired = self.internal.acquire_lock(
-                    metric_name=metric_name,
-                    detector_id="pipeline",
-                    process_type="pipeline",
-                    timeout_seconds=3600,
+            # Acquire the pipeline lock. A stale 'running' row (older than the
+            # timeout) is auto-overridden inside acquire_lock; --force skips the
+            # held-lock check but still takes ownership so the lock is released
+            # on exit. Done outside the try/finally below so we never release a
+            # lock held by another (still-active) process.
+            lock_acquired = self.internal.acquire_lock(
+                metric_name=metric_name,
+                detector_id="pipeline",
+                process_type="pipeline",
+                timeout_seconds=PIPELINE_LOCK_TIMEOUT_SECONDS,
+                force=force,
+            )
+            if not lock_acquired:
+                raise RuntimeError(
+                    f"Failed to acquire lock for metric '{metric_name}'. "
+                    "Another task is running. Use --force to override."
                 )
-                if not lock_acquired:
-                    raise RuntimeError(
-                        f"Failed to acquire lock for metric '{metric_name}'. "
-                        "Another task is running. Use --force to override."
-                    )
             try:
                 if PipelineStep.LOAD in steps:
@@ -98,15 +108,17 @@ class TaskManager(_LoadStepMixin, _DetectStepMixin, _AlertStepMixin):
                     result["steps_completed"].append(PipelineStep.ALERT)
             finally:
-                if not force:
-                    status = "completed" if result["status"] == TaskStatus.SUCCESS else "failed"
-                    self.internal.release_lock(
-                        metric_name=metric_name,
-                        detector_id="pipeline",
-                        process_type="pipeline",
-                        status=status,
-                        error_message=result.get("error"),
-                    )
+                # Always release the lock we acquired — including forced runs,
+                # so a --force run heals a previously stuck 'running' row
+                # instead of leaving it behind.
+                status = "completed" if result["status"] == TaskStatus.SUCCESS else "failed"
+                self.internal.release_lock(
+                    metric_name=metric_name,
+                    detector_id="pipeline",
+                    process_type="pipeline",
+                    status=status,
+                    error_message=result.get("error"),
+                )
         except Exception as exc:
             # Surface the failure with type + message so the CLI/log shows

{detectkit-0.5.3 → detectkit-0.6.0/detectkit.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: detectkit
-Version: 0.5.3
+Version: 0.6.0
 Summary: Metric monitoring with automatic anomaly detection
 Author: detectkit team
 License: MIT
@@ -83,7 +83,7 @@ Dynamic: license-file
 - **Project-level error alerts** — catch DB outages and pipeline crashes once per run
 - **Database agnostic** — ClickHouse, PostgreSQL, MySQL
 - **Idempotent** — resume from interruptions, no duplicate processing
-- **CLI** — `dtk init`, `dtk run --select`, tag-based selectors
+- **CLI** — `dtk init`, `dtk run --select`, `dtk unlock`, tag-based selectors
 ## Installation
@@ -112,6 +112,9 @@ dtk run --select cpu_usage
 dtk run --select tag:critical
 dtk run --select cpu_usage --steps load,detect
 dtk run --select cpu_usage --from 2024-01-01
+# Clear a stuck lock left by a crashed run (e.g. DB restarted mid-run)
+dtk unlock --select cpu_usage
 ```
 ### Metric Configuration

{detectkit-0.5.3 → detectkit-0.6.0}/detectkit.egg-info/SOURCES.txt RENAMED Viewed

@@ -34,6 +34,7 @@ detectkit/cli/commands/__init__.py
 detectkit/cli/commands/init.py
 detectkit/cli/commands/run.py
 detectkit/cli/commands/test_alert.py
+detectkit/cli/commands/unlock.py
 detectkit/config/__init__.py
 detectkit/config/metric_config.py
 detectkit/config/profile.py

{detectkit-0.5.3 → detectkit-0.6.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "detectkit"
-version = "0.5.3"
+version = "0.6.0"
 description = "Metric monitoring with automatic anomaly detection"
 readme = "README.md"
 requires-python = ">=3.10"

detectkit-0.5.3/detectkit/database/internal_tables/_tasks.py DELETED Viewed

@@ -1,87 +0,0 @@
-"""Task locking mixin: ``_dtk_tasks`` operations."""
-from __future__ import annotations
-from datetime import datetime
-from detectkit.database.internal_tables._base import _InternalTablesBase
-from detectkit.database.tables import TABLE_TASKS
-class _TasksMixin(_InternalTablesBase):
-    def acquire_lock(
-        self,
-        metric_name: str,
-        detector_id: str,
-        process_type: str,
-        timeout_seconds: int = 3600,
-    ) -> bool:
-        """Try to acquire the task lock; return False if it's already held."""
-        # TODO: respect *timeout_seconds* by treating stale 'running' rows as released.
-        if self.check_lock(metric_name, detector_id, process_type):
-            return False
-        self._manager.upsert_task_status(
-            metric_name=metric_name,
-            detector_id=detector_id,
-            process_type=process_type,
-            status="running",
-            timeout_seconds=timeout_seconds,
-        )
-        return True
-    def release_lock(
-        self,
-        metric_name: str,
-        detector_id: str,
-        process_type: str,
-        status: str,
-        last_processed_timestamp: datetime | None = None,
-        error_message: str | None = None,
-    ) -> None:
-        """Mark the task as ``completed`` or ``failed``."""
-        self._manager.upsert_task_status(
-            metric_name=metric_name,
-            detector_id=detector_id,
-            process_type=process_type,
-            status=status,
-            last_processed_timestamp=last_processed_timestamp,
-            error_message=error_message,
-        )
-    def check_lock(self, metric_name: str, detector_id: str, process_type: str) -> dict | None:
-        """Return the running-task row, or ``None`` if no lock is active."""
-        full_table_name = self._manager.get_full_table_name(TABLE_TASKS, use_internal=True)
-        query = f"""
-        SELECT *
-        FROM {full_table_name}
-        WHERE metric_name = %(metric_name)s
-          AND detector_id = %(detector_id)s
-          AND process_type = %(process_type)s
-          AND status = 'running'
-        """
-        results = self._manager.execute_query(
-            query,
-            {
-                "metric_name": metric_name,
-                "detector_id": detector_id,
-                "process_type": process_type,
-            },
-        )
-        return results[0] if results else None
-    def update_task_progress(
-        self,
-        metric_name: str,
-        detector_id: str,
-        process_type: str,
-        last_processed_timestamp: datetime,
-    ) -> None:
-        """Update ``last_processed_timestamp`` for an in-flight task."""
-        self._manager.upsert_task_status(
-            metric_name=metric_name,
-            detector_id=detector_id,
-            process_type=process_type,
-            status="running",
-            last_processed_timestamp=last_processed_timestamp,
-        )