PyPI - dccd - Versions diffs - 3.5.0__tar.gz → 3.5.2__tar.gz - Mend

dccd 3.5.0tar.gz → 3.5.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (101) hide show

{dccd-3.5.0 → dccd-3.5.2}/CHANGELOG.md RENAMED Viewed

@@ -16,6 +16,59 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Removed
+## [3.5.2] - 2026-06-12
+### Added
+- Boot-time runs.db retention (`settings.runs_retention_days`, default 90,
+  `0` disables): terminal non-failed runs (`succeeded`/`stale`/`cancelled`)
+  older than the window are deleted and the database VACUUMed at daemon
+  start, right after the orphan sweep; `failed` runs are kept as the
+  long-term error journal. Verified on a copy of the production runs.db:
+  1,770 old rows pruned, file size −67 %, `failed` rows untouched. (#154)
+### Fixed
+- Webhook alerts send a plain-text body with `X-Title: dccd` /
+  `X-Priority: high` headers for ntfy-style endpoints — the phone showed a
+  raw JSON blob before; Slack webhooks (`hooks.slack.com`) keep the JSON
+  `{"text": …}` payload. Verified live: one test message delivered to the
+  production ntfy topic (HTTP 200) rendered as plain text. (#155)
+- Manual backfill triggers (`POST /api/backfill`, `/api/jobs/run`,
+  `/api/jobs/run-all`) are idempotent: a spec that is already being
+  backfilled returns the existing `run_id` (`status: already-running`) —
+  run-all skips busy jobs and lists them under `already_running` — instead
+  of starting a duplicate concurrent run that wasted exchange requests and
+  confused runs/progress. (#153)
+- Off-box sync no longer mirrors deletions: `RemoteStorage` runs
+  `rclone copy` instead of `rclone sync`, so locally purged files survive
+  on the remote for read-through restore — enabling `min_free_gb` no longer
+  risks deleting the only copy of old data. The remote is now an archive
+  superset (never deleted automatically; remote cleanup is manual).
+  Verified live against a real rclone remote: purge → sync → file survives
+  → `restore()` returns byte-identical content. (#152)
+## [3.5.1] - 2026-06-12
+### Fixed
+- `dccd start` marked its own just-started stream runs `stale` at boot: the
+  orphan sweep (`mark_stale_running`) ran in the FastAPI lifespan *after*
+  `cmd_start` had already started the scheduler's stream workers, so their
+  fresh `running` rows were swept as "orphaned by daemon restart" and the
+  Dashboard "Active now" never showed streams. The sweep now runs in
+  `cmd_start` before the scheduler starts; the lifespan only sweeps in
+  standalone `dccd ui`. Verified live across two daemon launches: the live
+  run stays `running`; a restart stales only the previous one. (#145)
+- OKX OHLC pagination silently dropped the bar at every 100-bar page
+  boundary: OKX `before`/`after` cursors are exclusive, so passing
+  `before=start_ms` excluded the bar exactly at each window start (observed
+  in production as 431 one-minute gaps per OKX pair, spaced exactly
+  100 min). `fetch_ohlc_page` now sends `before=start_ms-1`; regression
+  test drives the paginator across a page boundary under faithful exclusive
+  semantics. Verified live: a 12 h OKX 1m backfill lands with 0 gaps and
+  all 7 boundary bars present. (#144)
 ## [3.5.0] - 2026-06-11
 ### Added

{dccd-3.5.0 → dccd-3.5.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dccd
-Version: 3.5.0
+Version: 3.5.2
 Summary: Download Crypto Currency Data — hexagonal architecture, async-first.
 Author-email: Arthur Bernard <arthur.bernard.92@gmail.com>
 License: MIT

{dccd-3.5.0 → dccd-3.5.2}/dccd/application/config.py RENAMED Viewed

@@ -46,6 +46,9 @@ class SettingsConfig(BaseModel):
     - ``ui_trusted_proxy`` — trust ``X-Forwarded-For`` as the rate-limit client key.
       Enable **only** behind a reverse proxy that overwrites the header, else a direct
       client can forge it and bypass the limit.
+    - ``runs_retention_days`` — delete terminal non-failed runs (``succeeded``,
+      ``stale``, ``cancelled``) older than this many days at daemon boot.  ``0``
+      disables the sweep (rows accumulate indefinitely).  Default ``90``.
     """
     data_path: str = "./data/crypto"
     timezone: str = "local"
@@ -56,6 +59,7 @@ class SettingsConfig(BaseModel):
     ui_readonly: bool = False
     ui_rate_limit: int = 0
     ui_trusted_proxy: bool = False
+    runs_retention_days: int = 90
     @field_validator("data_path")
     @classmethod
@@ -69,6 +73,13 @@ class SettingsConfig(BaseModel):
             raise ValueError("ui_rate_limit must be >= 0")
         return v
+    @field_validator("runs_retention_days")
+    @classmethod
+    def _non_negative_retention(cls, v: int) -> int:
+        if v < 0:
+            raise ValueError("runs_retention_days must be >= 0")
+        return v
     @field_validator("timezone")
     @classmethod
     def _validate_tz(cls, v: str) -> str:

{dccd-3.5.0 → dccd-3.5.2}/dccd/application/monitor.py RENAMED Viewed

@@ -26,6 +26,17 @@ class HealthMonitor:
     job does not flood a webhook.  The count (and cooldown) reset on the first
     success.
+    Webhook format
+    --------------
+    - **Slack** (hostname is or ends with ``hooks.slack.com``): JSON body
+      ``{"text": msg}`` with ``Content-Type: application/json`` — the current
+      behaviour.
+    - **All other endpoints** (e.g. ntfy): plain-text body with headers
+      ``Content-Type: text/plain``, ``X-Title: dccd``, and
+      ``X-Priority: high``.  ntfy renders the raw body as the notification
+      message; the old JSON blob caused the phone to show ``{"text": "…"}``
+      instead of the actual alert.
     Parameters
     ----------
     runs_store : RunsStore
@@ -77,25 +88,43 @@ class HealthMonitor:
             self._last_alert_ts.pop(key, None)
             self._last_webhook_err_ts.pop(key, None)
+    def _post_webhook(self, msg: str, run_id: str) -> None:
+        """Send *msg* to ``self._webhook``, choosing the right payload format.
+        Slack endpoints (hostname ``hooks.slack.com`` or subdomains) receive
+        ``{"text": msg}`` JSON.  All other endpoints (ntfy, custom) receive a
+        plain-text body so the notification shows the message directly.
+        """
+        import json
+        import urllib.parse
+        import urllib.request
+        assert self._webhook is not None  # caller (_alert) guards this
+        hostname: str = urllib.parse.urlsplit(self._webhook).hostname or ""
+        if hostname == "hooks.slack.com" or hostname.endswith(".hooks.slack.com"):
+            data = json.dumps({"text": msg}).encode()
+            headers: dict[str, str] = {"Content-Type": "application/json"}
+        else:
+            data = msg.encode()
+            headers = {
+                "Content-Type": "text/plain",
+                "X-Title": "dccd",
+                "X-Priority": "high",
+            }
+        req = urllib.request.Request(self._webhook, data=data, headers=headers)
+        try:
+            with urllib.request.urlopen(req, timeout=5):
+                pass
+        except Exception as exc:
+            # Log webhook-send failures at most once per cooldown window.
+            last_err = self._last_webhook_err_ts.get(run_id, 0.0)
+            if time.monotonic() - last_err >= _ALERT_COOLDOWN_S:
+                logger.warning("Webhook alert failed: %s", exc)
+                self._last_webhook_err_ts[run_id] = time.monotonic()
     def _alert(self, run_id: str, count: int) -> None:
         msg = f"dccd alert: {run_id} failed {count} times consecutively."
         logger.error(msg)
         self._last_alert_ts[run_id] = time.monotonic()
         if self._webhook:
-            try:
-                import json
-                import urllib.request
-                data = json.dumps({"text": msg}).encode()
-                req = urllib.request.Request(
-                    self._webhook,
-                    data=data,
-                    headers={"Content-Type": "application/json"},
-                )
-                with urllib.request.urlopen(req, timeout=5):
-                    pass
-            except Exception as exc:
-                # Log webhook-send failures at most once per cooldown window.
-                last_err = self._last_webhook_err_ts.get(run_id, 0.0)
-                if time.monotonic() - last_err >= _ALERT_COOLDOWN_S:
-                    logger.warning("Webhook alert failed: %s", exc)
-                    self._last_webhook_err_ts[run_id] = time.monotonic()
+            self._post_webhook(msg, run_id)

{dccd-3.5.0 → dccd-3.5.2}/dccd/interfaces/api/app.py RENAMED Viewed

@@ -191,12 +191,20 @@ def create_app(
         app.state.config_path = config_path
         app.state.store = build_store(cfg.settings.data_path)
         app.state.runs_store = build_runs_store(cfg.settings.data_path)
-        # Mark any runs left in 'running' state as 'stale' — they were
-        # orphaned by the previous daemon crash/SIGKILL and would otherwise
-        # pollute active_runs() and the Dashboard forever.
-        _stale_count = app.state.runs_store.mark_stale_running()
-        if _stale_count:
-            logger.warning("marked %d orphaned run(s) stale (daemon restarted)", _stale_count)
+        # Mark any runs left in 'running' state as 'stale'.  For standalone
+        # `dccd ui` (scheduler is None) this is the right call site — no
+        # scheduler has started any workers yet.  For `dccd start` the sweep
+        # happens in cmd_start *before* the scheduler starts, so we must skip
+        # it here: sweeping at this point would stale-out the stream-worker
+        # rows the scheduler just created (the exact misuse warned about in
+        # RunsStore.mark_stale_running's docstring).
+        if scheduler is None:
+            _stale_count = app.state.runs_store.mark_stale_running()
+            if _stale_count:
+                logger.warning("marked %d orphaned run(s) stale (daemon restarted)", _stale_count)
+            _pruned_count = app.state.runs_store.prune_old_runs(cfg.settings.runs_retention_days)
+            if _pruned_count:
+                logger.info("pruned %d old terminal run(s) from runs.db (retention: %dd)", _pruned_count, cfg.settings.runs_retention_days)
         app.state.coverage_store = build_coverage_store(cfg.settings.data_path)
         app.state.event_bus = EventBus()
         app.state.registry = build_registry()
@@ -443,6 +451,14 @@ def create_app(
         await request.app.state.scheduler.sync_streams(request.app.state.all_specs)
         await request.app.state.scheduler.sync_intervals(request.app.state.all_specs)
+    async def _active_run_for(request: Request, spec_id: str) -> str | None:
+        """Return the run_id of the first active run matching *spec_id*, or None."""
+        active = await asyncio.to_thread(_runs(request).active_runs)
+        for row in active:
+            if row.get("spec_id") == spec_id:
+                return str(row["run_id"])
+        return None
     def _run_backfill_tracked(request: Request, spec: JobSpec, run_id: str) -> None:
         """Spawn a backfill with a cancellable stop event registered by run_id."""
         reg = _reg(request)
@@ -580,6 +596,11 @@ def create_app(
             origin="runtime",
         )
+        # Idempotency guard: if the spec is already running, return the existing run_id.
+        existing = await _active_run_for(request, spec.id)
+        if existing is not None:
+            return {"run_id": existing, "status": "already-running"}
         # Generate a URL-safe run_id and pass it into backfill() so both the
         # API polling endpoint and the RunsStore use the same identifier.
         # We use a short UUID (no slashes) instead of embedding spec.id which
@@ -683,6 +704,11 @@ def create_app(
         if spec.operation != "backfill":
             raise HTTPException(400, "Only backfill jobs can be triggered manually; use /api/streams/start for stream jobs")
+        # Idempotency guard: if the spec is already running, return the existing run_id.
+        existing = await _active_run_for(request, spec.id)
+        if existing is not None:
+            return {"run_id": existing, "status": "already-running", "job_id": job_id}
         run_id = str(_uuid.uuid4())
         _run_backfill_tracked(request, spec, run_id)
@@ -696,12 +722,17 @@ def create_app(
         backfill_specs = [s for s in specs if s.operation == "backfill" and s.enabled]
         run_ids = []
+        already_running = []
         for spec in backfill_specs:
+            existing = await _active_run_for(request, spec.id)
+            if existing is not None:
+                already_running.append({"run_id": existing, "job_id": spec.id})
+                continue
             run_id = str(_uuid.uuid4())
             run_ids.append({"run_id": run_id, "job_id": spec.id})
             _run_backfill_tracked(request, spec, run_id)
-        return {"started": len(run_ids), "runs": run_ids}
+        return {"started": len(run_ids), "runs": run_ids, "already_running": already_running}
     # -----------------------------------------------------------------------
     # Config

{dccd-3.5.0 → dccd-3.5.2}/dccd/interfaces/cli/main.py RENAMED Viewed

@@ -160,6 +160,16 @@ def cmd_start(
     cfg, cfg_path = _load_cfg(config)
     store = build_store(cfg.settings.data_path)
     runs_store = build_runs_store(cfg.settings.data_path)
+    # Sweep orphaned runs *before* the scheduler starts any stream workers so
+    # the lifespan (which opens the same DB) does not stale-out legitimate rows
+    # that the scheduler just created.  This is the correct boot-path call site
+    # warned about in RunsStore.mark_stale_running's docstring.
+    _stale = runs_store.mark_stale_running()
+    if _stale > 0:
+        typer.echo(f"Marked {_stale} orphaned run(s) stale (daemon restarted)")
+    _pruned = runs_store.prune_old_runs(cfg.settings.runs_retention_days)
+    if _pruned > 0:
+        typer.echo(f"Pruned {_pruned} old terminal run(s) from runs.db (retention: {cfg.settings.runs_retention_days}d)")
     coverage_store = build_coverage_store(cfg.settings.data_path)
     registry = build_registry()
     bus = EventBus()

{dccd-3.5.0 → dccd-3.5.2}/dccd/sources/okx.py RENAMED Viewed

@@ -102,10 +102,12 @@ class OKXSource(
             return []
         pair = self.render_symbol(symbol)
+        # OKX `before`/`after` are exclusive bounds. Without `- 1` the bar
+        # exactly at each page-window start would be silently dropped.
         params: dict[str, Any] = {
             "instId": pair,
             "bar": bar,
-            "before": str(start_ns // 1_000_000),
+            "before": str(start_ns // 1_000_000 - 1),
             "after": str(end_ns // 1_000_000),
             "limit": min(limit, 100),
         }

{dccd-3.5.0 → dccd-3.5.2}/dccd/storage/purge.py RENAMED Viewed

@@ -2,9 +2,11 @@
 pressure.
 **Safety contract**: this deletes local data that is recoverable only from the
-remote, so it must be called **only when the remote mirror is up to date** — in
+remote, so it must be called **only when the remote archive is up to date** — in
 practice, right after a successful :func:`~dccd.application.operations.sync_remote`
-cycle. The coverage manifest (``CoverageStore``) preserves the resume cursor, so a
+cycle.  Because :class:`~dccd.storage.remote.RemoteStorage` uses ``rclone copy``
+(never ``rclone sync``), purged files remain on the remote and can be pulled
+back by :meth:`~dccd.storage.remote.RemoteStorage.restore`. The coverage manifest (``CoverageStore``) preserves the resume cursor, so a
 later ``backfill(start="last")`` still continues from where collection left off;
 reads of purged ranges return what's local until read-through restore pulls them
 back.
@@ -41,7 +43,7 @@ def purge_to_free_space(
     """Delete oldest Parquet files until free space reaches ``min_free_gb``.
     Files are removed oldest-first (by mtime), so recent data stays local while
-    old data — already mirrored off-box — is dropped. The ``.dccd`` directory is
+    old data — already copied off-box — is dropped. The ``.dccd`` directory is
     excluded. No-op when ``min_free_gb <= 0`` or free space is already above the
     threshold.

{dccd-3.5.0 → dccd-3.5.2}/dccd/storage/remote.py RENAMED Viewed

@@ -1,4 +1,17 @@
-"""Remote storage sync via rclone."""
+"""Remote storage copy via rclone.
+The remote is an *archive superset* of the local store: files are copied
+off-box but never deleted remotely.  Local = hot tier (fast access, space
+pressure managed by the purge subsystem); remote = complete history archive.
+This means:
+- A local file that is purged to free disk space still exists on the remote
+  and can be pulled back by :meth:`RemoteStorage.restore` (read-through
+  restore).
+- Remote cleanup (removing datasets you no longer want) is a **manual**
+  operation — use ``rclone delete`` or the provider console.  dccd will
+  never delete from the remote automatically.
+"""
 from __future__ import annotations
@@ -13,12 +26,18 @@ logger = logging.getLogger(__name__)
 class RemoteStorage:
-    """Sync local data to one or more rclone remotes.
+    """Copy local data to one or more rclone remotes (non-destructive).
+    Each sync cycle runs ``rclone copy`` (not ``rclone sync``) so that files
+    present on the remote but absent locally — e.g. files purged from the hot
+    tier to reclaim disk — are **never deleted**.  The remote grows
+    monotonically and acts as a complete off-box archive; :meth:`restore` pulls
+    individual dataset directories back on demand.
     Parameters
     ----------
     local_path : str or Path
-        Local data directory to sync.
+        Local data directory to copy from.
     remotes : list of dicts
         Each dict has ``provider`` and ``remote`` keys.
     """
@@ -32,24 +51,30 @@ class RemoteStorage:
         self._remotes = remotes or []
     def sync_one(self, remote: str) -> bool:
-        """Sync to a single rclone remote. Returns True on success."""
+        """Copy local store to a single rclone remote.  Returns True on success.
+        Uses ``rclone copy`` (not ``rclone sync``) so files that exist on the
+        remote but are absent locally are preserved.  This guarantees that
+        files purged from the local hot tier remain available on the remote for
+        read-through :meth:`restore`.
+        """
         try:
             result = subprocess.run(
-                ["rclone", "sync", str(self._local), remote, "--quiet"],
+                ["rclone", "copy", str(self._local), remote, "--quiet"],
                 capture_output=True,
                 text=True,
                 timeout=300,
             )
             if result.returncode != 0:
-                logger.error("rclone sync to %s failed: %s", remote, result.stderr)
+                logger.error("rclone copy to %s failed: %s", remote, result.stderr)
                 return False
-            logger.info("Synced to %s", remote)
+            logger.info("Copied to %s", remote)
             return True
         except FileNotFoundError:
             logger.error("rclone not found in PATH")
             return False
         except subprocess.TimeoutExpired:
-            logger.error("rclone sync to %s timed out", remote)
+            logger.error("rclone copy to %s timed out", remote)
             return False
     def restore(self, rel_path: str) -> bool:

{dccd-3.5.0 → dccd-3.5.2}/dccd/storage/runs_sqlite.py RENAMED Viewed

@@ -178,6 +178,59 @@ class RunsStore:
         """Runs currently ``running`` or ``reconnecting``."""
         return self.list_runs(state="running") + self.list_runs(state="reconnecting")
+    def prune_old_runs(self, retention_days: int) -> int:
+        """Delete terminal non-failed runs older than *retention_days* days.
+        Runs in states ``succeeded``, ``stale``, and ``cancelled`` that started
+        more than *retention_days* days ago are removed.  ``failed`` rows are
+        intentionally kept as the long-term error journal.  The database is
+        ``VACUUM``-ed after any deletion to reclaim disk space.
+        Parameters
+        ----------
+        retention_days : int
+            Number of days to retain terminal non-failed runs.  Pass ``0`` (or
+            any value ``<= 0``) to disable pruning; the method returns ``0``
+            immediately without touching the database.
+        Returns
+        -------
+        int
+            Number of rows deleted (0 when pruning is disabled or when no rows
+            match the cutoff).
+        Notes
+        -----
+        ``VACUUM`` cannot run inside a transaction.  This method opens a
+        separate connection (outside the :meth:`_conn` context manager) for the
+        ``VACUUM`` statement, which is executed only when at least one row was
+        deleted.
+        This method must be called from the daemon boot path *after*
+        :meth:`mark_stale_running` so that freshly-staled orphans age normally
+        rather than being immediately pruned on the next boot.
+        """
+        if retention_days <= 0:
+            return 0
+        import time
+        cutoff_ns = int(time.time() * 1_000_000_000) - int(retention_days * 86400 * 1_000_000_000)
+        with self._conn() as conn:
+            cursor = conn.execute(
+                """DELETE FROM runs
+                   WHERE state IN ('succeeded', 'stale', 'cancelled')
+                   AND started_at < ?""",
+                (cutoff_ns,),
+            )
+            deleted = cursor.rowcount
+        if deleted > 0:
+            # VACUUM cannot run inside a transaction — open a plain connection.
+            conn2 = sqlite3.connect(str(self._path))
+            try:
+                conn2.execute("VACUUM")
+            finally:
+                conn2.close()
+        return deleted
     def mark_stale_running(self) -> int:
         """Transition all ``running`` rows to ``stale`` at daemon boot.
@@ -201,9 +254,12 @@ class RunsStore:
         clearly attributes the state change to a restart rather than a normal
         completion or a user-visible error.
-        This method must only be called from the daemon boot path (FastAPI
-        lifespan startup).  Calling it while a daemon is live would incorrectly
-        stale-out its legitimate active runs.
+        This method must only be called from the daemon boot path, before any
+        new runs are started: ``cmd_start`` for ``dccd start`` (called before
+        the scheduler starts stream workers); the FastAPI lifespan for
+        standalone ``dccd ui`` (called before the standalone scheduler is
+        created).  Calling it while workers are already running would
+        incorrectly stale-out their legitimate active runs.
         """
         import time
         now = int(time.time() * 1_000_000_000)

dccd 3.5.0__tar.gz → 3.5.2__tar.gz

dccd 3.5.0tar.gz → 3.5.2tar.gz