dccd 3.0.0__tar.gz → 3.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. {dccd-3.0.0 → dccd-3.2.0}/CHANGELOG.md +95 -0
  2. {dccd-3.0.0 → dccd-3.2.0}/CLAUDE.md +23 -7
  3. {dccd-3.0.0 → dccd-3.2.0}/PKG-INFO +1 -1
  4. {dccd-3.0.0 → dccd-3.2.0}/dccd/__init__.py +14 -3
  5. {dccd-3.0.0 → dccd-3.2.0}/dccd/application/config.py +3 -1
  6. {dccd-3.0.0 → dccd-3.2.0}/dccd/application/monitor.py +9 -4
  7. {dccd-3.0.0 → dccd-3.2.0}/dccd/application/operations.py +109 -2
  8. {dccd-3.0.0 → dccd-3.2.0}/dccd/application/scheduler.py +87 -1
  9. {dccd-3.0.0 → dccd-3.2.0}/dccd/application/service_factory.py +57 -1
  10. {dccd-3.0.0 → dccd-3.2.0}/dccd/interfaces/api/app.py +80 -1
  11. {dccd-3.0.0 → dccd-3.2.0}/dccd/interfaces/cli/main.py +29 -3
  12. dccd-3.2.0/dccd/interfaces/ui/templates/storage.html +108 -0
  13. dccd-3.2.0/dccd/storage/coverage_sqlite.py +134 -0
  14. {dccd-3.0.0 → dccd-3.2.0}/dccd/storage/parquet.py +5 -0
  15. dccd-3.2.0/dccd/storage/purge.py +105 -0
  16. {dccd-3.0.0 → dccd-3.2.0}/dccd/storage/remote.py +40 -0
  17. {dccd-3.0.0 → dccd-3.2.0}/dccd/tests/v3/test_api.py +49 -1
  18. {dccd-3.0.0 → dccd-3.2.0}/dccd/tests/v3/test_application.py +62 -0
  19. dccd-3.2.0/dccd/tests/v3/test_coverage.py +140 -0
  20. dccd-3.2.0/dccd/tests/v3/test_purge.py +104 -0
  21. dccd-3.2.0/dccd/tests/v3/test_remote_sync.py +172 -0
  22. dccd-3.2.0/dccd/tests/v3/test_restart.py +77 -0
  23. dccd-3.2.0/dccd/tests/v3/test_restore.py +79 -0
  24. {dccd-3.0.0 → dccd-3.2.0}/dccd.egg-info/PKG-INFO +1 -1
  25. {dccd-3.0.0 → dccd-3.2.0}/dccd.egg-info/SOURCES.txt +7 -0
  26. {dccd-3.0.0 → dccd-3.2.0}/pyproject.toml +1 -1
  27. dccd-3.0.0/dccd/interfaces/ui/templates/storage.html +0 -53
  28. {dccd-3.0.0 → dccd-3.2.0}/CONTRIBUTING.md +0 -0
  29. {dccd-3.0.0 → dccd-3.2.0}/LICENSE.txt +0 -0
  30. {dccd-3.0.0 → dccd-3.2.0}/MANIFEST.in +0 -0
  31. {dccd-3.0.0 → dccd-3.2.0}/README.md +0 -0
  32. {dccd-3.0.0 → dccd-3.2.0}/dccd/application/__init__.py +0 -0
  33. {dccd-3.0.0 → dccd-3.2.0}/dccd/application/events.py +0 -0
  34. {dccd-3.0.0 → dccd-3.2.0}/dccd/application/jobs.py +0 -0
  35. {dccd-3.0.0 → dccd-3.2.0}/dccd/application/registry.py +0 -0
  36. {dccd-3.0.0 → dccd-3.2.0}/dccd/domain/__init__.py +0 -0
  37. {dccd-3.0.0 → dccd-3.2.0}/dccd/domain/capability.py +0 -0
  38. {dccd-3.0.0 → dccd-3.2.0}/dccd/domain/dataset.py +0 -0
  39. {dccd-3.0.0 → dccd-3.2.0}/dccd/domain/errors.py +0 -0
  40. {dccd-3.0.0 → dccd-3.2.0}/dccd/domain/records.py +0 -0
  41. {dccd-3.0.0 → dccd-3.2.0}/dccd/domain/symbol.py +0 -0
  42. {dccd-3.0.0 → dccd-3.2.0}/dccd/domain/timeutils.py +0 -0
  43. {dccd-3.0.0 → dccd-3.2.0}/dccd/domain/transforms.py +0 -0
  44. {dccd-3.0.0 → dccd-3.2.0}/dccd/domain/types.py +0 -0
  45. {dccd-3.0.0 → dccd-3.2.0}/dccd/interfaces/__init__.py +0 -0
  46. {dccd-3.0.0 → dccd-3.2.0}/dccd/interfaces/api/__init__.py +0 -0
  47. {dccd-3.0.0 → dccd-3.2.0}/dccd/interfaces/cli/__init__.py +0 -0
  48. {dccd-3.0.0 → dccd-3.2.0}/dccd/interfaces/ui/__init__.py +0 -0
  49. {dccd-3.0.0 → dccd-3.2.0}/dccd/interfaces/ui/static/favicon.svg +0 -0
  50. {dccd-3.0.0 → dccd-3.2.0}/dccd/interfaces/ui/static/logo.svg +0 -0
  51. {dccd-3.0.0 → dccd-3.2.0}/dccd/interfaces/ui/templates/base.html +0 -0
  52. {dccd-3.0.0 → dccd-3.2.0}/dccd/interfaces/ui/templates/config.html +0 -0
  53. {dccd-3.0.0 → dccd-3.2.0}/dccd/interfaces/ui/templates/dashboard.html +0 -0
  54. {dccd-3.0.0 → dccd-3.2.0}/dccd/interfaces/ui/templates/data.html +0 -0
  55. {dccd-3.0.0 → dccd-3.2.0}/dccd/interfaces/ui/templates/historical.html +0 -0
  56. {dccd-3.0.0 → dccd-3.2.0}/dccd/interfaces/ui/templates/live.html +0 -0
  57. {dccd-3.0.0 → dccd-3.2.0}/dccd/interfaces/ui/templates/logs.html +0 -0
  58. {dccd-3.0.0 → dccd-3.2.0}/dccd/sources/__init__.py +0 -0
  59. {dccd-3.0.0 → dccd-3.2.0}/dccd/sources/base.py +0 -0
  60. {dccd-3.0.0 → dccd-3.2.0}/dccd/sources/binance.py +0 -0
  61. {dccd-3.0.0 → dccd-3.2.0}/dccd/sources/bitfinex.py +0 -0
  62. {dccd-3.0.0 → dccd-3.2.0}/dccd/sources/bitmex.py +0 -0
  63. {dccd-3.0.0 → dccd-3.2.0}/dccd/sources/bybit.py +0 -0
  64. {dccd-3.0.0 → dccd-3.2.0}/dccd/sources/coinbase.py +0 -0
  65. {dccd-3.0.0 → dccd-3.2.0}/dccd/sources/kraken.py +0 -0
  66. {dccd-3.0.0 → dccd-3.2.0}/dccd/sources/okx.py +0 -0
  67. {dccd-3.0.0 → dccd-3.2.0}/dccd/sources/registry.py +0 -0
  68. {dccd-3.0.0 → dccd-3.2.0}/dccd/storage/__init__.py +0 -0
  69. {dccd-3.0.0 → dccd-3.2.0}/dccd/storage/runs_sqlite.py +0 -0
  70. {dccd-3.0.0 → dccd-3.2.0}/dccd/tests/__init__.py +0 -0
  71. {dccd-3.0.0 → dccd-3.2.0}/dccd/tests/v3/__init__.py +0 -0
  72. {dccd-3.0.0 → dccd-3.2.0}/dccd/tests/v3/test_backfill_lookback.py +0 -0
  73. {dccd-3.0.0 → dccd-3.2.0}/dccd/tests/v3/test_client.py +0 -0
  74. {dccd-3.0.0 → dccd-3.2.0}/dccd/tests/v3/test_domain.py +0 -0
  75. {dccd-3.0.0 → dccd-3.2.0}/dccd/tests/v3/test_domain_extended.py +0 -0
  76. {dccd-3.0.0 → dccd-3.2.0}/dccd/tests/v3/test_network.py +0 -0
  77. {dccd-3.0.0 → dccd-3.2.0}/dccd/tests/v3/test_sources.py +0 -0
  78. {dccd-3.0.0 → dccd-3.2.0}/dccd/tests/v3/test_storage.py +0 -0
  79. {dccd-3.0.0 → dccd-3.2.0}/dccd/tests/v3/test_storage_extended.py +0 -0
  80. {dccd-3.0.0 → dccd-3.2.0}/dccd/tests/v3/test_transport.py +0 -0
  81. {dccd-3.0.0 → dccd-3.2.0}/dccd/transport/__init__.py +0 -0
  82. {dccd-3.0.0 → dccd-3.2.0}/dccd/transport/http.py +0 -0
  83. {dccd-3.0.0 → dccd-3.2.0}/dccd/transport/paginate.py +0 -0
  84. {dccd-3.0.0 → dccd-3.2.0}/dccd/transport/ratelimit.py +0 -0
  85. {dccd-3.0.0 → dccd-3.2.0}/dccd/transport/ws.py +0 -0
  86. {dccd-3.0.0 → dccd-3.2.0}/dccd.egg-info/dependency_links.txt +0 -0
  87. {dccd-3.0.0 → dccd-3.2.0}/dccd.egg-info/entry_points.txt +0 -0
  88. {dccd-3.0.0 → dccd-3.2.0}/dccd.egg-info/requires.txt +0 -0
  89. {dccd-3.0.0 → dccd-3.2.0}/dccd.egg-info/top_level.txt +0 -0
  90. {dccd-3.0.0 → dccd-3.2.0}/setup.cfg +0 -0
@@ -16,6 +16,101 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
16
16
 
17
17
  ### Removed
18
18
 
19
+ ## [3.2.0] - 2026-06-10
20
+
21
+ ### Added
22
+
23
+ - Dev workflow: hierarchical, file-based **plan trees** under `doc/dev/plans/`
24
+ (committed) with a `<plans_dir>` descriptor key. A roadmap item expands into a
25
+ global `00-plan.md` + precise leaf specs (adaptive depth); each leaf declares a
26
+ `complexity` that derives its execution model (`low→haiku`/`medium→sonnet`/
27
+ `high→opus`). New `/plan` (build the tree + open the plan PR first) and
28
+ `/execute-leaf` (spawn an agent per leaf, verify on real data) skills; `/pick-task`,
29
+ `/finish-task`, `/abandon-task`, `/release` and `CLAUDE.md` updated to chain
30
+ through it. Backward-compatible: no `plans_dir` ⇒ the old plan-mode loop. (#94)
31
+ - Restart/reboot safety verified on a real server `systemctl reboot`: the daemon
32
+ auto-starts, the trades stream reconnects, the interval backfill re-arms, the
33
+ `RunsStore` (SQLite WAL) survives and appends, and the coverage manifest keeps the
34
+ resume cursor (no gap). New `test_restart.py` guards RunsStore persistence across a
35
+ reopen and scheduler interval re-arm from config. (#99)
36
+ - Ops for unattended deploy: `HealthMonitor` is now wired into the daemon (CLI
37
+ `dccd start` and the standalone API) — it was implemented but never instantiated,
38
+ so webhook alerts never fired. Docker `HEALTHCHECK` on `/health`, commented
39
+ systemd resource limits, and journald log-rotation guidance. Verified live on a
40
+ server: a failing job past the threshold delivered a real webhook POST, and the
41
+ container reports `healthy`. (#100)
42
+ - Docs: new `how-to/deploy` guide — a blessed, host-validated path to run dccd
43
+ unattended on a server (systemd + venv recommended, Docker alternative), covering
44
+ install, secret injection, `/health`, restart/reboot safety, logs, alerts and the
45
+ old-CPU caveat. Completes **Epic A** (run on a remote server). (#102)
46
+
47
+ ### Changed
48
+
49
+ - `Dockerfile`: pin the base image to a digest (reproducible builds) and add a
50
+ `POLARS_VARIANT` build arg — on CPUs without AVX2 (older servers) the default
51
+ `polars` wheel crashes with SIGILL, so
52
+ `docker build --build-arg POLARS_VARIANT=polars-lts-cpu` installs the LTS-CPU
53
+ build instead. Verified end-to-end on a real host (build, run, `/health`, Bearer
54
+ auth, a backfill writing correct OHLC to the `/data` volume). (#97)
55
+ - Docs: `how-to/protect-ui` now covers deploy-time secret injection — the token and
56
+ `rclone.conf` are mounted at run time, never baked into the image (verified on the
57
+ built image: `docker history`/filesystem show no config); the YAML loader does not
58
+ expand `${ENV}` placeholders, so the mounted-file pattern is the blessed one. (#101)
59
+
60
+ ### Fixed
61
+
62
+ - `deploy/dccd.service`: `ExecStart` pointed at `/usr/local/bin/dccd` and failed
63
+ `systemd-analyze verify`; it now uses a venv path (`/opt/dccd/venv/bin/dccd`) with
64
+ `StateDirectory=dccd` (systemd owns `/var/lib/dccd`). The install spec dropped the
65
+ non-existent `ui` extra (`.[daemon,ui]` → `.[daemon]`, also in the `Dockerfile`).
66
+ Verified a real system-wide install: `systemd-analyze verify` passes, the service
67
+ is active, auto-restarts after SIGKILL, and a backfill writes correct OHLC under
68
+ the hardened `/var/lib/dccd/data` (`ProtectSystem=strict`). (#98)
69
+ - `HealthMonitor` counted consecutive failures per `run_id`, but each backfill run
70
+ has a unique id (`{spec}@{ts}`), so repeated failures never accumulated (only
71
+ streams, with a stable `@stream` id, could alert). It now keys on the job
72
+ (spec id) so repeated backfill failures trip the alert. (#100)
73
+
74
+ ### Deprecated
75
+
76
+ ### Removed
77
+
78
+ ## [3.1.0] - 2026-06-09
79
+
80
+ ### Added
81
+
82
+ - `dccd start` now schedules rclone remote sync: when `storage.remotes` is set,
83
+ the daemon mirrors the store off-box every `storage.sync_interval` seconds with
84
+ exponential backoff, persisted run history (`sync` runs in `RunsStore`) and a
85
+ live `remote-sync` EventBus status. Previously `RemoteStorage` was implemented
86
+ but never driven — a server synced nothing. (#86)
87
+ - Storage page surfaces remote sync: last/next sync, status, configured remotes
88
+ and synced volume, plus a **Sync now** button — backed by
89
+ `GET`/`POST /api/storage/sync`. The shared `operations.sync_remote` primitive
90
+ records each cycle, so the manual button and the scheduled loop stay in sync. (#87)
91
+ - Coverage manifest (`CoverageStore`, SQLite under `.dccd/`): backfill records each
92
+ dataset's `[min_ts, max_ts]` extent, and `start="last"` falls back to the
93
+ manifest's `max_ts` when no local file exists — so local data can be dropped to
94
+ free disk without forcing a re-download on the next backfill. (#88)
95
+ - Free-space purge: `storage.min_free_gb` (default `0` = off). After each
96
+ successful sync the daemon drops the oldest already-synced Parquet files until
97
+ free space is back above the floor (the coverage manifest keeps the resume
98
+ cursor, `.dccd/` is never touched). (#89)
99
+ - Read-through restore: reading a dataset whose local Parquet was purged now pulls
100
+ it back from the remote (`rclone copy`) before loading, so a purge is
101
+ transparent to readers (`Client.read`, `POST /api/read`). (#90)
102
+ - Docs: the `how-to/sync-remote` guide now covers rclone provisioning, the
103
+ `min_free_gb` free-space purge, read-through restore, and restore/integrity
104
+ (`rclone copy`/`rclone check`) — completing Epic C (tiered storage). (#91)
105
+
106
+ ### Changed
107
+
108
+ ### Fixed
109
+
110
+ ### Deprecated
111
+
112
+ ### Removed
113
+
19
114
  ## [3.0.0] - 2026-06-07
20
115
 
21
116
  ### Added
@@ -79,22 +79,38 @@ away without losing unrelated good work is too big: split it. This is what makes
79
79
 
80
80
  ### Dev loop & docs of record
81
81
 
82
- The iterative loop is tooled by skills, with three tracked docs as the sources of
82
+ The iterative loop is tooled by skills, with four tracked docs as the sources of
83
83
  truth:
84
84
 
85
85
  | Doc | Holds | Updated by |
86
86
  |-----|-------|-----------|
87
- | `doc/dev/07-roadmap.md` | open work (single source) | `/pick-task` reads · `/finish-task`, `/abandon-task` update |
87
+ | `doc/dev/07-roadmap.md` | open work single source *index* | `/pick-task` reads · `/finish-task`, `/abandon-task` update |
88
+ | `doc/dev/plans/<epic>/` | open work *detail* — durable hierarchical plan trees (global + leaf specs) | `/plan` writes · `/execute-leaf` reads · `/finish-task`/`/abandon-task` archive |
88
89
  | `doc/dev/03-decisions.md` | the *why* — ADR journal (+ settled rationale) | `/finish-task` (accepted), `/abandon-task` (rejected/tombstone) |
89
90
  | `doc/dev/06-status.md` | where things stand | `/finish-task`, `/groom-docs` |
90
91
 
91
92
  `CHANGELOG.md` + git log stay authoritative for *what* shipped. The loop:
92
- `/pick-task` (smallest slice → branch) → plan (split big plans into small PRs) →
93
- `/finish-task` (tests, ADR entry, status, PR) **or** `/abandon-task` (salvage the
94
- lesson + close the PR); `/groom-docs` periodically keeps `doc/dev/` lean and true.
95
93
 
96
- **Model per task** (advisory you set it via `/model`, or a skill spawns a
97
- subagent with an explicit `model`; subagents otherwise *inherit* the parent):
94
+ `/pick-task` (smallest coherent slice; **no branch yet**)
95
+ `/plan` (decompose into a `doc/dev/plans/<epic>/` tree adaptive depth: a single
96
+ leaf for a trivial task, a global `00-plan.md` + leaves otherwise — and open the
97
+ **plan PR** that lands the tree on `develop` first) →
98
+ `/execute-leaf <epic> next` (cut the leaf branch, **spawn an agent at the model
99
+ derived from the leaf's `complexity`**, which implements + tests + **verifies on
100
+ real data**, then reports) →
101
+ `/finish-task` (tests, ADR, CHANGELOG, leaf PR, archive the leaf, tick the global
102
+ checklist) → … per leaf … → last leaf removes the roadmap line → `/release`.
103
+
104
+ `/abandon-task` salvages the lesson + closes a bad PR (tombstones the leaf);
105
+ `/groom-docs` periodically keeps `doc/dev/` lean and true. The full format lives in
106
+ [`doc/dev/plans/README.md`](doc/dev/plans/README.md). The workflow is
107
+ backward-compatible: a repo whose `.claude/workflow.json` has **no `plans_dir`**
108
+ falls back to the older `/pick-task → plan mode → /finish-task` loop.
109
+
110
+ **Model per task** (advisory — you set it via `/model`, a skill spawns a subagent
111
+ with an explicit `model`, or a plan **leaf's `complexity` derives it**:
112
+ `low→haiku`, `medium→sonnet`, `high→opus`; subagents otherwise *inherit* the
113
+ parent):
98
114
 
99
115
  | Model | For |
100
116
  |-------|-----|
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dccd
3
- Version: 3.0.0
3
+ Version: 3.2.0
4
4
  Summary: Download Crypto Currency Data — hexagonal architecture, async-first.
5
5
  Author-email: Arthur Bernard <arthur.bernard.92@gmail.com>
6
6
  License: MIT
@@ -70,6 +70,8 @@ class Client:
70
70
  self._config: AppConfig | None = None
71
71
  self._store: ParquetStore | None = None
72
72
  self._registry: SourceRegistry | None = None
73
+ self._coverage_store: Any = None
74
+ self._remote: Any = None
73
75
 
74
76
  def _require_ready(self) -> tuple["SourceRegistry", "ParquetStore"]:
75
77
  if self._registry is None or self._store is None:
@@ -78,7 +80,12 @@ class Client:
78
80
 
79
81
  async def __aenter__(self) -> "Client":
80
82
  from dccd.application.config import AppConfig, load_config, resolve_config_path
81
- from dccd.application.service_factory import build_registry, build_store
83
+ from dccd.application.service_factory import (
84
+ build_coverage_store,
85
+ build_registry,
86
+ build_remote,
87
+ build_store,
88
+ )
82
89
 
83
90
  try:
84
91
  path = resolve_config_path(self._config_path)
@@ -88,6 +95,8 @@ class Client:
88
95
 
89
96
  # Single source of truth for adapter wiring — same as CLI and API.
90
97
  self._store = build_store(self._config.settings.data_path)
98
+ self._coverage_store = build_coverage_store(self._config.settings.data_path)
99
+ self._remote = build_remote(self._config)
91
100
  self._registry = build_registry()
92
101
  return self
93
102
 
@@ -156,7 +165,8 @@ class Client:
156
165
  origin="runtime",
157
166
  )
158
167
  registry, store = self._require_ready()
159
- return await do_backfill(spec, registry=registry, store=store)
168
+ return await do_backfill(spec, registry=registry, store=store,
169
+ coverage_store=self._coverage_store)
160
170
 
161
171
  async def stream(self, exchange: str, symbol: str, data_type: str = "trades",
162
172
  span: int | None = None, depth: int | None = None,
@@ -249,7 +259,8 @@ class Client:
249
259
  _, store = self._require_ready()
250
260
  target = JobTarget(exchange=exchange, symbol=Symbol.parse(symbol),
251
261
  data_type=DataType(data_type), span=span)
252
- return cast("pl.DataFrame", do_read(target, store=store, start_ns=start_ns, end_ns=end_ns))
262
+ return cast("pl.DataFrame", do_read(target, store=store, start_ns=start_ns,
263
+ end_ns=end_ns, remote=self._remote))
253
264
 
254
265
  def inventory(self) -> list[dict[str, Any]]:
255
266
  """List every stored dataset with its coverage.
@@ -70,10 +70,12 @@ class RemoteConfig(BaseModel):
70
70
 
71
71
 
72
72
  class StorageConfig(BaseModel):
73
- """Storage settings: local path, rclone remotes, and sync interval."""
73
+ """Storage settings: local path, rclone remotes, sync interval, and the
74
+ free-space floor (GiB) that triggers purging already-synced local files."""
74
75
  local_path: str = ""
75
76
  remotes: list[RemoteConfig] = Field(default_factory=list)
76
77
  sync_interval: int = 3600
78
+ min_free_gb: float = 0.0
77
79
 
78
80
 
79
81
  class AlertConfig(BaseModel):
@@ -41,13 +41,18 @@ class HealthMonitor:
41
41
  def _on_event(self, event: Event) -> None:
42
42
  if not isinstance(event, StatusEvent):
43
43
  return
44
+ # Count failures per *job*, not per run: a run_id is `{spec_id}@{run}` and
45
+ # each backfill run is unique, so keying on run_id would never accumulate
46
+ # across runs (only streams reuse `{spec_id}@stream`). Key on the spec_id
47
+ # prefix so repeated failures of the same job trip the alert.
48
+ key = event.run_id.split("@", 1)[0]
44
49
  if event.state == "failed":
45
- self._consecutive[event.run_id] += 1
46
- count = self._consecutive[event.run_id]
50
+ self._consecutive[key] += 1
51
+ count = self._consecutive[key]
47
52
  if count >= self._max_errors:
48
- self._alert(event.run_id, count)
53
+ self._alert(key, count)
49
54
  elif event.state == "succeeded":
50
- self._consecutive[event.run_id] = 0
55
+ self._consecutive[key] = 0
51
56
 
52
57
  def _alert(self, run_id: str, count: int) -> None:
53
58
  msg = f"dccd alert: {run_id} failed {count} times consecutively."
@@ -27,10 +27,12 @@ from dccd.domain.timeutils import NS, ns_now, ns_to_dt
27
27
  from dccd.domain.types import DataType
28
28
  from dccd.sources.base import OHLCHistory, OrderBookSnapshotREST, TradesHistory
29
29
  from dccd.sources.registry import SourceRegistry
30
+ from dccd.storage.coverage_sqlite import CoverageStore
30
31
  from dccd.storage.parquet import ParquetStore
32
+ from dccd.storage.remote import RemoteStorage
31
33
  from dccd.storage.runs_sqlite import RunsStore
32
34
 
33
- __all__ = ["backfill", "stream", "read", "inventory"]
35
+ __all__ = ["backfill", "stream", "read", "inventory", "sync_remote"]
34
36
 
35
37
  logger = logging.getLogger(__name__)
36
38
 
@@ -112,6 +114,7 @@ async def backfill(
112
114
  registry: SourceRegistry,
113
115
  store: ParquetStore,
114
116
  runs_store: RunsStore | None = None,
117
+ coverage_store: CoverageStore | None = None,
115
118
  events: RunEvents | None = None,
116
119
  stop_event: asyncio.Event | None = None,
117
120
  run_id: str | None = None,
@@ -136,6 +139,10 @@ async def backfill(
136
139
  registry : SourceRegistry
137
140
  store : ParquetStore
138
141
  runs_store : RunsStore or None
142
+ coverage_store : CoverageStore or None
143
+ When set, ``start="last"`` falls back to the manifest's recorded
144
+ ``max_ts`` if no local file exists (so a dropped store doesn't trigger a
145
+ re-download), and the dataset's extent is recorded on success.
139
146
  events : RunEvents or None
140
147
  stop_event : asyncio.Event or None
141
148
  Set externally to cancel mid-run cleanly.
@@ -169,6 +176,11 @@ async def backfill(
169
176
 
170
177
  if params.start == "last":
171
178
  last = store.last_timestamp(ds)
179
+ if last is None and coverage_store is not None:
180
+ # Local files may have been dropped to free disk; the coverage
181
+ # manifest remembers how far we got, so we resume from there instead
182
+ # of re-downloading from the bounded default lookback.
183
+ last = coverage_store.get_max_ts(ds)
172
184
  if last is not None:
173
185
  start_ns: int = last + 1
174
186
  else:
@@ -200,6 +212,17 @@ async def backfill(
200
212
  # Counts every item received from the paginator, including unflushed ones.
201
213
  _collected: list[int] = [0]
202
214
 
215
+ # Min/max timestamp seen this run, fed to the coverage manifest on success so
216
+ # the dataset's extent survives a local-data drop (see CoverageStore).
217
+ _run_min: list[int | None] = [None]
218
+ _run_max: list[int | None] = [None]
219
+
220
+ def _track_ts(ts: int) -> None:
221
+ if _run_min[0] is None or ts < _run_min[0]:
222
+ _run_min[0] = ts
223
+ if _run_max[0] is None or ts > _run_max[0]:
224
+ _run_max[0] = ts
225
+
203
226
  # Progress is reported by *time covered* of the requested window, which gives
204
227
  # a real, smooth bar for both OHLC and cursor-paginated trades (the latter
205
228
  # have no page total). ``at`` is the timestamp reached. The window is read
@@ -257,6 +280,7 @@ async def backfill(
257
280
  break
258
281
  bars.append(bar)
259
282
  _collected[0] += 1
283
+ _track_ts(bar.ts)
260
284
  if _collected[0] % 200 == 0:
261
285
  _emit_time(bar.ts)
262
286
  if len(bars) >= _FLUSH_BATCH:
@@ -295,6 +319,7 @@ async def backfill(
295
319
  break
296
320
  batch.append(trade)
297
321
  _collected[0] += 1
322
+ _track_ts(trade.ts)
298
323
  if _collected[0] % 1000 == 0:
299
324
  _emit_time(trade.ts) # progress by time covered, not page count
300
325
  if len(batch) >= _FLUSH_BATCH:
@@ -309,6 +334,7 @@ async def backfill(
309
334
  raise NoCapability(target.exchange, "orderbook", "snapshot")
310
335
  depth = params.depth or 50
311
336
  snap = await adapter.fetch_orderbook(target.symbol, depth)
337
+ _track_ts(snap.ts)
312
338
  total_written += await _flush(store, ds, [snap], prov_src)
313
339
 
314
340
  except Exception as exc:
@@ -328,6 +354,12 @@ async def backfill(
328
354
  if runs_store:
329
355
  runs_store.finish_run(run_id, state, rows_written=total_written)
330
356
 
357
+ # Record coverage so this dataset's extent survives a later local-data drop.
358
+ if coverage_store is not None and _run_max[0] is not None:
359
+ coverage_store.record(
360
+ ds, min_ts=_run_min[0], max_ts=_run_max[0], rows_added=total_written
361
+ )
362
+
331
363
  return {"run_id": run_id, "rows_written": total_written, "start_ns": start_ns, "end_ns": end_ns}
332
364
 
333
365
 
@@ -461,12 +493,87 @@ def read(
461
493
  store: ParquetStore,
462
494
  start_ns: int | None = None,
463
495
  end_ns: int | None = None,
496
+ remote: RemoteStorage | None = None,
464
497
  ) -> Any:
465
- """Read stored data for *target* in the given nanosecond range."""
498
+ """Read stored data for *target* in the given nanosecond range.
499
+
500
+ Read-through restore: when *remote* is set and the dataset has no local
501
+ Parquet (e.g. it was purged to free disk), the dataset directory is pulled
502
+ back from the remote (``rclone copy``) before loading, so a purge is
503
+ transparent to readers.
504
+ """
466
505
  ds = _make_dataset_id(target)
506
+ if remote is not None:
507
+ directory = store.directory(ds)
508
+ if not any(directory.glob("*.parquet")):
509
+ rel = directory.relative_to(store.root)
510
+ remote.restore(str(rel))
467
511
  return store.load(ds, start_ns, end_ns)
468
512
 
469
513
 
470
514
  def inventory(*, store: ParquetStore) -> list[dict[str, Any]]:
471
515
  """Return a list of dataset descriptors for all stored data."""
472
516
  return store.inventory()
517
+
518
+
519
+ async def sync_remote(
520
+ remote: RemoteStorage,
521
+ *,
522
+ runs_store: RunsStore | None = None,
523
+ events: RunEvents | None = None,
524
+ run_id: str | None = None,
525
+ ) -> dict[str, Any]:
526
+ """Run one remote-sync cycle: mirror the local store to all rclone remotes.
527
+
528
+ Records the cycle as a ``sync`` run in *runs_store* (so the Storage UI can
529
+ show "last sync") and emits ``status``/``log`` on *events*. Shared by the
530
+ scheduler's periodic loop and the manual "Sync now" endpoint, so the
531
+ run-recording lives in exactly one place.
532
+
533
+ Parameters
534
+ ----------
535
+ remote : RemoteStorage
536
+ runs_store : RunsStore or None
537
+ events : RunEvents or None
538
+ run_id : str or None
539
+ Override the auto-generated run id.
540
+
541
+ Returns
542
+ -------
543
+ dict
544
+ ``{'run_id', 'results', 'ok'}`` — ``results`` maps remote → success;
545
+ ``ok`` is True only when every configured remote synced.
546
+ """
547
+ if run_id is None:
548
+ run_id = f"remote-sync@{time.time_ns()}"
549
+ if runs_store:
550
+ runs_store.create_run(run_id, "remote-sync", "sync", "-", "all", "-")
551
+ if events:
552
+ events.status("running")
553
+ try:
554
+ results = await remote.sync_all()
555
+ except Exception as exc:
556
+ msg = f"Remote sync error: {exc}"
557
+ if events:
558
+ events.log(msg, "error")
559
+ events.status("failed")
560
+ if runs_store:
561
+ runs_store.finish_run(run_id, "failed", error=str(exc))
562
+ return {"run_id": run_id, "results": {}, "ok": False}
563
+
564
+ failed = [r for r, ok in results.items() if not ok]
565
+ if failed:
566
+ msg = f"Remote sync failed for: {', '.join(failed)}"
567
+ if events:
568
+ events.log(msg, "error")
569
+ events.status("failed")
570
+ if runs_store:
571
+ runs_store.finish_run(run_id, "failed", error=msg)
572
+ return {"run_id": run_id, "results": results, "ok": False}
573
+
574
+ if events:
575
+ events.log(f"Synced {len(results)} remote(s)")
576
+ events.status("succeeded")
577
+ if runs_store:
578
+ runs_store.finish_run(run_id, "succeeded", rows_written=len(results))
579
+ return {"run_id": run_id, "results": results, "ok": True}
@@ -6,10 +6,12 @@ import asyncio
6
6
  import logging
7
7
  import time
8
8
 
9
- from dccd.application.events import EventBus
9
+ from dccd.application.events import EventBus, RunEvents
10
10
  from dccd.application.jobs import JobSpec
11
11
  from dccd.sources.registry import SourceRegistry
12
+ from dccd.storage.coverage_sqlite import CoverageStore
12
13
  from dccd.storage.parquet import ParquetStore
14
+ from dccd.storage.remote import RemoteStorage
13
15
  from dccd.storage.runs_sqlite import RunsStore
14
16
 
15
17
  __all__ = ["Scheduler"]
@@ -90,6 +92,11 @@ class Scheduler:
90
92
  store : ParquetStore
91
93
  runs_store : RunsStore or None
92
94
  events : EventBus
95
+ remote : RemoteStorage or None
96
+ When set (rclone remotes configured), :meth:`start` launches a periodic
97
+ loop that mirrors the local store off-box every ``sync_interval`` seconds.
98
+ sync_interval : int
99
+ Seconds between remote-sync cycles (default 3600).
93
100
  """
94
101
 
95
102
  def __init__(
@@ -98,11 +105,22 @@ class Scheduler:
98
105
  store: ParquetStore,
99
106
  runs_store: RunsStore | None = None,
100
107
  events: EventBus | None = None,
108
+ remote: RemoteStorage | None = None,
109
+ sync_interval: int = 3600,
110
+ coverage_store: CoverageStore | None = None,
111
+ data_path: str | None = None,
112
+ min_free_gb: float = 0.0,
101
113
  ) -> None:
102
114
  self._registry = registry
103
115
  self._store = store
104
116
  self._runs_store = runs_store
105
117
  self._events = events or EventBus()
118
+ self._remote = remote
119
+ self._sync_interval = sync_interval
120
+ self._coverage_store = coverage_store
121
+ self._data_path = data_path
122
+ self._min_free_gb = min_free_gb
123
+ self._sync_task: asyncio.Task[None] | None = None
106
124
  self._streams: dict[str, _StreamWorker] = {}
107
125
  self._interval_tasks: list[asyncio.Task[None]] = []
108
126
  # Per-spec recurring backfill loops, keyed by spec id, with the interval
@@ -186,6 +204,8 @@ class Scheduler:
186
204
  async def start(self, specs: list[JobSpec]) -> None:
187
205
  """Start all enabled specs (full daemon mode)."""
188
206
  self._running = True
207
+ if self._remote is not None and self._sync_task is None:
208
+ self._sync_task = asyncio.create_task(self._sync_loop())
189
209
  for spec in specs:
190
210
  if not spec.enabled:
191
211
  continue
@@ -207,6 +227,13 @@ class Scheduler:
207
227
  async def stop(self) -> None:
208
228
  """Stop all running jobs."""
209
229
  self._running = False
230
+ if self._sync_task is not None:
231
+ self._sync_task.cancel()
232
+ try:
233
+ await self._sync_task
234
+ except (asyncio.CancelledError, Exception):
235
+ pass
236
+ self._sync_task = None
210
237
  for task in self._interval_tasks:
211
238
  task.cancel()
212
239
  for task, _ in self._interval_loops.values():
@@ -216,6 +243,64 @@ class Scheduler:
216
243
  self._interval_tasks.clear()
217
244
  self._interval_loops.clear()
218
245
 
246
+ async def _sync_loop(self) -> None:
247
+ """Periodically mirror the local store to the configured rclone remotes.
248
+
249
+ Runs only when a :class:`~dccd.storage.remote.RemoteStorage` was wired in
250
+ (``storage.remotes`` non-empty). Each cycle is delegated to
251
+ :func:`dccd.application.operations.sync_remote` (which records a ``sync``
252
+ run in :class:`RunsStore` and emits live ``remote-sync`` EventBus status);
253
+ this loop only owns the cadence and the exponential backoff on failure
254
+ (30s → capped at ``sync_interval``) so a flapping remote doesn't hammer
255
+ rclone.
256
+ """
257
+ from dccd.application.operations import sync_remote
258
+ assert self._remote is not None
259
+ run_events = self._events.for_run("remote-sync")
260
+ backoff = 30.0
261
+ try:
262
+ while self._running:
263
+ result = await sync_remote(
264
+ self._remote, runs_store=self._runs_store, events=run_events
265
+ )
266
+ if result["ok"]:
267
+ # Remote is now up to date → safe to free disk by dropping the
268
+ # oldest already-synced files (the coverage manifest keeps the
269
+ # resume cursor). Runs only when a floor is configured.
270
+ await self._maybe_purge(run_events)
271
+ backoff = 30.0
272
+ await asyncio.sleep(self._sync_interval)
273
+ else:
274
+ logger.warning("Remote sync failed — retry in %ds", int(backoff))
275
+ await asyncio.sleep(min(backoff, self._sync_interval))
276
+ backoff = min(backoff * 2, float(self._sync_interval))
277
+ except asyncio.CancelledError:
278
+ return
279
+
280
+ async def _maybe_purge(self, run_events: RunEvents) -> None:
281
+ """Free disk by dropping oldest synced files when below the floor.
282
+
283
+ Called right after a successful sync (remote is current), so dropped
284
+ files are recoverable from the remote. No-op unless ``min_free_gb`` and a
285
+ ``data_path`` are configured.
286
+ """
287
+ if self._min_free_gb <= 0 or not self._data_path:
288
+ return
289
+ from dccd.storage.purge import purge_to_free_space
290
+ try:
291
+ res = await asyncio.to_thread(
292
+ purge_to_free_space, self._data_path, self._min_free_gb
293
+ )
294
+ except Exception as exc:
295
+ logger.warning("Purge failed: %s", exc)
296
+ return
297
+ if res["removed"]:
298
+ run_events.log(
299
+ f"Purged {len(res['removed'])} file(s), "
300
+ f"freed ~{res['freed_bytes'] / (1024 ** 3):.2f} GiB to stay above "
301
+ f"{self._min_free_gb} GiB free"
302
+ )
303
+
219
304
  async def _interval_loop(self, spec: JobSpec) -> None:
220
305
  every = spec.trigger.every or spec.target.span or 3600
221
306
  while self._running:
@@ -231,6 +316,7 @@ class Scheduler:
231
316
  registry=self._registry,
232
317
  store=self._store,
233
318
  runs_store=self._runs_store,
319
+ coverage_store=self._coverage_store,
234
320
  events=run_events,
235
321
  )
236
322
  except Exception as exc:
@@ -11,11 +11,20 @@ import pathlib
11
11
  from typing import TYPE_CHECKING
12
12
 
13
13
  if TYPE_CHECKING:
14
+ from dccd.application.config import AppConfig
14
15
  from dccd.sources.registry import SourceRegistry
16
+ from dccd.storage.coverage_sqlite import CoverageStore
15
17
  from dccd.storage.parquet import ParquetStore
18
+ from dccd.storage.remote import RemoteStorage
16
19
  from dccd.storage.runs_sqlite import RunsStore
17
20
 
18
- __all__ = ["build_registry", "build_store", "build_runs_store"]
21
+ __all__ = [
22
+ "build_registry",
23
+ "build_store",
24
+ "build_runs_store",
25
+ "build_remote",
26
+ "build_coverage_store",
27
+ ]
19
28
 
20
29
 
21
30
  def build_registry() -> "SourceRegistry":
@@ -77,3 +86,50 @@ def build_runs_store(data_path: str | pathlib.Path) -> "RunsStore":
77
86
  from dccd.storage.runs_sqlite import RunsStore
78
87
 
79
88
  return RunsStore(pathlib.Path(data_path) / ".dccd" / "runs.db")
89
+
90
+
91
+ def build_coverage_store(data_path: str | pathlib.Path) -> "CoverageStore":
92
+ """Return a :class:`~dccd.storage.coverage_sqlite.CoverageStore`.
93
+
94
+ The database lives at ``{data_path}/.dccd/coverage.db`` — the manifest that
95
+ lets local data be dropped without forcing a re-download on the next
96
+ backfill.
97
+
98
+ Parameters
99
+ ----------
100
+ data_path : str or Path
101
+
102
+ Returns
103
+ -------
104
+ CoverageStore
105
+ """
106
+ from dccd.storage.coverage_sqlite import CoverageStore
107
+
108
+ return CoverageStore(pathlib.Path(data_path) / ".dccd" / "coverage.db")
109
+
110
+
111
+ def build_remote(cfg: "AppConfig") -> "RemoteStorage | None":
112
+ """Return a :class:`~dccd.storage.remote.RemoteStorage`, or ``None``.
113
+
114
+ Returns ``None`` when no rclone remotes are configured (``storage.remotes``
115
+ empty) — there is nothing to sync, so the daemon skips the sync loop. The
116
+ local root is ``settings.data_path`` (the canonical store root used by
117
+ :func:`build_store`).
118
+
119
+ Parameters
120
+ ----------
121
+ cfg : AppConfig
122
+
123
+ Returns
124
+ -------
125
+ RemoteStorage or None
126
+ """
127
+ if not cfg.storage.remotes:
128
+ return None
129
+
130
+ from dccd.storage.remote import RemoteStorage
131
+
132
+ return RemoteStorage(
133
+ cfg.settings.data_path,
134
+ [r.model_dump() for r in cfg.storage.remotes],
135
+ )