dccd 3.5.0__tar.gz → 3.5.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dccd-3.5.0 → dccd-3.5.2}/CHANGELOG.md +53 -0
- {dccd-3.5.0 → dccd-3.5.2}/PKG-INFO +1 -1
- {dccd-3.5.0 → dccd-3.5.2}/dccd/application/config.py +11 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/application/monitor.py +46 -17
- {dccd-3.5.0 → dccd-3.5.2}/dccd/interfaces/api/app.py +38 -7
- {dccd-3.5.0 → dccd-3.5.2}/dccd/interfaces/cli/main.py +10 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/sources/okx.py +3 -1
- {dccd-3.5.0 → dccd-3.5.2}/dccd/storage/purge.py +5 -3
- {dccd-3.5.0 → dccd-3.5.2}/dccd/storage/remote.py +33 -8
- {dccd-3.5.0 → dccd-3.5.2}/dccd/storage/runs_sqlite.py +59 -3
- {dccd-3.5.0 → dccd-3.5.2}/dccd/tests/v3/test_api.py +220 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/tests/v3/test_cli.py +142 -0
- dccd-3.5.2/dccd/tests/v3/test_monitor_webhook.py +150 -0
- dccd-3.5.2/dccd/tests/v3/test_remote_sync.py +339 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/tests/v3/test_sources.py +118 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/tests/v3/test_storage.py +47 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd.egg-info/PKG-INFO +1 -1
- {dccd-3.5.0 → dccd-3.5.2}/dccd.egg-info/SOURCES.txt +1 -0
- {dccd-3.5.0 → dccd-3.5.2}/pyproject.toml +1 -1
- dccd-3.5.0/dccd/tests/v3/test_remote_sync.py +0 -172
- {dccd-3.5.0 → dccd-3.5.2}/CLAUDE.md +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/CONTRIBUTING.md +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/LICENSE.txt +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/MANIFEST.in +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/README.md +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/__init__.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/application/__init__.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/application/events.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/application/jobs.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/application/operations.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/application/registry.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/application/scheduler.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/application/service_factory.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/domain/__init__.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/domain/capability.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/domain/dataset.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/domain/errors.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/domain/records.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/domain/symbol.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/domain/timeutils.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/domain/transforms.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/domain/types.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/interfaces/__init__.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/interfaces/api/__init__.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/interfaces/cli/__init__.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/interfaces/ui/__init__.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/interfaces/ui/static/favicon.svg +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/interfaces/ui/static/logo.svg +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/interfaces/ui/templates/base.html +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/interfaces/ui/templates/config.html +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/interfaces/ui/templates/dashboard.html +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/interfaces/ui/templates/data.html +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/interfaces/ui/templates/historical.html +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/interfaces/ui/templates/live.html +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/interfaces/ui/templates/login.html +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/interfaces/ui/templates/logs.html +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/interfaces/ui/templates/storage.html +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/sources/__init__.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/sources/base.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/sources/binance.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/sources/bitfinex.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/sources/bitmex.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/sources/bybit.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/sources/coinbase.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/sources/kraken.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/sources/registry.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/storage/__init__.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/storage/coverage_sqlite.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/storage/parquet.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/tests/__init__.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/tests/v3/__init__.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/tests/v3/test_adapter_parsing.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/tests/v3/test_application.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/tests/v3/test_backfill_lookback.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/tests/v3/test_client.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/tests/v3/test_coverage.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/tests/v3/test_domain.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/tests/v3/test_domain_extended.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/tests/v3/test_network.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/tests/v3/test_orderbook_throttle.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/tests/v3/test_purge.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/tests/v3/test_ratelimit.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/tests/v3/test_restart.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/tests/v3/test_restore.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/tests/v3/test_scheduler_hygiene.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/tests/v3/test_storage_extended.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/tests/v3/test_stream_end_state.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/tests/v3/test_stream_flush.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/tests/v3/test_stream_nocapability.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/tests/v3/test_transport.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/tests/v3/test_ws_subscription_honesty.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/transport/__init__.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/transport/http.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/transport/paginate.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/transport/ratelimit.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd/transport/ws.py +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd.egg-info/dependency_links.txt +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd.egg-info/entry_points.txt +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd.egg-info/requires.txt +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/dccd.egg-info/top_level.txt +0 -0
- {dccd-3.5.0 → dccd-3.5.2}/setup.cfg +0 -0
|
@@ -16,6 +16,59 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
16
16
|
|
|
17
17
|
### Removed
|
|
18
18
|
|
|
19
|
+
## [3.5.2] - 2026-06-12
|
|
20
|
+
|
|
21
|
+
### Added
|
|
22
|
+
|
|
23
|
+
- Boot-time runs.db retention (`settings.runs_retention_days`, default 90,
|
|
24
|
+
`0` disables): terminal non-failed runs (`succeeded`/`stale`/`cancelled`)
|
|
25
|
+
older than the window are deleted and the database VACUUMed at daemon
|
|
26
|
+
start, right after the orphan sweep; `failed` runs are kept as the
|
|
27
|
+
long-term error journal. Verified on a copy of the production runs.db:
|
|
28
|
+
1,770 old rows pruned, file size −67 %, `failed` rows untouched. (#154)
|
|
29
|
+
|
|
30
|
+
### Fixed
|
|
31
|
+
|
|
32
|
+
- Webhook alerts send a plain-text body with `X-Title: dccd` /
|
|
33
|
+
`X-Priority: high` headers for ntfy-style endpoints — the phone showed a
|
|
34
|
+
raw JSON blob before; Slack webhooks (`hooks.slack.com`) keep the JSON
|
|
35
|
+
`{"text": …}` payload. Verified live: one test message delivered to the
|
|
36
|
+
production ntfy topic (HTTP 200) rendered as plain text. (#155)
|
|
37
|
+
- Manual backfill triggers (`POST /api/backfill`, `/api/jobs/run`,
|
|
38
|
+
`/api/jobs/run-all`) are idempotent: a spec that is already being
|
|
39
|
+
backfilled returns the existing `run_id` (`status: already-running`) —
|
|
40
|
+
run-all skips busy jobs and lists them under `already_running` — instead
|
|
41
|
+
of starting a duplicate concurrent run that wasted exchange requests and
|
|
42
|
+
confused runs/progress. (#153)
|
|
43
|
+
- Off-box sync no longer mirrors deletions: `RemoteStorage` runs
|
|
44
|
+
`rclone copy` instead of `rclone sync`, so locally purged files survive
|
|
45
|
+
on the remote for read-through restore — enabling `min_free_gb` no longer
|
|
46
|
+
risks deleting the only copy of old data. The remote is now an archive
|
|
47
|
+
superset (never deleted automatically; remote cleanup is manual).
|
|
48
|
+
Verified live against a real rclone remote: purge → sync → file survives
|
|
49
|
+
→ `restore()` returns byte-identical content. (#152)
|
|
50
|
+
|
|
51
|
+
## [3.5.1] - 2026-06-12
|
|
52
|
+
|
|
53
|
+
### Fixed
|
|
54
|
+
|
|
55
|
+
- `dccd start` marked its own just-started stream runs `stale` at boot: the
|
|
56
|
+
orphan sweep (`mark_stale_running`) ran in the FastAPI lifespan *after*
|
|
57
|
+
`cmd_start` had already started the scheduler's stream workers, so their
|
|
58
|
+
fresh `running` rows were swept as "orphaned by daemon restart" and the
|
|
59
|
+
Dashboard "Active now" never showed streams. The sweep now runs in
|
|
60
|
+
`cmd_start` before the scheduler starts; the lifespan only sweeps in
|
|
61
|
+
standalone `dccd ui`. Verified live across two daemon launches: the live
|
|
62
|
+
run stays `running`; a restart stales only the previous one. (#145)
|
|
63
|
+
- OKX OHLC pagination silently dropped the bar at every 100-bar page
|
|
64
|
+
boundary: OKX `before`/`after` cursors are exclusive, so passing
|
|
65
|
+
`before=start_ms` excluded the bar exactly at each window start (observed
|
|
66
|
+
in production as 431 one-minute gaps per OKX pair, spaced exactly
|
|
67
|
+
100 min). `fetch_ohlc_page` now sends `before=start_ms-1`; regression
|
|
68
|
+
test drives the paginator across a page boundary under faithful exclusive
|
|
69
|
+
semantics. Verified live: a 12 h OKX 1m backfill lands with 0 gaps and
|
|
70
|
+
all 7 boundary bars present. (#144)
|
|
71
|
+
|
|
19
72
|
## [3.5.0] - 2026-06-11
|
|
20
73
|
|
|
21
74
|
### Added
|
|
@@ -46,6 +46,9 @@ class SettingsConfig(BaseModel):
|
|
|
46
46
|
- ``ui_trusted_proxy`` — trust ``X-Forwarded-For`` as the rate-limit client key.
|
|
47
47
|
Enable **only** behind a reverse proxy that overwrites the header, else a direct
|
|
48
48
|
client can forge it and bypass the limit.
|
|
49
|
+
- ``runs_retention_days`` — delete terminal non-failed runs (``succeeded``,
|
|
50
|
+
``stale``, ``cancelled``) older than this many days at daemon boot. ``0``
|
|
51
|
+
disables the sweep (rows accumulate indefinitely). Default ``90``.
|
|
49
52
|
"""
|
|
50
53
|
data_path: str = "./data/crypto"
|
|
51
54
|
timezone: str = "local"
|
|
@@ -56,6 +59,7 @@ class SettingsConfig(BaseModel):
|
|
|
56
59
|
ui_readonly: bool = False
|
|
57
60
|
ui_rate_limit: int = 0
|
|
58
61
|
ui_trusted_proxy: bool = False
|
|
62
|
+
runs_retention_days: int = 90
|
|
59
63
|
|
|
60
64
|
@field_validator("data_path")
|
|
61
65
|
@classmethod
|
|
@@ -69,6 +73,13 @@ class SettingsConfig(BaseModel):
|
|
|
69
73
|
raise ValueError("ui_rate_limit must be >= 0")
|
|
70
74
|
return v
|
|
71
75
|
|
|
76
|
+
@field_validator("runs_retention_days")
|
|
77
|
+
@classmethod
|
|
78
|
+
def _non_negative_retention(cls, v: int) -> int:
|
|
79
|
+
if v < 0:
|
|
80
|
+
raise ValueError("runs_retention_days must be >= 0")
|
|
81
|
+
return v
|
|
82
|
+
|
|
72
83
|
@field_validator("timezone")
|
|
73
84
|
@classmethod
|
|
74
85
|
def _validate_tz(cls, v: str) -> str:
|
|
@@ -26,6 +26,17 @@ class HealthMonitor:
|
|
|
26
26
|
job does not flood a webhook. The count (and cooldown) reset on the first
|
|
27
27
|
success.
|
|
28
28
|
|
|
29
|
+
Webhook format
|
|
30
|
+
--------------
|
|
31
|
+
- **Slack** (hostname is or ends with ``hooks.slack.com``): JSON body
|
|
32
|
+
``{"text": msg}`` with ``Content-Type: application/json`` — the current
|
|
33
|
+
behaviour.
|
|
34
|
+
- **All other endpoints** (e.g. ntfy): plain-text body with headers
|
|
35
|
+
``Content-Type: text/plain``, ``X-Title: dccd``, and
|
|
36
|
+
``X-Priority: high``. ntfy renders the raw body as the notification
|
|
37
|
+
message; the old JSON blob caused the phone to show ``{"text": "…"}``
|
|
38
|
+
instead of the actual alert.
|
|
39
|
+
|
|
29
40
|
Parameters
|
|
30
41
|
----------
|
|
31
42
|
runs_store : RunsStore
|
|
@@ -77,25 +88,43 @@ class HealthMonitor:
|
|
|
77
88
|
self._last_alert_ts.pop(key, None)
|
|
78
89
|
self._last_webhook_err_ts.pop(key, None)
|
|
79
90
|
|
|
91
|
+
def _post_webhook(self, msg: str, run_id: str) -> None:
|
|
92
|
+
"""Send *msg* to ``self._webhook``, choosing the right payload format.
|
|
93
|
+
|
|
94
|
+
Slack endpoints (hostname ``hooks.slack.com`` or subdomains) receive
|
|
95
|
+
``{"text": msg}`` JSON. All other endpoints (ntfy, custom) receive a
|
|
96
|
+
plain-text body so the notification shows the message directly.
|
|
97
|
+
"""
|
|
98
|
+
import json
|
|
99
|
+
import urllib.parse
|
|
100
|
+
import urllib.request
|
|
101
|
+
|
|
102
|
+
assert self._webhook is not None # caller (_alert) guards this
|
|
103
|
+
hostname: str = urllib.parse.urlsplit(self._webhook).hostname or ""
|
|
104
|
+
if hostname == "hooks.slack.com" or hostname.endswith(".hooks.slack.com"):
|
|
105
|
+
data = json.dumps({"text": msg}).encode()
|
|
106
|
+
headers: dict[str, str] = {"Content-Type": "application/json"}
|
|
107
|
+
else:
|
|
108
|
+
data = msg.encode()
|
|
109
|
+
headers = {
|
|
110
|
+
"Content-Type": "text/plain",
|
|
111
|
+
"X-Title": "dccd",
|
|
112
|
+
"X-Priority": "high",
|
|
113
|
+
}
|
|
114
|
+
req = urllib.request.Request(self._webhook, data=data, headers=headers)
|
|
115
|
+
try:
|
|
116
|
+
with urllib.request.urlopen(req, timeout=5):
|
|
117
|
+
pass
|
|
118
|
+
except Exception as exc:
|
|
119
|
+
# Log webhook-send failures at most once per cooldown window.
|
|
120
|
+
last_err = self._last_webhook_err_ts.get(run_id, 0.0)
|
|
121
|
+
if time.monotonic() - last_err >= _ALERT_COOLDOWN_S:
|
|
122
|
+
logger.warning("Webhook alert failed: %s", exc)
|
|
123
|
+
self._last_webhook_err_ts[run_id] = time.monotonic()
|
|
124
|
+
|
|
80
125
|
def _alert(self, run_id: str, count: int) -> None:
|
|
81
126
|
msg = f"dccd alert: {run_id} failed {count} times consecutively."
|
|
82
127
|
logger.error(msg)
|
|
83
128
|
self._last_alert_ts[run_id] = time.monotonic()
|
|
84
129
|
if self._webhook:
|
|
85
|
-
|
|
86
|
-
import json
|
|
87
|
-
import urllib.request
|
|
88
|
-
data = json.dumps({"text": msg}).encode()
|
|
89
|
-
req = urllib.request.Request(
|
|
90
|
-
self._webhook,
|
|
91
|
-
data=data,
|
|
92
|
-
headers={"Content-Type": "application/json"},
|
|
93
|
-
)
|
|
94
|
-
with urllib.request.urlopen(req, timeout=5):
|
|
95
|
-
pass
|
|
96
|
-
except Exception as exc:
|
|
97
|
-
# Log webhook-send failures at most once per cooldown window.
|
|
98
|
-
last_err = self._last_webhook_err_ts.get(run_id, 0.0)
|
|
99
|
-
if time.monotonic() - last_err >= _ALERT_COOLDOWN_S:
|
|
100
|
-
logger.warning("Webhook alert failed: %s", exc)
|
|
101
|
-
self._last_webhook_err_ts[run_id] = time.monotonic()
|
|
130
|
+
self._post_webhook(msg, run_id)
|
|
@@ -191,12 +191,20 @@ def create_app(
|
|
|
191
191
|
app.state.config_path = config_path
|
|
192
192
|
app.state.store = build_store(cfg.settings.data_path)
|
|
193
193
|
app.state.runs_store = build_runs_store(cfg.settings.data_path)
|
|
194
|
-
# Mark any runs left in 'running' state as 'stale'
|
|
195
|
-
#
|
|
196
|
-
#
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
194
|
+
# Mark any runs left in 'running' state as 'stale'. For standalone
|
|
195
|
+
# `dccd ui` (scheduler is None) this is the right call site — no
|
|
196
|
+
# scheduler has started any workers yet. For `dccd start` the sweep
|
|
197
|
+
# happens in cmd_start *before* the scheduler starts, so we must skip
|
|
198
|
+
# it here: sweeping at this point would stale-out the stream-worker
|
|
199
|
+
# rows the scheduler just created (the exact misuse warned about in
|
|
200
|
+
# RunsStore.mark_stale_running's docstring).
|
|
201
|
+
if scheduler is None:
|
|
202
|
+
_stale_count = app.state.runs_store.mark_stale_running()
|
|
203
|
+
if _stale_count:
|
|
204
|
+
logger.warning("marked %d orphaned run(s) stale (daemon restarted)", _stale_count)
|
|
205
|
+
_pruned_count = app.state.runs_store.prune_old_runs(cfg.settings.runs_retention_days)
|
|
206
|
+
if _pruned_count:
|
|
207
|
+
logger.info("pruned %d old terminal run(s) from runs.db (retention: %dd)", _pruned_count, cfg.settings.runs_retention_days)
|
|
200
208
|
app.state.coverage_store = build_coverage_store(cfg.settings.data_path)
|
|
201
209
|
app.state.event_bus = EventBus()
|
|
202
210
|
app.state.registry = build_registry()
|
|
@@ -443,6 +451,14 @@ def create_app(
|
|
|
443
451
|
await request.app.state.scheduler.sync_streams(request.app.state.all_specs)
|
|
444
452
|
await request.app.state.scheduler.sync_intervals(request.app.state.all_specs)
|
|
445
453
|
|
|
454
|
+
async def _active_run_for(request: Request, spec_id: str) -> str | None:
|
|
455
|
+
"""Return the run_id of the first active run matching *spec_id*, or None."""
|
|
456
|
+
active = await asyncio.to_thread(_runs(request).active_runs)
|
|
457
|
+
for row in active:
|
|
458
|
+
if row.get("spec_id") == spec_id:
|
|
459
|
+
return str(row["run_id"])
|
|
460
|
+
return None
|
|
461
|
+
|
|
446
462
|
def _run_backfill_tracked(request: Request, spec: JobSpec, run_id: str) -> None:
|
|
447
463
|
"""Spawn a backfill with a cancellable stop event registered by run_id."""
|
|
448
464
|
reg = _reg(request)
|
|
@@ -580,6 +596,11 @@ def create_app(
|
|
|
580
596
|
origin="runtime",
|
|
581
597
|
)
|
|
582
598
|
|
|
599
|
+
# Idempotency guard: if the spec is already running, return the existing run_id.
|
|
600
|
+
existing = await _active_run_for(request, spec.id)
|
|
601
|
+
if existing is not None:
|
|
602
|
+
return {"run_id": existing, "status": "already-running"}
|
|
603
|
+
|
|
583
604
|
# Generate a URL-safe run_id and pass it into backfill() so both the
|
|
584
605
|
# API polling endpoint and the RunsStore use the same identifier.
|
|
585
606
|
# We use a short UUID (no slashes) instead of embedding spec.id which
|
|
@@ -683,6 +704,11 @@ def create_app(
|
|
|
683
704
|
if spec.operation != "backfill":
|
|
684
705
|
raise HTTPException(400, "Only backfill jobs can be triggered manually; use /api/streams/start for stream jobs")
|
|
685
706
|
|
|
707
|
+
# Idempotency guard: if the spec is already running, return the existing run_id.
|
|
708
|
+
existing = await _active_run_for(request, spec.id)
|
|
709
|
+
if existing is not None:
|
|
710
|
+
return {"run_id": existing, "status": "already-running", "job_id": job_id}
|
|
711
|
+
|
|
686
712
|
run_id = str(_uuid.uuid4())
|
|
687
713
|
_run_backfill_tracked(request, spec, run_id)
|
|
688
714
|
|
|
@@ -696,12 +722,17 @@ def create_app(
|
|
|
696
722
|
backfill_specs = [s for s in specs if s.operation == "backfill" and s.enabled]
|
|
697
723
|
|
|
698
724
|
run_ids = []
|
|
725
|
+
already_running = []
|
|
699
726
|
for spec in backfill_specs:
|
|
727
|
+
existing = await _active_run_for(request, spec.id)
|
|
728
|
+
if existing is not None:
|
|
729
|
+
already_running.append({"run_id": existing, "job_id": spec.id})
|
|
730
|
+
continue
|
|
700
731
|
run_id = str(_uuid.uuid4())
|
|
701
732
|
run_ids.append({"run_id": run_id, "job_id": spec.id})
|
|
702
733
|
_run_backfill_tracked(request, spec, run_id)
|
|
703
734
|
|
|
704
|
-
return {"started": len(run_ids), "runs": run_ids}
|
|
735
|
+
return {"started": len(run_ids), "runs": run_ids, "already_running": already_running}
|
|
705
736
|
|
|
706
737
|
# -----------------------------------------------------------------------
|
|
707
738
|
# Config
|
|
@@ -160,6 +160,16 @@ def cmd_start(
|
|
|
160
160
|
cfg, cfg_path = _load_cfg(config)
|
|
161
161
|
store = build_store(cfg.settings.data_path)
|
|
162
162
|
runs_store = build_runs_store(cfg.settings.data_path)
|
|
163
|
+
# Sweep orphaned runs *before* the scheduler starts any stream workers so
|
|
164
|
+
# the lifespan (which opens the same DB) does not stale-out legitimate rows
|
|
165
|
+
# that the scheduler just created. This is the correct boot-path call site
|
|
166
|
+
# warned about in RunsStore.mark_stale_running's docstring.
|
|
167
|
+
_stale = runs_store.mark_stale_running()
|
|
168
|
+
if _stale > 0:
|
|
169
|
+
typer.echo(f"Marked {_stale} orphaned run(s) stale (daemon restarted)")
|
|
170
|
+
_pruned = runs_store.prune_old_runs(cfg.settings.runs_retention_days)
|
|
171
|
+
if _pruned > 0:
|
|
172
|
+
typer.echo(f"Pruned {_pruned} old terminal run(s) from runs.db (retention: {cfg.settings.runs_retention_days}d)")
|
|
163
173
|
coverage_store = build_coverage_store(cfg.settings.data_path)
|
|
164
174
|
registry = build_registry()
|
|
165
175
|
bus = EventBus()
|
|
@@ -102,10 +102,12 @@ class OKXSource(
|
|
|
102
102
|
return []
|
|
103
103
|
|
|
104
104
|
pair = self.render_symbol(symbol)
|
|
105
|
+
# OKX `before`/`after` are exclusive bounds. Without `- 1` the bar
|
|
106
|
+
# exactly at each page-window start would be silently dropped.
|
|
105
107
|
params: dict[str, Any] = {
|
|
106
108
|
"instId": pair,
|
|
107
109
|
"bar": bar,
|
|
108
|
-
"before": str(start_ns // 1_000_000),
|
|
110
|
+
"before": str(start_ns // 1_000_000 - 1),
|
|
109
111
|
"after": str(end_ns // 1_000_000),
|
|
110
112
|
"limit": min(limit, 100),
|
|
111
113
|
}
|
|
@@ -2,9 +2,11 @@
|
|
|
2
2
|
pressure.
|
|
3
3
|
|
|
4
4
|
**Safety contract**: this deletes local data that is recoverable only from the
|
|
5
|
-
remote, so it must be called **only when the remote
|
|
5
|
+
remote, so it must be called **only when the remote archive is up to date** — in
|
|
6
6
|
practice, right after a successful :func:`~dccd.application.operations.sync_remote`
|
|
7
|
-
cycle.
|
|
7
|
+
cycle. Because :class:`~dccd.storage.remote.RemoteStorage` uses ``rclone copy``
|
|
8
|
+
(never ``rclone sync``), purged files remain on the remote and can be pulled
|
|
9
|
+
back by :meth:`~dccd.storage.remote.RemoteStorage.restore`. The coverage manifest (``CoverageStore``) preserves the resume cursor, so a
|
|
8
10
|
later ``backfill(start="last")`` still continues from where collection left off;
|
|
9
11
|
reads of purged ranges return what's local until read-through restore pulls them
|
|
10
12
|
back.
|
|
@@ -41,7 +43,7 @@ def purge_to_free_space(
|
|
|
41
43
|
"""Delete oldest Parquet files until free space reaches ``min_free_gb``.
|
|
42
44
|
|
|
43
45
|
Files are removed oldest-first (by mtime), so recent data stays local while
|
|
44
|
-
old data — already
|
|
46
|
+
old data — already copied off-box — is dropped. The ``.dccd`` directory is
|
|
45
47
|
excluded. No-op when ``min_free_gb <= 0`` or free space is already above the
|
|
46
48
|
threshold.
|
|
47
49
|
|
|
@@ -1,4 +1,17 @@
|
|
|
1
|
-
"""Remote storage
|
|
1
|
+
"""Remote storage copy via rclone.
|
|
2
|
+
|
|
3
|
+
The remote is an *archive superset* of the local store: files are copied
|
|
4
|
+
off-box but never deleted remotely. Local = hot tier (fast access, space
|
|
5
|
+
pressure managed by the purge subsystem); remote = complete history archive.
|
|
6
|
+
This means:
|
|
7
|
+
|
|
8
|
+
- A local file that is purged to free disk space still exists on the remote
|
|
9
|
+
and can be pulled back by :meth:`RemoteStorage.restore` (read-through
|
|
10
|
+
restore).
|
|
11
|
+
- Remote cleanup (removing datasets you no longer want) is a **manual**
|
|
12
|
+
operation — use ``rclone delete`` or the provider console. dccd will
|
|
13
|
+
never delete from the remote automatically.
|
|
14
|
+
"""
|
|
2
15
|
|
|
3
16
|
from __future__ import annotations
|
|
4
17
|
|
|
@@ -13,12 +26,18 @@ logger = logging.getLogger(__name__)
|
|
|
13
26
|
|
|
14
27
|
|
|
15
28
|
class RemoteStorage:
|
|
16
|
-
"""
|
|
29
|
+
"""Copy local data to one or more rclone remotes (non-destructive).
|
|
30
|
+
|
|
31
|
+
Each sync cycle runs ``rclone copy`` (not ``rclone sync``) so that files
|
|
32
|
+
present on the remote but absent locally — e.g. files purged from the hot
|
|
33
|
+
tier to reclaim disk — are **never deleted**. The remote grows
|
|
34
|
+
monotonically and acts as a complete off-box archive; :meth:`restore` pulls
|
|
35
|
+
individual dataset directories back on demand.
|
|
17
36
|
|
|
18
37
|
Parameters
|
|
19
38
|
----------
|
|
20
39
|
local_path : str or Path
|
|
21
|
-
Local data directory to
|
|
40
|
+
Local data directory to copy from.
|
|
22
41
|
remotes : list of dicts
|
|
23
42
|
Each dict has ``provider`` and ``remote`` keys.
|
|
24
43
|
"""
|
|
@@ -32,24 +51,30 @@ class RemoteStorage:
|
|
|
32
51
|
self._remotes = remotes or []
|
|
33
52
|
|
|
34
53
|
def sync_one(self, remote: str) -> bool:
|
|
35
|
-
"""
|
|
54
|
+
"""Copy local store to a single rclone remote. Returns True on success.
|
|
55
|
+
|
|
56
|
+
Uses ``rclone copy`` (not ``rclone sync``) so files that exist on the
|
|
57
|
+
remote but are absent locally are preserved. This guarantees that
|
|
58
|
+
files purged from the local hot tier remain available on the remote for
|
|
59
|
+
read-through :meth:`restore`.
|
|
60
|
+
"""
|
|
36
61
|
try:
|
|
37
62
|
result = subprocess.run(
|
|
38
|
-
["rclone", "
|
|
63
|
+
["rclone", "copy", str(self._local), remote, "--quiet"],
|
|
39
64
|
capture_output=True,
|
|
40
65
|
text=True,
|
|
41
66
|
timeout=300,
|
|
42
67
|
)
|
|
43
68
|
if result.returncode != 0:
|
|
44
|
-
logger.error("rclone
|
|
69
|
+
logger.error("rclone copy to %s failed: %s", remote, result.stderr)
|
|
45
70
|
return False
|
|
46
|
-
logger.info("
|
|
71
|
+
logger.info("Copied to %s", remote)
|
|
47
72
|
return True
|
|
48
73
|
except FileNotFoundError:
|
|
49
74
|
logger.error("rclone not found in PATH")
|
|
50
75
|
return False
|
|
51
76
|
except subprocess.TimeoutExpired:
|
|
52
|
-
logger.error("rclone
|
|
77
|
+
logger.error("rclone copy to %s timed out", remote)
|
|
53
78
|
return False
|
|
54
79
|
|
|
55
80
|
def restore(self, rel_path: str) -> bool:
|
|
@@ -178,6 +178,59 @@ class RunsStore:
|
|
|
178
178
|
"""Runs currently ``running`` or ``reconnecting``."""
|
|
179
179
|
return self.list_runs(state="running") + self.list_runs(state="reconnecting")
|
|
180
180
|
|
|
181
|
+
def prune_old_runs(self, retention_days: int) -> int:
|
|
182
|
+
"""Delete terminal non-failed runs older than *retention_days* days.
|
|
183
|
+
|
|
184
|
+
Runs in states ``succeeded``, ``stale``, and ``cancelled`` that started
|
|
185
|
+
more than *retention_days* days ago are removed. ``failed`` rows are
|
|
186
|
+
intentionally kept as the long-term error journal. The database is
|
|
187
|
+
``VACUUM``-ed after any deletion to reclaim disk space.
|
|
188
|
+
|
|
189
|
+
Parameters
|
|
190
|
+
----------
|
|
191
|
+
retention_days : int
|
|
192
|
+
Number of days to retain terminal non-failed runs. Pass ``0`` (or
|
|
193
|
+
any value ``<= 0``) to disable pruning; the method returns ``0``
|
|
194
|
+
immediately without touching the database.
|
|
195
|
+
|
|
196
|
+
Returns
|
|
197
|
+
-------
|
|
198
|
+
int
|
|
199
|
+
Number of rows deleted (0 when pruning is disabled or when no rows
|
|
200
|
+
match the cutoff).
|
|
201
|
+
|
|
202
|
+
Notes
|
|
203
|
+
-----
|
|
204
|
+
``VACUUM`` cannot run inside a transaction. This method opens a
|
|
205
|
+
separate connection (outside the :meth:`_conn` context manager) for the
|
|
206
|
+
``VACUUM`` statement, which is executed only when at least one row was
|
|
207
|
+
deleted.
|
|
208
|
+
|
|
209
|
+
This method must be called from the daemon boot path *after*
|
|
210
|
+
:meth:`mark_stale_running` so that freshly-staled orphans age normally
|
|
211
|
+
rather than being immediately pruned on the next boot.
|
|
212
|
+
"""
|
|
213
|
+
if retention_days <= 0:
|
|
214
|
+
return 0
|
|
215
|
+
import time
|
|
216
|
+
cutoff_ns = int(time.time() * 1_000_000_000) - int(retention_days * 86400 * 1_000_000_000)
|
|
217
|
+
with self._conn() as conn:
|
|
218
|
+
cursor = conn.execute(
|
|
219
|
+
"""DELETE FROM runs
|
|
220
|
+
WHERE state IN ('succeeded', 'stale', 'cancelled')
|
|
221
|
+
AND started_at < ?""",
|
|
222
|
+
(cutoff_ns,),
|
|
223
|
+
)
|
|
224
|
+
deleted = cursor.rowcount
|
|
225
|
+
if deleted > 0:
|
|
226
|
+
# VACUUM cannot run inside a transaction — open a plain connection.
|
|
227
|
+
conn2 = sqlite3.connect(str(self._path))
|
|
228
|
+
try:
|
|
229
|
+
conn2.execute("VACUUM")
|
|
230
|
+
finally:
|
|
231
|
+
conn2.close()
|
|
232
|
+
return deleted
|
|
233
|
+
|
|
181
234
|
def mark_stale_running(self) -> int:
|
|
182
235
|
"""Transition all ``running`` rows to ``stale`` at daemon boot.
|
|
183
236
|
|
|
@@ -201,9 +254,12 @@ class RunsStore:
|
|
|
201
254
|
clearly attributes the state change to a restart rather than a normal
|
|
202
255
|
completion or a user-visible error.
|
|
203
256
|
|
|
204
|
-
This method must only be called from the daemon boot path
|
|
205
|
-
|
|
206
|
-
|
|
257
|
+
This method must only be called from the daemon boot path, before any
|
|
258
|
+
new runs are started: ``cmd_start`` for ``dccd start`` (called before
|
|
259
|
+
the scheduler starts stream workers); the FastAPI lifespan for
|
|
260
|
+
standalone ``dccd ui`` (called before the standalone scheduler is
|
|
261
|
+
created). Calling it while workers are already running would
|
|
262
|
+
incorrectly stale-out their legitimate active runs.
|
|
207
263
|
"""
|
|
208
264
|
import time
|
|
209
265
|
now = int(time.time() * 1_000_000_000)
|