loki-mode 7.56.0 → 7.58.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/SKILL.md +2 -2
- package/VERSION +1 -1
- package/autonomy/app-runner.sh +101 -0
- package/autonomy/lib/prd-enrich.sh +437 -0
- package/autonomy/loki +58 -9
- package/autonomy/run.sh +175 -60
- package/dashboard/__init__.py +1 -1
- package/dashboard/server.py +652 -194
- package/dashboard/static/index.html +164 -151
- package/docs/INSTALLATION.md +2 -2
- package/loki-ts/dist/loki.js +2 -2
- package/mcp/__init__.py +1 -1
- package/memory/consolidation.py +14 -2
- package/memory/retrieval.py +10 -0
- package/memory/storage.py +10 -0
- package/package.json +1 -1
- package/plugins/loki-mode/.claude-plugin/plugin.json +1 -1
- package/skills/quality-gates.md +135 -11
package/dashboard/server.py
CHANGED
|
@@ -11,6 +11,7 @@ import json
|
|
|
11
11
|
import logging
|
|
12
12
|
import os
|
|
13
13
|
import subprocess
|
|
14
|
+
import threading
|
|
14
15
|
import time
|
|
15
16
|
from collections import defaultdict
|
|
16
17
|
from dataclasses import asdict
|
|
@@ -3864,34 +3865,40 @@ async def get_memory_summary():
|
|
|
3864
3865
|
@app.get("/api/memory/episodes")
|
|
3865
3866
|
async def list_episodes(limit: int = Query(default=50, ge=1, le=1000)):
|
|
3866
3867
|
"""List episodic memory entries."""
|
|
3867
|
-
#
|
|
3868
|
-
|
|
3869
|
-
|
|
3870
|
-
|
|
3871
|
-
|
|
3872
|
-
|
|
3873
|
-
|
|
3874
|
-
ep = storage.load_episode(eid)
|
|
3875
|
-
if ep:
|
|
3876
|
-
episodes.append(ep)
|
|
3877
|
-
return episodes
|
|
3878
|
-
except Exception:
|
|
3879
|
-
pass
|
|
3880
|
-
|
|
3881
|
-
# Fallback to JSON files -- use heapq to avoid sorting all files
|
|
3882
|
-
import heapq
|
|
3883
|
-
ep_dir = _get_loki_dir() / "memory" / "episodic"
|
|
3884
|
-
episodes = []
|
|
3885
|
-
if ep_dir.exists():
|
|
3886
|
-
all_files = ep_dir.glob("*.json")
|
|
3887
|
-
# nlargest by filename (timestamps sort lexicographically) avoids full sort
|
|
3888
|
-
files = heapq.nlargest(limit, all_files, key=lambda f: f.name)
|
|
3889
|
-
for f in files:
|
|
3868
|
+
# Both backends below are blocking (SQLite queries / a glob+read loop over
|
|
3869
|
+
# many JSON files) and only build a local list, so offload the whole read
|
|
3870
|
+
# off the event loop to keep status + WS heartbeat responsive.
|
|
3871
|
+
def _load_episodes() -> list:
|
|
3872
|
+
# Try SQLite backend first
|
|
3873
|
+
storage = _get_memory_storage()
|
|
3874
|
+
if storage is not None:
|
|
3890
3875
|
try:
|
|
3891
|
-
|
|
3876
|
+
ids = storage.list_episodes(limit=limit)
|
|
3877
|
+
episodes = []
|
|
3878
|
+
for eid in ids:
|
|
3879
|
+
ep = storage.load_episode(eid)
|
|
3880
|
+
if ep:
|
|
3881
|
+
episodes.append(ep)
|
|
3882
|
+
return episodes
|
|
3892
3883
|
except Exception:
|
|
3893
3884
|
pass
|
|
3894
|
-
|
|
3885
|
+
|
|
3886
|
+
# Fallback to JSON files -- use heapq to avoid sorting all files
|
|
3887
|
+
import heapq
|
|
3888
|
+
ep_dir = _get_loki_dir() / "memory" / "episodic"
|
|
3889
|
+
episodes = []
|
|
3890
|
+
if ep_dir.exists():
|
|
3891
|
+
all_files = ep_dir.glob("*.json")
|
|
3892
|
+
# nlargest by filename (timestamps sort lexicographically) avoids full sort
|
|
3893
|
+
files = heapq.nlargest(limit, all_files, key=lambda f: f.name)
|
|
3894
|
+
for f in files:
|
|
3895
|
+
try:
|
|
3896
|
+
episodes.append(json.loads(f.read_text()))
|
|
3897
|
+
except Exception:
|
|
3898
|
+
pass
|
|
3899
|
+
return episodes
|
|
3900
|
+
|
|
3901
|
+
return await asyncio.to_thread(_load_episodes)
|
|
3895
3902
|
|
|
3896
3903
|
|
|
3897
3904
|
@app.get("/api/memory/episodes/{episode_id}", dependencies=[Depends(auth.require_scope("read"))])
|
|
@@ -3968,30 +3975,35 @@ async def get_pattern(pattern_id: str):
|
|
|
3968
3975
|
@app.get("/api/memory/skills")
|
|
3969
3976
|
async def list_skills():
|
|
3970
3977
|
"""List procedural skills."""
|
|
3971
|
-
#
|
|
3972
|
-
|
|
3973
|
-
|
|
3974
|
-
|
|
3975
|
-
|
|
3976
|
-
|
|
3977
|
-
for sid in ids:
|
|
3978
|
-
s = storage.load_skill(sid)
|
|
3979
|
-
if s:
|
|
3980
|
-
skills.append(s)
|
|
3981
|
-
return skills
|
|
3982
|
-
except Exception:
|
|
3983
|
-
pass
|
|
3984
|
-
|
|
3985
|
-
# Fallback to JSON
|
|
3986
|
-
skills_dir = _get_loki_dir() / "memory" / "skills"
|
|
3987
|
-
skills = []
|
|
3988
|
-
if skills_dir.exists():
|
|
3989
|
-
for f in sorted(skills_dir.glob("*.json")):
|
|
3978
|
+
# Blocking SQLite query / glob+read loop; offload the whole read so the
|
|
3979
|
+
# event loop (status + WS heartbeat) stays responsive.
|
|
3980
|
+
def _load_skills() -> list:
|
|
3981
|
+
# Try SQLite first
|
|
3982
|
+
storage = _get_memory_storage()
|
|
3983
|
+
if storage is not None:
|
|
3990
3984
|
try:
|
|
3991
|
-
|
|
3985
|
+
ids = storage.list_skills()
|
|
3986
|
+
skills = []
|
|
3987
|
+
for sid in ids:
|
|
3988
|
+
s = storage.load_skill(sid)
|
|
3989
|
+
if s:
|
|
3990
|
+
skills.append(s)
|
|
3991
|
+
return skills
|
|
3992
3992
|
except Exception:
|
|
3993
3993
|
pass
|
|
3994
|
-
|
|
3994
|
+
|
|
3995
|
+
# Fallback to JSON
|
|
3996
|
+
skills_dir = _get_loki_dir() / "memory" / "skills"
|
|
3997
|
+
skills = []
|
|
3998
|
+
if skills_dir.exists():
|
|
3999
|
+
for f in sorted(skills_dir.glob("*.json")):
|
|
4000
|
+
try:
|
|
4001
|
+
skills.append(json.loads(f.read_text()))
|
|
4002
|
+
except Exception:
|
|
4003
|
+
pass
|
|
4004
|
+
return skills
|
|
4005
|
+
|
|
4006
|
+
return await asyncio.to_thread(_load_skills)
|
|
3995
4007
|
|
|
3996
4008
|
|
|
3997
4009
|
@app.get("/api/memory/skills/{skill_id}", dependencies=[Depends(auth.require_scope("read"))])
|
|
@@ -4346,15 +4358,16 @@ async def get_memory_file(
|
|
|
4346
4358
|
st = target.stat()
|
|
4347
4359
|
except Exception:
|
|
4348
4360
|
raise HTTPException(status_code=500, detail="stat failed")
|
|
4349
|
-
truncated =
|
|
4361
|
+
truncated = st.st_size > _MEMORY_FILE_MAX_BYTES
|
|
4362
|
+
|
|
4363
|
+
def _read_memory_blob() -> bytes:
|
|
4364
|
+
# Up to a 2 MiB blocking read; offloaded so the single-worker event
|
|
4365
|
+
# loop (and /api/status + WS heartbeat) stays responsive.
|
|
4366
|
+
with open(target, "rb") as fh:
|
|
4367
|
+
return fh.read(_MEMORY_FILE_MAX_BYTES) if truncated else fh.read()
|
|
4368
|
+
|
|
4350
4369
|
try:
|
|
4351
|
-
|
|
4352
|
-
with open(target, "rb") as fh:
|
|
4353
|
-
raw = fh.read(_MEMORY_FILE_MAX_BYTES)
|
|
4354
|
-
truncated = True
|
|
4355
|
-
else:
|
|
4356
|
-
with open(target, "rb") as fh:
|
|
4357
|
-
raw = fh.read()
|
|
4370
|
+
raw = await asyncio.to_thread(_read_memory_blob)
|
|
4358
4371
|
# Decode as UTF-8 with replacement so we never 500 on a stray byte.
|
|
4359
4372
|
content = raw.decode("utf-8", errors="replace")
|
|
4360
4373
|
except HTTPException:
|
|
@@ -4432,44 +4445,49 @@ async def search_memory(
|
|
|
4432
4445
|
@app.get("/api/memory/stats")
|
|
4433
4446
|
async def get_memory_stats():
|
|
4434
4447
|
"""Get memory system statistics (counts, size, backend info)."""
|
|
4435
|
-
|
|
4436
|
-
|
|
4437
|
-
|
|
4438
|
-
|
|
4439
|
-
|
|
4440
|
-
|
|
4448
|
+
# SQLite stats query or a directory-walk over many JSON files; both block,
|
|
4449
|
+
# so offload off the event loop.
|
|
4450
|
+
def _compute_stats() -> dict:
|
|
4451
|
+
storage = _get_memory_storage()
|
|
4452
|
+
if storage is not None:
|
|
4453
|
+
try:
|
|
4454
|
+
return storage.get_stats()
|
|
4455
|
+
except Exception:
|
|
4456
|
+
pass
|
|
4441
4457
|
|
|
4442
|
-
|
|
4443
|
-
|
|
4444
|
-
|
|
4445
|
-
|
|
4446
|
-
|
|
4447
|
-
|
|
4448
|
-
|
|
4449
|
-
|
|
4450
|
-
|
|
4451
|
-
|
|
4452
|
-
|
|
4453
|
-
|
|
4454
|
-
|
|
4455
|
-
|
|
4456
|
-
|
|
4457
|
-
|
|
4458
|
-
|
|
4459
|
-
|
|
4460
|
-
|
|
4458
|
+
# Fallback: compute stats from JSON files
|
|
4459
|
+
memory_dir = _get_loki_dir() / "memory"
|
|
4460
|
+
ep_count = 0
|
|
4461
|
+
ep_dir = memory_dir / "episodic"
|
|
4462
|
+
if ep_dir.exists():
|
|
4463
|
+
for d in ep_dir.iterdir():
|
|
4464
|
+
if d.is_dir():
|
|
4465
|
+
ep_count += len(list(d.glob("*.json")))
|
|
4466
|
+
elif d.suffix == ".json":
|
|
4467
|
+
ep_count += 1
|
|
4468
|
+
|
|
4469
|
+
pat_count = 0
|
|
4470
|
+
patterns_file = memory_dir / "semantic" / "patterns.json"
|
|
4471
|
+
if patterns_file.exists():
|
|
4472
|
+
try:
|
|
4473
|
+
data = json.loads(patterns_file.read_text())
|
|
4474
|
+
pat_count = len(data) if isinstance(data, list) else len(data.get("patterns", []))
|
|
4475
|
+
except Exception:
|
|
4476
|
+
pass
|
|
4461
4477
|
|
|
4462
|
-
|
|
4463
|
-
|
|
4464
|
-
|
|
4465
|
-
|
|
4478
|
+
skill_count = 0
|
|
4479
|
+
skills_dir = memory_dir / "skills"
|
|
4480
|
+
if skills_dir.exists():
|
|
4481
|
+
skill_count = len(list(skills_dir.glob("*.json")))
|
|
4466
4482
|
|
|
4467
|
-
|
|
4468
|
-
|
|
4469
|
-
|
|
4470
|
-
|
|
4471
|
-
|
|
4472
|
-
|
|
4483
|
+
return {
|
|
4484
|
+
"backend": "json",
|
|
4485
|
+
"episode_count": ep_count,
|
|
4486
|
+
"pattern_count": pat_count,
|
|
4487
|
+
"skill_count": skill_count,
|
|
4488
|
+
}
|
|
4489
|
+
|
|
4490
|
+
return await asyncio.to_thread(_compute_stats)
|
|
4473
4491
|
|
|
4474
4492
|
|
|
4475
4493
|
# Learning/metrics endpoints
|
|
@@ -4515,10 +4533,10 @@ async def get_learning_metrics(
|
|
|
4515
4533
|
source: Optional[str] = None,
|
|
4516
4534
|
):
|
|
4517
4535
|
"""Get learning metrics from events, metrics files, and learning signals."""
|
|
4518
|
-
events = _read_events
|
|
4536
|
+
events = await asyncio.to_thread(_read_events, timeRange)
|
|
4519
4537
|
|
|
4520
4538
|
# Also read from learning signals directory
|
|
4521
|
-
all_signals = _read_learning_signals
|
|
4539
|
+
all_signals = await asyncio.to_thread(_read_learning_signals, limit=10000)
|
|
4522
4540
|
|
|
4523
4541
|
# Filter by type and source
|
|
4524
4542
|
if signalType:
|
|
@@ -4595,7 +4613,7 @@ async def get_learning_trends(
|
|
|
4595
4613
|
source: Optional[str] = None,
|
|
4596
4614
|
):
|
|
4597
4615
|
"""Get learning trend data."""
|
|
4598
|
-
events = _read_events
|
|
4616
|
+
events = await asyncio.to_thread(_read_events, timeRange)
|
|
4599
4617
|
# Group by hour for trend data
|
|
4600
4618
|
by_hour: dict = {}
|
|
4601
4619
|
for e in events:
|
|
@@ -4617,14 +4635,14 @@ async def get_learning_signals(
|
|
|
4617
4635
|
offset: int = Query(default=0, ge=0),
|
|
4618
4636
|
):
|
|
4619
4637
|
"""Get raw learning signals from both events.jsonl and learning signals directory."""
|
|
4620
|
-
events = _read_events
|
|
4638
|
+
events = await asyncio.to_thread(_read_events, timeRange)
|
|
4621
4639
|
if signalType:
|
|
4622
4640
|
events = [e for e in events if e.get("type") == signalType]
|
|
4623
4641
|
if source:
|
|
4624
4642
|
events = [e for e in events if e.get("data", {}).get("source") == source]
|
|
4625
4643
|
|
|
4626
4644
|
# Also read from learning signals directory
|
|
4627
|
-
file_signals = _read_learning_signals
|
|
4645
|
+
file_signals = await asyncio.to_thread(_read_learning_signals, signal_type=signalType, limit=10000)
|
|
4628
4646
|
if source:
|
|
4629
4647
|
file_signals = [s for s in file_signals if s.get("source") == source]
|
|
4630
4648
|
|
|
@@ -4648,10 +4666,10 @@ async def get_learning_aggregation():
|
|
|
4648
4666
|
pass
|
|
4649
4667
|
|
|
4650
4668
|
# Supplement with live data from learning signals directory
|
|
4651
|
-
success_signals = _read_learning_signals
|
|
4652
|
-
tool_signals = _read_learning_signals
|
|
4653
|
-
error_signals = _read_learning_signals
|
|
4654
|
-
pref_signals = _read_learning_signals
|
|
4669
|
+
success_signals = await asyncio.to_thread(_read_learning_signals, signal_type="success_pattern", limit=500)
|
|
4670
|
+
tool_signals = await asyncio.to_thread(_read_learning_signals, signal_type="tool_efficiency", limit=500)
|
|
4671
|
+
error_signals = await asyncio.to_thread(_read_learning_signals, signal_type="error_pattern", limit=500)
|
|
4672
|
+
pref_signals = await asyncio.to_thread(_read_learning_signals, signal_type="user_preference", limit=500)
|
|
4655
4673
|
|
|
4656
4674
|
# Merge success patterns from signals if aggregation file had none
|
|
4657
4675
|
if not result.get("success_patterns") and success_signals:
|
|
@@ -4725,6 +4743,14 @@ async def trigger_aggregation():
|
|
|
4725
4743
|
if not _read_limiter.check("learning_aggregate"):
|
|
4726
4744
|
raise HTTPException(status_code=429, detail="Rate limit exceeded")
|
|
4727
4745
|
|
|
4746
|
+
# Reads up to 10 MB of events.jsonl, parses every line, then writes the
|
|
4747
|
+
# aggregation.json metrics file. All blocking, all on local state +
|
|
4748
|
+
# filesystem (no shared in-memory state), so offload the whole computation
|
|
4749
|
+
# to a thread to keep the event loop (status + WS heartbeat) responsive.
|
|
4750
|
+
return await asyncio.to_thread(_compute_learning_aggregation)
|
|
4751
|
+
|
|
4752
|
+
|
|
4753
|
+
def _compute_learning_aggregation() -> dict:
|
|
4728
4754
|
events_file = _get_loki_dir() / "events.jsonl"
|
|
4729
4755
|
preferences: dict = {}
|
|
4730
4756
|
error_patterns: dict = {}
|
|
@@ -4820,10 +4846,10 @@ async def trigger_aggregation():
|
|
|
4820
4846
|
@app.get("/api/learning/preferences", dependencies=[Depends(auth.require_scope("read"))])
|
|
4821
4847
|
async def get_learning_preferences(limit: int = Query(default=50, ge=1, le=1000)):
|
|
4822
4848
|
"""Get aggregated user preferences from events and learning signals directory."""
|
|
4823
|
-
events = _read_events
|
|
4849
|
+
events = await asyncio.to_thread(_read_events, "30d")
|
|
4824
4850
|
prefs = [e for e in events if e.get("type") == "user_preference"]
|
|
4825
4851
|
# Also read from learning signals directory
|
|
4826
|
-
file_prefs = _read_learning_signals
|
|
4852
|
+
file_prefs = await asyncio.to_thread(_read_learning_signals, signal_type="user_preference", limit=limit)
|
|
4827
4853
|
combined = prefs + file_prefs
|
|
4828
4854
|
combined.sort(key=lambda s: s.get("timestamp", ""), reverse=True)
|
|
4829
4855
|
return combined[:limit]
|
|
@@ -4832,10 +4858,10 @@ async def get_learning_preferences(limit: int = Query(default=50, ge=1, le=1000)
|
|
|
4832
4858
|
@app.get("/api/learning/errors", dependencies=[Depends(auth.require_scope("read"))])
|
|
4833
4859
|
async def get_learning_errors(limit: int = Query(default=50, ge=1, le=1000)):
|
|
4834
4860
|
"""Get aggregated error patterns from events and learning signals directory."""
|
|
4835
|
-
events = _read_events
|
|
4861
|
+
events = await asyncio.to_thread(_read_events, "30d")
|
|
4836
4862
|
errors = [e for e in events if e.get("type") == "error_pattern"]
|
|
4837
4863
|
# Also read from learning signals directory
|
|
4838
|
-
file_errors = _read_learning_signals
|
|
4864
|
+
file_errors = await asyncio.to_thread(_read_learning_signals, signal_type="error_pattern", limit=limit)
|
|
4839
4865
|
combined = errors + file_errors
|
|
4840
4866
|
combined.sort(key=lambda s: s.get("timestamp", ""), reverse=True)
|
|
4841
4867
|
return combined[:limit]
|
|
@@ -4844,10 +4870,10 @@ async def get_learning_errors(limit: int = Query(default=50, ge=1, le=1000)):
|
|
|
4844
4870
|
@app.get("/api/learning/success", dependencies=[Depends(auth.require_scope("read"))])
|
|
4845
4871
|
async def get_learning_success(limit: int = Query(default=50, ge=1, le=1000)):
|
|
4846
4872
|
"""Get aggregated success patterns from events and learning signals directory."""
|
|
4847
|
-
events = _read_events
|
|
4873
|
+
events = await asyncio.to_thread(_read_events, "30d")
|
|
4848
4874
|
successes = [e for e in events if e.get("type") == "success_pattern"]
|
|
4849
4875
|
# Also read from learning signals directory
|
|
4850
|
-
file_successes = _read_learning_signals
|
|
4876
|
+
file_successes = await asyncio.to_thread(_read_learning_signals, signal_type="success_pattern", limit=limit)
|
|
4851
4877
|
combined = successes + file_successes
|
|
4852
4878
|
combined.sort(key=lambda s: s.get("timestamp", ""), reverse=True)
|
|
4853
4879
|
return combined[:limit]
|
|
@@ -4856,10 +4882,10 @@ async def get_learning_success(limit: int = Query(default=50, ge=1, le=1000)):
|
|
|
4856
4882
|
@app.get("/api/learning/tools", dependencies=[Depends(auth.require_scope("read"))])
|
|
4857
4883
|
async def get_tool_efficiency(limit: int = Query(default=50, ge=1, le=1000)):
|
|
4858
4884
|
"""Get tool efficiency rankings from events and learning signals directory."""
|
|
4859
|
-
events = _read_events
|
|
4885
|
+
events = await asyncio.to_thread(_read_events, "30d")
|
|
4860
4886
|
tools = [e for e in events if e.get("type") == "tool_efficiency"]
|
|
4861
4887
|
# Also read from learning signals directory
|
|
4862
|
-
file_tools = _read_learning_signals
|
|
4888
|
+
file_tools = await asyncio.to_thread(_read_learning_signals, signal_type="tool_efficiency", limit=limit)
|
|
4863
4889
|
combined = tools + file_tools
|
|
4864
4890
|
combined.sort(key=lambda s: s.get("timestamp", ""), reverse=True)
|
|
4865
4891
|
return combined[:limit]
|
|
@@ -5203,7 +5229,16 @@ def _calculate_model_cost(model: str, input_tokens: int, output_tokens: int) ->
|
|
|
5203
5229
|
|
|
5204
5230
|
@app.get("/api/cost")
|
|
5205
5231
|
async def get_cost():
|
|
5206
|
-
"""Get cost visibility data from .loki/metrics/efficiency/ and budget.json.
|
|
5232
|
+
"""Get cost visibility data from .loki/metrics/efficiency/ and budget.json.
|
|
5233
|
+
|
|
5234
|
+
The computation globs + reads every per-iteration efficiency JSON file
|
|
5235
|
+
(a blocking multi-file read loop building only local aggregates), so it is
|
|
5236
|
+
offloaded to a thread to keep the event loop responsive.
|
|
5237
|
+
"""
|
|
5238
|
+
return await asyncio.to_thread(_compute_cost_snapshot)
|
|
5239
|
+
|
|
5240
|
+
|
|
5241
|
+
def _compute_cost_snapshot() -> dict:
|
|
5207
5242
|
loki_dir = _get_loki_dir()
|
|
5208
5243
|
efficiency_dir = loki_dir / "metrics" / "efficiency"
|
|
5209
5244
|
budget_file = loki_dir / "metrics" / "budget.json"
|
|
@@ -5470,7 +5505,15 @@ async def get_cost_timeline():
|
|
|
5470
5505
|
classifies into ok/warn/exceeded so the UI can warn at 80% before the cap.
|
|
5471
5506
|
Cost is never fabricated: when nothing was recorded, cost_recorded is False
|
|
5472
5507
|
and totals are honestly null rather than a misleading $0.00.
|
|
5508
|
+
|
|
5509
|
+
Globs + reads every efficiency iteration file and every proof.json (a
|
|
5510
|
+
blocking multi-file read loop building only local state), so it is offloaded
|
|
5511
|
+
to a thread to keep the event loop responsive.
|
|
5473
5512
|
"""
|
|
5513
|
+
return await asyncio.to_thread(_compute_cost_timeline)
|
|
5514
|
+
|
|
5515
|
+
|
|
5516
|
+
def _compute_cost_timeline() -> dict:
|
|
5474
5517
|
loki_dir = _get_loki_dir()
|
|
5475
5518
|
efficiency_dir = loki_dir / "metrics" / "efficiency"
|
|
5476
5519
|
|
|
@@ -5729,51 +5772,59 @@ async def get_council_state():
|
|
|
5729
5772
|
|
|
5730
5773
|
@app.get("/api/council/verdicts")
|
|
5731
5774
|
async def get_council_verdicts(limit: int = Query(default=20, ge=1, le=1000)):
|
|
5732
|
-
"""Get council vote history (decision log).
|
|
5733
|
-
state_file = _get_loki_dir() / "council" / "state.json"
|
|
5734
|
-
verdicts = []
|
|
5735
|
-
if state_file.exists():
|
|
5736
|
-
try:
|
|
5737
|
-
state = json.loads(state_file.read_text())
|
|
5738
|
-
verdicts = state.get("verdicts", [])
|
|
5739
|
-
except Exception:
|
|
5740
|
-
pass
|
|
5775
|
+
"""Get council vote history (decision log).
|
|
5741
5776
|
|
|
5742
|
-
|
|
5743
|
-
|
|
5744
|
-
|
|
5745
|
-
|
|
5746
|
-
|
|
5747
|
-
|
|
5748
|
-
|
|
5749
|
-
|
|
5750
|
-
|
|
5751
|
-
|
|
5752
|
-
|
|
5753
|
-
|
|
5754
|
-
|
|
5755
|
-
verdict_detail["evidence_preview"] = ""
|
|
5756
|
-
# Read member votes
|
|
5757
|
-
members = []
|
|
5758
|
-
for member_file in sorted(vote_dir.glob("member-*.txt")):
|
|
5759
|
-
try:
|
|
5760
|
-
content = member_file.read_text().strip()
|
|
5761
|
-
members.append({
|
|
5762
|
-
"member": member_file.stem,
|
|
5763
|
-
"content": content
|
|
5764
|
-
})
|
|
5765
|
-
except Exception:
|
|
5766
|
-
pass
|
|
5767
|
-
verdict_detail["members"] = members
|
|
5768
|
-
# Read contrarian
|
|
5769
|
-
contrarian_file = vote_dir / "contrarian.txt"
|
|
5770
|
-
if contrarian_file.exists():
|
|
5771
|
-
verdict_detail["contrarian"] = contrarian_file.read_text().strip()
|
|
5772
|
-
detailed_verdicts.append(verdict_detail)
|
|
5773
|
-
if len(detailed_verdicts) >= limit:
|
|
5774
|
-
break
|
|
5777
|
+
Walks every vote directory and reads its evidence/member/contrarian files
|
|
5778
|
+
(a blocking multi-file read loop building only local state), so it is
|
|
5779
|
+
offloaded to a thread to keep the event loop responsive.
|
|
5780
|
+
"""
|
|
5781
|
+
def _collect_verdicts() -> dict:
|
|
5782
|
+
state_file = _get_loki_dir() / "council" / "state.json"
|
|
5783
|
+
verdicts = []
|
|
5784
|
+
if state_file.exists():
|
|
5785
|
+
try:
|
|
5786
|
+
state = json.loads(state_file.read_text())
|
|
5787
|
+
verdicts = state.get("verdicts", [])
|
|
5788
|
+
except Exception:
|
|
5789
|
+
pass
|
|
5775
5790
|
|
|
5776
|
-
|
|
5791
|
+
# Also read individual vote files for detail
|
|
5792
|
+
votes_dir = _get_loki_dir() / "council" / "votes"
|
|
5793
|
+
detailed_verdicts = []
|
|
5794
|
+
if votes_dir.exists():
|
|
5795
|
+
for vote_dir in sorted(votes_dir.iterdir(), reverse=True):
|
|
5796
|
+
if vote_dir.is_dir():
|
|
5797
|
+
verdict_detail = {"iteration": vote_dir.name}
|
|
5798
|
+
# Read evidence
|
|
5799
|
+
evidence_file = vote_dir / "evidence.md"
|
|
5800
|
+
if evidence_file.exists():
|
|
5801
|
+
try:
|
|
5802
|
+
verdict_detail["evidence_preview"] = evidence_file.read_text()[:500]
|
|
5803
|
+
except Exception:
|
|
5804
|
+
verdict_detail["evidence_preview"] = ""
|
|
5805
|
+
# Read member votes
|
|
5806
|
+
members = []
|
|
5807
|
+
for member_file in sorted(vote_dir.glob("member-*.txt")):
|
|
5808
|
+
try:
|
|
5809
|
+
content = member_file.read_text().strip()
|
|
5810
|
+
members.append({
|
|
5811
|
+
"member": member_file.stem,
|
|
5812
|
+
"content": content
|
|
5813
|
+
})
|
|
5814
|
+
except Exception:
|
|
5815
|
+
pass
|
|
5816
|
+
verdict_detail["members"] = members
|
|
5817
|
+
# Read contrarian
|
|
5818
|
+
contrarian_file = vote_dir / "contrarian.txt"
|
|
5819
|
+
if contrarian_file.exists():
|
|
5820
|
+
verdict_detail["contrarian"] = contrarian_file.read_text().strip()
|
|
5821
|
+
detailed_verdicts.append(verdict_detail)
|
|
5822
|
+
if len(detailed_verdicts) >= limit:
|
|
5823
|
+
break
|
|
5824
|
+
|
|
5825
|
+
return {"verdicts": verdicts, "details": detailed_verdicts}
|
|
5826
|
+
|
|
5827
|
+
return await asyncio.to_thread(_collect_verdicts)
|
|
5777
5828
|
|
|
5778
5829
|
|
|
5779
5830
|
@app.get("/api/council/convergence")
|
|
@@ -5848,35 +5899,41 @@ async def get_council_transcripts(
|
|
|
5848
5899
|
if not transcripts_dir.exists():
|
|
5849
5900
|
response: dict = {"transcripts": [], "total": 0, "latest_id": None}
|
|
5850
5901
|
if type_prefix:
|
|
5851
|
-
response["hook_events"] = _read_events
|
|
5902
|
+
response["hook_events"] = await asyncio.to_thread(_read_events, type_prefix=type_prefix)
|
|
5852
5903
|
return response
|
|
5853
5904
|
|
|
5854
|
-
|
|
5855
|
-
|
|
5856
|
-
|
|
5857
|
-
|
|
5858
|
-
|
|
5859
|
-
logger.warning("Skipping corrupt council transcript file: %s", f.name)
|
|
5860
|
-
continue
|
|
5861
|
-
if not isinstance(rec, dict):
|
|
5862
|
-
logger.warning("Skipping non-object council transcript file: %s", f.name)
|
|
5863
|
-
continue
|
|
5864
|
-
if not isinstance(rec.get("iteration_id"), str):
|
|
5865
|
-
logger.warning("Skipping transcript missing iteration_id field: %s", f.name)
|
|
5866
|
-
continue
|
|
5867
|
-
if since_dt is not None:
|
|
5868
|
-
ts_str = rec.get("timestamp", "")
|
|
5905
|
+
def _collect_transcript_records() -> list:
|
|
5906
|
+
# Globs + reads up to `limit` (<=200) JSON transcript files; a blocking
|
|
5907
|
+
# multi-file read loop offloaded so the event loop stays responsive.
|
|
5908
|
+
out: list = []
|
|
5909
|
+
for f in sorted(transcripts_dir.glob("iter-*.json"), reverse=True):
|
|
5869
5910
|
try:
|
|
5870
|
-
|
|
5871
|
-
except
|
|
5911
|
+
rec = json.loads(f.read_text())
|
|
5912
|
+
except Exception:
|
|
5913
|
+
logger.warning("Skipping corrupt council transcript file: %s", f.name)
|
|
5872
5914
|
continue
|
|
5873
|
-
if
|
|
5915
|
+
if not isinstance(rec, dict):
|
|
5916
|
+
logger.warning("Skipping non-object council transcript file: %s", f.name)
|
|
5874
5917
|
continue
|
|
5875
|
-
|
|
5876
|
-
|
|
5877
|
-
|
|
5878
|
-
|
|
5879
|
-
|
|
5918
|
+
if not isinstance(rec.get("iteration_id"), str):
|
|
5919
|
+
logger.warning("Skipping transcript missing iteration_id field: %s", f.name)
|
|
5920
|
+
continue
|
|
5921
|
+
if since_dt is not None:
|
|
5922
|
+
ts_str = rec.get("timestamp", "")
|
|
5923
|
+
try:
|
|
5924
|
+
ts = datetime.fromisoformat(ts_str.replace("Z", "+00:00"))
|
|
5925
|
+
except (ValueError, AttributeError):
|
|
5926
|
+
continue
|
|
5927
|
+
if ts <= since_dt:
|
|
5928
|
+
continue
|
|
5929
|
+
if iter_min is not None and rec.get("iteration", 0) < iter_min:
|
|
5930
|
+
continue
|
|
5931
|
+
out.append(rec)
|
|
5932
|
+
if len(out) >= limit:
|
|
5933
|
+
break
|
|
5934
|
+
return out
|
|
5935
|
+
|
|
5936
|
+
records = await asyncio.to_thread(_collect_transcript_records)
|
|
5880
5937
|
|
|
5881
5938
|
response = {
|
|
5882
5939
|
"transcripts": records,
|
|
@@ -5885,7 +5942,7 @@ async def get_council_transcripts(
|
|
|
5885
5942
|
}
|
|
5886
5943
|
# v7.5.22 Phase D: opt-in hook-event passthrough via _read_events filter.
|
|
5887
5944
|
if type_prefix:
|
|
5888
|
-
response["hook_events"] = _read_events
|
|
5945
|
+
response["hook_events"] = await asyncio.to_thread(_read_events, type_prefix=type_prefix)
|
|
5889
5946
|
return response
|
|
5890
5947
|
|
|
5891
5948
|
|
|
@@ -6106,7 +6163,16 @@ def _sanitize_checkpoint_id(checkpoint_id: str) -> str:
|
|
|
6106
6163
|
|
|
6107
6164
|
@app.get("/api/checkpoints")
|
|
6108
6165
|
async def list_checkpoints(limit: int = Query(default=20, ge=1, le=200)):
|
|
6109
|
-
"""List recent checkpoints from index.jsonl, enriched with metadata when available.
|
|
6166
|
+
"""List recent checkpoints from index.jsonl, enriched with metadata when available.
|
|
6167
|
+
|
|
6168
|
+
Reads index.jsonl plus a metadata.json and a recursive rglob() file count
|
|
6169
|
+
per checkpoint (a blocking multi-file walk building only local state), so
|
|
6170
|
+
it is offloaded to a thread to keep the event loop responsive.
|
|
6171
|
+
"""
|
|
6172
|
+
return await asyncio.to_thread(_collect_checkpoints, limit)
|
|
6173
|
+
|
|
6174
|
+
|
|
6175
|
+
def _collect_checkpoints(limit: int) -> list:
|
|
6110
6176
|
loki_dir = _get_loki_dir()
|
|
6111
6177
|
index_file = loki_dir / "state" / "checkpoints" / "index.jsonl"
|
|
6112
6178
|
checkpoints_dir = loki_dir / "state" / "checkpoints"
|
|
@@ -6557,17 +6623,18 @@ async def get_logs(lines: int = 100, token: Optional[dict] = Depends(auth.get_cu
|
|
|
6557
6623
|
file_mtime = datetime.fromtimestamp(log_file.stat().st_mtime, tz=timezone.utc).strftime(
|
|
6558
6624
|
"%Y-%m-%dT%H:%M:%S"
|
|
6559
6625
|
)
|
|
6560
|
-
# Read only the tail to avoid loading huge files into memory
|
|
6561
|
-
|
|
6562
|
-
|
|
6563
|
-
|
|
6564
|
-
|
|
6626
|
+
# Read only the tail to avoid loading huge files into memory.
|
|
6627
|
+
# The up-to-1MB blocking read is offloaded to a thread so the
|
|
6628
|
+
# single-worker event loop (status + WS heartbeat) stays free.
|
|
6629
|
+
def _read_log_tail(lf_path=log_file, n=lines) -> list[str]:
|
|
6630
|
+
with open(lf_path, "rb") as lf:
|
|
6565
6631
|
lf.seek(0, 2)
|
|
6566
6632
|
file_size = lf.tell()
|
|
6567
|
-
# Read at most 1MB from the end (plenty for any reasonable lines count)
|
|
6568
6633
|
read_size = min(file_size, 1024 * 1024)
|
|
6569
6634
|
lf.seek(max(0, file_size - read_size))
|
|
6570
|
-
|
|
6635
|
+
return lf.read().decode("utf-8", errors="replace").strip().split("\n")[-n:]
|
|
6636
|
+
try:
|
|
6637
|
+
tail_lines = await asyncio.to_thread(_read_log_tail)
|
|
6571
6638
|
except (OSError, UnicodeDecodeError):
|
|
6572
6639
|
tail_lines = []
|
|
6573
6640
|
for raw_line in tail_lines:
|
|
@@ -7599,18 +7666,397 @@ def _reconcile_app_runner_liveness(state):
|
|
|
7599
7666
|
return state
|
|
7600
7667
|
|
|
7601
7668
|
|
|
7669
|
+
# =============================================================================
|
|
7670
|
+
# Docker-compose app-runner discovery
|
|
7671
|
+
#
|
|
7672
|
+
# When the autonomous agent brings up a docker-compose stack itself (rather than
|
|
7673
|
+
# via autonomy/app-runner.sh), no .loki/app-runner/state.json is written, so the
|
|
7674
|
+
# status endpoint reports "not_initialized" / "stopped" even though the app is
|
|
7675
|
+
# genuinely running. The discovery helper below inspects the live compose stack
|
|
7676
|
+
# for the project directory and synthesizes an equivalent status so the dashboard
|
|
7677
|
+
# App Runner panel surfaces the running app and its URL.
|
|
7678
|
+
#
|
|
7679
|
+
# Safety contract (all mandatory):
|
|
7680
|
+
# - Every docker subprocess.run has an explicit timeout; total work is bounded.
|
|
7681
|
+
# - On ANY error (TimeoutExpired/OSError/SubprocessError/parse failure) the
|
|
7682
|
+
# helper returns None and the caller falls back to its prior behavior. The
|
|
7683
|
+
# handler never raises and never blocks the event loop (it is offloaded via
|
|
7684
|
+
# asyncio.to_thread / run_in_threadpool).
|
|
7685
|
+
# - A short TTL cache prevents the 3s/5s dashboard pollers from spawning
|
|
7686
|
+
# repeated docker invocations.
|
|
7687
|
+
# - A URL is never fabricated for a non-running or non-published container.
|
|
7688
|
+
# =============================================================================
|
|
7689
|
+
|
|
7690
|
+
# Common host ports a web service typically publishes, in precedence order.
|
|
7691
|
+
# Mirrors autonomy/app-runner.sh _identify_compose_web_service (COMMON list).
|
|
7692
|
+
_COMPOSE_COMMON_WEB_PORTS = ["3000", "8000", "8080", "5000", "4200", "5173", "80"]
|
|
7693
|
+
|
|
7694
|
+
# Per-docker-call timeout (seconds). Several calls run in sequence; keep each
|
|
7695
|
+
# tight so total discovery stays bounded well under the poller interval.
|
|
7696
|
+
_COMPOSE_DISCOVERY_CMD_TIMEOUT = 3
|
|
7697
|
+
|
|
7698
|
+
# TTL (seconds) for the discovery result cache, keyed by resolved project dir.
|
|
7699
|
+
# The dashboard polls every 3-5s; a 2.5s TTL collapses a burst of concurrent
|
|
7700
|
+
# pollers onto a single docker probe without making the status feel stale.
|
|
7701
|
+
_COMPOSE_DISCOVERY_TTL_SECONDS = 2.5
|
|
7702
|
+
|
|
7703
|
+
# Cache: {project_dir_str: (expiry_epoch, result_or_None)}. Module-level so it
|
|
7704
|
+
# survives across requests. Guarded by a lock because to_thread runs the sync
|
|
7705
|
+
# helper on worker threads that can overlap.
|
|
7706
|
+
_compose_discovery_cache: dict[str, tuple[float, Optional[dict]]] = {}
|
|
7707
|
+
_compose_discovery_lock = threading.Lock()
|
|
7708
|
+
|
|
7709
|
+
|
|
7710
|
+
def _parse_docker_json(raw):
|
|
7711
|
+
"""Parse docker --format json output into a list of dicts, defensively.
|
|
7712
|
+
|
|
7713
|
+
Docker emits either a single JSON array or newline-delimited JSON (one
|
|
7714
|
+
object per line), and the shape has varied across docker/compose versions.
|
|
7715
|
+
Try a whole-blob parse first; if that fails or does not yield a list, fall
|
|
7716
|
+
back to parsing each non-empty line individually. Returns a list of dicts
|
|
7717
|
+
(possibly empty). Never raises.
|
|
7718
|
+
"""
|
|
7719
|
+
raw = (raw or "").strip()
|
|
7720
|
+
if not raw:
|
|
7721
|
+
return []
|
|
7722
|
+
try:
|
|
7723
|
+
parsed = json.loads(raw)
|
|
7724
|
+
if isinstance(parsed, list):
|
|
7725
|
+
return [x for x in parsed if isinstance(x, dict)]
|
|
7726
|
+
if isinstance(parsed, dict):
|
|
7727
|
+
return [parsed]
|
|
7728
|
+
except (ValueError, TypeError):
|
|
7729
|
+
pass
|
|
7730
|
+
items = []
|
|
7731
|
+
for line in raw.splitlines():
|
|
7732
|
+
line = line.strip()
|
|
7733
|
+
if not line:
|
|
7734
|
+
continue
|
|
7735
|
+
try:
|
|
7736
|
+
obj = json.loads(line)
|
|
7737
|
+
except (ValueError, TypeError):
|
|
7738
|
+
continue
|
|
7739
|
+
if isinstance(obj, dict):
|
|
7740
|
+
items.append(obj)
|
|
7741
|
+
return items
|
|
7742
|
+
|
|
7743
|
+
|
|
7744
|
+
def _run_docker_json(args, cwd=None):
|
|
7745
|
+
"""Run a docker command and return parsed JSON rows, or None on any failure.
|
|
7746
|
+
|
|
7747
|
+
args is the argument list AFTER `docker` (e.g. ["compose", "ps", ...]). Uses
|
|
7748
|
+
an explicit per-call timeout and a list argv (no shell). A non-zero exit,
|
|
7749
|
+
timeout, missing docker binary, or unparseable output all yield None so the
|
|
7750
|
+
caller fails open.
|
|
7751
|
+
"""
|
|
7752
|
+
try:
|
|
7753
|
+
proc = subprocess.run(
|
|
7754
|
+
["docker", *args],
|
|
7755
|
+
capture_output=True,
|
|
7756
|
+
text=True,
|
|
7757
|
+
timeout=_COMPOSE_DISCOVERY_CMD_TIMEOUT,
|
|
7758
|
+
cwd=str(cwd) if cwd else None,
|
|
7759
|
+
)
|
|
7760
|
+
except (OSError, subprocess.SubprocessError):
|
|
7761
|
+
return None
|
|
7762
|
+
if proc.returncode != 0:
|
|
7763
|
+
return None
|
|
7764
|
+
return _parse_docker_json(proc.stdout)
|
|
7765
|
+
|
|
7766
|
+
|
|
7767
|
+
def _compose_published_ports(container):
|
|
7768
|
+
"""Host ports actually published by a running compose container (compose ps).
|
|
7769
|
+
|
|
7770
|
+
`docker compose ps --format json` exposes published ports under the
|
|
7771
|
+
"Publishers" list, each like {"PublishedPort": 3000, "TargetPort": 3000,
|
|
7772
|
+
"Protocol": "tcp", "URL": "0.0.0.0"}. A PublishedPort of 0 means the port is
|
|
7773
|
+
exposed but not published to the host, so it is filtered out. Returns a list
|
|
7774
|
+
of host port strings, preserving order. Never raises.
|
|
7775
|
+
"""
|
|
7776
|
+
out = []
|
|
7777
|
+
pubs = container.get("Publishers")
|
|
7778
|
+
if not isinstance(pubs, list):
|
|
7779
|
+
return out
|
|
7780
|
+
for p in pubs:
|
|
7781
|
+
if not isinstance(p, dict):
|
|
7782
|
+
continue
|
|
7783
|
+
port = p.get("PublishedPort")
|
|
7784
|
+
try:
|
|
7785
|
+
port = int(port)
|
|
7786
|
+
except (TypeError, ValueError):
|
|
7787
|
+
continue
|
|
7788
|
+
if port > 0:
|
|
7789
|
+
out.append(str(port))
|
|
7790
|
+
return out
|
|
7791
|
+
|
|
7792
|
+
|
|
7793
|
+
def _compose_service_labels(svc):
|
|
7794
|
+
"""Normalize a compose-config service's labels into a dict. Never raises."""
|
|
7795
|
+
labels = svc.get("labels") or {}
|
|
7796
|
+
if isinstance(labels, dict):
|
|
7797
|
+
return labels
|
|
7798
|
+
if isinstance(labels, list):
|
|
7799
|
+
normalized = {}
|
|
7800
|
+
for item in labels:
|
|
7801
|
+
if isinstance(item, str) and "=" in item:
|
|
7802
|
+
k, v = item.split("=", 1)
|
|
7803
|
+
normalized[k] = v
|
|
7804
|
+
return normalized
|
|
7805
|
+
return {}
|
|
7806
|
+
|
|
7807
|
+
|
|
7808
|
+
def _identify_compose_web_service(config_services, running_by_service):
|
|
7809
|
+
"""Pick the primary web service and its published host port.
|
|
7810
|
+
|
|
7811
|
+
Mirrors the precedence in autonomy/app-runner.sh:431-481:
|
|
7812
|
+
(1) service labelled loki.primary=true
|
|
7813
|
+
(2) service named web/app
|
|
7814
|
+
(3) service publishing a common web port (3000/8000/8080/5000/4200/5173/80)
|
|
7815
|
+
(4) first service with any published port
|
|
7816
|
+
Declared names/labels come from `docker compose config`; the actual runtime
|
|
7817
|
+
published port comes from the matching RUNNING container (compose ps), since
|
|
7818
|
+
only running, published containers can yield a real URL. Returns
|
|
7819
|
+
(service_name, port_str) or (None, None). Never raises.
|
|
7820
|
+
|
|
7821
|
+
config_services: dict {service_name: service_config_dict} (may be empty).
|
|
7822
|
+
running_by_service: dict {service_name: [published_port_str, ...]} for
|
|
7823
|
+
currently-running containers with at least one published host port.
|
|
7824
|
+
"""
|
|
7825
|
+
if not running_by_service:
|
|
7826
|
+
return (None, None)
|
|
7827
|
+
|
|
7828
|
+
# (1) label loki.primary=true (declared in compose config)
|
|
7829
|
+
for name, svc in (config_services or {}).items():
|
|
7830
|
+
if not isinstance(svc, dict):
|
|
7831
|
+
continue
|
|
7832
|
+
labels = _compose_service_labels(svc)
|
|
7833
|
+
if str(labels.get("loki.primary", "")).lower() == "true":
|
|
7834
|
+
ports = running_by_service.get(name)
|
|
7835
|
+
if ports:
|
|
7836
|
+
return (name, ports[0])
|
|
7837
|
+
|
|
7838
|
+
# (2) service named web/app
|
|
7839
|
+
for cand in ("web", "app"):
|
|
7840
|
+
ports = running_by_service.get(cand)
|
|
7841
|
+
if ports:
|
|
7842
|
+
return (cand, ports[0])
|
|
7843
|
+
|
|
7844
|
+
# (3) service publishing a common web port
|
|
7845
|
+
for cp in _COMPOSE_COMMON_WEB_PORTS:
|
|
7846
|
+
for name, ports in running_by_service.items():
|
|
7847
|
+
if cp in ports:
|
|
7848
|
+
return (name, cp)
|
|
7849
|
+
|
|
7850
|
+
# (4) first running service with any published port. Sort for determinism.
|
|
7851
|
+
for name in sorted(running_by_service.keys()):
|
|
7852
|
+
ports = running_by_service[name]
|
|
7853
|
+
if ports:
|
|
7854
|
+
return (name, ports[0])
|
|
7855
|
+
|
|
7856
|
+
return (None, None)
|
|
7857
|
+
|
|
7858
|
+
|
|
7859
|
+
def _container_health_state(container):
|
|
7860
|
+
"""Classify a running compose container into 'running' | 'starting' | None.
|
|
7861
|
+
|
|
7862
|
+
Reads the container State + Health fields from `docker compose ps`:
|
|
7863
|
+
- State exited/dead/paused/removing -> None (no live URL to surface)
|
|
7864
|
+
- State running + Health healthy or empty (no healthcheck) -> 'running'
|
|
7865
|
+
- State running + Health unhealthy/starting -> 'starting' (still surface
|
|
7866
|
+
the URL: e.g. a Next.js app whose home renders but whose '/' healthcheck
|
|
7867
|
+
fails is reachable and should show as starting, not hidden)
|
|
7868
|
+
- State created/restarting -> 'starting'
|
|
7869
|
+
Returns the status string or None. Never raises.
|
|
7870
|
+
"""
|
|
7871
|
+
state = str(container.get("State", "")).lower()
|
|
7872
|
+
health = str(container.get("Health", "")).lower()
|
|
7873
|
+
if state in ("exited", "dead", "paused", "removing"):
|
|
7874
|
+
return None
|
|
7875
|
+
if state == "running":
|
|
7876
|
+
if health in ("", "healthy"):
|
|
7877
|
+
return "running"
|
|
7878
|
+
# unhealthy or starting healthcheck: reachable, treat as starting.
|
|
7879
|
+
return "starting"
|
|
7880
|
+
if state in ("created", "restarting"):
|
|
7881
|
+
return "starting"
|
|
7882
|
+
# Unknown/other states: do not fabricate a running URL.
|
|
7883
|
+
return None
|
|
7884
|
+
|
|
7885
|
+
|
|
7886
|
+
def _discover_compose_app_runner_state():
|
|
7887
|
+
"""Discover a running docker-compose stack for the active project, or None.
|
|
7888
|
+
|
|
7889
|
+
Returns a synthesized app-runner state dict (source=="discovered") when the
|
|
7890
|
+
project directory hosts a compose file AND a primary web service is running
|
|
7891
|
+
with a published host port. Returns None in every other case (no compose
|
|
7892
|
+
file, docker absent, nothing running, no published web port, only
|
|
7893
|
+
dead/exited containers, or any error). Synchronous and self-contained; the
|
|
7894
|
+
caller offloads it onto a worker thread. Never raises.
|
|
7895
|
+
"""
|
|
7896
|
+
try:
|
|
7897
|
+
project_dir = _get_loki_dir().parent.resolve()
|
|
7898
|
+
except Exception:
|
|
7899
|
+
return None
|
|
7900
|
+
cache_key = str(project_dir)
|
|
7901
|
+
|
|
7902
|
+
now = time.monotonic()
|
|
7903
|
+
with _compose_discovery_lock:
|
|
7904
|
+
cached = _compose_discovery_cache.get(cache_key)
|
|
7905
|
+
if cached is not None and cached[0] > now:
|
|
7906
|
+
return cached[1]
|
|
7907
|
+
|
|
7908
|
+
result = _discover_compose_app_runner_state_uncached(project_dir)
|
|
7909
|
+
|
|
7910
|
+
with _compose_discovery_lock:
|
|
7911
|
+
_compose_discovery_cache[cache_key] = (
|
|
7912
|
+
time.monotonic() + _COMPOSE_DISCOVERY_TTL_SECONDS,
|
|
7913
|
+
result,
|
|
7914
|
+
)
|
|
7915
|
+
return result
|
|
7916
|
+
|
|
7917
|
+
|
|
7918
|
+
def _discover_compose_app_runner_state_uncached(project_dir):
|
|
7919
|
+
"""Uncached body of _discover_compose_app_runner_state. Never raises."""
|
|
7920
|
+
try:
|
|
7921
|
+
# Step A: a compose file must exist in the project dir, else this is a
|
|
7922
|
+
# single-process app and discovery does not apply.
|
|
7923
|
+
compose_names = (
|
|
7924
|
+
"docker-compose.yml", "docker-compose.yaml",
|
|
7925
|
+
"compose.yml", "compose.yaml",
|
|
7926
|
+
)
|
|
7927
|
+
if not any((project_dir / n).is_file() for n in compose_names):
|
|
7928
|
+
return None
|
|
7929
|
+
|
|
7930
|
+
# Step C: running containers for THIS project's compose stack, with the
|
|
7931
|
+
# runtime published ports. Run from the project dir so compose resolves
|
|
7932
|
+
# the right project. (Step B project matching is implicitly handled by
|
|
7933
|
+
# running compose from project_dir; we keep ls/ps from this dir.)
|
|
7934
|
+
ps_rows = _run_docker_json(
|
|
7935
|
+
["compose", "ps", "--format", "json"], cwd=project_dir
|
|
7936
|
+
)
|
|
7937
|
+
if ps_rows is None:
|
|
7938
|
+
# docker absent / timeout / error -> fail open.
|
|
7939
|
+
return None
|
|
7940
|
+
if not ps_rows:
|
|
7941
|
+
# No containers for this compose project (not up). Nothing to show.
|
|
7942
|
+
return None
|
|
7943
|
+
|
|
7944
|
+
# Map running, published services to their host ports. Track health and
|
|
7945
|
+
# the raw container for the primary so we can classify it precisely.
|
|
7946
|
+
running_by_service = {}
|
|
7947
|
+
container_by_service = {}
|
|
7948
|
+
for c in ps_rows:
|
|
7949
|
+
service = c.get("Service") or c.get("Name")
|
|
7950
|
+
if not service:
|
|
7951
|
+
continue
|
|
7952
|
+
ports = _compose_published_ports(c)
|
|
7953
|
+
if ports:
|
|
7954
|
+
running_by_service.setdefault(service, [])
|
|
7955
|
+
for p in ports:
|
|
7956
|
+
if p not in running_by_service[service]:
|
|
7957
|
+
running_by_service[service].append(p)
|
|
7958
|
+
container_by_service.setdefault(service, c)
|
|
7959
|
+
if not running_by_service:
|
|
7960
|
+
# Stack is up but nothing publishes a host port: no surfaceable URL.
|
|
7961
|
+
return None
|
|
7962
|
+
|
|
7963
|
+
# Step D: declared service config (names/labels) for precedence. Best
|
|
7964
|
+
# effort: if config is unavailable we still proceed with ps data alone.
|
|
7965
|
+
config_rows = _run_docker_json(
|
|
7966
|
+
["compose", "config", "--format", "json"], cwd=project_dir
|
|
7967
|
+
)
|
|
7968
|
+
config_services = {}
|
|
7969
|
+
if config_rows:
|
|
7970
|
+
cfg = config_rows[0]
|
|
7971
|
+
svcs = cfg.get("services")
|
|
7972
|
+
if isinstance(svcs, dict):
|
|
7973
|
+
config_services = svcs
|
|
7974
|
+
|
|
7975
|
+
primary_service, port = _identify_compose_web_service(
|
|
7976
|
+
config_services, running_by_service
|
|
7977
|
+
)
|
|
7978
|
+
if not primary_service or not port:
|
|
7979
|
+
return None
|
|
7980
|
+
|
|
7981
|
+
# Step E health classification, from the primary's running container.
|
|
7982
|
+
primary_container = container_by_service.get(primary_service)
|
|
7983
|
+
if not isinstance(primary_container, dict):
|
|
7984
|
+
return None
|
|
7985
|
+
health_status = _container_health_state(primary_container)
|
|
7986
|
+
if health_status is None:
|
|
7987
|
+
# exited/dead/paused/unknown -> do not fabricate a URL.
|
|
7988
|
+
return None
|
|
7989
|
+
|
|
7990
|
+
# Step B (best effort): record the compose project name for the panel.
|
|
7991
|
+
compose_project = (
|
|
7992
|
+
primary_container.get("Project")
|
|
7993
|
+
or "".join(ch for ch in project_dir.name.lower() if ch.isalnum())
|
|
7994
|
+
)
|
|
7995
|
+
|
|
7996
|
+
health_text = str(primary_container.get("Health", "")).lower()
|
|
7997
|
+
health_ok = health_text in ("", "healthy")
|
|
7998
|
+
|
|
7999
|
+
# Step F: synthesize the state dict using the SAME field names the UI and
|
|
8000
|
+
# app-runner.sh state.json use (status/url/port/method/last_health), plus
|
|
8001
|
+
# discovery-provenance fields the panel safely ignores.
|
|
8002
|
+
return {
|
|
8003
|
+
"status": health_status,
|
|
8004
|
+
"url": "http://localhost:{}".format(port),
|
|
8005
|
+
"port": int(port),
|
|
8006
|
+
"method": "docker compose (detected)",
|
|
8007
|
+
"primary_service": primary_service,
|
|
8008
|
+
"compose_project": compose_project,
|
|
8009
|
+
"source": "discovered",
|
|
8010
|
+
"externally_managed": True,
|
|
8011
|
+
"last_health": {"ok": health_ok},
|
|
8012
|
+
}
|
|
8013
|
+
except Exception:
|
|
8014
|
+
# Fail open on anything unexpected; never break the status endpoint.
|
|
8015
|
+
return None
|
|
8016
|
+
|
|
8017
|
+
|
|
7602
8018
|
@app.get("/api/app-runner/status")
|
|
7603
8019
|
async def get_app_runner_status():
|
|
7604
|
-
"""Get app runner current status (with dead-run liveness reconciliation).
|
|
8020
|
+
"""Get app runner current status (with dead-run liveness reconciliation).
|
|
8021
|
+
|
|
8022
|
+
Resolution order:
|
|
8023
|
+
1. state.json present AND reconciles to running/starting -> return it (an
|
|
8024
|
+
app-runner.sh-managed run is authoritative).
|
|
8025
|
+
2. state.json missing OR reconciles to stopped/stale -> attempt
|
|
8026
|
+
docker-compose discovery for stacks the autonomous agent launched
|
|
8027
|
+
itself; if a running stack is found, return the synthesized state
|
|
8028
|
+
(bypassing pid-based liveness reconciliation, which is meaningless for
|
|
8029
|
+
externally-launched containers).
|
|
8030
|
+
3. otherwise return the existing (possibly reconciled / not_initialized)
|
|
8031
|
+
result.
|
|
8032
|
+
Discovery runs on a worker thread so its bounded docker calls never block
|
|
8033
|
+
the event loop.
|
|
8034
|
+
"""
|
|
7605
8035
|
loki_dir = _get_loki_dir()
|
|
7606
8036
|
state_file = loki_dir / "app-runner" / "state.json"
|
|
8037
|
+
|
|
7607
8038
|
if not state_file.exists():
|
|
8039
|
+
discovered = await asyncio.to_thread(_discover_compose_app_runner_state)
|
|
8040
|
+
if discovered is not None:
|
|
8041
|
+
return discovered
|
|
7608
8042
|
return {"status": "not_initialized"}
|
|
8043
|
+
|
|
7609
8044
|
try:
|
|
7610
8045
|
state = json.loads(state_file.read_text())
|
|
7611
8046
|
except (json.JSONDecodeError, OSError):
|
|
7612
8047
|
return {"status": "error"}
|
|
7613
|
-
|
|
8048
|
+
|
|
8049
|
+
reconciled = _reconcile_app_runner_liveness(state)
|
|
8050
|
+
if isinstance(reconciled, dict) and reconciled.get("status") in ("running", "starting"):
|
|
8051
|
+
# An app-runner.sh-managed run that is still live is authoritative.
|
|
8052
|
+
return reconciled
|
|
8053
|
+
|
|
8054
|
+
# State is missing-live (stopped/stale/other): the agent may have brought up
|
|
8055
|
+
# a compose stack outside app-runner.sh. Prefer a live discovered stack.
|
|
8056
|
+
discovered = await asyncio.to_thread(_discover_compose_app_runner_state)
|
|
8057
|
+
if discovered is not None:
|
|
8058
|
+
return discovered
|
|
8059
|
+
return reconciled
|
|
7614
8060
|
|
|
7615
8061
|
|
|
7616
8062
|
def _get_log_redactor():
|
|
@@ -7655,8 +8101,12 @@ async def get_app_runner_logs(lines: int = Query(default=100, ge=1, le=1000)):
|
|
|
7655
8101
|
return {"lines": []}
|
|
7656
8102
|
try:
|
|
7657
8103
|
redact = _get_log_redactor()
|
|
7658
|
-
|
|
7659
|
-
|
|
8104
|
+
# Reading + redacting the app log is blocking (the log can be large);
|
|
8105
|
+
# offload so the event loop (status + WS heartbeat) is not stalled.
|
|
8106
|
+
def _read_redacted(p=log_file, n=lines):
|
|
8107
|
+
return [redact(ln) for ln in _safe_read_text(p).splitlines()[-n:]]
|
|
8108
|
+
out_lines = await asyncio.to_thread(_read_redacted)
|
|
8109
|
+
return {"lines": out_lines, "redacted": True}
|
|
7660
8110
|
except OSError:
|
|
7661
8111
|
return {"lines": []}
|
|
7662
8112
|
|
|
@@ -7691,8 +8141,10 @@ async def get_app_runner_errors(lines: int = Query(default=50, ge=1, le=500)):
|
|
|
7691
8141
|
if log_file.exists():
|
|
7692
8142
|
try:
|
|
7693
8143
|
redact = _get_log_redactor()
|
|
7694
|
-
|
|
7695
|
-
|
|
8144
|
+
# Offload the blocking log read + redaction off the event loop.
|
|
8145
|
+
def _read_redacted(p=log_file, n=lines):
|
|
8146
|
+
return [redact(ln) for ln in _safe_read_text(p).splitlines()[-n:]]
|
|
8147
|
+
out_lines = await asyncio.to_thread(_read_redacted)
|
|
7696
8148
|
except OSError:
|
|
7697
8149
|
out_lines = []
|
|
7698
8150
|
|
|
@@ -8410,7 +8862,11 @@ async def get_managed_events(
|
|
|
8410
8862
|
"""
|
|
8411
8863
|
try:
|
|
8412
8864
|
path = _managed_events_path()
|
|
8413
|
-
|
|
8865
|
+
# Tails an ndjson file (rotated at 10MB) via a blocking readlines();
|
|
8866
|
+
# offload so the event loop stays responsive.
|
|
8867
|
+
records = await asyncio.to_thread(
|
|
8868
|
+
_tail_ndjson, path, limit, since, type
|
|
8869
|
+
)
|
|
8414
8870
|
return {
|
|
8415
8871
|
"events": records,
|
|
8416
8872
|
"count": len(records),
|
|
@@ -8433,11 +8889,13 @@ async def get_managed_status():
|
|
|
8433
8889
|
snapshot = _managed_flags_snapshot()
|
|
8434
8890
|
# last_fallback_ts is best-effort from the local events file.
|
|
8435
8891
|
try:
|
|
8436
|
-
|
|
8892
|
+
# Blocking ndjson tail read; offload off the event loop.
|
|
8893
|
+
events = await asyncio.to_thread(
|
|
8894
|
+
_tail_ndjson,
|
|
8437
8895
|
_managed_events_path(),
|
|
8438
|
-
|
|
8439
|
-
|
|
8440
|
-
|
|
8896
|
+
500,
|
|
8897
|
+
None,
|
|
8898
|
+
"managed_agents_fallback",
|
|
8441
8899
|
)
|
|
8442
8900
|
snapshot["last_fallback_ts"] = _last_fallback_ts(events)
|
|
8443
8901
|
except Exception:
|