npm - loki-mode - Versions diffs - 7.56.0 → 7.58.0 - Mend

loki-mode 7.56.0 → 7.58.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/README.md +1 -1
package/SKILL.md +2 -2
package/VERSION +1 -1
package/autonomy/app-runner.sh +101 -0
package/autonomy/lib/prd-enrich.sh +437 -0
package/autonomy/loki +58 -9
package/autonomy/run.sh +175 -60
package/dashboard/__init__.py +1 -1
package/dashboard/server.py +652 -194
package/dashboard/static/index.html +164 -151
package/docs/INSTALLATION.md +2 -2
package/loki-ts/dist/loki.js +2 -2
package/mcp/__init__.py +1 -1
package/memory/consolidation.py +14 -2
package/memory/retrieval.py +10 -0
package/memory/storage.py +10 -0
package/package.json +1 -1
package/plugins/loki-mode/.claude-plugin/plugin.json +1 -1
package/skills/quality-gates.md +135 -11

package/dashboard/server.py CHANGED Viewed

@@ -11,6 +11,7 @@ import json
 import logging
 import os
 import subprocess
+import threading
 import time
 from collections import defaultdict
 from dataclasses import asdict
@@ -3864,34 +3865,40 @@ async def get_memory_summary():
 @app.get("/api/memory/episodes")
 async def list_episodes(limit: int = Query(default=50, ge=1, le=1000)):
     """List episodic memory entries."""
-    # Try SQLite backend first
-    storage = _get_memory_storage()
-    if storage is not None:
-        try:
-            ids = storage.list_episodes(limit=limit)
-            episodes = []
-            for eid in ids:
-                ep = storage.load_episode(eid)
-                if ep:
-                    episodes.append(ep)
-            return episodes
-        except Exception:
-            pass
-    # Fallback to JSON files -- use heapq to avoid sorting all files
-    import heapq
-    ep_dir = _get_loki_dir() / "memory" / "episodic"
-    episodes = []
-    if ep_dir.exists():
-        all_files = ep_dir.glob("*.json")
-        # nlargest by filename (timestamps sort lexicographically) avoids full sort
-        files = heapq.nlargest(limit, all_files, key=lambda f: f.name)
-        for f in files:
+    # Both backends below are blocking (SQLite queries / a glob+read loop over
+    # many JSON files) and only build a local list, so offload the whole read
+    # off the event loop to keep status + WS heartbeat responsive.
+    def _load_episodes() -> list:
+        # Try SQLite backend first
+        storage = _get_memory_storage()
+        if storage is not None:
             try:
-                episodes.append(json.loads(f.read_text()))
+                ids = storage.list_episodes(limit=limit)
+                episodes = []
+                for eid in ids:
+                    ep = storage.load_episode(eid)
+                    if ep:
+                        episodes.append(ep)
+                return episodes
             except Exception:
                 pass
-    return episodes
+        # Fallback to JSON files -- use heapq to avoid sorting all files
+        import heapq
+        ep_dir = _get_loki_dir() / "memory" / "episodic"
+        episodes = []
+        if ep_dir.exists():
+            all_files = ep_dir.glob("*.json")
+            # nlargest by filename (timestamps sort lexicographically) avoids full sort
+            files = heapq.nlargest(limit, all_files, key=lambda f: f.name)
+            for f in files:
+                try:
+                    episodes.append(json.loads(f.read_text()))
+                except Exception:
+                    pass
+        return episodes
+    return await asyncio.to_thread(_load_episodes)
 @app.get("/api/memory/episodes/{episode_id}", dependencies=[Depends(auth.require_scope("read"))])
@@ -3968,30 +3975,35 @@ async def get_pattern(pattern_id: str):
 @app.get("/api/memory/skills")
 async def list_skills():
     """List procedural skills."""
-    # Try SQLite first
-    storage = _get_memory_storage()
-    if storage is not None:
-        try:
-            ids = storage.list_skills()
-            skills = []
-            for sid in ids:
-                s = storage.load_skill(sid)
-                if s:
-                    skills.append(s)
-            return skills
-        except Exception:
-            pass
-    # Fallback to JSON
-    skills_dir = _get_loki_dir() / "memory" / "skills"
-    skills = []
-    if skills_dir.exists():
-        for f in sorted(skills_dir.glob("*.json")):
+    # Blocking SQLite query / glob+read loop; offload the whole read so the
+    # event loop (status + WS heartbeat) stays responsive.
+    def _load_skills() -> list:
+        # Try SQLite first
+        storage = _get_memory_storage()
+        if storage is not None:
             try:
-                skills.append(json.loads(f.read_text()))
+                ids = storage.list_skills()
+                skills = []
+                for sid in ids:
+                    s = storage.load_skill(sid)
+                    if s:
+                        skills.append(s)
+                return skills
             except Exception:
                 pass
-    return skills
+        # Fallback to JSON
+        skills_dir = _get_loki_dir() / "memory" / "skills"
+        skills = []
+        if skills_dir.exists():
+            for f in sorted(skills_dir.glob("*.json")):
+                try:
+                    skills.append(json.loads(f.read_text()))
+                except Exception:
+                    pass
+        return skills
+    return await asyncio.to_thread(_load_skills)
 @app.get("/api/memory/skills/{skill_id}", dependencies=[Depends(auth.require_scope("read"))])
@@ -4346,15 +4358,16 @@ async def get_memory_file(
         st = target.stat()
     except Exception:
         raise HTTPException(status_code=500, detail="stat failed")
-    truncated = False
+    truncated = st.st_size > _MEMORY_FILE_MAX_BYTES
+    def _read_memory_blob() -> bytes:
+        # Up to a 2 MiB blocking read; offloaded so the single-worker event
+        # loop (and /api/status + WS heartbeat) stays responsive.
+        with open(target, "rb") as fh:
+            return fh.read(_MEMORY_FILE_MAX_BYTES) if truncated else fh.read()
     try:
-        if st.st_size > _MEMORY_FILE_MAX_BYTES:
-            with open(target, "rb") as fh:
-                raw = fh.read(_MEMORY_FILE_MAX_BYTES)
-            truncated = True
-        else:
-            with open(target, "rb") as fh:
-                raw = fh.read()
+        raw = await asyncio.to_thread(_read_memory_blob)
         # Decode as UTF-8 with replacement so we never 500 on a stray byte.
         content = raw.decode("utf-8", errors="replace")
     except HTTPException:
@@ -4432,44 +4445,49 @@ async def search_memory(
 @app.get("/api/memory/stats")
 async def get_memory_stats():
     """Get memory system statistics (counts, size, backend info)."""
-    storage = _get_memory_storage()
-    if storage is not None:
-        try:
-            return storage.get_stats()
-        except Exception:
-            pass
+    # SQLite stats query or a directory-walk over many JSON files; both block,
+    # so offload off the event loop.
+    def _compute_stats() -> dict:
+        storage = _get_memory_storage()
+        if storage is not None:
+            try:
+                return storage.get_stats()
+            except Exception:
+                pass
-    # Fallback: compute stats from JSON files
-    memory_dir = _get_loki_dir() / "memory"
-    ep_count = 0
-    ep_dir = memory_dir / "episodic"
-    if ep_dir.exists():
-        for d in ep_dir.iterdir():
-            if d.is_dir():
-                ep_count += len(list(d.glob("*.json")))
-            elif d.suffix == ".json":
-                ep_count += 1
-    pat_count = 0
-    patterns_file = memory_dir / "semantic" / "patterns.json"
-    if patterns_file.exists():
-        try:
-            data = json.loads(patterns_file.read_text())
-            pat_count = len(data) if isinstance(data, list) else len(data.get("patterns", []))
-        except Exception:
-            pass
+        # Fallback: compute stats from JSON files
+        memory_dir = _get_loki_dir() / "memory"
+        ep_count = 0
+        ep_dir = memory_dir / "episodic"
+        if ep_dir.exists():
+            for d in ep_dir.iterdir():
+                if d.is_dir():
+                    ep_count += len(list(d.glob("*.json")))
+                elif d.suffix == ".json":
+                    ep_count += 1
+        pat_count = 0
+        patterns_file = memory_dir / "semantic" / "patterns.json"
+        if patterns_file.exists():
+            try:
+                data = json.loads(patterns_file.read_text())
+                pat_count = len(data) if isinstance(data, list) else len(data.get("patterns", []))
+            except Exception:
+                pass
-    skill_count = 0
-    skills_dir = memory_dir / "skills"
-    if skills_dir.exists():
-        skill_count = len(list(skills_dir.glob("*.json")))
+        skill_count = 0
+        skills_dir = memory_dir / "skills"
+        if skills_dir.exists():
+            skill_count = len(list(skills_dir.glob("*.json")))
-    return {
-        "backend": "json",
-        "episode_count": ep_count,
-        "pattern_count": pat_count,
-        "skill_count": skill_count,
-    }
+        return {
+            "backend": "json",
+            "episode_count": ep_count,
+            "pattern_count": pat_count,
+            "skill_count": skill_count,
+        }
+    return await asyncio.to_thread(_compute_stats)
 # Learning/metrics endpoints
@@ -4515,10 +4533,10 @@ async def get_learning_metrics(
     source: Optional[str] = None,
 ):
     """Get learning metrics from events, metrics files, and learning signals."""
-    events = _read_events(timeRange)
+    events = await asyncio.to_thread(_read_events, timeRange)
     # Also read from learning signals directory
-    all_signals = _read_learning_signals(limit=10000)
+    all_signals = await asyncio.to_thread(_read_learning_signals, limit=10000)
     # Filter by type and source
     if signalType:
@@ -4595,7 +4613,7 @@ async def get_learning_trends(
     source: Optional[str] = None,
 ):
     """Get learning trend data."""
-    events = _read_events(timeRange)
+    events = await asyncio.to_thread(_read_events, timeRange)
     # Group by hour for trend data
     by_hour: dict = {}
     for e in events:
@@ -4617,14 +4635,14 @@ async def get_learning_signals(
     offset: int = Query(default=0, ge=0),
 ):
     """Get raw learning signals from both events.jsonl and learning signals directory."""
-    events = _read_events(timeRange)
+    events = await asyncio.to_thread(_read_events, timeRange)
     if signalType:
         events = [e for e in events if e.get("type") == signalType]
     if source:
         events = [e for e in events if e.get("data", {}).get("source") == source]
     # Also read from learning signals directory
-    file_signals = _read_learning_signals(signal_type=signalType, limit=10000)
+    file_signals = await asyncio.to_thread(_read_learning_signals, signal_type=signalType, limit=10000)
     if source:
         file_signals = [s for s in file_signals if s.get("source") == source]
@@ -4648,10 +4666,10 @@ async def get_learning_aggregation():
             pass
     # Supplement with live data from learning signals directory
-    success_signals = _read_learning_signals(signal_type="success_pattern", limit=500)
-    tool_signals = _read_learning_signals(signal_type="tool_efficiency", limit=500)
-    error_signals = _read_learning_signals(signal_type="error_pattern", limit=500)
-    pref_signals = _read_learning_signals(signal_type="user_preference", limit=500)
+    success_signals = await asyncio.to_thread(_read_learning_signals, signal_type="success_pattern", limit=500)
+    tool_signals = await asyncio.to_thread(_read_learning_signals, signal_type="tool_efficiency", limit=500)
+    error_signals = await asyncio.to_thread(_read_learning_signals, signal_type="error_pattern", limit=500)
+    pref_signals = await asyncio.to_thread(_read_learning_signals, signal_type="user_preference", limit=500)
     # Merge success patterns from signals if aggregation file had none
     if not result.get("success_patterns") and success_signals:
@@ -4725,6 +4743,14 @@ async def trigger_aggregation():
     if not _read_limiter.check("learning_aggregate"):
         raise HTTPException(status_code=429, detail="Rate limit exceeded")
+    # Reads up to 10 MB of events.jsonl, parses every line, then writes the
+    # aggregation.json metrics file. All blocking, all on local state +
+    # filesystem (no shared in-memory state), so offload the whole computation
+    # to a thread to keep the event loop (status + WS heartbeat) responsive.
+    return await asyncio.to_thread(_compute_learning_aggregation)
+def _compute_learning_aggregation() -> dict:
     events_file = _get_loki_dir() / "events.jsonl"
     preferences: dict = {}
     error_patterns: dict = {}
@@ -4820,10 +4846,10 @@ async def trigger_aggregation():
 @app.get("/api/learning/preferences", dependencies=[Depends(auth.require_scope("read"))])
 async def get_learning_preferences(limit: int = Query(default=50, ge=1, le=1000)):
     """Get aggregated user preferences from events and learning signals directory."""
-    events = _read_events("30d")
+    events = await asyncio.to_thread(_read_events, "30d")
     prefs = [e for e in events if e.get("type") == "user_preference"]
     # Also read from learning signals directory
-    file_prefs = _read_learning_signals(signal_type="user_preference", limit=limit)
+    file_prefs = await asyncio.to_thread(_read_learning_signals, signal_type="user_preference", limit=limit)
     combined = prefs + file_prefs
     combined.sort(key=lambda s: s.get("timestamp", ""), reverse=True)
     return combined[:limit]
@@ -4832,10 +4858,10 @@ async def get_learning_preferences(limit: int = Query(default=50, ge=1, le=1000)
 @app.get("/api/learning/errors", dependencies=[Depends(auth.require_scope("read"))])
 async def get_learning_errors(limit: int = Query(default=50, ge=1, le=1000)):
     """Get aggregated error patterns from events and learning signals directory."""
-    events = _read_events("30d")
+    events = await asyncio.to_thread(_read_events, "30d")
     errors = [e for e in events if e.get("type") == "error_pattern"]
     # Also read from learning signals directory
-    file_errors = _read_learning_signals(signal_type="error_pattern", limit=limit)
+    file_errors = await asyncio.to_thread(_read_learning_signals, signal_type="error_pattern", limit=limit)
     combined = errors + file_errors
     combined.sort(key=lambda s: s.get("timestamp", ""), reverse=True)
     return combined[:limit]
@@ -4844,10 +4870,10 @@ async def get_learning_errors(limit: int = Query(default=50, ge=1, le=1000)):
 @app.get("/api/learning/success", dependencies=[Depends(auth.require_scope("read"))])
 async def get_learning_success(limit: int = Query(default=50, ge=1, le=1000)):
     """Get aggregated success patterns from events and learning signals directory."""
-    events = _read_events("30d")
+    events = await asyncio.to_thread(_read_events, "30d")
     successes = [e for e in events if e.get("type") == "success_pattern"]
     # Also read from learning signals directory
-    file_successes = _read_learning_signals(signal_type="success_pattern", limit=limit)
+    file_successes = await asyncio.to_thread(_read_learning_signals, signal_type="success_pattern", limit=limit)
     combined = successes + file_successes
     combined.sort(key=lambda s: s.get("timestamp", ""), reverse=True)
     return combined[:limit]
@@ -4856,10 +4882,10 @@ async def get_learning_success(limit: int = Query(default=50, ge=1, le=1000)):
 @app.get("/api/learning/tools", dependencies=[Depends(auth.require_scope("read"))])
 async def get_tool_efficiency(limit: int = Query(default=50, ge=1, le=1000)):
     """Get tool efficiency rankings from events and learning signals directory."""
-    events = _read_events("30d")
+    events = await asyncio.to_thread(_read_events, "30d")
     tools = [e for e in events if e.get("type") == "tool_efficiency"]
     # Also read from learning signals directory
-    file_tools = _read_learning_signals(signal_type="tool_efficiency", limit=limit)
+    file_tools = await asyncio.to_thread(_read_learning_signals, signal_type="tool_efficiency", limit=limit)
     combined = tools + file_tools
     combined.sort(key=lambda s: s.get("timestamp", ""), reverse=True)
     return combined[:limit]
@@ -5203,7 +5229,16 @@ def _calculate_model_cost(model: str, input_tokens: int, output_tokens: int) ->
 @app.get("/api/cost")
 async def get_cost():
-    """Get cost visibility data from .loki/metrics/efficiency/ and budget.json."""
+    """Get cost visibility data from .loki/metrics/efficiency/ and budget.json.
+    The computation globs + reads every per-iteration efficiency JSON file
+    (a blocking multi-file read loop building only local aggregates), so it is
+    offloaded to a thread to keep the event loop responsive.
+    """
+    return await asyncio.to_thread(_compute_cost_snapshot)
+def _compute_cost_snapshot() -> dict:
     loki_dir = _get_loki_dir()
     efficiency_dir = loki_dir / "metrics" / "efficiency"
     budget_file = loki_dir / "metrics" / "budget.json"
@@ -5470,7 +5505,15 @@ async def get_cost_timeline():
     classifies into ok/warn/exceeded so the UI can warn at 80% before the cap.
     Cost is never fabricated: when nothing was recorded, cost_recorded is False
     and totals are honestly null rather than a misleading $0.00.
+    Globs + reads every efficiency iteration file and every proof.json (a
+    blocking multi-file read loop building only local state), so it is offloaded
+    to a thread to keep the event loop responsive.
     """
+    return await asyncio.to_thread(_compute_cost_timeline)
+def _compute_cost_timeline() -> dict:
     loki_dir = _get_loki_dir()
     efficiency_dir = loki_dir / "metrics" / "efficiency"
@@ -5729,51 +5772,59 @@ async def get_council_state():
 @app.get("/api/council/verdicts")
 async def get_council_verdicts(limit: int = Query(default=20, ge=1, le=1000)):
-    """Get council vote history (decision log)."""
-    state_file = _get_loki_dir() / "council" / "state.json"
-    verdicts = []
-    if state_file.exists():
-        try:
-            state = json.loads(state_file.read_text())
-            verdicts = state.get("verdicts", [])
-        except Exception:
-            pass
+    """Get council vote history (decision log).
-    # Also read individual vote files for detail
-    votes_dir = _get_loki_dir() / "council" / "votes"
-    detailed_verdicts = []
-    if votes_dir.exists():
-        for vote_dir in sorted(votes_dir.iterdir(), reverse=True):
-            if vote_dir.is_dir():
-                verdict_detail = {"iteration": vote_dir.name}
-                # Read evidence
-                evidence_file = vote_dir / "evidence.md"
-                if evidence_file.exists():
-                    try:
-                        verdict_detail["evidence_preview"] = evidence_file.read_text()[:500]
-                    except Exception:
-                        verdict_detail["evidence_preview"] = ""
-                # Read member votes
-                members = []
-                for member_file in sorted(vote_dir.glob("member-*.txt")):
-                    try:
-                        content = member_file.read_text().strip()
-                        members.append({
-                            "member": member_file.stem,
-                            "content": content
-                        })
-                    except Exception:
-                        pass
-                verdict_detail["members"] = members
-                # Read contrarian
-                contrarian_file = vote_dir / "contrarian.txt"
-                if contrarian_file.exists():
-                    verdict_detail["contrarian"] = contrarian_file.read_text().strip()
-                detailed_verdicts.append(verdict_detail)
-                if len(detailed_verdicts) >= limit:
-                    break
+    Walks every vote directory and reads its evidence/member/contrarian files
+    (a blocking multi-file read loop building only local state), so it is
+    offloaded to a thread to keep the event loop responsive.
+    """
+    def _collect_verdicts() -> dict:
+        state_file = _get_loki_dir() / "council" / "state.json"
+        verdicts = []
+        if state_file.exists():
+            try:
+                state = json.loads(state_file.read_text())
+                verdicts = state.get("verdicts", [])
+            except Exception:
+                pass
-    return {"verdicts": verdicts, "details": detailed_verdicts}
+        # Also read individual vote files for detail
+        votes_dir = _get_loki_dir() / "council" / "votes"
+        detailed_verdicts = []
+        if votes_dir.exists():
+            for vote_dir in sorted(votes_dir.iterdir(), reverse=True):
+                if vote_dir.is_dir():
+                    verdict_detail = {"iteration": vote_dir.name}
+                    # Read evidence
+                    evidence_file = vote_dir / "evidence.md"
+                    if evidence_file.exists():
+                        try:
+                            verdict_detail["evidence_preview"] = evidence_file.read_text()[:500]
+                        except Exception:
+                            verdict_detail["evidence_preview"] = ""
+                    # Read member votes
+                    members = []
+                    for member_file in sorted(vote_dir.glob("member-*.txt")):
+                        try:
+                            content = member_file.read_text().strip()
+                            members.append({
+                                "member": member_file.stem,
+                                "content": content
+                            })
+                        except Exception:
+                            pass
+                    verdict_detail["members"] = members
+                    # Read contrarian
+                    contrarian_file = vote_dir / "contrarian.txt"
+                    if contrarian_file.exists():
+                        verdict_detail["contrarian"] = contrarian_file.read_text().strip()
+                    detailed_verdicts.append(verdict_detail)
+                    if len(detailed_verdicts) >= limit:
+                        break
+        return {"verdicts": verdicts, "details": detailed_verdicts}
+    return await asyncio.to_thread(_collect_verdicts)
 @app.get("/api/council/convergence")
@@ -5848,35 +5899,41 @@ async def get_council_transcripts(
     if not transcripts_dir.exists():
         response: dict = {"transcripts": [], "total": 0, "latest_id": None}
         if type_prefix:
-            response["hook_events"] = _read_events(type_prefix=type_prefix)
+            response["hook_events"] = await asyncio.to_thread(_read_events, type_prefix=type_prefix)
         return response
-    records = []
-    for f in sorted(transcripts_dir.glob("iter-*.json"), reverse=True):
-        try:
-            rec = json.loads(f.read_text())
-        except Exception:
-            logger.warning("Skipping corrupt council transcript file: %s", f.name)
-            continue
-        if not isinstance(rec, dict):
-            logger.warning("Skipping non-object council transcript file: %s", f.name)
-            continue
-        if not isinstance(rec.get("iteration_id"), str):
-            logger.warning("Skipping transcript missing iteration_id field: %s", f.name)
-            continue
-        if since_dt is not None:
-            ts_str = rec.get("timestamp", "")
+    def _collect_transcript_records() -> list:
+        # Globs + reads up to `limit` (<=200) JSON transcript files; a blocking
+        # multi-file read loop offloaded so the event loop stays responsive.
+        out: list = []
+        for f in sorted(transcripts_dir.glob("iter-*.json"), reverse=True):
             try:
-                ts = datetime.fromisoformat(ts_str.replace("Z", "+00:00"))
-            except (ValueError, AttributeError):
+                rec = json.loads(f.read_text())
+            except Exception:
+                logger.warning("Skipping corrupt council transcript file: %s", f.name)
                 continue
-            if ts <= since_dt:
+            if not isinstance(rec, dict):
+                logger.warning("Skipping non-object council transcript file: %s", f.name)
                 continue
-        if iter_min is not None and rec.get("iteration", 0) < iter_min:
-            continue
-        records.append(rec)
-        if len(records) >= limit:
-            break
+            if not isinstance(rec.get("iteration_id"), str):
+                logger.warning("Skipping transcript missing iteration_id field: %s", f.name)
+                continue
+            if since_dt is not None:
+                ts_str = rec.get("timestamp", "")
+                try:
+                    ts = datetime.fromisoformat(ts_str.replace("Z", "+00:00"))
+                except (ValueError, AttributeError):
+                    continue
+                if ts <= since_dt:
+                    continue
+            if iter_min is not None and rec.get("iteration", 0) < iter_min:
+                continue
+            out.append(rec)
+            if len(out) >= limit:
+                break
+        return out
+    records = await asyncio.to_thread(_collect_transcript_records)
     response = {
         "transcripts": records,
@@ -5885,7 +5942,7 @@ async def get_council_transcripts(
     }
     # v7.5.22 Phase D: opt-in hook-event passthrough via _read_events filter.
     if type_prefix:
-        response["hook_events"] = _read_events(type_prefix=type_prefix)
+        response["hook_events"] = await asyncio.to_thread(_read_events, type_prefix=type_prefix)
     return response
@@ -6106,7 +6163,16 @@ def _sanitize_checkpoint_id(checkpoint_id: str) -> str:
 @app.get("/api/checkpoints")
 async def list_checkpoints(limit: int = Query(default=20, ge=1, le=200)):
-    """List recent checkpoints from index.jsonl, enriched with metadata when available."""
+    """List recent checkpoints from index.jsonl, enriched with metadata when available.
+    Reads index.jsonl plus a metadata.json and a recursive rglob() file count
+    per checkpoint (a blocking multi-file walk building only local state), so
+    it is offloaded to a thread to keep the event loop responsive.
+    """
+    return await asyncio.to_thread(_collect_checkpoints, limit)
+def _collect_checkpoints(limit: int) -> list:
     loki_dir = _get_loki_dir()
     index_file = loki_dir / "state" / "checkpoints" / "index.jsonl"
     checkpoints_dir = loki_dir / "state" / "checkpoints"
@@ -6557,17 +6623,18 @@ async def get_logs(lines: int = 100, token: Optional[dict] = Depends(auth.get_cu
                 file_mtime = datetime.fromtimestamp(log_file.stat().st_mtime, tz=timezone.utc).strftime(
                     "%Y-%m-%dT%H:%M:%S"
                 )
-                # Read only the tail to avoid loading huge files into memory
-                tail_lines = []
-                try:
-                    with open(log_file, "rb") as lf:
-                        # Seek from end to find enough lines
+                # Read only the tail to avoid loading huge files into memory.
+                # The up-to-1MB blocking read is offloaded to a thread so the
+                # single-worker event loop (status + WS heartbeat) stays free.
+                def _read_log_tail(lf_path=log_file, n=lines) -> list[str]:
+                    with open(lf_path, "rb") as lf:
                         lf.seek(0, 2)
                         file_size = lf.tell()
-                        # Read at most 1MB from the end (plenty for any reasonable lines count)
                         read_size = min(file_size, 1024 * 1024)
                         lf.seek(max(0, file_size - read_size))
-                        tail_lines = lf.read().decode("utf-8", errors="replace").strip().split("\n")[-lines:]
+                        return lf.read().decode("utf-8", errors="replace").strip().split("\n")[-n:]
+                try:
+                    tail_lines = await asyncio.to_thread(_read_log_tail)
                 except (OSError, UnicodeDecodeError):
                     tail_lines = []
                 for raw_line in tail_lines:
@@ -7599,18 +7666,397 @@ def _reconcile_app_runner_liveness(state):
     return state
+# =============================================================================
+# Docker-compose app-runner discovery
+#
+# When the autonomous agent brings up a docker-compose stack itself (rather than
+# via autonomy/app-runner.sh), no .loki/app-runner/state.json is written, so the
+# status endpoint reports "not_initialized" / "stopped" even though the app is
+# genuinely running. The discovery helper below inspects the live compose stack
+# for the project directory and synthesizes an equivalent status so the dashboard
+# App Runner panel surfaces the running app and its URL.
+#
+# Safety contract (all mandatory):
+#   - Every docker subprocess.run has an explicit timeout; total work is bounded.
+#   - On ANY error (TimeoutExpired/OSError/SubprocessError/parse failure) the
+#     helper returns None and the caller falls back to its prior behavior. The
+#     handler never raises and never blocks the event loop (it is offloaded via
+#     asyncio.to_thread / run_in_threadpool).
+#   - A short TTL cache prevents the 3s/5s dashboard pollers from spawning
+#     repeated docker invocations.
+#   - A URL is never fabricated for a non-running or non-published container.
+# =============================================================================
+# Common host ports a web service typically publishes, in precedence order.
+# Mirrors autonomy/app-runner.sh _identify_compose_web_service (COMMON list).
+_COMPOSE_COMMON_WEB_PORTS = ["3000", "8000", "8080", "5000", "4200", "5173", "80"]
+# Per-docker-call timeout (seconds). Several calls run in sequence; keep each
+# tight so total discovery stays bounded well under the poller interval.
+_COMPOSE_DISCOVERY_CMD_TIMEOUT = 3
+# TTL (seconds) for the discovery result cache, keyed by resolved project dir.
+# The dashboard polls every 3-5s; a 2.5s TTL collapses a burst of concurrent
+# pollers onto a single docker probe without making the status feel stale.
+_COMPOSE_DISCOVERY_TTL_SECONDS = 2.5
+# Cache: {project_dir_str: (expiry_epoch, result_or_None)}. Module-level so it
+# survives across requests. Guarded by a lock because to_thread runs the sync
+# helper on worker threads that can overlap.
+_compose_discovery_cache: dict[str, tuple[float, Optional[dict]]] = {}
+_compose_discovery_lock = threading.Lock()
+def _parse_docker_json(raw):
+    """Parse docker --format json output into a list of dicts, defensively.
+    Docker emits either a single JSON array or newline-delimited JSON (one
+    object per line), and the shape has varied across docker/compose versions.
+    Try a whole-blob parse first; if that fails or does not yield a list, fall
+    back to parsing each non-empty line individually. Returns a list of dicts
+    (possibly empty). Never raises.
+    """
+    raw = (raw or "").strip()
+    if not raw:
+        return []
+    try:
+        parsed = json.loads(raw)
+        if isinstance(parsed, list):
+            return [x for x in parsed if isinstance(x, dict)]
+        if isinstance(parsed, dict):
+            return [parsed]
+    except (ValueError, TypeError):
+        pass
+    items = []
+    for line in raw.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            obj = json.loads(line)
+        except (ValueError, TypeError):
+            continue
+        if isinstance(obj, dict):
+            items.append(obj)
+    return items
+def _run_docker_json(args, cwd=None):
+    """Run a docker command and return parsed JSON rows, or None on any failure.
+    args is the argument list AFTER `docker` (e.g. ["compose", "ps", ...]). Uses
+    an explicit per-call timeout and a list argv (no shell). A non-zero exit,
+    timeout, missing docker binary, or unparseable output all yield None so the
+    caller fails open.
+    """
+    try:
+        proc = subprocess.run(
+            ["docker", *args],
+            capture_output=True,
+            text=True,
+            timeout=_COMPOSE_DISCOVERY_CMD_TIMEOUT,
+            cwd=str(cwd) if cwd else None,
+        )
+    except (OSError, subprocess.SubprocessError):
+        return None
+    if proc.returncode != 0:
+        return None
+    return _parse_docker_json(proc.stdout)
+def _compose_published_ports(container):
+    """Host ports actually published by a running compose container (compose ps).
+    `docker compose ps --format json` exposes published ports under the
+    "Publishers" list, each like {"PublishedPort": 3000, "TargetPort": 3000,
+    "Protocol": "tcp", "URL": "0.0.0.0"}. A PublishedPort of 0 means the port is
+    exposed but not published to the host, so it is filtered out. Returns a list
+    of host port strings, preserving order. Never raises.
+    """
+    out = []
+    pubs = container.get("Publishers")
+    if not isinstance(pubs, list):
+        return out
+    for p in pubs:
+        if not isinstance(p, dict):
+            continue
+        port = p.get("PublishedPort")
+        try:
+            port = int(port)
+        except (TypeError, ValueError):
+            continue
+        if port > 0:
+            out.append(str(port))
+    return out
+def _compose_service_labels(svc):
+    """Normalize a compose-config service's labels into a dict. Never raises."""
+    labels = svc.get("labels") or {}
+    if isinstance(labels, dict):
+        return labels
+    if isinstance(labels, list):
+        normalized = {}
+        for item in labels:
+            if isinstance(item, str) and "=" in item:
+                k, v = item.split("=", 1)
+                normalized[k] = v
+        return normalized
+    return {}
+def _identify_compose_web_service(config_services, running_by_service):
+    """Pick the primary web service and its published host port.
+    Mirrors the precedence in autonomy/app-runner.sh:431-481:
+      (1) service labelled loki.primary=true
+      (2) service named web/app
+      (3) service publishing a common web port (3000/8000/8080/5000/4200/5173/80)
+      (4) first service with any published port
+    Declared names/labels come from `docker compose config`; the actual runtime
+    published port comes from the matching RUNNING container (compose ps), since
+    only running, published containers can yield a real URL. Returns
+    (service_name, port_str) or (None, None). Never raises.
+    config_services: dict {service_name: service_config_dict} (may be empty).
+    running_by_service: dict {service_name: [published_port_str, ...]} for
+        currently-running containers with at least one published host port.
+    """
+    if not running_by_service:
+        return (None, None)
+    # (1) label loki.primary=true (declared in compose config)
+    for name, svc in (config_services or {}).items():
+        if not isinstance(svc, dict):
+            continue
+        labels = _compose_service_labels(svc)
+        if str(labels.get("loki.primary", "")).lower() == "true":
+            ports = running_by_service.get(name)
+            if ports:
+                return (name, ports[0])
+    # (2) service named web/app
+    for cand in ("web", "app"):
+        ports = running_by_service.get(cand)
+        if ports:
+            return (cand, ports[0])
+    # (3) service publishing a common web port
+    for cp in _COMPOSE_COMMON_WEB_PORTS:
+        for name, ports in running_by_service.items():
+            if cp in ports:
+                return (name, cp)
+    # (4) first running service with any published port. Sort for determinism.
+    for name in sorted(running_by_service.keys()):
+        ports = running_by_service[name]
+        if ports:
+            return (name, ports[0])
+    return (None, None)
+def _container_health_state(container):
+    """Classify a running compose container into 'running' | 'starting' | None.
+    Reads the container State + Health fields from `docker compose ps`:
+      - State exited/dead/paused/removing -> None (no live URL to surface)
+      - State running + Health healthy or empty (no healthcheck) -> 'running'
+      - State running + Health unhealthy/starting -> 'starting' (still surface
+        the URL: e.g. a Next.js app whose home renders but whose '/' healthcheck
+        fails is reachable and should show as starting, not hidden)
+      - State created/restarting -> 'starting'
+    Returns the status string or None. Never raises.
+    """
+    state = str(container.get("State", "")).lower()
+    health = str(container.get("Health", "")).lower()
+    if state in ("exited", "dead", "paused", "removing"):
+        return None
+    if state == "running":
+        if health in ("", "healthy"):
+            return "running"
+        # unhealthy or starting healthcheck: reachable, treat as starting.
+        return "starting"
+    if state in ("created", "restarting"):
+        return "starting"
+    # Unknown/other states: do not fabricate a running URL.
+    return None
+def _discover_compose_app_runner_state():
+    """Discover a running docker-compose stack for the active project, or None.
+    Returns a synthesized app-runner state dict (source=="discovered") when the
+    project directory hosts a compose file AND a primary web service is running
+    with a published host port. Returns None in every other case (no compose
+    file, docker absent, nothing running, no published web port, only
+    dead/exited containers, or any error). Synchronous and self-contained; the
+    caller offloads it onto a worker thread. Never raises.
+    """
+    try:
+        project_dir = _get_loki_dir().parent.resolve()
+    except Exception:
+        return None
+    cache_key = str(project_dir)
+    now = time.monotonic()
+    with _compose_discovery_lock:
+        cached = _compose_discovery_cache.get(cache_key)
+        if cached is not None and cached[0] > now:
+            return cached[1]
+    result = _discover_compose_app_runner_state_uncached(project_dir)
+    with _compose_discovery_lock:
+        _compose_discovery_cache[cache_key] = (
+            time.monotonic() + _COMPOSE_DISCOVERY_TTL_SECONDS,
+            result,
+        )
+    return result
+def _discover_compose_app_runner_state_uncached(project_dir):
+    """Uncached body of _discover_compose_app_runner_state. Never raises."""
+    try:
+        # Step A: a compose file must exist in the project dir, else this is a
+        # single-process app and discovery does not apply.
+        compose_names = (
+            "docker-compose.yml", "docker-compose.yaml",
+            "compose.yml", "compose.yaml",
+        )
+        if not any((project_dir / n).is_file() for n in compose_names):
+            return None
+        # Step C: running containers for THIS project's compose stack, with the
+        # runtime published ports. Run from the project dir so compose resolves
+        # the right project. (Step B project matching is implicitly handled by
+        # running compose from project_dir; we keep ls/ps from this dir.)
+        ps_rows = _run_docker_json(
+            ["compose", "ps", "--format", "json"], cwd=project_dir
+        )
+        if ps_rows is None:
+            # docker absent / timeout / error -> fail open.
+            return None
+        if not ps_rows:
+            # No containers for this compose project (not up). Nothing to show.
+            return None
+        # Map running, published services to their host ports. Track health and
+        # the raw container for the primary so we can classify it precisely.
+        running_by_service = {}
+        container_by_service = {}
+        for c in ps_rows:
+            service = c.get("Service") or c.get("Name")
+            if not service:
+                continue
+            ports = _compose_published_ports(c)
+            if ports:
+                running_by_service.setdefault(service, [])
+                for p in ports:
+                    if p not in running_by_service[service]:
+                        running_by_service[service].append(p)
+                container_by_service.setdefault(service, c)
+        if not running_by_service:
+            # Stack is up but nothing publishes a host port: no surfaceable URL.
+            return None
+        # Step D: declared service config (names/labels) for precedence. Best
+        # effort: if config is unavailable we still proceed with ps data alone.
+        config_rows = _run_docker_json(
+            ["compose", "config", "--format", "json"], cwd=project_dir
+        )
+        config_services = {}
+        if config_rows:
+            cfg = config_rows[0]
+            svcs = cfg.get("services")
+            if isinstance(svcs, dict):
+                config_services = svcs
+        primary_service, port = _identify_compose_web_service(
+            config_services, running_by_service
+        )
+        if not primary_service or not port:
+            return None
+        # Step E health classification, from the primary's running container.
+        primary_container = container_by_service.get(primary_service)
+        if not isinstance(primary_container, dict):
+            return None
+        health_status = _container_health_state(primary_container)
+        if health_status is None:
+            # exited/dead/paused/unknown -> do not fabricate a URL.
+            return None
+        # Step B (best effort): record the compose project name for the panel.
+        compose_project = (
+            primary_container.get("Project")
+            or "".join(ch for ch in project_dir.name.lower() if ch.isalnum())
+        )
+        health_text = str(primary_container.get("Health", "")).lower()
+        health_ok = health_text in ("", "healthy")
+        # Step F: synthesize the state dict using the SAME field names the UI and
+        # app-runner.sh state.json use (status/url/port/method/last_health), plus
+        # discovery-provenance fields the panel safely ignores.
+        return {
+            "status": health_status,
+            "url": "http://localhost:{}".format(port),
+            "port": int(port),
+            "method": "docker compose (detected)",
+            "primary_service": primary_service,
+            "compose_project": compose_project,
+            "source": "discovered",
+            "externally_managed": True,
+            "last_health": {"ok": health_ok},
+        }
+    except Exception:
+        # Fail open on anything unexpected; never break the status endpoint.
+        return None
 @app.get("/api/app-runner/status")
 async def get_app_runner_status():
-    """Get app runner current status (with dead-run liveness reconciliation)."""
+    """Get app runner current status (with dead-run liveness reconciliation).
+    Resolution order:
+      1. state.json present AND reconciles to running/starting -> return it (an
+         app-runner.sh-managed run is authoritative).
+      2. state.json missing OR reconciles to stopped/stale -> attempt
+         docker-compose discovery for stacks the autonomous agent launched
+         itself; if a running stack is found, return the synthesized state
+         (bypassing pid-based liveness reconciliation, which is meaningless for
+         externally-launched containers).
+      3. otherwise return the existing (possibly reconciled / not_initialized)
+         result.
+    Discovery runs on a worker thread so its bounded docker calls never block
+    the event loop.
+    """
     loki_dir = _get_loki_dir()
     state_file = loki_dir / "app-runner" / "state.json"
     if not state_file.exists():
+        discovered = await asyncio.to_thread(_discover_compose_app_runner_state)
+        if discovered is not None:
+            return discovered
         return {"status": "not_initialized"}
     try:
         state = json.loads(state_file.read_text())
     except (json.JSONDecodeError, OSError):
         return {"status": "error"}
-    return _reconcile_app_runner_liveness(state)
+    reconciled = _reconcile_app_runner_liveness(state)
+    if isinstance(reconciled, dict) and reconciled.get("status") in ("running", "starting"):
+        # An app-runner.sh-managed run that is still live is authoritative.
+        return reconciled
+    # State is missing-live (stopped/stale/other): the agent may have brought up
+    # a compose stack outside app-runner.sh. Prefer a live discovered stack.
+    discovered = await asyncio.to_thread(_discover_compose_app_runner_state)
+    if discovered is not None:
+        return discovered
+    return reconciled
 def _get_log_redactor():
@@ -7655,8 +8101,12 @@ async def get_app_runner_logs(lines: int = Query(default=100, ge=1, le=1000)):
         return {"lines": []}
     try:
         redact = _get_log_redactor()
-        all_lines = _safe_read_text(log_file).splitlines()
-        return {"lines": [redact(ln) for ln in all_lines[-lines:]], "redacted": True}
+        # Reading + redacting the app log is blocking (the log can be large);
+        # offload so the event loop (status + WS heartbeat) is not stalled.
+        def _read_redacted(p=log_file, n=lines):
+            return [redact(ln) for ln in _safe_read_text(p).splitlines()[-n:]]
+        out_lines = await asyncio.to_thread(_read_redacted)
+        return {"lines": out_lines, "redacted": True}
     except OSError:
         return {"lines": []}
@@ -7691,8 +8141,10 @@ async def get_app_runner_errors(lines: int = Query(default=50, ge=1, le=500)):
     if log_file.exists():
         try:
             redact = _get_log_redactor()
-            all_lines = _safe_read_text(log_file).splitlines()
-            out_lines = [redact(ln) for ln in all_lines[-lines:]]
+            # Offload the blocking log read + redaction off the event loop.
+            def _read_redacted(p=log_file, n=lines):
+                return [redact(ln) for ln in _safe_read_text(p).splitlines()[-n:]]
+            out_lines = await asyncio.to_thread(_read_redacted)
         except OSError:
             out_lines = []
@@ -8410,7 +8862,11 @@ async def get_managed_events(
     """
     try:
         path = _managed_events_path()
-        records = _tail_ndjson(path, limit=limit, since_iso=since, event_type=type)
+        # Tails an ndjson file (rotated at 10MB) via a blocking readlines();
+        # offload so the event loop stays responsive.
+        records = await asyncio.to_thread(
+            _tail_ndjson, path, limit, since, type
+        )
         return {
             "events": records,
             "count": len(records),
@@ -8433,11 +8889,13 @@ async def get_managed_status():
     snapshot = _managed_flags_snapshot()
     # last_fallback_ts is best-effort from the local events file.
     try:
-        events = _tail_ndjson(
+        # Blocking ndjson tail read; offload off the event loop.
+        events = await asyncio.to_thread(
+            _tail_ndjson,
             _managed_events_path(),
-            limit=500,
-            since_iso=None,
-            event_type="managed_agents_fallback",
+            500,
+            None,
+            "managed_agents_fallback",
         )
         snapshot["last_fallback_ts"] = _last_fallback_ts(events)
     except Exception: