loki-mode 7.56.0 → 7.58.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,6 +11,7 @@ import json
11
11
  import logging
12
12
  import os
13
13
  import subprocess
14
+ import threading
14
15
  import time
15
16
  from collections import defaultdict
16
17
  from dataclasses import asdict
@@ -3864,34 +3865,40 @@ async def get_memory_summary():
3864
3865
  @app.get("/api/memory/episodes")
3865
3866
  async def list_episodes(limit: int = Query(default=50, ge=1, le=1000)):
3866
3867
  """List episodic memory entries."""
3867
- # Try SQLite backend first
3868
- storage = _get_memory_storage()
3869
- if storage is not None:
3870
- try:
3871
- ids = storage.list_episodes(limit=limit)
3872
- episodes = []
3873
- for eid in ids:
3874
- ep = storage.load_episode(eid)
3875
- if ep:
3876
- episodes.append(ep)
3877
- return episodes
3878
- except Exception:
3879
- pass
3880
-
3881
- # Fallback to JSON files -- use heapq to avoid sorting all files
3882
- import heapq
3883
- ep_dir = _get_loki_dir() / "memory" / "episodic"
3884
- episodes = []
3885
- if ep_dir.exists():
3886
- all_files = ep_dir.glob("*.json")
3887
- # nlargest by filename (timestamps sort lexicographically) avoids full sort
3888
- files = heapq.nlargest(limit, all_files, key=lambda f: f.name)
3889
- for f in files:
3868
+ # Both backends below are blocking (SQLite queries / a glob+read loop over
3869
+ # many JSON files) and only build a local list, so offload the whole read
3870
+ # off the event loop to keep status + WS heartbeat responsive.
3871
+ def _load_episodes() -> list:
3872
+ # Try SQLite backend first
3873
+ storage = _get_memory_storage()
3874
+ if storage is not None:
3890
3875
  try:
3891
- episodes.append(json.loads(f.read_text()))
3876
+ ids = storage.list_episodes(limit=limit)
3877
+ episodes = []
3878
+ for eid in ids:
3879
+ ep = storage.load_episode(eid)
3880
+ if ep:
3881
+ episodes.append(ep)
3882
+ return episodes
3892
3883
  except Exception:
3893
3884
  pass
3894
- return episodes
3885
+
3886
+ # Fallback to JSON files -- use heapq to avoid sorting all files
3887
+ import heapq
3888
+ ep_dir = _get_loki_dir() / "memory" / "episodic"
3889
+ episodes = []
3890
+ if ep_dir.exists():
3891
+ all_files = ep_dir.glob("*.json")
3892
+ # nlargest by filename (timestamps sort lexicographically) avoids full sort
3893
+ files = heapq.nlargest(limit, all_files, key=lambda f: f.name)
3894
+ for f in files:
3895
+ try:
3896
+ episodes.append(json.loads(f.read_text()))
3897
+ except Exception:
3898
+ pass
3899
+ return episodes
3900
+
3901
+ return await asyncio.to_thread(_load_episodes)
3895
3902
 
3896
3903
 
3897
3904
  @app.get("/api/memory/episodes/{episode_id}", dependencies=[Depends(auth.require_scope("read"))])
@@ -3968,30 +3975,35 @@ async def get_pattern(pattern_id: str):
3968
3975
  @app.get("/api/memory/skills")
3969
3976
  async def list_skills():
3970
3977
  """List procedural skills."""
3971
- # Try SQLite first
3972
- storage = _get_memory_storage()
3973
- if storage is not None:
3974
- try:
3975
- ids = storage.list_skills()
3976
- skills = []
3977
- for sid in ids:
3978
- s = storage.load_skill(sid)
3979
- if s:
3980
- skills.append(s)
3981
- return skills
3982
- except Exception:
3983
- pass
3984
-
3985
- # Fallback to JSON
3986
- skills_dir = _get_loki_dir() / "memory" / "skills"
3987
- skills = []
3988
- if skills_dir.exists():
3989
- for f in sorted(skills_dir.glob("*.json")):
3978
+ # Blocking SQLite query / glob+read loop; offload the whole read so the
3979
+ # event loop (status + WS heartbeat) stays responsive.
3980
+ def _load_skills() -> list:
3981
+ # Try SQLite first
3982
+ storage = _get_memory_storage()
3983
+ if storage is not None:
3990
3984
  try:
3991
- skills.append(json.loads(f.read_text()))
3985
+ ids = storage.list_skills()
3986
+ skills = []
3987
+ for sid in ids:
3988
+ s = storage.load_skill(sid)
3989
+ if s:
3990
+ skills.append(s)
3991
+ return skills
3992
3992
  except Exception:
3993
3993
  pass
3994
- return skills
3994
+
3995
+ # Fallback to JSON
3996
+ skills_dir = _get_loki_dir() / "memory" / "skills"
3997
+ skills = []
3998
+ if skills_dir.exists():
3999
+ for f in sorted(skills_dir.glob("*.json")):
4000
+ try:
4001
+ skills.append(json.loads(f.read_text()))
4002
+ except Exception:
4003
+ pass
4004
+ return skills
4005
+
4006
+ return await asyncio.to_thread(_load_skills)
3995
4007
 
3996
4008
 
3997
4009
  @app.get("/api/memory/skills/{skill_id}", dependencies=[Depends(auth.require_scope("read"))])
@@ -4346,15 +4358,16 @@ async def get_memory_file(
4346
4358
  st = target.stat()
4347
4359
  except Exception:
4348
4360
  raise HTTPException(status_code=500, detail="stat failed")
4349
- truncated = False
4361
+ truncated = st.st_size > _MEMORY_FILE_MAX_BYTES
4362
+
4363
+ def _read_memory_blob() -> bytes:
4364
+ # Up to a 2 MiB blocking read; offloaded so the single-worker event
4365
+ # loop (and /api/status + WS heartbeat) stays responsive.
4366
+ with open(target, "rb") as fh:
4367
+ return fh.read(_MEMORY_FILE_MAX_BYTES) if truncated else fh.read()
4368
+
4350
4369
  try:
4351
- if st.st_size > _MEMORY_FILE_MAX_BYTES:
4352
- with open(target, "rb") as fh:
4353
- raw = fh.read(_MEMORY_FILE_MAX_BYTES)
4354
- truncated = True
4355
- else:
4356
- with open(target, "rb") as fh:
4357
- raw = fh.read()
4370
+ raw = await asyncio.to_thread(_read_memory_blob)
4358
4371
  # Decode as UTF-8 with replacement so we never 500 on a stray byte.
4359
4372
  content = raw.decode("utf-8", errors="replace")
4360
4373
  except HTTPException:
@@ -4432,44 +4445,49 @@ async def search_memory(
4432
4445
  @app.get("/api/memory/stats")
4433
4446
  async def get_memory_stats():
4434
4447
  """Get memory system statistics (counts, size, backend info)."""
4435
- storage = _get_memory_storage()
4436
- if storage is not None:
4437
- try:
4438
- return storage.get_stats()
4439
- except Exception:
4440
- pass
4448
+ # SQLite stats query or a directory-walk over many JSON files; both block,
4449
+ # so offload off the event loop.
4450
+ def _compute_stats() -> dict:
4451
+ storage = _get_memory_storage()
4452
+ if storage is not None:
4453
+ try:
4454
+ return storage.get_stats()
4455
+ except Exception:
4456
+ pass
4441
4457
 
4442
- # Fallback: compute stats from JSON files
4443
- memory_dir = _get_loki_dir() / "memory"
4444
- ep_count = 0
4445
- ep_dir = memory_dir / "episodic"
4446
- if ep_dir.exists():
4447
- for d in ep_dir.iterdir():
4448
- if d.is_dir():
4449
- ep_count += len(list(d.glob("*.json")))
4450
- elif d.suffix == ".json":
4451
- ep_count += 1
4452
-
4453
- pat_count = 0
4454
- patterns_file = memory_dir / "semantic" / "patterns.json"
4455
- if patterns_file.exists():
4456
- try:
4457
- data = json.loads(patterns_file.read_text())
4458
- pat_count = len(data) if isinstance(data, list) else len(data.get("patterns", []))
4459
- except Exception:
4460
- pass
4458
+ # Fallback: compute stats from JSON files
4459
+ memory_dir = _get_loki_dir() / "memory"
4460
+ ep_count = 0
4461
+ ep_dir = memory_dir / "episodic"
4462
+ if ep_dir.exists():
4463
+ for d in ep_dir.iterdir():
4464
+ if d.is_dir():
4465
+ ep_count += len(list(d.glob("*.json")))
4466
+ elif d.suffix == ".json":
4467
+ ep_count += 1
4468
+
4469
+ pat_count = 0
4470
+ patterns_file = memory_dir / "semantic" / "patterns.json"
4471
+ if patterns_file.exists():
4472
+ try:
4473
+ data = json.loads(patterns_file.read_text())
4474
+ pat_count = len(data) if isinstance(data, list) else len(data.get("patterns", []))
4475
+ except Exception:
4476
+ pass
4461
4477
 
4462
- skill_count = 0
4463
- skills_dir = memory_dir / "skills"
4464
- if skills_dir.exists():
4465
- skill_count = len(list(skills_dir.glob("*.json")))
4478
+ skill_count = 0
4479
+ skills_dir = memory_dir / "skills"
4480
+ if skills_dir.exists():
4481
+ skill_count = len(list(skills_dir.glob("*.json")))
4466
4482
 
4467
- return {
4468
- "backend": "json",
4469
- "episode_count": ep_count,
4470
- "pattern_count": pat_count,
4471
- "skill_count": skill_count,
4472
- }
4483
+ return {
4484
+ "backend": "json",
4485
+ "episode_count": ep_count,
4486
+ "pattern_count": pat_count,
4487
+ "skill_count": skill_count,
4488
+ }
4489
+
4490
+ return await asyncio.to_thread(_compute_stats)
4473
4491
 
4474
4492
 
4475
4493
  # Learning/metrics endpoints
@@ -4515,10 +4533,10 @@ async def get_learning_metrics(
4515
4533
  source: Optional[str] = None,
4516
4534
  ):
4517
4535
  """Get learning metrics from events, metrics files, and learning signals."""
4518
- events = _read_events(timeRange)
4536
+ events = await asyncio.to_thread(_read_events, timeRange)
4519
4537
 
4520
4538
  # Also read from learning signals directory
4521
- all_signals = _read_learning_signals(limit=10000)
4539
+ all_signals = await asyncio.to_thread(_read_learning_signals, limit=10000)
4522
4540
 
4523
4541
  # Filter by type and source
4524
4542
  if signalType:
@@ -4595,7 +4613,7 @@ async def get_learning_trends(
4595
4613
  source: Optional[str] = None,
4596
4614
  ):
4597
4615
  """Get learning trend data."""
4598
- events = _read_events(timeRange)
4616
+ events = await asyncio.to_thread(_read_events, timeRange)
4599
4617
  # Group by hour for trend data
4600
4618
  by_hour: dict = {}
4601
4619
  for e in events:
@@ -4617,14 +4635,14 @@ async def get_learning_signals(
4617
4635
  offset: int = Query(default=0, ge=0),
4618
4636
  ):
4619
4637
  """Get raw learning signals from both events.jsonl and learning signals directory."""
4620
- events = _read_events(timeRange)
4638
+ events = await asyncio.to_thread(_read_events, timeRange)
4621
4639
  if signalType:
4622
4640
  events = [e for e in events if e.get("type") == signalType]
4623
4641
  if source:
4624
4642
  events = [e for e in events if e.get("data", {}).get("source") == source]
4625
4643
 
4626
4644
  # Also read from learning signals directory
4627
- file_signals = _read_learning_signals(signal_type=signalType, limit=10000)
4645
+ file_signals = await asyncio.to_thread(_read_learning_signals, signal_type=signalType, limit=10000)
4628
4646
  if source:
4629
4647
  file_signals = [s for s in file_signals if s.get("source") == source]
4630
4648
 
@@ -4648,10 +4666,10 @@ async def get_learning_aggregation():
4648
4666
  pass
4649
4667
 
4650
4668
  # Supplement with live data from learning signals directory
4651
- success_signals = _read_learning_signals(signal_type="success_pattern", limit=500)
4652
- tool_signals = _read_learning_signals(signal_type="tool_efficiency", limit=500)
4653
- error_signals = _read_learning_signals(signal_type="error_pattern", limit=500)
4654
- pref_signals = _read_learning_signals(signal_type="user_preference", limit=500)
4669
+ success_signals = await asyncio.to_thread(_read_learning_signals, signal_type="success_pattern", limit=500)
4670
+ tool_signals = await asyncio.to_thread(_read_learning_signals, signal_type="tool_efficiency", limit=500)
4671
+ error_signals = await asyncio.to_thread(_read_learning_signals, signal_type="error_pattern", limit=500)
4672
+ pref_signals = await asyncio.to_thread(_read_learning_signals, signal_type="user_preference", limit=500)
4655
4673
 
4656
4674
  # Merge success patterns from signals if aggregation file had none
4657
4675
  if not result.get("success_patterns") and success_signals:
@@ -4725,6 +4743,14 @@ async def trigger_aggregation():
4725
4743
  if not _read_limiter.check("learning_aggregate"):
4726
4744
  raise HTTPException(status_code=429, detail="Rate limit exceeded")
4727
4745
 
4746
+ # Reads up to 10 MB of events.jsonl, parses every line, then writes the
4747
+ # aggregation.json metrics file. All blocking, all on local state +
4748
+ # filesystem (no shared in-memory state), so offload the whole computation
4749
+ # to a thread to keep the event loop (status + WS heartbeat) responsive.
4750
+ return await asyncio.to_thread(_compute_learning_aggregation)
4751
+
4752
+
4753
+ def _compute_learning_aggregation() -> dict:
4728
4754
  events_file = _get_loki_dir() / "events.jsonl"
4729
4755
  preferences: dict = {}
4730
4756
  error_patterns: dict = {}
@@ -4820,10 +4846,10 @@ async def trigger_aggregation():
4820
4846
  @app.get("/api/learning/preferences", dependencies=[Depends(auth.require_scope("read"))])
4821
4847
  async def get_learning_preferences(limit: int = Query(default=50, ge=1, le=1000)):
4822
4848
  """Get aggregated user preferences from events and learning signals directory."""
4823
- events = _read_events("30d")
4849
+ events = await asyncio.to_thread(_read_events, "30d")
4824
4850
  prefs = [e for e in events if e.get("type") == "user_preference"]
4825
4851
  # Also read from learning signals directory
4826
- file_prefs = _read_learning_signals(signal_type="user_preference", limit=limit)
4852
+ file_prefs = await asyncio.to_thread(_read_learning_signals, signal_type="user_preference", limit=limit)
4827
4853
  combined = prefs + file_prefs
4828
4854
  combined.sort(key=lambda s: s.get("timestamp", ""), reverse=True)
4829
4855
  return combined[:limit]
@@ -4832,10 +4858,10 @@ async def get_learning_preferences(limit: int = Query(default=50, ge=1, le=1000)
4832
4858
  @app.get("/api/learning/errors", dependencies=[Depends(auth.require_scope("read"))])
4833
4859
  async def get_learning_errors(limit: int = Query(default=50, ge=1, le=1000)):
4834
4860
  """Get aggregated error patterns from events and learning signals directory."""
4835
- events = _read_events("30d")
4861
+ events = await asyncio.to_thread(_read_events, "30d")
4836
4862
  errors = [e for e in events if e.get("type") == "error_pattern"]
4837
4863
  # Also read from learning signals directory
4838
- file_errors = _read_learning_signals(signal_type="error_pattern", limit=limit)
4864
+ file_errors = await asyncio.to_thread(_read_learning_signals, signal_type="error_pattern", limit=limit)
4839
4865
  combined = errors + file_errors
4840
4866
  combined.sort(key=lambda s: s.get("timestamp", ""), reverse=True)
4841
4867
  return combined[:limit]
@@ -4844,10 +4870,10 @@ async def get_learning_errors(limit: int = Query(default=50, ge=1, le=1000)):
4844
4870
  @app.get("/api/learning/success", dependencies=[Depends(auth.require_scope("read"))])
4845
4871
  async def get_learning_success(limit: int = Query(default=50, ge=1, le=1000)):
4846
4872
  """Get aggregated success patterns from events and learning signals directory."""
4847
- events = _read_events("30d")
4873
+ events = await asyncio.to_thread(_read_events, "30d")
4848
4874
  successes = [e for e in events if e.get("type") == "success_pattern"]
4849
4875
  # Also read from learning signals directory
4850
- file_successes = _read_learning_signals(signal_type="success_pattern", limit=limit)
4876
+ file_successes = await asyncio.to_thread(_read_learning_signals, signal_type="success_pattern", limit=limit)
4851
4877
  combined = successes + file_successes
4852
4878
  combined.sort(key=lambda s: s.get("timestamp", ""), reverse=True)
4853
4879
  return combined[:limit]
@@ -4856,10 +4882,10 @@ async def get_learning_success(limit: int = Query(default=50, ge=1, le=1000)):
4856
4882
  @app.get("/api/learning/tools", dependencies=[Depends(auth.require_scope("read"))])
4857
4883
  async def get_tool_efficiency(limit: int = Query(default=50, ge=1, le=1000)):
4858
4884
  """Get tool efficiency rankings from events and learning signals directory."""
4859
- events = _read_events("30d")
4885
+ events = await asyncio.to_thread(_read_events, "30d")
4860
4886
  tools = [e for e in events if e.get("type") == "tool_efficiency"]
4861
4887
  # Also read from learning signals directory
4862
- file_tools = _read_learning_signals(signal_type="tool_efficiency", limit=limit)
4888
+ file_tools = await asyncio.to_thread(_read_learning_signals, signal_type="tool_efficiency", limit=limit)
4863
4889
  combined = tools + file_tools
4864
4890
  combined.sort(key=lambda s: s.get("timestamp", ""), reverse=True)
4865
4891
  return combined[:limit]
@@ -5203,7 +5229,16 @@ def _calculate_model_cost(model: str, input_tokens: int, output_tokens: int) ->
5203
5229
 
5204
5230
  @app.get("/api/cost")
5205
5231
  async def get_cost():
5206
- """Get cost visibility data from .loki/metrics/efficiency/ and budget.json."""
5232
+ """Get cost visibility data from .loki/metrics/efficiency/ and budget.json.
5233
+
5234
+ The computation globs + reads every per-iteration efficiency JSON file
5235
+ (a blocking multi-file read loop building only local aggregates), so it is
5236
+ offloaded to a thread to keep the event loop responsive.
5237
+ """
5238
+ return await asyncio.to_thread(_compute_cost_snapshot)
5239
+
5240
+
5241
+ def _compute_cost_snapshot() -> dict:
5207
5242
  loki_dir = _get_loki_dir()
5208
5243
  efficiency_dir = loki_dir / "metrics" / "efficiency"
5209
5244
  budget_file = loki_dir / "metrics" / "budget.json"
@@ -5470,7 +5505,15 @@ async def get_cost_timeline():
5470
5505
  classifies into ok/warn/exceeded so the UI can warn at 80% before the cap.
5471
5506
  Cost is never fabricated: when nothing was recorded, cost_recorded is False
5472
5507
  and totals are honestly null rather than a misleading $0.00.
5508
+
5509
+ Globs + reads every efficiency iteration file and every proof.json (a
5510
+ blocking multi-file read loop building only local state), so it is offloaded
5511
+ to a thread to keep the event loop responsive.
5473
5512
  """
5513
+ return await asyncio.to_thread(_compute_cost_timeline)
5514
+
5515
+
5516
+ def _compute_cost_timeline() -> dict:
5474
5517
  loki_dir = _get_loki_dir()
5475
5518
  efficiency_dir = loki_dir / "metrics" / "efficiency"
5476
5519
 
@@ -5729,51 +5772,59 @@ async def get_council_state():
5729
5772
 
5730
5773
  @app.get("/api/council/verdicts")
5731
5774
  async def get_council_verdicts(limit: int = Query(default=20, ge=1, le=1000)):
5732
- """Get council vote history (decision log)."""
5733
- state_file = _get_loki_dir() / "council" / "state.json"
5734
- verdicts = []
5735
- if state_file.exists():
5736
- try:
5737
- state = json.loads(state_file.read_text())
5738
- verdicts = state.get("verdicts", [])
5739
- except Exception:
5740
- pass
5775
+ """Get council vote history (decision log).
5741
5776
 
5742
- # Also read individual vote files for detail
5743
- votes_dir = _get_loki_dir() / "council" / "votes"
5744
- detailed_verdicts = []
5745
- if votes_dir.exists():
5746
- for vote_dir in sorted(votes_dir.iterdir(), reverse=True):
5747
- if vote_dir.is_dir():
5748
- verdict_detail = {"iteration": vote_dir.name}
5749
- # Read evidence
5750
- evidence_file = vote_dir / "evidence.md"
5751
- if evidence_file.exists():
5752
- try:
5753
- verdict_detail["evidence_preview"] = evidence_file.read_text()[:500]
5754
- except Exception:
5755
- verdict_detail["evidence_preview"] = ""
5756
- # Read member votes
5757
- members = []
5758
- for member_file in sorted(vote_dir.glob("member-*.txt")):
5759
- try:
5760
- content = member_file.read_text().strip()
5761
- members.append({
5762
- "member": member_file.stem,
5763
- "content": content
5764
- })
5765
- except Exception:
5766
- pass
5767
- verdict_detail["members"] = members
5768
- # Read contrarian
5769
- contrarian_file = vote_dir / "contrarian.txt"
5770
- if contrarian_file.exists():
5771
- verdict_detail["contrarian"] = contrarian_file.read_text().strip()
5772
- detailed_verdicts.append(verdict_detail)
5773
- if len(detailed_verdicts) >= limit:
5774
- break
5777
+ Walks every vote directory and reads its evidence/member/contrarian files
5778
+ (a blocking multi-file read loop building only local state), so it is
5779
+ offloaded to a thread to keep the event loop responsive.
5780
+ """
5781
+ def _collect_verdicts() -> dict:
5782
+ state_file = _get_loki_dir() / "council" / "state.json"
5783
+ verdicts = []
5784
+ if state_file.exists():
5785
+ try:
5786
+ state = json.loads(state_file.read_text())
5787
+ verdicts = state.get("verdicts", [])
5788
+ except Exception:
5789
+ pass
5775
5790
 
5776
- return {"verdicts": verdicts, "details": detailed_verdicts}
5791
+ # Also read individual vote files for detail
5792
+ votes_dir = _get_loki_dir() / "council" / "votes"
5793
+ detailed_verdicts = []
5794
+ if votes_dir.exists():
5795
+ for vote_dir in sorted(votes_dir.iterdir(), reverse=True):
5796
+ if vote_dir.is_dir():
5797
+ verdict_detail = {"iteration": vote_dir.name}
5798
+ # Read evidence
5799
+ evidence_file = vote_dir / "evidence.md"
5800
+ if evidence_file.exists():
5801
+ try:
5802
+ verdict_detail["evidence_preview"] = evidence_file.read_text()[:500]
5803
+ except Exception:
5804
+ verdict_detail["evidence_preview"] = ""
5805
+ # Read member votes
5806
+ members = []
5807
+ for member_file in sorted(vote_dir.glob("member-*.txt")):
5808
+ try:
5809
+ content = member_file.read_text().strip()
5810
+ members.append({
5811
+ "member": member_file.stem,
5812
+ "content": content
5813
+ })
5814
+ except Exception:
5815
+ pass
5816
+ verdict_detail["members"] = members
5817
+ # Read contrarian
5818
+ contrarian_file = vote_dir / "contrarian.txt"
5819
+ if contrarian_file.exists():
5820
+ verdict_detail["contrarian"] = contrarian_file.read_text().strip()
5821
+ detailed_verdicts.append(verdict_detail)
5822
+ if len(detailed_verdicts) >= limit:
5823
+ break
5824
+
5825
+ return {"verdicts": verdicts, "details": detailed_verdicts}
5826
+
5827
+ return await asyncio.to_thread(_collect_verdicts)
5777
5828
 
5778
5829
 
5779
5830
  @app.get("/api/council/convergence")
@@ -5848,35 +5899,41 @@ async def get_council_transcripts(
5848
5899
  if not transcripts_dir.exists():
5849
5900
  response: dict = {"transcripts": [], "total": 0, "latest_id": None}
5850
5901
  if type_prefix:
5851
- response["hook_events"] = _read_events(type_prefix=type_prefix)
5902
+ response["hook_events"] = await asyncio.to_thread(_read_events, type_prefix=type_prefix)
5852
5903
  return response
5853
5904
 
5854
- records = []
5855
- for f in sorted(transcripts_dir.glob("iter-*.json"), reverse=True):
5856
- try:
5857
- rec = json.loads(f.read_text())
5858
- except Exception:
5859
- logger.warning("Skipping corrupt council transcript file: %s", f.name)
5860
- continue
5861
- if not isinstance(rec, dict):
5862
- logger.warning("Skipping non-object council transcript file: %s", f.name)
5863
- continue
5864
- if not isinstance(rec.get("iteration_id"), str):
5865
- logger.warning("Skipping transcript missing iteration_id field: %s", f.name)
5866
- continue
5867
- if since_dt is not None:
5868
- ts_str = rec.get("timestamp", "")
5905
+ def _collect_transcript_records() -> list:
5906
+ # Globs + reads up to `limit` (<=200) JSON transcript files; a blocking
5907
+ # multi-file read loop offloaded so the event loop stays responsive.
5908
+ out: list = []
5909
+ for f in sorted(transcripts_dir.glob("iter-*.json"), reverse=True):
5869
5910
  try:
5870
- ts = datetime.fromisoformat(ts_str.replace("Z", "+00:00"))
5871
- except (ValueError, AttributeError):
5911
+ rec = json.loads(f.read_text())
5912
+ except Exception:
5913
+ logger.warning("Skipping corrupt council transcript file: %s", f.name)
5872
5914
  continue
5873
- if ts <= since_dt:
5915
+ if not isinstance(rec, dict):
5916
+ logger.warning("Skipping non-object council transcript file: %s", f.name)
5874
5917
  continue
5875
- if iter_min is not None and rec.get("iteration", 0) < iter_min:
5876
- continue
5877
- records.append(rec)
5878
- if len(records) >= limit:
5879
- break
5918
+ if not isinstance(rec.get("iteration_id"), str):
5919
+ logger.warning("Skipping transcript missing iteration_id field: %s", f.name)
5920
+ continue
5921
+ if since_dt is not None:
5922
+ ts_str = rec.get("timestamp", "")
5923
+ try:
5924
+ ts = datetime.fromisoformat(ts_str.replace("Z", "+00:00"))
5925
+ except (ValueError, AttributeError):
5926
+ continue
5927
+ if ts <= since_dt:
5928
+ continue
5929
+ if iter_min is not None and rec.get("iteration", 0) < iter_min:
5930
+ continue
5931
+ out.append(rec)
5932
+ if len(out) >= limit:
5933
+ break
5934
+ return out
5935
+
5936
+ records = await asyncio.to_thread(_collect_transcript_records)
5880
5937
 
5881
5938
  response = {
5882
5939
  "transcripts": records,
@@ -5885,7 +5942,7 @@ async def get_council_transcripts(
5885
5942
  }
5886
5943
  # v7.5.22 Phase D: opt-in hook-event passthrough via _read_events filter.
5887
5944
  if type_prefix:
5888
- response["hook_events"] = _read_events(type_prefix=type_prefix)
5945
+ response["hook_events"] = await asyncio.to_thread(_read_events, type_prefix=type_prefix)
5889
5946
  return response
5890
5947
 
5891
5948
 
@@ -6106,7 +6163,16 @@ def _sanitize_checkpoint_id(checkpoint_id: str) -> str:
6106
6163
 
6107
6164
  @app.get("/api/checkpoints")
6108
6165
  async def list_checkpoints(limit: int = Query(default=20, ge=1, le=200)):
6109
- """List recent checkpoints from index.jsonl, enriched with metadata when available."""
6166
+ """List recent checkpoints from index.jsonl, enriched with metadata when available.
6167
+
6168
+ Reads index.jsonl plus a metadata.json and a recursive rglob() file count
6169
+ per checkpoint (a blocking multi-file walk building only local state), so
6170
+ it is offloaded to a thread to keep the event loop responsive.
6171
+ """
6172
+ return await asyncio.to_thread(_collect_checkpoints, limit)
6173
+
6174
+
6175
+ def _collect_checkpoints(limit: int) -> list:
6110
6176
  loki_dir = _get_loki_dir()
6111
6177
  index_file = loki_dir / "state" / "checkpoints" / "index.jsonl"
6112
6178
  checkpoints_dir = loki_dir / "state" / "checkpoints"
@@ -6557,17 +6623,18 @@ async def get_logs(lines: int = 100, token: Optional[dict] = Depends(auth.get_cu
6557
6623
  file_mtime = datetime.fromtimestamp(log_file.stat().st_mtime, tz=timezone.utc).strftime(
6558
6624
  "%Y-%m-%dT%H:%M:%S"
6559
6625
  )
6560
- # Read only the tail to avoid loading huge files into memory
6561
- tail_lines = []
6562
- try:
6563
- with open(log_file, "rb") as lf:
6564
- # Seek from end to find enough lines
6626
+ # Read only the tail to avoid loading huge files into memory.
6627
+ # The up-to-1MB blocking read is offloaded to a thread so the
6628
+ # single-worker event loop (status + WS heartbeat) stays free.
6629
+ def _read_log_tail(lf_path=log_file, n=lines) -> list[str]:
6630
+ with open(lf_path, "rb") as lf:
6565
6631
  lf.seek(0, 2)
6566
6632
  file_size = lf.tell()
6567
- # Read at most 1MB from the end (plenty for any reasonable lines count)
6568
6633
  read_size = min(file_size, 1024 * 1024)
6569
6634
  lf.seek(max(0, file_size - read_size))
6570
- tail_lines = lf.read().decode("utf-8", errors="replace").strip().split("\n")[-lines:]
6635
+ return lf.read().decode("utf-8", errors="replace").strip().split("\n")[-n:]
6636
+ try:
6637
+ tail_lines = await asyncio.to_thread(_read_log_tail)
6571
6638
  except (OSError, UnicodeDecodeError):
6572
6639
  tail_lines = []
6573
6640
  for raw_line in tail_lines:
@@ -7599,18 +7666,397 @@ def _reconcile_app_runner_liveness(state):
7599
7666
  return state
7600
7667
 
7601
7668
 
7669
+ # =============================================================================
7670
+ # Docker-compose app-runner discovery
7671
+ #
7672
+ # When the autonomous agent brings up a docker-compose stack itself (rather than
7673
+ # via autonomy/app-runner.sh), no .loki/app-runner/state.json is written, so the
7674
+ # status endpoint reports "not_initialized" / "stopped" even though the app is
7675
+ # genuinely running. The discovery helper below inspects the live compose stack
7676
+ # for the project directory and synthesizes an equivalent status so the dashboard
7677
+ # App Runner panel surfaces the running app and its URL.
7678
+ #
7679
+ # Safety contract (all mandatory):
7680
+ # - Every docker subprocess.run has an explicit timeout; total work is bounded.
7681
+ # - On ANY error (TimeoutExpired/OSError/SubprocessError/parse failure) the
7682
+ # helper returns None and the caller falls back to its prior behavior. The
7683
+ # handler never raises and never blocks the event loop (it is offloaded via
7684
+ # asyncio.to_thread / run_in_threadpool).
7685
+ # - A short TTL cache prevents the 3s/5s dashboard pollers from spawning
7686
+ # repeated docker invocations.
7687
+ # - A URL is never fabricated for a non-running or non-published container.
7688
+ # =============================================================================
7689
+
7690
+ # Common host ports a web service typically publishes, in precedence order.
7691
+ # Mirrors autonomy/app-runner.sh _identify_compose_web_service (COMMON list).
7692
+ _COMPOSE_COMMON_WEB_PORTS = ["3000", "8000", "8080", "5000", "4200", "5173", "80"]
7693
+
7694
+ # Per-docker-call timeout (seconds). Several calls run in sequence; keep each
7695
+ # tight so total discovery stays bounded well under the poller interval.
7696
+ _COMPOSE_DISCOVERY_CMD_TIMEOUT = 3
7697
+
7698
+ # TTL (seconds) for the discovery result cache, keyed by resolved project dir.
7699
+ # The dashboard polls every 3-5s; a 2.5s TTL collapses a burst of concurrent
7700
+ # pollers onto a single docker probe without making the status feel stale.
7701
+ _COMPOSE_DISCOVERY_TTL_SECONDS = 2.5
7702
+
7703
+ # Cache: {project_dir_str: (expiry_epoch, result_or_None)}. Module-level so it
7704
+ # survives across requests. Guarded by a lock because to_thread runs the sync
7705
+ # helper on worker threads that can overlap.
7706
+ _compose_discovery_cache: dict[str, tuple[float, Optional[dict]]] = {}
7707
+ _compose_discovery_lock = threading.Lock()
7708
+
7709
+
7710
+ def _parse_docker_json(raw):
7711
+ """Parse docker --format json output into a list of dicts, defensively.
7712
+
7713
+ Docker emits either a single JSON array or newline-delimited JSON (one
7714
+ object per line), and the shape has varied across docker/compose versions.
7715
+ Try a whole-blob parse first; if that fails or does not yield a list, fall
7716
+ back to parsing each non-empty line individually. Returns a list of dicts
7717
+ (possibly empty). Never raises.
7718
+ """
7719
+ raw = (raw or "").strip()
7720
+ if not raw:
7721
+ return []
7722
+ try:
7723
+ parsed = json.loads(raw)
7724
+ if isinstance(parsed, list):
7725
+ return [x for x in parsed if isinstance(x, dict)]
7726
+ if isinstance(parsed, dict):
7727
+ return [parsed]
7728
+ except (ValueError, TypeError):
7729
+ pass
7730
+ items = []
7731
+ for line in raw.splitlines():
7732
+ line = line.strip()
7733
+ if not line:
7734
+ continue
7735
+ try:
7736
+ obj = json.loads(line)
7737
+ except (ValueError, TypeError):
7738
+ continue
7739
+ if isinstance(obj, dict):
7740
+ items.append(obj)
7741
+ return items
7742
+
7743
+
7744
+ def _run_docker_json(args, cwd=None):
7745
+ """Run a docker command and return parsed JSON rows, or None on any failure.
7746
+
7747
+ args is the argument list AFTER `docker` (e.g. ["compose", "ps", ...]). Uses
7748
+ an explicit per-call timeout and a list argv (no shell). A non-zero exit,
7749
+ timeout, missing docker binary, or unparseable output all yield None so the
7750
+ caller fails open.
7751
+ """
7752
+ try:
7753
+ proc = subprocess.run(
7754
+ ["docker", *args],
7755
+ capture_output=True,
7756
+ text=True,
7757
+ timeout=_COMPOSE_DISCOVERY_CMD_TIMEOUT,
7758
+ cwd=str(cwd) if cwd else None,
7759
+ )
7760
+ except (OSError, subprocess.SubprocessError):
7761
+ return None
7762
+ if proc.returncode != 0:
7763
+ return None
7764
+ return _parse_docker_json(proc.stdout)
7765
+
7766
+
7767
+ def _compose_published_ports(container):
7768
+ """Host ports actually published by a running compose container (compose ps).
7769
+
7770
+ `docker compose ps --format json` exposes published ports under the
7771
+ "Publishers" list, each like {"PublishedPort": 3000, "TargetPort": 3000,
7772
+ "Protocol": "tcp", "URL": "0.0.0.0"}. A PublishedPort of 0 means the port is
7773
+ exposed but not published to the host, so it is filtered out. Returns a list
7774
+ of host port strings, preserving order. Never raises.
7775
+ """
7776
+ out = []
7777
+ pubs = container.get("Publishers")
7778
+ if not isinstance(pubs, list):
7779
+ return out
7780
+ for p in pubs:
7781
+ if not isinstance(p, dict):
7782
+ continue
7783
+ port = p.get("PublishedPort")
7784
+ try:
7785
+ port = int(port)
7786
+ except (TypeError, ValueError):
7787
+ continue
7788
+ if port > 0:
7789
+ out.append(str(port))
7790
+ return out
7791
+
7792
+
7793
+ def _compose_service_labels(svc):
7794
+ """Normalize a compose-config service's labels into a dict. Never raises."""
7795
+ labels = svc.get("labels") or {}
7796
+ if isinstance(labels, dict):
7797
+ return labels
7798
+ if isinstance(labels, list):
7799
+ normalized = {}
7800
+ for item in labels:
7801
+ if isinstance(item, str) and "=" in item:
7802
+ k, v = item.split("=", 1)
7803
+ normalized[k] = v
7804
+ return normalized
7805
+ return {}
7806
+
7807
+
7808
+ def _identify_compose_web_service(config_services, running_by_service):
7809
+ """Pick the primary web service and its published host port.
7810
+
7811
+ Mirrors the precedence in autonomy/app-runner.sh:431-481:
7812
+ (1) service labelled loki.primary=true
7813
+ (2) service named web/app
7814
+ (3) service publishing a common web port (3000/8000/8080/5000/4200/5173/80)
7815
+ (4) first service with any published port
7816
+ Declared names/labels come from `docker compose config`; the actual runtime
7817
+ published port comes from the matching RUNNING container (compose ps), since
7818
+ only running, published containers can yield a real URL. Returns
7819
+ (service_name, port_str) or (None, None). Never raises.
7820
+
7821
+ config_services: dict {service_name: service_config_dict} (may be empty).
7822
+ running_by_service: dict {service_name: [published_port_str, ...]} for
7823
+ currently-running containers with at least one published host port.
7824
+ """
7825
+ if not running_by_service:
7826
+ return (None, None)
7827
+
7828
+ # (1) label loki.primary=true (declared in compose config)
7829
+ for name, svc in (config_services or {}).items():
7830
+ if not isinstance(svc, dict):
7831
+ continue
7832
+ labels = _compose_service_labels(svc)
7833
+ if str(labels.get("loki.primary", "")).lower() == "true":
7834
+ ports = running_by_service.get(name)
7835
+ if ports:
7836
+ return (name, ports[0])
7837
+
7838
+ # (2) service named web/app
7839
+ for cand in ("web", "app"):
7840
+ ports = running_by_service.get(cand)
7841
+ if ports:
7842
+ return (cand, ports[0])
7843
+
7844
+ # (3) service publishing a common web port
7845
+ for cp in _COMPOSE_COMMON_WEB_PORTS:
7846
+ for name, ports in running_by_service.items():
7847
+ if cp in ports:
7848
+ return (name, cp)
7849
+
7850
+ # (4) first running service with any published port. Sort for determinism.
7851
+ for name in sorted(running_by_service.keys()):
7852
+ ports = running_by_service[name]
7853
+ if ports:
7854
+ return (name, ports[0])
7855
+
7856
+ return (None, None)
7857
+
7858
+
7859
+ def _container_health_state(container):
7860
+ """Classify a running compose container into 'running' | 'starting' | None.
7861
+
7862
+ Reads the container State + Health fields from `docker compose ps`:
7863
+ - State exited/dead/paused/removing -> None (no live URL to surface)
7864
+ - State running + Health healthy or empty (no healthcheck) -> 'running'
7865
+ - State running + Health unhealthy/starting -> 'starting' (still surface
7866
+ the URL: e.g. a Next.js app whose home renders but whose '/' healthcheck
7867
+ fails is reachable and should show as starting, not hidden)
7868
+ - State created/restarting -> 'starting'
7869
+ Returns the status string or None. Never raises.
7870
+ """
7871
+ state = str(container.get("State", "")).lower()
7872
+ health = str(container.get("Health", "")).lower()
7873
+ if state in ("exited", "dead", "paused", "removing"):
7874
+ return None
7875
+ if state == "running":
7876
+ if health in ("", "healthy"):
7877
+ return "running"
7878
+ # unhealthy or starting healthcheck: reachable, treat as starting.
7879
+ return "starting"
7880
+ if state in ("created", "restarting"):
7881
+ return "starting"
7882
+ # Unknown/other states: do not fabricate a running URL.
7883
+ return None
7884
+
7885
+
7886
+ def _discover_compose_app_runner_state():
7887
+ """Discover a running docker-compose stack for the active project, or None.
7888
+
7889
+ Returns a synthesized app-runner state dict (source=="discovered") when the
7890
+ project directory hosts a compose file AND a primary web service is running
7891
+ with a published host port. Returns None in every other case (no compose
7892
+ file, docker absent, nothing running, no published web port, only
7893
+ dead/exited containers, or any error). Synchronous and self-contained; the
7894
+ caller offloads it onto a worker thread. Never raises.
7895
+ """
7896
+ try:
7897
+ project_dir = _get_loki_dir().parent.resolve()
7898
+ except Exception:
7899
+ return None
7900
+ cache_key = str(project_dir)
7901
+
7902
+ now = time.monotonic()
7903
+ with _compose_discovery_lock:
7904
+ cached = _compose_discovery_cache.get(cache_key)
7905
+ if cached is not None and cached[0] > now:
7906
+ return cached[1]
7907
+
7908
+ result = _discover_compose_app_runner_state_uncached(project_dir)
7909
+
7910
+ with _compose_discovery_lock:
7911
+ _compose_discovery_cache[cache_key] = (
7912
+ time.monotonic() + _COMPOSE_DISCOVERY_TTL_SECONDS,
7913
+ result,
7914
+ )
7915
+ return result
7916
+
7917
+
7918
+ def _discover_compose_app_runner_state_uncached(project_dir):
7919
+ """Uncached body of _discover_compose_app_runner_state. Never raises."""
7920
+ try:
7921
+ # Step A: a compose file must exist in the project dir, else this is a
7922
+ # single-process app and discovery does not apply.
7923
+ compose_names = (
7924
+ "docker-compose.yml", "docker-compose.yaml",
7925
+ "compose.yml", "compose.yaml",
7926
+ )
7927
+ if not any((project_dir / n).is_file() for n in compose_names):
7928
+ return None
7929
+
7930
+ # Step C: running containers for THIS project's compose stack, with the
7931
+ # runtime published ports. Run from the project dir so compose resolves
7932
+ # the right project. (Step B project matching is implicitly handled by
7933
+ # running compose from project_dir; we keep ls/ps from this dir.)
7934
+ ps_rows = _run_docker_json(
7935
+ ["compose", "ps", "--format", "json"], cwd=project_dir
7936
+ )
7937
+ if ps_rows is None:
7938
+ # docker absent / timeout / error -> fail open.
7939
+ return None
7940
+ if not ps_rows:
7941
+ # No containers for this compose project (not up). Nothing to show.
7942
+ return None
7943
+
7944
+ # Map running, published services to their host ports. Track health and
7945
+ # the raw container for the primary so we can classify it precisely.
7946
+ running_by_service = {}
7947
+ container_by_service = {}
7948
+ for c in ps_rows:
7949
+ service = c.get("Service") or c.get("Name")
7950
+ if not service:
7951
+ continue
7952
+ ports = _compose_published_ports(c)
7953
+ if ports:
7954
+ running_by_service.setdefault(service, [])
7955
+ for p in ports:
7956
+ if p not in running_by_service[service]:
7957
+ running_by_service[service].append(p)
7958
+ container_by_service.setdefault(service, c)
7959
+ if not running_by_service:
7960
+ # Stack is up but nothing publishes a host port: no surfaceable URL.
7961
+ return None
7962
+
7963
+ # Step D: declared service config (names/labels) for precedence. Best
7964
+ # effort: if config is unavailable we still proceed with ps data alone.
7965
+ config_rows = _run_docker_json(
7966
+ ["compose", "config", "--format", "json"], cwd=project_dir
7967
+ )
7968
+ config_services = {}
7969
+ if config_rows:
7970
+ cfg = config_rows[0]
7971
+ svcs = cfg.get("services")
7972
+ if isinstance(svcs, dict):
7973
+ config_services = svcs
7974
+
7975
+ primary_service, port = _identify_compose_web_service(
7976
+ config_services, running_by_service
7977
+ )
7978
+ if not primary_service or not port:
7979
+ return None
7980
+
7981
+ # Step E health classification, from the primary's running container.
7982
+ primary_container = container_by_service.get(primary_service)
7983
+ if not isinstance(primary_container, dict):
7984
+ return None
7985
+ health_status = _container_health_state(primary_container)
7986
+ if health_status is None:
7987
+ # exited/dead/paused/unknown -> do not fabricate a URL.
7988
+ return None
7989
+
7990
+ # Step B (best effort): record the compose project name for the panel.
7991
+ compose_project = (
7992
+ primary_container.get("Project")
7993
+ or "".join(ch for ch in project_dir.name.lower() if ch.isalnum())
7994
+ )
7995
+
7996
+ health_text = str(primary_container.get("Health", "")).lower()
7997
+ health_ok = health_text in ("", "healthy")
7998
+
7999
+ # Step F: synthesize the state dict using the SAME field names the UI and
8000
+ # app-runner.sh state.json use (status/url/port/method/last_health), plus
8001
+ # discovery-provenance fields the panel safely ignores.
8002
+ return {
8003
+ "status": health_status,
8004
+ "url": "http://localhost:{}".format(port),
8005
+ "port": int(port),
8006
+ "method": "docker compose (detected)",
8007
+ "primary_service": primary_service,
8008
+ "compose_project": compose_project,
8009
+ "source": "discovered",
8010
+ "externally_managed": True,
8011
+ "last_health": {"ok": health_ok},
8012
+ }
8013
+ except Exception:
8014
+ # Fail open on anything unexpected; never break the status endpoint.
8015
+ return None
8016
+
8017
+
7602
8018
  @app.get("/api/app-runner/status")
7603
8019
  async def get_app_runner_status():
7604
- """Get app runner current status (with dead-run liveness reconciliation)."""
8020
+ """Get app runner current status (with dead-run liveness reconciliation).
8021
+
8022
+ Resolution order:
8023
+ 1. state.json present AND reconciles to running/starting -> return it (an
8024
+ app-runner.sh-managed run is authoritative).
8025
+ 2. state.json missing OR reconciles to stopped/stale -> attempt
8026
+ docker-compose discovery for stacks the autonomous agent launched
8027
+ itself; if a running stack is found, return the synthesized state
8028
+ (bypassing pid-based liveness reconciliation, which is meaningless for
8029
+ externally-launched containers).
8030
+ 3. otherwise return the existing (possibly reconciled / not_initialized)
8031
+ result.
8032
+ Discovery runs on a worker thread so its bounded docker calls never block
8033
+ the event loop.
8034
+ """
7605
8035
  loki_dir = _get_loki_dir()
7606
8036
  state_file = loki_dir / "app-runner" / "state.json"
8037
+
7607
8038
  if not state_file.exists():
8039
+ discovered = await asyncio.to_thread(_discover_compose_app_runner_state)
8040
+ if discovered is not None:
8041
+ return discovered
7608
8042
  return {"status": "not_initialized"}
8043
+
7609
8044
  try:
7610
8045
  state = json.loads(state_file.read_text())
7611
8046
  except (json.JSONDecodeError, OSError):
7612
8047
  return {"status": "error"}
7613
- return _reconcile_app_runner_liveness(state)
8048
+
8049
+ reconciled = _reconcile_app_runner_liveness(state)
8050
+ if isinstance(reconciled, dict) and reconciled.get("status") in ("running", "starting"):
8051
+ # An app-runner.sh-managed run that is still live is authoritative.
8052
+ return reconciled
8053
+
8054
+ # State is missing-live (stopped/stale/other): the agent may have brought up
8055
+ # a compose stack outside app-runner.sh. Prefer a live discovered stack.
8056
+ discovered = await asyncio.to_thread(_discover_compose_app_runner_state)
8057
+ if discovered is not None:
8058
+ return discovered
8059
+ return reconciled
7614
8060
 
7615
8061
 
7616
8062
  def _get_log_redactor():
@@ -7655,8 +8101,12 @@ async def get_app_runner_logs(lines: int = Query(default=100, ge=1, le=1000)):
7655
8101
  return {"lines": []}
7656
8102
  try:
7657
8103
  redact = _get_log_redactor()
7658
- all_lines = _safe_read_text(log_file).splitlines()
7659
- return {"lines": [redact(ln) for ln in all_lines[-lines:]], "redacted": True}
8104
+ # Reading + redacting the app log is blocking (the log can be large);
8105
+ # offload so the event loop (status + WS heartbeat) is not stalled.
8106
+ def _read_redacted(p=log_file, n=lines):
8107
+ return [redact(ln) for ln in _safe_read_text(p).splitlines()[-n:]]
8108
+ out_lines = await asyncio.to_thread(_read_redacted)
8109
+ return {"lines": out_lines, "redacted": True}
7660
8110
  except OSError:
7661
8111
  return {"lines": []}
7662
8112
 
@@ -7691,8 +8141,10 @@ async def get_app_runner_errors(lines: int = Query(default=50, ge=1, le=500)):
7691
8141
  if log_file.exists():
7692
8142
  try:
7693
8143
  redact = _get_log_redactor()
7694
- all_lines = _safe_read_text(log_file).splitlines()
7695
- out_lines = [redact(ln) for ln in all_lines[-lines:]]
8144
+ # Offload the blocking log read + redaction off the event loop.
8145
+ def _read_redacted(p=log_file, n=lines):
8146
+ return [redact(ln) for ln in _safe_read_text(p).splitlines()[-n:]]
8147
+ out_lines = await asyncio.to_thread(_read_redacted)
7696
8148
  except OSError:
7697
8149
  out_lines = []
7698
8150
 
@@ -8410,7 +8862,11 @@ async def get_managed_events(
8410
8862
  """
8411
8863
  try:
8412
8864
  path = _managed_events_path()
8413
- records = _tail_ndjson(path, limit=limit, since_iso=since, event_type=type)
8865
+ # Tails an ndjson file (rotated at 10MB) via a blocking readlines();
8866
+ # offload so the event loop stays responsive.
8867
+ records = await asyncio.to_thread(
8868
+ _tail_ndjson, path, limit, since, type
8869
+ )
8414
8870
  return {
8415
8871
  "events": records,
8416
8872
  "count": len(records),
@@ -8433,11 +8889,13 @@ async def get_managed_status():
8433
8889
  snapshot = _managed_flags_snapshot()
8434
8890
  # last_fallback_ts is best-effort from the local events file.
8435
8891
  try:
8436
- events = _tail_ndjson(
8892
+ # Blocking ndjson tail read; offload off the event loop.
8893
+ events = await asyncio.to_thread(
8894
+ _tail_ndjson,
8437
8895
  _managed_events_path(),
8438
- limit=500,
8439
- since_iso=None,
8440
- event_type="managed_agents_fallback",
8896
+ 500,
8897
+ None,
8898
+ "managed_agents_fallback",
8441
8899
  )
8442
8900
  snapshot["last_fallback_ts"] = _last_fallback_ts(events)
8443
8901
  except Exception: