arkaos 3.77.0 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. package/VERSION +1 -1
  2. package/config/agent-allowlists/laravel.yaml +1 -0
  3. package/config/agent-allowlists/node.yaml +1 -0
  4. package/config/agent-allowlists/nuxt.yaml +1 -0
  5. package/config/agent-allowlists/python.yaml +1 -0
  6. package/core/agents/__pycache__/registry_gen.cpython-313.pyc +0 -0
  7. package/core/agents/__pycache__/schema.cpython-313.pyc +0 -0
  8. package/core/agents/registry_gen.py +6 -1
  9. package/core/agents/schema.py +4 -0
  10. package/core/cognition/__pycache__/reorganizer.cpython-313.pyc +0 -0
  11. package/core/cognition/reorganizer.py +37 -7
  12. package/core/governance/__pycache__/design_system_lint.cpython-313.pyc +0 -0
  13. package/core/governance/__pycache__/design_system_lint_cli.cpython-313.pyc +0 -0
  14. package/core/knowledge/__pycache__/agent_match.cpython-313.pyc +0 -0
  15. package/core/knowledge/__pycache__/chunker.cpython-313.pyc +0 -0
  16. package/core/knowledge/__pycache__/ingest.cpython-313.pyc +0 -0
  17. package/core/knowledge/__pycache__/sources.cpython-313.pyc +0 -0
  18. package/core/knowledge/__pycache__/vector_store.cpython-313.pyc +0 -0
  19. package/core/knowledge/agent_match.py +114 -0
  20. package/core/knowledge/chunker.py +45 -0
  21. package/core/knowledge/ingest.py +156 -78
  22. package/core/knowledge/sources.py +138 -0
  23. package/core/knowledge/vector_store.py +52 -0
  24. package/core/squads/__pycache__/loader.cpython-313.pyc +0 -0
  25. package/core/squads/loader.py +25 -0
  26. package/core/sync/__pycache__/agent_provisioner.cpython-313.pyc +0 -0
  27. package/core/sync/agent_provisioner.py +19 -8
  28. package/dashboard/app/components/KnowledgeSourcesList.vue +40 -13
  29. package/dashboard/app/pages/cognition.vue +9 -4
  30. package/dashboard/app/pages/knowledge/[id].vue +669 -0
  31. package/dashboard/app/pages/knowledge/index.vue +1281 -0
  32. package/dashboard/app/types/index.d.ts +1 -1
  33. package/departments/brand/agents/brand-director.yaml +2 -0
  34. package/departments/brand/agents/creative-director.md +4 -0
  35. package/departments/brand/agents/motion-designer.md +5 -1
  36. package/departments/brand/agents/ux-designer.yaml +26 -1
  37. package/departments/brand/agents/ux-researcher.yaml +73 -0
  38. package/departments/brand/agents/ux-strategist.yaml +72 -0
  39. package/departments/brand/agents/visual-designer.md +4 -0
  40. package/departments/brand/agents/visual-designer.yaml +11 -0
  41. package/departments/brand/references/uiux-knowledge-and-tools.md +136 -0
  42. package/departments/dev/agents/ai-engineering/ai-engineering-lead.yaml +76 -0
  43. package/departments/dev/agents/architect.yaml +9 -3
  44. package/departments/dev/agents/backend-core/laravel-eng.yaml +76 -0
  45. package/departments/dev/agents/backend-core/node-ts-eng.yaml +76 -0
  46. package/departments/dev/agents/backend-core/python-eng.yaml +76 -0
  47. package/departments/dev/agents/backend-dev.yaml +10 -4
  48. package/departments/dev/agents/data-platform/etl-eng.yaml +74 -0
  49. package/departments/dev/agents/dba.yaml +7 -3
  50. package/departments/dev/agents/frontend-dev.md +41 -11
  51. package/departments/dev/agents/frontend-dev.yaml +6 -0
  52. package/departments/dev/references/backend-knowledge-and-tools.md +70 -0
  53. package/departments/ecom/agents/retention-manager.yaml +13 -1
  54. package/departments/leadership/agents/culture-coach.yaml +20 -0
  55. package/departments/leadership/agents/hr-specialist.yaml +18 -0
  56. package/departments/leadership/agents/leadership-director.yaml +10 -0
  57. package/departments/org/agents/chief-of-staff.yaml +76 -0
  58. package/departments/org/agents/coo.yaml +11 -0
  59. package/departments/org/agents/okr-steward.yaml +71 -0
  60. package/departments/org/agents/org-designer.yaml +23 -0
  61. package/departments/org/skills/okr-cadence/SKILL.md +34 -0
  62. package/departments/org/skills/principles-audit/SKILL.md +36 -0
  63. package/departments/pm/agents/pm-director.yaml +21 -8
  64. package/departments/pm/agents/product-owner.yaml +24 -2
  65. package/departments/pm/agents/scrum-master.yaml +21 -0
  66. package/departments/pm/agents/strategic-pm.yaml +72 -0
  67. package/departments/pm/skills/discovery-plan/SKILL.md +7 -1
  68. package/departments/quality/agents/cqo.yaml +8 -0
  69. package/departments/saas/agents/cs-manager.yaml +19 -2
  70. package/departments/saas/agents/growth-engineer.yaml +14 -1
  71. package/departments/saas/agents/metrics-analyst.yaml +17 -1
  72. package/departments/saas/agents/revops-lead.yaml +73 -0
  73. package/departments/saas/skills/leaky-bucket/SKILL.md +28 -0
  74. package/departments/saas/skills/voc-loop/SKILL.md +29 -0
  75. package/departments/sales/agents/sales-director.yaml +9 -0
  76. package/departments/sales/agents/sdr.yaml +72 -0
  77. package/departments/strategy/agents/decision-quality.yaml +72 -0
  78. package/departments/strategy/agents/strategy-director.yaml +13 -0
  79. package/departments/strategy/skills/premortem/SKILL.md +33 -0
  80. package/installer/claude-plugins.js +32 -3
  81. package/installer/doctor.js +15 -0
  82. package/installer/frontend-tooling.js +150 -0
  83. package/installer/index.js +28 -0
  84. package/installer/keys.js +1 -0
  85. package/installer/update.js +35 -0
  86. package/knowledge/agents-registry-v2.json +1218 -78
  87. package/package.json +1 -1
  88. package/pyproject.toml +1 -1
  89. package/scripts/__pycache__/dashboard-api.cpython-313.pyc +0 -0
  90. package/scripts/dashboard-api.py +376 -13
  91. package/dashboard/app/pages/knowledge.vue +0 -918
@@ -6,12 +6,14 @@ the vector store. Reports progress via callback for real-time UI updates.
6
6
 
7
7
  import os
8
8
  import re
9
+ import subprocess
9
10
  import tempfile
10
11
  from dataclasses import dataclass, field
11
12
  from pathlib import Path
12
13
  from typing import Callable, Optional
13
14
 
14
15
  from core.knowledge.chunker import chunk_markdown
16
+ from core.knowledge.sources import source_id
15
17
  from core.knowledge.vector_store import VectorStore
16
18
 
17
19
 
@@ -25,6 +27,11 @@ class IngestResult:
25
27
  title: str = ""
26
28
  error: str = ""
27
29
  success: bool = True
30
+ duration: int = 0
31
+ language: str = ""
32
+ media_path: str = ""
33
+ thumbnail_path: str = ""
34
+ transcript: str = ""
28
35
 
29
36
 
30
37
  ProgressCallback = Callable[[int, str], None] # (percent, message)
@@ -38,15 +45,20 @@ def detect_source_type(source: str) -> str:
38
45
  if any(domain in source_lower for domain in ["youtube.com", "youtu.be"]):
39
46
  return "youtube"
40
47
 
41
- # Web URLs
48
+ # Video: a URL or file path ending in a video container extension.
49
+ # Checked *before* the generic web fallback so a non-youtube CDN clip
50
+ # (https://.../clip.mp4) resolves to "video", per PR1 spec Task 2.3.
51
+ ext = Path(source.split("?", 1)[0]).suffix.lower()
52
+ if ext in IngestEngine.VIDEO_EXTS:
53
+ return "video"
54
+
55
+ # Web URLs (no recognised media extension)
42
56
  if source_lower.startswith(("http://", "https://")):
43
57
  return "web"
44
58
 
45
- # File extensions
46
- ext = Path(source).suffix.lower()
47
59
  if ext == ".pdf":
48
60
  return "pdf"
49
- if ext in (".mp3", ".wav", ".m4a", ".ogg", ".flac", ".webm"):
61
+ if ext in (".mp3", ".wav", ".m4a", ".ogg", ".flac"):
50
62
  return "audio"
51
63
  if ext in (".md", ".txt", ".rst"):
52
64
  return "markdown"
@@ -57,11 +69,19 @@ def detect_source_type(source: str) -> str:
57
69
  class IngestEngine:
58
70
  """Processes content from various sources into the vector store."""
59
71
 
60
- def __init__(self, store: VectorStore, media_dir: str | Path = "") -> None:
72
+ VIDEO_EXTS = (".mp4", ".mov", ".webm", ".mkv", ".avi")
73
+
74
+ def __init__(self, store: VectorStore, media_dir: str | Path = "", registry=None) -> None:
61
75
  self._store = store
76
+ self._registry = registry
62
77
  self._media_dir = Path(media_dir) if media_dir else Path.home() / ".arkaos" / "media"
63
78
  self._media_dir.mkdir(parents=True, exist_ok=True)
64
79
 
80
+ @staticmethod
81
+ def detect_source_type(source: str) -> str:
82
+ """Detect the source type for a URL or file path (class-level alias)."""
83
+ return detect_source_type(source)
84
+
65
85
  def ingest(
66
86
  self,
67
87
  source: str,
@@ -87,6 +107,7 @@ class IngestEngine:
87
107
  "youtube": self._process_youtube,
88
108
  "pdf": self._process_pdf,
89
109
  "audio": self._process_audio,
110
+ "video": self._process_video,
90
111
  "web": self._process_web,
91
112
  "markdown": self._process_markdown,
92
113
  }
@@ -96,11 +117,13 @@ class IngestEngine:
96
117
  return IngestResult(source=source, source_type=source_type, error=f"Unsupported type: {source_type}", success=False)
97
118
 
98
119
  try:
99
- text, title = processor(source, progress)
120
+ text, title, extra = self._invoke_processor(processor, source, progress)
100
121
  except Exception as e:
122
+ self._register_failure(source, source_type, str(e))
101
123
  return IngestResult(source=source, source_type=source_type, error=str(e), success=False)
102
124
 
103
125
  if not text or len(text.strip()) < 50:
126
+ self._register_failure(source, source_type, "Extracted text too short")
104
127
  return IngestResult(source=source, source_type=source_type, error="Extracted text too short", success=False)
105
128
 
106
129
  # Chunk and index
@@ -110,7 +133,9 @@ class IngestEngine:
110
133
 
111
134
  if total_chunks == 0:
112
135
  progress(100, "No chunks to index")
113
- return IngestResult(source=source, source_type=source_type, text_length=len(text), chunks_created=0, title=title, success=True)
136
+ empty = self._make_result(source, source_type, text, title, 0, extra)
137
+ self._register_success(empty)
138
+ return empty
114
139
 
115
140
  # Index in batches with granular progress (85→99%)
116
141
  texts = [c.text for c in chunks]
@@ -149,100 +174,153 @@ class IngestEngine:
149
174
  except Exception:
150
175
  pass
151
176
 
177
+ result = self._make_result(source, source_type, text, title, count, extra)
178
+ self._register_success(result)
179
+ return result
180
+
181
+ @staticmethod
182
+ def _invoke_processor(
183
+ processor: Callable, source: str, progress: ProgressCallback
184
+ ) -> tuple[str, str, dict]:
185
+ """Call a processor, normalizing 2-tuple and 3-tuple returns."""
186
+ out = processor(source, progress)
187
+ if len(out) == 3:
188
+ return out[0], out[1], out[2] or {}
189
+ return out[0], out[1], {}
190
+
191
+ @staticmethod
192
+ def _make_result(
193
+ source: str, source_type: str, text: str, title: str,
194
+ count: int, extra: dict,
195
+ ) -> IngestResult:
196
+ """Assemble a successful IngestResult including media metadata."""
152
197
  return IngestResult(
153
- source=source,
154
- source_type=source_type,
155
- text_length=len(text),
156
- chunks_created=count,
157
- title=title,
158
- success=True,
198
+ source=source, source_type=source_type, text_length=len(text),
199
+ chunks_created=count, title=title, success=True, transcript=text,
200
+ duration=int(extra.get("duration", 0)),
201
+ language=extra.get("language", ""),
202
+ media_path=extra.get("media_path", ""),
203
+ thumbnail_path=extra.get("thumbnail_path", ""),
159
204
  )
160
205
 
161
- def _process_youtube(self, url: str, progress: ProgressCallback) -> tuple[str, str]:
162
- """Download YouTube video and transcribe audio.
206
+ def _register_success(self, result: IngestResult) -> None:
207
+ """Persist a successful ingest to the source registry, if present."""
208
+ if self._registry is None:
209
+ return
210
+ self._registry.upsert(
211
+ result.source, type=result.source_type, title=result.title,
212
+ duration=result.duration, language=result.language,
213
+ thumbnail_path=result.thumbnail_path, media_path=result.media_path,
214
+ transcript=result.transcript, chunk_count=result.chunks_created,
215
+ status="ready",
216
+ )
217
+
218
+ def _register_failure(self, source: str, stype: str, error: str) -> None:
219
+ """Persist a failed ingest to the source registry, if present."""
220
+ if self._registry is None:
221
+ return
222
+ self._registry.upsert(source, type=stype, status="failed", error=error)
163
223
 
164
- 5 distinct phases with clear progress:
165
- Phase 1: Fetch video info (0-5%)
166
- Phase 2: Download video (5-25%)
167
- Phase 3: Extract audio (25-35%)
168
- Phase 4: Transcribe audio (35-65%)
169
- Phase 5: Return text for chunking/indexing (handled by caller, 75-100%)
224
+ def _process_youtube(self, url: str, progress: ProgressCallback) -> tuple[str, str, dict]:
225
+ """Download a YouTube video (kept as media) and transcribe it.
226
+
227
+ Phase 1: Fetch info (title, duration, language, thumbnail).
228
+ Phase 2: Download best video+audio merged to mp4 (kept as media).
229
+ Phase 3: Extract a WAV audio track for transcription.
230
+ Phase 4: Transcribe. Returns (text, title, extra-metadata).
170
231
  """
171
232
  try:
172
- import yt_dlp
233
+ import yt_dlp # noqa: F401
173
234
  except ImportError:
174
235
  raise RuntimeError("yt-dlp not installed. Run: pip install yt-dlp")
175
236
 
176
- # === Phase 1: Fetch video info ===
177
237
  progress(2, "Phase 1/4 — Fetching video info...")
238
+ info = self._youtube_info(url, progress)
239
+ title = info.get("title", "YouTube Video")
240
+
241
+ progress(8, "Phase 2/4 — Downloading video...")
242
+ video_path = self._download_video(url, progress)
243
+
244
+ progress(40, "Phase 3/4 — Extracting audio from video...")
245
+ audio_path = self._extract_audio(video_path)
246
+
247
+ progress(50, "Phase 4/4 — Transcribing audio (this may take a while)...")
248
+ text = self._transcribe_audio(str(audio_path))
249
+ if not text or len(text.strip()) < 20:
250
+ raise RuntimeError("Transcription produced no usable text")
251
+ progress(70, f"Phase 4/4 — Transcribed: {len(text.split())} words")
252
+
253
+ return text, title, self._youtube_extra(info, video_path)
254
+
255
+ @staticmethod
256
+ def _youtube_extra(info: dict, video_path: Path) -> dict:
257
+ """Build the extra-metadata dict from yt-dlp info + saved video."""
258
+ return {
259
+ "duration": int(info.get("duration") or 0),
260
+ "language": info.get("language") or "",
261
+ "thumbnail_path": info.get("thumbnail") or "",
262
+ "media_path": str(video_path),
263
+ }
264
+
265
+ def _youtube_info(self, url: str, progress: ProgressCallback) -> dict:
266
+ """Fetch YouTube metadata without downloading."""
267
+ import yt_dlp
178
268
  try:
179
269
  with yt_dlp.YoutubeDL({"quiet": True, "no_warnings": True}) as ydl:
180
270
  info = ydl.extract_info(url, download=False)
181
- title = info.get("title", "YouTube Video")
182
- duration = info.get("duration", 0)
183
- progress(5, f"Phase 1/4 — Found: {title} ({duration}s)")
271
+ progress(5, f"Phase 1/4 — Found: {info.get('title')}")
272
+ return info
184
273
  except Exception as e:
185
274
  raise RuntimeError(f"YouTube access failed: {str(e)[:200]}")
186
275
 
187
- # === Phase 2: Download video + extract audio ===
188
- progress(8, f"Phase 2/4 Downloading video...")
189
- audio_path = str(self._media_dir / "yt_audio.wav")
276
+ def _download_video(self, url: str, progress: ProgressCallback) -> Path:
277
+ """Download best video+audio merged to mp4, keyed by stable id."""
278
+ import yt_dlp
279
+ stable_id = source_id(url)
280
+ out = self._media_dir / stable_id
190
281
  ydl_opts = {
191
- "format": "bestaudio/best",
192
- "outtmpl": str(self._media_dir / "yt_audio.%(ext)s"),
193
- "postprocessors": [{
194
- "key": "FFmpegExtractAudio",
195
- "preferredcodec": "wav",
196
- "preferredquality": "16",
197
- }],
282
+ "format": "bestvideo*+bestaudio/best",
283
+ "merge_output_format": "mp4",
284
+ "outtmpl": str(out) + ".%(ext)s",
198
285
  "quiet": True,
199
286
  "no_warnings": True,
200
- "progress_hooks": [lambda d: progress(
201
- 8 + int((d.get("downloaded_bytes", 0) / max(d.get("total_bytes", 1), 1)) * 17),
202
- f"Phase 2/4 — Downloading... {d.get('_percent_str', '').strip()}"
203
- ) if d.get("status") == "downloading" else None],
287
+ "progress_hooks": [self._dl_hook(progress)],
204
288
  }
205
-
206
289
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
207
290
  ydl.extract_info(url, download=True)
291
+ return self._media_dir / f"{stable_id}.mp4"
292
+
293
+ @staticmethod
294
+ def _dl_hook(progress: ProgressCallback) -> Callable:
295
+ """Build a yt-dlp progress hook mapping download % to 8-38%."""
296
+ def hook(d: dict) -> None:
297
+ if d.get("status") != "downloading":
298
+ return
299
+ ratio = d.get("downloaded_bytes", 0) / max(d.get("total_bytes", 1), 1)
300
+ progress(8 + int(ratio * 30),
301
+ f"Phase 2/4 — Downloading... {d.get('_percent_str', '').strip()}")
302
+ return hook
303
+
304
+ def _extract_audio(self, video_path: Path) -> Path:
305
+ """Extract a 16kHz mono WAV track from a video via ffmpeg."""
306
+ audio_path = video_path.with_suffix(".wav")
307
+ subprocess.run(
308
+ ["ffmpeg", "-y", "-i", str(video_path), "-vn",
309
+ "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1",
310
+ str(audio_path)],
311
+ check=True, capture_output=True,
312
+ )
313
+ return audio_path
208
314
 
209
- # === Phase 3: Extract audio (FFmpeg post-processing) ===
210
- progress(28, "Phase 3/4 Extracting audio from video...")
211
-
212
- # Verify audio file exists
213
- if not os.path.exists(audio_path):
214
- # Try to find the downloaded file with different extension
215
- for ext in ["wav", "m4a", "webm", "mp3", "opus"]:
216
- alt = str(self._media_dir / f"yt_audio.{ext}")
217
- if os.path.exists(alt):
218
- audio_path = alt
219
- break
220
- else:
221
- raise RuntimeError("Audio extraction failed — no output file found")
222
-
223
- audio_size_mb = os.path.getsize(audio_path) / (1024 * 1024)
224
- progress(35, f"Phase 3/4 — Audio extracted ({audio_size_mb:.1f} MB)")
225
-
226
- # === Phase 4: Transcribe audio ===
227
- progress(38, "Phase 4/4 — Transcribing audio (this may take a while)...")
228
- text = self._transcribe_audio(audio_path)
229
-
230
- if not text or len(text.strip()) < 20:
231
- raise RuntimeError("Transcription produced no usable text")
232
-
233
- word_count = len(text.split())
234
- progress(70, f"Phase 4/4 — Transcribed: {word_count} words")
235
-
236
- # Rename audio to include title for easy identification
237
- safe_title = "".join(c if c.isalnum() or c in " -_" else "" for c in title)[:50].strip()
238
- final_audio = self._media_dir / f"{safe_title}.wav"
239
- try:
240
- import shutil
241
- shutil.move(audio_path, str(final_audio))
242
- except Exception:
243
- final_audio = Path(audio_path)
244
-
245
- return text, title
315
+ def _process_video(self, path: str, progress: ProgressCallback) -> tuple[str, str, dict]:
316
+ """Ingest a local video file; the video itself is the media."""
317
+ filepath = Path(path)
318
+ if not filepath.exists():
319
+ raise FileNotFoundError(f"Video not found: {path}")
320
+ progress(30, "Transcribing video...")
321
+ text = self._transcribe_audio(str(filepath))
322
+ title = filepath.stem.replace("-", " ").replace("_", " ")
323
+ return text, title, {"media_path": str(filepath)}
246
324
 
247
325
  def _process_pdf(self, path: str, progress: ProgressCallback) -> tuple[str, str]:
248
326
  """Extract text from PDF."""
@@ -0,0 +1,138 @@
1
+ """Source registry for the knowledge base.
2
+
3
+ Stores rich per-source metadata (title, duration, media path, transcript,
4
+ thumbnail, status) in a dedicated ``sources`` table living inside the same
5
+ ``knowledge.db`` as the vector store. This module is purely additive: it
6
+ never touches the ``chunks`` table owned by :class:`VectorStore`.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import hashlib
11
+ import sqlite3
12
+ import threading
13
+ from pathlib import Path
14
+ from typing import Optional
15
+
16
+ _SCHEMA = """
17
+ CREATE TABLE IF NOT EXISTS sources (
18
+ id TEXT PRIMARY KEY,
19
+ source TEXT NOT NULL,
20
+ type TEXT DEFAULT '',
21
+ title TEXT DEFAULT '',
22
+ duration INTEGER DEFAULT 0,
23
+ language TEXT DEFAULT '',
24
+ thumbnail_path TEXT DEFAULT '',
25
+ media_path TEXT DEFAULT '',
26
+ transcript TEXT DEFAULT '',
27
+ chunk_count INTEGER DEFAULT 0,
28
+ status TEXT DEFAULT 'pending',
29
+ error TEXT DEFAULT '',
30
+ created_at REAL DEFAULT (unixepoch('now')),
31
+ updated_at REAL DEFAULT (unixepoch('now'))
32
+ )
33
+ """
34
+
35
+ _COLUMNS = (
36
+ "id", "source", "type", "title", "duration", "language",
37
+ "thumbnail_path", "media_path", "transcript", "chunk_count",
38
+ "status", "error", "created_at", "updated_at",
39
+ )
40
+
41
+
42
+ def source_id(source: str) -> str:
43
+ """Return a stable id for a source string: ``src-`` + sha1[:12]."""
44
+ digest = hashlib.sha1(source.encode("utf-8")).hexdigest()
45
+ return f"src-{digest[:12]}"
46
+
47
+
48
+ class SourceRegistry:
49
+ """SQLite-backed registry of knowledge sources and their metadata."""
50
+
51
+ def __init__(self, db_path: str | Path = "") -> None:
52
+ """Open (or create) the sources table in the knowledge database."""
53
+ self._db_path = str(db_path) if db_path else self._default_path()
54
+ self._lock = threading.Lock()
55
+ self._conn = sqlite3.connect(self._db_path, check_same_thread=False)
56
+ self._conn.execute("PRAGMA journal_mode=WAL")
57
+ self._conn.execute(_SCHEMA)
58
+ self._conn.commit()
59
+
60
+ @staticmethod
61
+ def _default_path() -> str:
62
+ home = Path.home() / ".arkaos"
63
+ home.mkdir(parents=True, exist_ok=True)
64
+ return str(home / "knowledge.db")
65
+
66
+ def upsert(
67
+ self,
68
+ source: str,
69
+ *,
70
+ type: str = "",
71
+ title: str = "",
72
+ duration: int = 0,
73
+ language: str = "",
74
+ thumbnail_path: str = "",
75
+ media_path: str = "",
76
+ transcript: str = "",
77
+ chunk_count: int = 0,
78
+ status: str = "ready",
79
+ error: str = "",
80
+ ) -> str:
81
+ """Insert or replace a source row by id; return its stable id."""
82
+ sid = source_id(source)
83
+ params = (
84
+ sid, source, type, title, duration, language, thumbnail_path,
85
+ media_path, transcript, chunk_count, status, error, sid,
86
+ )
87
+ with self._lock:
88
+ self._conn.execute(self._upsert_sql(), params)
89
+ self._conn.commit()
90
+ return sid
91
+
92
+ @staticmethod
93
+ def _upsert_sql() -> str:
94
+ """SQL that preserves created_at on update via a COALESCE subquery."""
95
+ return (
96
+ "INSERT OR REPLACE INTO sources "
97
+ "(id, source, type, title, duration, language, thumbnail_path, "
98
+ "media_path, transcript, chunk_count, status, error, "
99
+ "created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, "
100
+ "?, ?, COALESCE((SELECT created_at FROM sources WHERE id = ?), "
101
+ "unixepoch('now')), unixepoch('now'))"
102
+ )
103
+
104
+ def get(self, source_id_: str) -> Optional[dict]:
105
+ """Return the full row for a source id as a dict, or None."""
106
+ row = self._conn.execute(
107
+ "SELECT * FROM sources WHERE id = ?", (source_id_,)
108
+ ).fetchone()
109
+ return self._row_to_dict(row) if row else None
110
+
111
+ def get_by_source(self, source: str) -> Optional[dict]:
112
+ """Return the row matching a raw source string, or None."""
113
+ return self.get(source_id(source))
114
+
115
+ def list(self) -> list[dict]:
116
+ """Return all source rows, newest updated first."""
117
+ rows = self._conn.execute(
118
+ "SELECT * FROM sources ORDER BY updated_at DESC"
119
+ ).fetchall()
120
+ return [self._row_to_dict(r) for r in rows]
121
+
122
+ def delete(self, source_id_: str) -> bool:
123
+ """Delete a source row; return True if a row was removed."""
124
+ with self._lock:
125
+ cur = self._conn.execute(
126
+ "DELETE FROM sources WHERE id = ?", (source_id_,)
127
+ )
128
+ self._conn.commit()
129
+ return cur.rowcount > 0
130
+
131
+ @staticmethod
132
+ def _row_to_dict(row: tuple) -> dict:
133
+ """Map a SELECT * tuple to a column-keyed dict."""
134
+ return dict(zip(_COLUMNS, row))
135
+
136
+ def close(self) -> None:
137
+ """Close the database connection."""
138
+ self._conn.close()
@@ -296,6 +296,58 @@ class VectorStore:
296
296
  ).fetchall()
297
297
  return [{"source": r["source"], "chunks": int(r["chunks"])} for r in rows]
298
298
 
299
+ def distinct_sources(self) -> list[str]:
300
+ """Return the distinct non-empty source strings, noisiest first.
301
+
302
+ Read-only reverse-lookup helper: the dashboard only has a
303
+ sha1-based source_id and must recover the raw source string to
304
+ serve chunks-only (pre-registry) sources. Reuses the same SELECT
305
+ shape as :meth:`list_sources`.
306
+ """
307
+ rows = self._db.execute(
308
+ "SELECT source, COUNT(*) AS chunks FROM chunks "
309
+ "WHERE source IS NOT NULL AND source != '' "
310
+ "GROUP BY source ORDER BY chunks DESC"
311
+ ).fetchall()
312
+ return [r["source"] for r in rows]
313
+
314
+ def chunks_for_source(self, source: str) -> list[dict]:
315
+ """Return all chunks for a source as text/heading/metadata dicts.
316
+
317
+ Ordered by ``id`` ASC (insertion / ingest order) so callers that
318
+ re-join the text — e.g. :meth:`transcript_for_source` — read the
319
+ chunks back in their original sequence.
320
+ """
321
+ rows = self._db.execute(
322
+ "SELECT text, heading, metadata FROM chunks "
323
+ "WHERE source = ? ORDER BY id",
324
+ (source,),
325
+ ).fetchall()
326
+ return [
327
+ {
328
+ "text": r["text"],
329
+ "heading": r["heading"],
330
+ "metadata": json.loads(r["metadata"]) if r["metadata"] else {},
331
+ }
332
+ for r in rows
333
+ ]
334
+
335
+ def transcript_for_source(self, source: str) -> str:
336
+ """Reconstruct a source's transcript from its indexed chunks.
337
+
338
+ Read-only. Joins the chunk texts (in ingest order, via
339
+ :meth:`chunks_for_source`) via :func:`~core.knowledge.chunker.stitch_chunks`,
340
+ which dedupes the token-overlap window the chunker keeps between
341
+ consecutive chunks so the seams don't repeat ~50 tokens of text.
342
+ Returns "" when the source has no chunks. Used to surface a transcript
343
+ for legacy sources ingested before the SourceRegistry, which have
344
+ chunks but no stored transcript.
345
+ """
346
+ from core.knowledge.chunker import stitch_chunks
347
+
348
+ chunks = self.chunks_for_source(source)
349
+ return stitch_chunks([c["text"] for c in chunks])
350
+
299
351
  def clear(self) -> None:
300
352
  """Remove all data."""
301
353
  if self._vec_available:
@@ -38,3 +38,28 @@ def load_all_squads(base_dir: str | Path) -> list[Squad]:
38
38
  warnings.warn(f"Failed to load squad: {squad_file}: {e}")
39
39
 
40
40
  return squads
41
+
42
+
43
+ def load_matrix_squads(squads_dir: str | Path) -> list[Squad]:
44
+ """Load cross-department matrix squads (missions + transversal).
45
+
46
+ These implement "Autonomy by Missions, not Departments": stream-aligned
47
+ mission squads that own an outcome end-to-end, and transversal
48
+ platform/enabling squads (RevOps, People & Org, Governance). Members are
49
+ borrowed from their home departments — agents keep their department home.
50
+
51
+ Discovers one level of categorised subdirectories (e.g.
52
+ squads/missions/*.yaml, squads/transversal/*.yaml) — not arbitrary depth,
53
+ so stray YAML elsewhere under the tree is never mistaken for a squad.
54
+ """
55
+ squads_dir = Path(squads_dir)
56
+ squads = []
57
+
58
+ for squad_file in sorted(squads_dir.glob("*/*.yaml")):
59
+ try:
60
+ squads.append(load_squad(squad_file))
61
+ except Exception as e:
62
+ import warnings
63
+ warnings.warn(f"Failed to load matrix squad: {squad_file}: {e}")
64
+
65
+ return squads
@@ -139,12 +139,23 @@ def _find_agent_file(core: Path, name: str, suffix: str) -> Path | None:
139
139
  departments_root = (core / "departments").resolve()
140
140
  if not departments_root.exists():
141
141
  return None
142
- for dept in departments_root.iterdir():
143
- candidate = (dept / "agents" / f"{name}{suffix}").resolve()
144
- try:
145
- candidate.relative_to(departments_root)
146
- except ValueError:
147
- continue
148
- if candidate.exists():
149
- return candidate
142
+ # Top-level agents/<name> first (fast, deterministic), then sub-squad
143
+ # subdirectories (e.g. agents/backend-core/<name>.yaml). `name` is already
144
+ # validated above, so the glob cannot traverse outside the agents tree.
145
+ direct = (departments_root.glob(f"*/agents/{name}{suffix}"))
146
+ nested = (departments_root.glob(f"*/agents/**/{name}{suffix}"))
147
+ for candidate in [*sorted(direct), *sorted(nested)]:
148
+ resolved = _safe_resolve(candidate, departments_root)
149
+ if resolved is not None:
150
+ return resolved
150
151
  return None
152
+
153
+
154
+ def _safe_resolve(candidate: Path, root: Path) -> Path | None:
155
+ """Resolve a candidate path, returning it only if it exists inside root."""
156
+ resolved = candidate.resolve()
157
+ try:
158
+ resolved.relative_to(root)
159
+ except ValueError:
160
+ return None
161
+ return resolved if resolved.exists() else None