arkaos 3.78.0 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/VERSION +1 -1
- package/config/agent-allowlists/laravel.yaml +1 -0
- package/config/agent-allowlists/node.yaml +1 -0
- package/config/agent-allowlists/nuxt.yaml +1 -0
- package/config/agent-allowlists/python.yaml +1 -0
- package/core/agents/__pycache__/registry_gen.cpython-313.pyc +0 -0
- package/core/agents/__pycache__/schema.cpython-313.pyc +0 -0
- package/core/agents/registry_gen.py +6 -1
- package/core/agents/schema.py +4 -0
- package/core/cognition/__pycache__/reorganizer.cpython-313.pyc +0 -0
- package/core/cognition/reorganizer.py +37 -7
- package/core/governance/__pycache__/design_system_lint.cpython-313.pyc +0 -0
- package/core/governance/__pycache__/design_system_lint_cli.cpython-313.pyc +0 -0
- package/core/knowledge/__pycache__/agent_match.cpython-313.pyc +0 -0
- package/core/knowledge/__pycache__/chunker.cpython-313.pyc +0 -0
- package/core/knowledge/__pycache__/ingest.cpython-313.pyc +0 -0
- package/core/knowledge/__pycache__/sources.cpython-313.pyc +0 -0
- package/core/knowledge/__pycache__/vector_store.cpython-313.pyc +0 -0
- package/core/knowledge/agent_match.py +114 -0
- package/core/knowledge/chunker.py +45 -0
- package/core/knowledge/ingest.py +156 -78
- package/core/knowledge/sources.py +138 -0
- package/core/knowledge/vector_store.py +52 -0
- package/core/squads/__pycache__/loader.cpython-313.pyc +0 -0
- package/core/squads/loader.py +25 -0
- package/core/sync/__pycache__/agent_provisioner.cpython-313.pyc +0 -0
- package/core/sync/agent_provisioner.py +19 -8
- package/dashboard/app/components/KnowledgeSourcesList.vue +40 -13
- package/dashboard/app/pages/cognition.vue +9 -4
- package/dashboard/app/pages/knowledge/[id].vue +669 -0
- package/dashboard/app/pages/knowledge/index.vue +1281 -0
- package/dashboard/app/types/index.d.ts +1 -1
- package/departments/brand/agents/ux-designer.yaml +15 -1
- package/departments/brand/agents/ux-researcher.yaml +73 -0
- package/departments/brand/agents/ux-strategist.yaml +72 -0
- package/departments/dev/agents/ai-engineering/ai-engineering-lead.yaml +76 -0
- package/departments/dev/agents/architect.yaml +9 -3
- package/departments/dev/agents/backend-core/laravel-eng.yaml +76 -0
- package/departments/dev/agents/backend-core/node-ts-eng.yaml +76 -0
- package/departments/dev/agents/backend-core/python-eng.yaml +76 -0
- package/departments/dev/agents/backend-dev.yaml +10 -4
- package/departments/dev/agents/data-platform/etl-eng.yaml +74 -0
- package/departments/dev/agents/dba.yaml +7 -3
- package/departments/dev/references/backend-knowledge-and-tools.md +70 -0
- package/departments/ecom/agents/retention-manager.yaml +13 -1
- package/departments/leadership/agents/culture-coach.yaml +20 -0
- package/departments/leadership/agents/hr-specialist.yaml +18 -0
- package/departments/leadership/agents/leadership-director.yaml +10 -0
- package/departments/org/agents/chief-of-staff.yaml +76 -0
- package/departments/org/agents/coo.yaml +11 -0
- package/departments/org/agents/okr-steward.yaml +71 -0
- package/departments/org/agents/org-designer.yaml +23 -0
- package/departments/org/skills/okr-cadence/SKILL.md +34 -0
- package/departments/org/skills/principles-audit/SKILL.md +36 -0
- package/departments/pm/agents/pm-director.yaml +21 -8
- package/departments/pm/agents/product-owner.yaml +24 -2
- package/departments/pm/agents/scrum-master.yaml +21 -0
- package/departments/pm/agents/strategic-pm.yaml +72 -0
- package/departments/pm/skills/discovery-plan/SKILL.md +7 -1
- package/departments/quality/agents/cqo.yaml +8 -0
- package/departments/saas/agents/cs-manager.yaml +19 -2
- package/departments/saas/agents/growth-engineer.yaml +14 -1
- package/departments/saas/agents/metrics-analyst.yaml +17 -1
- package/departments/saas/agents/revops-lead.yaml +73 -0
- package/departments/saas/skills/leaky-bucket/SKILL.md +28 -0
- package/departments/saas/skills/voc-loop/SKILL.md +29 -0
- package/departments/sales/agents/sales-director.yaml +9 -0
- package/departments/sales/agents/sdr.yaml +72 -0
- package/departments/strategy/agents/decision-quality.yaml +72 -0
- package/departments/strategy/agents/strategy-director.yaml +13 -0
- package/departments/strategy/skills/premortem/SKILL.md +33 -0
- package/knowledge/agents-registry-v2.json +1218 -78
- package/package.json +1 -1
- package/pyproject.toml +1 -1
- package/scripts/__pycache__/dashboard-api.cpython-313.pyc +0 -0
- package/scripts/dashboard-api.py +376 -13
- package/dashboard/app/pages/knowledge.vue +0 -918
package/core/knowledge/ingest.py
CHANGED
|
@@ -6,12 +6,14 @@ the vector store. Reports progress via callback for real-time UI updates.
|
|
|
6
6
|
|
|
7
7
|
import os
|
|
8
8
|
import re
|
|
9
|
+
import subprocess
|
|
9
10
|
import tempfile
|
|
10
11
|
from dataclasses import dataclass, field
|
|
11
12
|
from pathlib import Path
|
|
12
13
|
from typing import Callable, Optional
|
|
13
14
|
|
|
14
15
|
from core.knowledge.chunker import chunk_markdown
|
|
16
|
+
from core.knowledge.sources import source_id
|
|
15
17
|
from core.knowledge.vector_store import VectorStore
|
|
16
18
|
|
|
17
19
|
|
|
@@ -25,6 +27,11 @@ class IngestResult:
|
|
|
25
27
|
title: str = ""
|
|
26
28
|
error: str = ""
|
|
27
29
|
success: bool = True
|
|
30
|
+
duration: int = 0
|
|
31
|
+
language: str = ""
|
|
32
|
+
media_path: str = ""
|
|
33
|
+
thumbnail_path: str = ""
|
|
34
|
+
transcript: str = ""
|
|
28
35
|
|
|
29
36
|
|
|
30
37
|
ProgressCallback = Callable[[int, str], None] # (percent, message)
|
|
@@ -38,15 +45,20 @@ def detect_source_type(source: str) -> str:
|
|
|
38
45
|
if any(domain in source_lower for domain in ["youtube.com", "youtu.be"]):
|
|
39
46
|
return "youtube"
|
|
40
47
|
|
|
41
|
-
#
|
|
48
|
+
# Video: a URL or file path ending in a video container extension.
|
|
49
|
+
# Checked *before* the generic web fallback so a non-youtube CDN clip
|
|
50
|
+
# (https://.../clip.mp4) resolves to "video", per PR1 spec Task 2.3.
|
|
51
|
+
ext = Path(source.split("?", 1)[0]).suffix.lower()
|
|
52
|
+
if ext in IngestEngine.VIDEO_EXTS:
|
|
53
|
+
return "video"
|
|
54
|
+
|
|
55
|
+
# Web URLs (no recognised media extension)
|
|
42
56
|
if source_lower.startswith(("http://", "https://")):
|
|
43
57
|
return "web"
|
|
44
58
|
|
|
45
|
-
# File extensions
|
|
46
|
-
ext = Path(source).suffix.lower()
|
|
47
59
|
if ext == ".pdf":
|
|
48
60
|
return "pdf"
|
|
49
|
-
if ext in (".mp3", ".wav", ".m4a", ".ogg", ".flac"
|
|
61
|
+
if ext in (".mp3", ".wav", ".m4a", ".ogg", ".flac"):
|
|
50
62
|
return "audio"
|
|
51
63
|
if ext in (".md", ".txt", ".rst"):
|
|
52
64
|
return "markdown"
|
|
@@ -57,11 +69,19 @@ def detect_source_type(source: str) -> str:
|
|
|
57
69
|
class IngestEngine:
|
|
58
70
|
"""Processes content from various sources into the vector store."""
|
|
59
71
|
|
|
60
|
-
|
|
72
|
+
VIDEO_EXTS = (".mp4", ".mov", ".webm", ".mkv", ".avi")
|
|
73
|
+
|
|
74
|
+
def __init__(self, store: VectorStore, media_dir: str | Path = "", registry=None) -> None:
|
|
61
75
|
self._store = store
|
|
76
|
+
self._registry = registry
|
|
62
77
|
self._media_dir = Path(media_dir) if media_dir else Path.home() / ".arkaos" / "media"
|
|
63
78
|
self._media_dir.mkdir(parents=True, exist_ok=True)
|
|
64
79
|
|
|
80
|
+
@staticmethod
|
|
81
|
+
def detect_source_type(source: str) -> str:
|
|
82
|
+
"""Detect the source type for a URL or file path (class-level alias)."""
|
|
83
|
+
return detect_source_type(source)
|
|
84
|
+
|
|
65
85
|
def ingest(
|
|
66
86
|
self,
|
|
67
87
|
source: str,
|
|
@@ -87,6 +107,7 @@ class IngestEngine:
|
|
|
87
107
|
"youtube": self._process_youtube,
|
|
88
108
|
"pdf": self._process_pdf,
|
|
89
109
|
"audio": self._process_audio,
|
|
110
|
+
"video": self._process_video,
|
|
90
111
|
"web": self._process_web,
|
|
91
112
|
"markdown": self._process_markdown,
|
|
92
113
|
}
|
|
@@ -96,11 +117,13 @@ class IngestEngine:
|
|
|
96
117
|
return IngestResult(source=source, source_type=source_type, error=f"Unsupported type: {source_type}", success=False)
|
|
97
118
|
|
|
98
119
|
try:
|
|
99
|
-
text, title = processor
|
|
120
|
+
text, title, extra = self._invoke_processor(processor, source, progress)
|
|
100
121
|
except Exception as e:
|
|
122
|
+
self._register_failure(source, source_type, str(e))
|
|
101
123
|
return IngestResult(source=source, source_type=source_type, error=str(e), success=False)
|
|
102
124
|
|
|
103
125
|
if not text or len(text.strip()) < 50:
|
|
126
|
+
self._register_failure(source, source_type, "Extracted text too short")
|
|
104
127
|
return IngestResult(source=source, source_type=source_type, error="Extracted text too short", success=False)
|
|
105
128
|
|
|
106
129
|
# Chunk and index
|
|
@@ -110,7 +133,9 @@ class IngestEngine:
|
|
|
110
133
|
|
|
111
134
|
if total_chunks == 0:
|
|
112
135
|
progress(100, "No chunks to index")
|
|
113
|
-
|
|
136
|
+
empty = self._make_result(source, source_type, text, title, 0, extra)
|
|
137
|
+
self._register_success(empty)
|
|
138
|
+
return empty
|
|
114
139
|
|
|
115
140
|
# Index in batches with granular progress (85→99%)
|
|
116
141
|
texts = [c.text for c in chunks]
|
|
@@ -149,100 +174,153 @@ class IngestEngine:
|
|
|
149
174
|
except Exception:
|
|
150
175
|
pass
|
|
151
176
|
|
|
177
|
+
result = self._make_result(source, source_type, text, title, count, extra)
|
|
178
|
+
self._register_success(result)
|
|
179
|
+
return result
|
|
180
|
+
|
|
181
|
+
@staticmethod
|
|
182
|
+
def _invoke_processor(
|
|
183
|
+
processor: Callable, source: str, progress: ProgressCallback
|
|
184
|
+
) -> tuple[str, str, dict]:
|
|
185
|
+
"""Call a processor, normalizing 2-tuple and 3-tuple returns."""
|
|
186
|
+
out = processor(source, progress)
|
|
187
|
+
if len(out) == 3:
|
|
188
|
+
return out[0], out[1], out[2] or {}
|
|
189
|
+
return out[0], out[1], {}
|
|
190
|
+
|
|
191
|
+
@staticmethod
|
|
192
|
+
def _make_result(
|
|
193
|
+
source: str, source_type: str, text: str, title: str,
|
|
194
|
+
count: int, extra: dict,
|
|
195
|
+
) -> IngestResult:
|
|
196
|
+
"""Assemble a successful IngestResult including media metadata."""
|
|
152
197
|
return IngestResult(
|
|
153
|
-
source=source,
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
198
|
+
source=source, source_type=source_type, text_length=len(text),
|
|
199
|
+
chunks_created=count, title=title, success=True, transcript=text,
|
|
200
|
+
duration=int(extra.get("duration", 0)),
|
|
201
|
+
language=extra.get("language", ""),
|
|
202
|
+
media_path=extra.get("media_path", ""),
|
|
203
|
+
thumbnail_path=extra.get("thumbnail_path", ""),
|
|
159
204
|
)
|
|
160
205
|
|
|
161
|
-
def
|
|
162
|
-
"""
|
|
206
|
+
def _register_success(self, result: IngestResult) -> None:
|
|
207
|
+
"""Persist a successful ingest to the source registry, if present."""
|
|
208
|
+
if self._registry is None:
|
|
209
|
+
return
|
|
210
|
+
self._registry.upsert(
|
|
211
|
+
result.source, type=result.source_type, title=result.title,
|
|
212
|
+
duration=result.duration, language=result.language,
|
|
213
|
+
thumbnail_path=result.thumbnail_path, media_path=result.media_path,
|
|
214
|
+
transcript=result.transcript, chunk_count=result.chunks_created,
|
|
215
|
+
status="ready",
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
def _register_failure(self, source: str, stype: str, error: str) -> None:
|
|
219
|
+
"""Persist a failed ingest to the source registry, if present."""
|
|
220
|
+
if self._registry is None:
|
|
221
|
+
return
|
|
222
|
+
self._registry.upsert(source, type=stype, status="failed", error=error)
|
|
163
223
|
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
Phase
|
|
168
|
-
Phase
|
|
169
|
-
Phase
|
|
224
|
+
def _process_youtube(self, url: str, progress: ProgressCallback) -> tuple[str, str, dict]:
|
|
225
|
+
"""Download a YouTube video (kept as media) and transcribe it.
|
|
226
|
+
|
|
227
|
+
Phase 1: Fetch info (title, duration, language, thumbnail).
|
|
228
|
+
Phase 2: Download best video+audio merged to mp4 (kept as media).
|
|
229
|
+
Phase 3: Extract a WAV audio track for transcription.
|
|
230
|
+
Phase 4: Transcribe. Returns (text, title, extra-metadata).
|
|
170
231
|
"""
|
|
171
232
|
try:
|
|
172
|
-
import yt_dlp
|
|
233
|
+
import yt_dlp # noqa: F401
|
|
173
234
|
except ImportError:
|
|
174
235
|
raise RuntimeError("yt-dlp not installed. Run: pip install yt-dlp")
|
|
175
236
|
|
|
176
|
-
# === Phase 1: Fetch video info ===
|
|
177
237
|
progress(2, "Phase 1/4 — Fetching video info...")
|
|
238
|
+
info = self._youtube_info(url, progress)
|
|
239
|
+
title = info.get("title", "YouTube Video")
|
|
240
|
+
|
|
241
|
+
progress(8, "Phase 2/4 — Downloading video...")
|
|
242
|
+
video_path = self._download_video(url, progress)
|
|
243
|
+
|
|
244
|
+
progress(40, "Phase 3/4 — Extracting audio from video...")
|
|
245
|
+
audio_path = self._extract_audio(video_path)
|
|
246
|
+
|
|
247
|
+
progress(50, "Phase 4/4 — Transcribing audio (this may take a while)...")
|
|
248
|
+
text = self._transcribe_audio(str(audio_path))
|
|
249
|
+
if not text or len(text.strip()) < 20:
|
|
250
|
+
raise RuntimeError("Transcription produced no usable text")
|
|
251
|
+
progress(70, f"Phase 4/4 — Transcribed: {len(text.split())} words")
|
|
252
|
+
|
|
253
|
+
return text, title, self._youtube_extra(info, video_path)
|
|
254
|
+
|
|
255
|
+
@staticmethod
|
|
256
|
+
def _youtube_extra(info: dict, video_path: Path) -> dict:
|
|
257
|
+
"""Build the extra-metadata dict from yt-dlp info + saved video."""
|
|
258
|
+
return {
|
|
259
|
+
"duration": int(info.get("duration") or 0),
|
|
260
|
+
"language": info.get("language") or "",
|
|
261
|
+
"thumbnail_path": info.get("thumbnail") or "",
|
|
262
|
+
"media_path": str(video_path),
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
def _youtube_info(self, url: str, progress: ProgressCallback) -> dict:
|
|
266
|
+
"""Fetch YouTube metadata without downloading."""
|
|
267
|
+
import yt_dlp
|
|
178
268
|
try:
|
|
179
269
|
with yt_dlp.YoutubeDL({"quiet": True, "no_warnings": True}) as ydl:
|
|
180
270
|
info = ydl.extract_info(url, download=False)
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
progress(5, f"Phase 1/4 — Found: {title} ({duration}s)")
|
|
271
|
+
progress(5, f"Phase 1/4 — Found: {info.get('title')}")
|
|
272
|
+
return info
|
|
184
273
|
except Exception as e:
|
|
185
274
|
raise RuntimeError(f"YouTube access failed: {str(e)[:200]}")
|
|
186
275
|
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
276
|
+
def _download_video(self, url: str, progress: ProgressCallback) -> Path:
|
|
277
|
+
"""Download best video+audio merged to mp4, keyed by stable id."""
|
|
278
|
+
import yt_dlp
|
|
279
|
+
stable_id = source_id(url)
|
|
280
|
+
out = self._media_dir / stable_id
|
|
190
281
|
ydl_opts = {
|
|
191
|
-
"format": "bestaudio/best",
|
|
192
|
-
"
|
|
193
|
-
"
|
|
194
|
-
"key": "FFmpegExtractAudio",
|
|
195
|
-
"preferredcodec": "wav",
|
|
196
|
-
"preferredquality": "16",
|
|
197
|
-
}],
|
|
282
|
+
"format": "bestvideo*+bestaudio/best",
|
|
283
|
+
"merge_output_format": "mp4",
|
|
284
|
+
"outtmpl": str(out) + ".%(ext)s",
|
|
198
285
|
"quiet": True,
|
|
199
286
|
"no_warnings": True,
|
|
200
|
-
"progress_hooks": [
|
|
201
|
-
8 + int((d.get("downloaded_bytes", 0) / max(d.get("total_bytes", 1), 1)) * 17),
|
|
202
|
-
f"Phase 2/4 — Downloading... {d.get('_percent_str', '').strip()}"
|
|
203
|
-
) if d.get("status") == "downloading" else None],
|
|
287
|
+
"progress_hooks": [self._dl_hook(progress)],
|
|
204
288
|
}
|
|
205
|
-
|
|
206
289
|
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
|
207
290
|
ydl.extract_info(url, download=True)
|
|
291
|
+
return self._media_dir / f"{stable_id}.mp4"
|
|
292
|
+
|
|
293
|
+
@staticmethod
|
|
294
|
+
def _dl_hook(progress: ProgressCallback) -> Callable:
|
|
295
|
+
"""Build a yt-dlp progress hook mapping download % to 8-38%."""
|
|
296
|
+
def hook(d: dict) -> None:
|
|
297
|
+
if d.get("status") != "downloading":
|
|
298
|
+
return
|
|
299
|
+
ratio = d.get("downloaded_bytes", 0) / max(d.get("total_bytes", 1), 1)
|
|
300
|
+
progress(8 + int(ratio * 30),
|
|
301
|
+
f"Phase 2/4 — Downloading... {d.get('_percent_str', '').strip()}")
|
|
302
|
+
return hook
|
|
303
|
+
|
|
304
|
+
def _extract_audio(self, video_path: Path) -> Path:
|
|
305
|
+
"""Extract a 16kHz mono WAV track from a video via ffmpeg."""
|
|
306
|
+
audio_path = video_path.with_suffix(".wav")
|
|
307
|
+
subprocess.run(
|
|
308
|
+
["ffmpeg", "-y", "-i", str(video_path), "-vn",
|
|
309
|
+
"-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1",
|
|
310
|
+
str(audio_path)],
|
|
311
|
+
check=True, capture_output=True,
|
|
312
|
+
)
|
|
313
|
+
return audio_path
|
|
208
314
|
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
audio_path = alt
|
|
219
|
-
break
|
|
220
|
-
else:
|
|
221
|
-
raise RuntimeError("Audio extraction failed — no output file found")
|
|
222
|
-
|
|
223
|
-
audio_size_mb = os.path.getsize(audio_path) / (1024 * 1024)
|
|
224
|
-
progress(35, f"Phase 3/4 — Audio extracted ({audio_size_mb:.1f} MB)")
|
|
225
|
-
|
|
226
|
-
# === Phase 4: Transcribe audio ===
|
|
227
|
-
progress(38, "Phase 4/4 — Transcribing audio (this may take a while)...")
|
|
228
|
-
text = self._transcribe_audio(audio_path)
|
|
229
|
-
|
|
230
|
-
if not text or len(text.strip()) < 20:
|
|
231
|
-
raise RuntimeError("Transcription produced no usable text")
|
|
232
|
-
|
|
233
|
-
word_count = len(text.split())
|
|
234
|
-
progress(70, f"Phase 4/4 — Transcribed: {word_count} words")
|
|
235
|
-
|
|
236
|
-
# Rename audio to include title for easy identification
|
|
237
|
-
safe_title = "".join(c if c.isalnum() or c in " -_" else "" for c in title)[:50].strip()
|
|
238
|
-
final_audio = self._media_dir / f"{safe_title}.wav"
|
|
239
|
-
try:
|
|
240
|
-
import shutil
|
|
241
|
-
shutil.move(audio_path, str(final_audio))
|
|
242
|
-
except Exception:
|
|
243
|
-
final_audio = Path(audio_path)
|
|
244
|
-
|
|
245
|
-
return text, title
|
|
315
|
+
def _process_video(self, path: str, progress: ProgressCallback) -> tuple[str, str, dict]:
|
|
316
|
+
"""Ingest a local video file; the video itself is the media."""
|
|
317
|
+
filepath = Path(path)
|
|
318
|
+
if not filepath.exists():
|
|
319
|
+
raise FileNotFoundError(f"Video not found: {path}")
|
|
320
|
+
progress(30, "Transcribing video...")
|
|
321
|
+
text = self._transcribe_audio(str(filepath))
|
|
322
|
+
title = filepath.stem.replace("-", " ").replace("_", " ")
|
|
323
|
+
return text, title, {"media_path": str(filepath)}
|
|
246
324
|
|
|
247
325
|
def _process_pdf(self, path: str, progress: ProgressCallback) -> tuple[str, str]:
|
|
248
326
|
"""Extract text from PDF."""
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
"""Source registry for the knowledge base.
|
|
2
|
+
|
|
3
|
+
Stores rich per-source metadata (title, duration, media path, transcript,
|
|
4
|
+
thumbnail, status) in a dedicated ``sources`` table living inside the same
|
|
5
|
+
``knowledge.db`` as the vector store. This module is purely additive: it
|
|
6
|
+
never touches the ``chunks`` table owned by :class:`VectorStore`.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import hashlib
|
|
11
|
+
import sqlite3
|
|
12
|
+
import threading
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Optional
|
|
15
|
+
|
|
16
|
+
_SCHEMA = """
|
|
17
|
+
CREATE TABLE IF NOT EXISTS sources (
|
|
18
|
+
id TEXT PRIMARY KEY,
|
|
19
|
+
source TEXT NOT NULL,
|
|
20
|
+
type TEXT DEFAULT '',
|
|
21
|
+
title TEXT DEFAULT '',
|
|
22
|
+
duration INTEGER DEFAULT 0,
|
|
23
|
+
language TEXT DEFAULT '',
|
|
24
|
+
thumbnail_path TEXT DEFAULT '',
|
|
25
|
+
media_path TEXT DEFAULT '',
|
|
26
|
+
transcript TEXT DEFAULT '',
|
|
27
|
+
chunk_count INTEGER DEFAULT 0,
|
|
28
|
+
status TEXT DEFAULT 'pending',
|
|
29
|
+
error TEXT DEFAULT '',
|
|
30
|
+
created_at REAL DEFAULT (unixepoch('now')),
|
|
31
|
+
updated_at REAL DEFAULT (unixepoch('now'))
|
|
32
|
+
)
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
_COLUMNS = (
|
|
36
|
+
"id", "source", "type", "title", "duration", "language",
|
|
37
|
+
"thumbnail_path", "media_path", "transcript", "chunk_count",
|
|
38
|
+
"status", "error", "created_at", "updated_at",
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def source_id(source: str) -> str:
|
|
43
|
+
"""Return a stable id for a source string: ``src-`` + sha1[:12]."""
|
|
44
|
+
digest = hashlib.sha1(source.encode("utf-8")).hexdigest()
|
|
45
|
+
return f"src-{digest[:12]}"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class SourceRegistry:
|
|
49
|
+
"""SQLite-backed registry of knowledge sources and their metadata."""
|
|
50
|
+
|
|
51
|
+
def __init__(self, db_path: str | Path = "") -> None:
|
|
52
|
+
"""Open (or create) the sources table in the knowledge database."""
|
|
53
|
+
self._db_path = str(db_path) if db_path else self._default_path()
|
|
54
|
+
self._lock = threading.Lock()
|
|
55
|
+
self._conn = sqlite3.connect(self._db_path, check_same_thread=False)
|
|
56
|
+
self._conn.execute("PRAGMA journal_mode=WAL")
|
|
57
|
+
self._conn.execute(_SCHEMA)
|
|
58
|
+
self._conn.commit()
|
|
59
|
+
|
|
60
|
+
@staticmethod
|
|
61
|
+
def _default_path() -> str:
|
|
62
|
+
home = Path.home() / ".arkaos"
|
|
63
|
+
home.mkdir(parents=True, exist_ok=True)
|
|
64
|
+
return str(home / "knowledge.db")
|
|
65
|
+
|
|
66
|
+
def upsert(
|
|
67
|
+
self,
|
|
68
|
+
source: str,
|
|
69
|
+
*,
|
|
70
|
+
type: str = "",
|
|
71
|
+
title: str = "",
|
|
72
|
+
duration: int = 0,
|
|
73
|
+
language: str = "",
|
|
74
|
+
thumbnail_path: str = "",
|
|
75
|
+
media_path: str = "",
|
|
76
|
+
transcript: str = "",
|
|
77
|
+
chunk_count: int = 0,
|
|
78
|
+
status: str = "ready",
|
|
79
|
+
error: str = "",
|
|
80
|
+
) -> str:
|
|
81
|
+
"""Insert or replace a source row by id; return its stable id."""
|
|
82
|
+
sid = source_id(source)
|
|
83
|
+
params = (
|
|
84
|
+
sid, source, type, title, duration, language, thumbnail_path,
|
|
85
|
+
media_path, transcript, chunk_count, status, error, sid,
|
|
86
|
+
)
|
|
87
|
+
with self._lock:
|
|
88
|
+
self._conn.execute(self._upsert_sql(), params)
|
|
89
|
+
self._conn.commit()
|
|
90
|
+
return sid
|
|
91
|
+
|
|
92
|
+
@staticmethod
|
|
93
|
+
def _upsert_sql() -> str:
|
|
94
|
+
"""SQL that preserves created_at on update via a COALESCE subquery."""
|
|
95
|
+
return (
|
|
96
|
+
"INSERT OR REPLACE INTO sources "
|
|
97
|
+
"(id, source, type, title, duration, language, thumbnail_path, "
|
|
98
|
+
"media_path, transcript, chunk_count, status, error, "
|
|
99
|
+
"created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, "
|
|
100
|
+
"?, ?, COALESCE((SELECT created_at FROM sources WHERE id = ?), "
|
|
101
|
+
"unixepoch('now')), unixepoch('now'))"
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
def get(self, source_id_: str) -> Optional[dict]:
|
|
105
|
+
"""Return the full row for a source id as a dict, or None."""
|
|
106
|
+
row = self._conn.execute(
|
|
107
|
+
"SELECT * FROM sources WHERE id = ?", (source_id_,)
|
|
108
|
+
).fetchone()
|
|
109
|
+
return self._row_to_dict(row) if row else None
|
|
110
|
+
|
|
111
|
+
def get_by_source(self, source: str) -> Optional[dict]:
|
|
112
|
+
"""Return the row matching a raw source string, or None."""
|
|
113
|
+
return self.get(source_id(source))
|
|
114
|
+
|
|
115
|
+
def list(self) -> list[dict]:
|
|
116
|
+
"""Return all source rows, newest updated first."""
|
|
117
|
+
rows = self._conn.execute(
|
|
118
|
+
"SELECT * FROM sources ORDER BY updated_at DESC"
|
|
119
|
+
).fetchall()
|
|
120
|
+
return [self._row_to_dict(r) for r in rows]
|
|
121
|
+
|
|
122
|
+
def delete(self, source_id_: str) -> bool:
|
|
123
|
+
"""Delete a source row; return True if a row was removed."""
|
|
124
|
+
with self._lock:
|
|
125
|
+
cur = self._conn.execute(
|
|
126
|
+
"DELETE FROM sources WHERE id = ?", (source_id_,)
|
|
127
|
+
)
|
|
128
|
+
self._conn.commit()
|
|
129
|
+
return cur.rowcount > 0
|
|
130
|
+
|
|
131
|
+
@staticmethod
|
|
132
|
+
def _row_to_dict(row: tuple) -> dict:
|
|
133
|
+
"""Map a SELECT * tuple to a column-keyed dict."""
|
|
134
|
+
return dict(zip(_COLUMNS, row))
|
|
135
|
+
|
|
136
|
+
def close(self) -> None:
|
|
137
|
+
"""Close the database connection."""
|
|
138
|
+
self._conn.close()
|
|
@@ -296,6 +296,58 @@ class VectorStore:
|
|
|
296
296
|
).fetchall()
|
|
297
297
|
return [{"source": r["source"], "chunks": int(r["chunks"])} for r in rows]
|
|
298
298
|
|
|
299
|
+
def distinct_sources(self) -> list[str]:
|
|
300
|
+
"""Return the distinct non-empty source strings, noisiest first.
|
|
301
|
+
|
|
302
|
+
Read-only reverse-lookup helper: the dashboard only has a
|
|
303
|
+
sha1-based source_id and must recover the raw source string to
|
|
304
|
+
serve chunks-only (pre-registry) sources. Reuses the same SELECT
|
|
305
|
+
shape as :meth:`list_sources`.
|
|
306
|
+
"""
|
|
307
|
+
rows = self._db.execute(
|
|
308
|
+
"SELECT source, COUNT(*) AS chunks FROM chunks "
|
|
309
|
+
"WHERE source IS NOT NULL AND source != '' "
|
|
310
|
+
"GROUP BY source ORDER BY chunks DESC"
|
|
311
|
+
).fetchall()
|
|
312
|
+
return [r["source"] for r in rows]
|
|
313
|
+
|
|
314
|
+
def chunks_for_source(self, source: str) -> list[dict]:
|
|
315
|
+
"""Return all chunks for a source as text/heading/metadata dicts.
|
|
316
|
+
|
|
317
|
+
Ordered by ``id`` ASC (insertion / ingest order) so callers that
|
|
318
|
+
re-join the text — e.g. :meth:`transcript_for_source` — read the
|
|
319
|
+
chunks back in their original sequence.
|
|
320
|
+
"""
|
|
321
|
+
rows = self._db.execute(
|
|
322
|
+
"SELECT text, heading, metadata FROM chunks "
|
|
323
|
+
"WHERE source = ? ORDER BY id",
|
|
324
|
+
(source,),
|
|
325
|
+
).fetchall()
|
|
326
|
+
return [
|
|
327
|
+
{
|
|
328
|
+
"text": r["text"],
|
|
329
|
+
"heading": r["heading"],
|
|
330
|
+
"metadata": json.loads(r["metadata"]) if r["metadata"] else {},
|
|
331
|
+
}
|
|
332
|
+
for r in rows
|
|
333
|
+
]
|
|
334
|
+
|
|
335
|
+
def transcript_for_source(self, source: str) -> str:
|
|
336
|
+
"""Reconstruct a source's transcript from its indexed chunks.
|
|
337
|
+
|
|
338
|
+
Read-only. Joins the chunk texts (in ingest order, via
|
|
339
|
+
:meth:`chunks_for_source`) via :func:`~core.knowledge.chunker.stitch_chunks`,
|
|
340
|
+
which dedupes the token-overlap window the chunker keeps between
|
|
341
|
+
consecutive chunks so the seams don't repeat ~50 tokens of text.
|
|
342
|
+
Returns "" when the source has no chunks. Used to surface a transcript
|
|
343
|
+
for legacy sources ingested before the SourceRegistry, which have
|
|
344
|
+
chunks but no stored transcript.
|
|
345
|
+
"""
|
|
346
|
+
from core.knowledge.chunker import stitch_chunks
|
|
347
|
+
|
|
348
|
+
chunks = self.chunks_for_source(source)
|
|
349
|
+
return stitch_chunks([c["text"] for c in chunks])
|
|
350
|
+
|
|
299
351
|
def clear(self) -> None:
|
|
300
352
|
"""Remove all data."""
|
|
301
353
|
if self._vec_available:
|
|
Binary file
|
package/core/squads/loader.py
CHANGED
|
@@ -38,3 +38,28 @@ def load_all_squads(base_dir: str | Path) -> list[Squad]:
|
|
|
38
38
|
warnings.warn(f"Failed to load squad: {squad_file}: {e}")
|
|
39
39
|
|
|
40
40
|
return squads
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def load_matrix_squads(squads_dir: str | Path) -> list[Squad]:
|
|
44
|
+
"""Load cross-department matrix squads (missions + transversal).
|
|
45
|
+
|
|
46
|
+
These implement "Autonomy by Missions, not Departments": stream-aligned
|
|
47
|
+
mission squads that own an outcome end-to-end, and transversal
|
|
48
|
+
platform/enabling squads (RevOps, People & Org, Governance). Members are
|
|
49
|
+
borrowed from their home departments — agents keep their department home.
|
|
50
|
+
|
|
51
|
+
Discovers one level of categorised subdirectories (e.g.
|
|
52
|
+
squads/missions/*.yaml, squads/transversal/*.yaml) — not arbitrary depth,
|
|
53
|
+
so stray YAML elsewhere under the tree is never mistaken for a squad.
|
|
54
|
+
"""
|
|
55
|
+
squads_dir = Path(squads_dir)
|
|
56
|
+
squads = []
|
|
57
|
+
|
|
58
|
+
for squad_file in sorted(squads_dir.glob("*/*.yaml")):
|
|
59
|
+
try:
|
|
60
|
+
squads.append(load_squad(squad_file))
|
|
61
|
+
except Exception as e:
|
|
62
|
+
import warnings
|
|
63
|
+
warnings.warn(f"Failed to load matrix squad: {squad_file}: {e}")
|
|
64
|
+
|
|
65
|
+
return squads
|
|
Binary file
|
|
@@ -139,12 +139,23 @@ def _find_agent_file(core: Path, name: str, suffix: str) -> Path | None:
|
|
|
139
139
|
departments_root = (core / "departments").resolve()
|
|
140
140
|
if not departments_root.exists():
|
|
141
141
|
return None
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
142
|
+
# Top-level agents/<name> first (fast, deterministic), then sub-squad
|
|
143
|
+
# subdirectories (e.g. agents/backend-core/<name>.yaml). `name` is already
|
|
144
|
+
# validated above, so the glob cannot traverse outside the agents tree.
|
|
145
|
+
direct = (departments_root.glob(f"*/agents/{name}{suffix}"))
|
|
146
|
+
nested = (departments_root.glob(f"*/agents/**/{name}{suffix}"))
|
|
147
|
+
for candidate in [*sorted(direct), *sorted(nested)]:
|
|
148
|
+
resolved = _safe_resolve(candidate, departments_root)
|
|
149
|
+
if resolved is not None:
|
|
150
|
+
return resolved
|
|
150
151
|
return None
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _safe_resolve(candidate: Path, root: Path) -> Path | None:
|
|
155
|
+
"""Resolve a candidate path, returning it only if it exists inside root."""
|
|
156
|
+
resolved = candidate.resolve()
|
|
157
|
+
try:
|
|
158
|
+
resolved.relative_to(root)
|
|
159
|
+
except ValueError:
|
|
160
|
+
return None
|
|
161
|
+
return resolved if resolved.exists() else None
|