arkaos 2.0.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. package/VERSION +1 -1
  2. package/core/budget/__pycache__/__init__.cpython-313.pyc +0 -0
  3. package/core/budget/__pycache__/manager.cpython-313.pyc +0 -0
  4. package/core/budget/__pycache__/schema.cpython-313.pyc +0 -0
  5. package/core/knowledge/__pycache__/__init__.cpython-313.pyc +0 -0
  6. package/core/knowledge/__pycache__/chunker.cpython-313.pyc +0 -0
  7. package/core/knowledge/__pycache__/embedder.cpython-313.pyc +0 -0
  8. package/core/knowledge/__pycache__/indexer.cpython-313.pyc +0 -0
  9. package/core/knowledge/__pycache__/ingest.cpython-313.pyc +0 -0
  10. package/core/knowledge/__pycache__/vector_store.cpython-313.pyc +0 -0
  11. package/core/knowledge/ingest.py +270 -0
  12. package/core/obsidian/__pycache__/__init__.cpython-313.pyc +0 -0
  13. package/core/obsidian/__pycache__/templates.cpython-313.pyc +0 -0
  14. package/core/obsidian/__pycache__/writer.cpython-313.pyc +0 -0
  15. package/core/orchestration/__pycache__/__init__.cpython-313.pyc +0 -0
  16. package/core/orchestration/__pycache__/patterns.cpython-313.pyc +0 -0
  17. package/core/orchestration/__pycache__/protocol.cpython-313.pyc +0 -0
  18. package/core/runtime/__pycache__/subagent.cpython-313.pyc +0 -0
  19. package/core/squads/__pycache__/schema.cpython-313.pyc +0 -0
  20. package/core/synapse/__pycache__/engine.cpython-313.pyc +0 -0
  21. package/core/synapse/__pycache__/layers.cpython-313.pyc +0 -0
  22. package/core/tasks/__pycache__/schema.cpython-313.pyc +0 -0
  23. package/core/workflow/__pycache__/engine.cpython-313.pyc +0 -0
  24. package/core/workflow/__pycache__/schema.cpython-313.pyc +0 -0
  25. package/installer/cli.js +13 -0
  26. package/package.json +1 -1
  27. package/pyproject.toml +12 -1
package/VERSION CHANGED
@@ -1 +1 @@
1
- 2.0.3
1
+ 2.1.0
@@ -0,0 +1,270 @@
1
+ """Knowledge ingest engine — process YouTube, PDF, audio, web, markdown.
2
+
3
+ Downloads, transcribes, extracts text, chunks, embeds, and indexes into
4
+ the vector store. Reports progress via callback for real-time UI updates.
5
+ """
6
+
7
+ import os
8
+ import re
9
+ import tempfile
10
+ from dataclasses import dataclass, field
11
+ from pathlib import Path
12
+ from typing import Callable, Optional
13
+
14
+ from core.knowledge.chunker import chunk_markdown
15
+ from core.knowledge.vector_store import VectorStore
16
+
17
+
18
+ @dataclass
19
+ class IngestResult:
20
+ """Result of an ingest operation."""
21
+ source: str
22
+ source_type: str
23
+ text_length: int = 0
24
+ chunks_created: int = 0
25
+ title: str = ""
26
+ error: str = ""
27
+ success: bool = True
28
+
29
+
30
+ ProgressCallback = Callable[[int, str], None] # (percent, message)
31
+
32
+
33
+ def detect_source_type(source: str) -> str:
34
+ """Auto-detect content type from URL or file extension."""
35
+ source_lower = source.lower()
36
+
37
+ # YouTube URLs
38
+ if any(domain in source_lower for domain in ["youtube.com", "youtu.be"]):
39
+ return "youtube"
40
+
41
+ # Web URLs
42
+ if source_lower.startswith(("http://", "https://")):
43
+ return "web"
44
+
45
+ # File extensions
46
+ ext = Path(source).suffix.lower()
47
+ if ext == ".pdf":
48
+ return "pdf"
49
+ if ext in (".mp3", ".wav", ".m4a", ".ogg", ".flac", ".webm"):
50
+ return "audio"
51
+ if ext in (".md", ".txt", ".rst"):
52
+ return "markdown"
53
+
54
+ return "unknown"
55
+
56
+
57
+ class IngestEngine:
58
+ """Processes content from various sources into the vector store."""
59
+
60
+ def __init__(self, store: VectorStore, media_dir: str | Path = "") -> None:
61
+ self._store = store
62
+ self._media_dir = Path(media_dir) if media_dir else Path.home() / ".arkaos" / "media"
63
+ self._media_dir.mkdir(parents=True, exist_ok=True)
64
+
65
+ def ingest(
66
+ self,
67
+ source: str,
68
+ source_type: str = "",
69
+ on_progress: Optional[ProgressCallback] = None,
70
+ metadata: dict | None = None,
71
+ ) -> IngestResult:
72
+ """Ingest content from any supported source.
73
+
74
+ Args:
75
+ source: URL or file path.
76
+ source_type: youtube, pdf, audio, web, markdown. Auto-detected if empty.
77
+ on_progress: Callback(percent, message) for progress updates.
78
+ metadata: Extra metadata to attach to indexed chunks.
79
+ """
80
+ if not source_type:
81
+ source_type = detect_source_type(source)
82
+
83
+ progress = on_progress or (lambda p, m: None)
84
+ progress(0, f"Starting {source_type} ingest...")
85
+
86
+ processors = {
87
+ "youtube": self._process_youtube,
88
+ "pdf": self._process_pdf,
89
+ "audio": self._process_audio,
90
+ "web": self._process_web,
91
+ "markdown": self._process_markdown,
92
+ }
93
+
94
+ processor = processors.get(source_type)
95
+ if not processor:
96
+ return IngestResult(source=source, source_type=source_type, error=f"Unsupported type: {source_type}", success=False)
97
+
98
+ try:
99
+ text, title = processor(source, progress)
100
+ except Exception as e:
101
+ return IngestResult(source=source, source_type=source_type, error=str(e), success=False)
102
+
103
+ if not text or len(text.strip()) < 50:
104
+ return IngestResult(source=source, source_type=source_type, error="Extracted text too short", success=False)
105
+
106
+ # Chunk and index
107
+ progress(75, "Chunking content...")
108
+ chunks = chunk_markdown(text, max_tokens=512, source=source)
109
+
110
+ progress(85, f"Indexing {len(chunks)} chunks...")
111
+ texts = [c.text for c in chunks]
112
+ headings = [c.heading for c in chunks]
113
+ count = self._store.index_chunks(
114
+ texts=texts,
115
+ headings=headings,
116
+ source=source,
117
+ metadata={"type": source_type, "title": title, **(metadata or {})},
118
+ )
119
+
120
+ progress(100, f"Done — {count} chunks indexed")
121
+
122
+ return IngestResult(
123
+ source=source,
124
+ source_type=source_type,
125
+ text_length=len(text),
126
+ chunks_created=count,
127
+ title=title,
128
+ success=True,
129
+ )
130
+
131
+ def _process_youtube(self, url: str, progress: ProgressCallback) -> tuple[str, str]:
132
+ """Download YouTube video and transcribe audio."""
133
+ try:
134
+ import yt_dlp
135
+ except ImportError:
136
+ raise RuntimeError("yt-dlp not installed. Run: pip install yt-dlp")
137
+
138
+ progress(5, "Fetching video info...")
139
+
140
+ # Download audio only
141
+ audio_path = str(self._media_dir / "yt_audio.wav")
142
+ ydl_opts = {
143
+ "format": "bestaudio/best",
144
+ "outtmpl": str(self._media_dir / "yt_audio.%(ext)s"),
145
+ "postprocessors": [{
146
+ "key": "FFmpegExtractAudio",
147
+ "preferredcodec": "wav",
148
+ "preferredquality": "16",
149
+ }],
150
+ "quiet": True,
151
+ "no_warnings": True,
152
+ }
153
+
154
+ progress(10, "Downloading audio...")
155
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
156
+ info = ydl.extract_info(url, download=True)
157
+ title = info.get("title", "YouTube Video")
158
+
159
+ progress(35, "Transcribing audio...")
160
+ text = self._transcribe_audio(audio_path)
161
+
162
+ # Cleanup
163
+ try:
164
+ os.remove(audio_path)
165
+ except OSError:
166
+ pass
167
+
168
+ return text, title
169
+
170
+ def _process_pdf(self, path: str, progress: ProgressCallback) -> tuple[str, str]:
171
+ """Extract text from PDF."""
172
+ try:
173
+ import pdfplumber
174
+ except ImportError:
175
+ raise RuntimeError("pdfplumber not installed. Run: pip install pdfplumber")
176
+
177
+ progress(10, "Opening PDF...")
178
+ filepath = Path(path)
179
+ if not filepath.exists():
180
+ raise FileNotFoundError(f"PDF not found: {path}")
181
+
182
+ pages_text = []
183
+ with pdfplumber.open(filepath) as pdf:
184
+ total_pages = len(pdf.pages)
185
+ for i, page in enumerate(pdf.pages):
186
+ text = page.extract_text() or ""
187
+ pages_text.append(text)
188
+ pct = 10 + int((i / total_pages) * 60)
189
+ progress(pct, f"Extracting page {i + 1}/{total_pages}...")
190
+
191
+ title = filepath.stem.replace("-", " ").replace("_", " ")
192
+ return "\n\n".join(pages_text), title
193
+
194
+ def _process_audio(self, path: str, progress: ProgressCallback) -> tuple[str, str]:
195
+ """Transcribe audio file."""
196
+ progress(10, "Loading audio...")
197
+ filepath = Path(path)
198
+ if not filepath.exists():
199
+ raise FileNotFoundError(f"Audio not found: {path}")
200
+
201
+ progress(20, "Transcribing audio...")
202
+ text = self._transcribe_audio(str(filepath))
203
+ title = filepath.stem.replace("-", " ").replace("_", " ")
204
+ return text, title
205
+
206
+ def _process_web(self, url: str, progress: ProgressCallback) -> tuple[str, str]:
207
+ """Scrape web page content."""
208
+ try:
209
+ import requests
210
+ from bs4 import BeautifulSoup
211
+ except ImportError:
212
+ raise RuntimeError("beautifulsoup4 and requests not installed. Run: pip install beautifulsoup4 requests")
213
+
214
+ progress(10, "Fetching page...")
215
+ resp = requests.get(url, timeout=15, headers={
216
+ "User-Agent": "Mozilla/5.0 (ArkaOS Knowledge Indexer)"
217
+ })
218
+ resp.raise_for_status()
219
+
220
+ progress(40, "Parsing content...")
221
+ soup = BeautifulSoup(resp.text, "html.parser")
222
+
223
+ # Remove scripts, styles, nav, footer
224
+ for tag in soup(["script", "style", "nav", "footer", "header", "aside"]):
225
+ tag.decompose()
226
+
227
+ # Get title
228
+ title = soup.title.string if soup.title else url
229
+
230
+ # Get main content (article > main > body)
231
+ main = soup.find("article") or soup.find("main") or soup.find("body")
232
+ text = main.get_text(separator="\n\n", strip=True) if main else soup.get_text(separator="\n\n", strip=True)
233
+
234
+ # Clean up whitespace
235
+ text = re.sub(r'\n{3,}', '\n\n', text)
236
+
237
+ return text, title
238
+
239
+ def _process_markdown(self, path: str, progress: ProgressCallback) -> tuple[str, str]:
240
+ """Read markdown/text file directly."""
241
+ progress(10, "Reading file...")
242
+ filepath = Path(path)
243
+ if not filepath.exists():
244
+ raise FileNotFoundError(f"File not found: {path}")
245
+
246
+ text = filepath.read_text(encoding="utf-8")
247
+ title = filepath.stem.replace("-", " ").replace("_", " ")
248
+ return text, title
249
+
250
+ def _transcribe_audio(self, audio_path: str) -> str:
251
+ """Transcribe audio using faster-whisper (or fallback)."""
252
+ try:
253
+ from faster_whisper import WhisperModel
254
+ model = WhisperModel("base", device="cpu", compute_type="int8")
255
+ segments, _ = model.transcribe(audio_path, beam_size=5)
256
+ return " ".join(segment.text for segment in segments)
257
+ except ImportError:
258
+ pass
259
+
260
+ try:
261
+ import whisper
262
+ model = whisper.load_model("base")
263
+ result = model.transcribe(audio_path)
264
+ return result["text"]
265
+ except ImportError:
266
+ raise RuntimeError(
267
+ "No transcription engine available. Install one:\n"
268
+ " pip install faster-whisper (recommended, lighter)\n"
269
+ " pip install openai-whisper (original, heavier)"
270
+ )
package/installer/cli.js CHANGED
@@ -39,6 +39,7 @@ Usage:
39
39
  npx arkaos init Initialize project config (.arkaos.json)
40
40
  npx arkaos update Update to latest version
41
41
  npx arkaos migrate Migrate from v1 to v2
42
+ npx arkaos dashboard Start monitoring dashboard
42
43
  npx arkaos doctor Run health checks
43
44
  npx arkaos uninstall Remove ArkaOS
44
45
 
@@ -98,6 +99,18 @@ async function main() {
98
99
  await migrate();
99
100
  break;
100
101
 
102
+ case "dashboard": {
103
+ const { execSync: execDash } = await import("node:child_process");
104
+ const repoRootDash = dirname(fileURLToPath(import.meta.url)).replace(/\/installer$/, "");
105
+ try {
106
+ execDash(`bash "${repoRootDash}/scripts/start-dashboard.sh"`, {
107
+ stdio: "inherit",
108
+ env: { ...process.env, ARKAOS_ROOT: repoRootDash },
109
+ });
110
+ } catch { process.exit(1); }
111
+ break;
112
+ }
113
+
101
114
  case "index": {
102
115
  const { execSync } = await import("node:child_process");
103
116
  const indexArgs = positionals.slice(1).join(" ");
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "arkaos",
3
- "version": "2.0.3",
3
+ "version": "2.1.0",
4
4
  "description": "The Operating System for AI Agent Teams",
5
5
  "type": "module",
6
6
  "bin": {
package/pyproject.toml CHANGED
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "arkaos-core"
3
- version = "2.0.3"
3
+ version = "2.1.0"
4
4
  description = "Core engine for ArkaOS — The Operating System for AI Agent Teams"
5
5
  readme = "README.md"
6
6
  license = {text = "MIT"}
@@ -32,6 +32,17 @@ knowledge = [
32
32
  "fastembed>=0.8.0",
33
33
  "sqlite-vss>=0.1.2",
34
34
  ]
35
+ dashboard = [
36
+ "fastapi>=0.115.0",
37
+ "uvicorn>=0.32.0",
38
+ ]
39
+ ingest = [
40
+ "yt-dlp>=2024.0",
41
+ "faster-whisper>=1.0.0",
42
+ "pdfplumber>=0.11.0",
43
+ "beautifulsoup4>=4.12.0",
44
+ "requests>=2.31.0",
45
+ ]
35
46
  dev = [
36
47
  "pytest>=8.0",
37
48
  "pytest-cov>=5.0",