arkaos 2.0.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/VERSION +1 -1
- package/core/budget/__pycache__/__init__.cpython-313.pyc +0 -0
- package/core/budget/__pycache__/manager.cpython-313.pyc +0 -0
- package/core/budget/__pycache__/schema.cpython-313.pyc +0 -0
- package/core/knowledge/__pycache__/__init__.cpython-313.pyc +0 -0
- package/core/knowledge/__pycache__/chunker.cpython-313.pyc +0 -0
- package/core/knowledge/__pycache__/embedder.cpython-313.pyc +0 -0
- package/core/knowledge/__pycache__/indexer.cpython-313.pyc +0 -0
- package/core/knowledge/__pycache__/ingest.cpython-313.pyc +0 -0
- package/core/knowledge/__pycache__/vector_store.cpython-313.pyc +0 -0
- package/core/knowledge/ingest.py +270 -0
- package/core/obsidian/__pycache__/__init__.cpython-313.pyc +0 -0
- package/core/obsidian/__pycache__/templates.cpython-313.pyc +0 -0
- package/core/obsidian/__pycache__/writer.cpython-313.pyc +0 -0
- package/core/orchestration/__pycache__/__init__.cpython-313.pyc +0 -0
- package/core/orchestration/__pycache__/patterns.cpython-313.pyc +0 -0
- package/core/orchestration/__pycache__/protocol.cpython-313.pyc +0 -0
- package/core/runtime/__pycache__/subagent.cpython-313.pyc +0 -0
- package/core/squads/__pycache__/schema.cpython-313.pyc +0 -0
- package/core/synapse/__pycache__/engine.cpython-313.pyc +0 -0
- package/core/synapse/__pycache__/layers.cpython-313.pyc +0 -0
- package/core/tasks/__pycache__/schema.cpython-313.pyc +0 -0
- package/core/workflow/__pycache__/engine.cpython-313.pyc +0 -0
- package/core/workflow/__pycache__/schema.cpython-313.pyc +0 -0
- package/installer/cli.js +13 -0
- package/package.json +1 -1
- package/pyproject.toml +12 -1
package/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
2.0
|
|
1
|
+
2.1.0
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
"""Knowledge ingest engine — process YouTube, PDF, audio, web, markdown.
|
|
2
|
+
|
|
3
|
+
Downloads, transcribes, extracts text, chunks, embeds, and indexes into
|
|
4
|
+
the vector store. Reports progress via callback for real-time UI updates.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
import re
|
|
9
|
+
import tempfile
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Callable, Optional
|
|
13
|
+
|
|
14
|
+
from core.knowledge.chunker import chunk_markdown
|
|
15
|
+
from core.knowledge.vector_store import VectorStore
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class IngestResult:
|
|
20
|
+
"""Result of an ingest operation."""
|
|
21
|
+
source: str
|
|
22
|
+
source_type: str
|
|
23
|
+
text_length: int = 0
|
|
24
|
+
chunks_created: int = 0
|
|
25
|
+
title: str = ""
|
|
26
|
+
error: str = ""
|
|
27
|
+
success: bool = True
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
ProgressCallback = Callable[[int, str], None] # (percent, message)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def detect_source_type(source: str) -> str:
|
|
34
|
+
"""Auto-detect content type from URL or file extension."""
|
|
35
|
+
source_lower = source.lower()
|
|
36
|
+
|
|
37
|
+
# YouTube URLs
|
|
38
|
+
if any(domain in source_lower for domain in ["youtube.com", "youtu.be"]):
|
|
39
|
+
return "youtube"
|
|
40
|
+
|
|
41
|
+
# Web URLs
|
|
42
|
+
if source_lower.startswith(("http://", "https://")):
|
|
43
|
+
return "web"
|
|
44
|
+
|
|
45
|
+
# File extensions
|
|
46
|
+
ext = Path(source).suffix.lower()
|
|
47
|
+
if ext == ".pdf":
|
|
48
|
+
return "pdf"
|
|
49
|
+
if ext in (".mp3", ".wav", ".m4a", ".ogg", ".flac", ".webm"):
|
|
50
|
+
return "audio"
|
|
51
|
+
if ext in (".md", ".txt", ".rst"):
|
|
52
|
+
return "markdown"
|
|
53
|
+
|
|
54
|
+
return "unknown"
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class IngestEngine:
|
|
58
|
+
"""Processes content from various sources into the vector store."""
|
|
59
|
+
|
|
60
|
+
def __init__(self, store: VectorStore, media_dir: str | Path = "") -> None:
|
|
61
|
+
self._store = store
|
|
62
|
+
self._media_dir = Path(media_dir) if media_dir else Path.home() / ".arkaos" / "media"
|
|
63
|
+
self._media_dir.mkdir(parents=True, exist_ok=True)
|
|
64
|
+
|
|
65
|
+
def ingest(
|
|
66
|
+
self,
|
|
67
|
+
source: str,
|
|
68
|
+
source_type: str = "",
|
|
69
|
+
on_progress: Optional[ProgressCallback] = None,
|
|
70
|
+
metadata: dict | None = None,
|
|
71
|
+
) -> IngestResult:
|
|
72
|
+
"""Ingest content from any supported source.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
source: URL or file path.
|
|
76
|
+
source_type: youtube, pdf, audio, web, markdown. Auto-detected if empty.
|
|
77
|
+
on_progress: Callback(percent, message) for progress updates.
|
|
78
|
+
metadata: Extra metadata to attach to indexed chunks.
|
|
79
|
+
"""
|
|
80
|
+
if not source_type:
|
|
81
|
+
source_type = detect_source_type(source)
|
|
82
|
+
|
|
83
|
+
progress = on_progress or (lambda p, m: None)
|
|
84
|
+
progress(0, f"Starting {source_type} ingest...")
|
|
85
|
+
|
|
86
|
+
processors = {
|
|
87
|
+
"youtube": self._process_youtube,
|
|
88
|
+
"pdf": self._process_pdf,
|
|
89
|
+
"audio": self._process_audio,
|
|
90
|
+
"web": self._process_web,
|
|
91
|
+
"markdown": self._process_markdown,
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
processor = processors.get(source_type)
|
|
95
|
+
if not processor:
|
|
96
|
+
return IngestResult(source=source, source_type=source_type, error=f"Unsupported type: {source_type}", success=False)
|
|
97
|
+
|
|
98
|
+
try:
|
|
99
|
+
text, title = processor(source, progress)
|
|
100
|
+
except Exception as e:
|
|
101
|
+
return IngestResult(source=source, source_type=source_type, error=str(e), success=False)
|
|
102
|
+
|
|
103
|
+
if not text or len(text.strip()) < 50:
|
|
104
|
+
return IngestResult(source=source, source_type=source_type, error="Extracted text too short", success=False)
|
|
105
|
+
|
|
106
|
+
# Chunk and index
|
|
107
|
+
progress(75, "Chunking content...")
|
|
108
|
+
chunks = chunk_markdown(text, max_tokens=512, source=source)
|
|
109
|
+
|
|
110
|
+
progress(85, f"Indexing {len(chunks)} chunks...")
|
|
111
|
+
texts = [c.text for c in chunks]
|
|
112
|
+
headings = [c.heading for c in chunks]
|
|
113
|
+
count = self._store.index_chunks(
|
|
114
|
+
texts=texts,
|
|
115
|
+
headings=headings,
|
|
116
|
+
source=source,
|
|
117
|
+
metadata={"type": source_type, "title": title, **(metadata or {})},
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
progress(100, f"Done — {count} chunks indexed")
|
|
121
|
+
|
|
122
|
+
return IngestResult(
|
|
123
|
+
source=source,
|
|
124
|
+
source_type=source_type,
|
|
125
|
+
text_length=len(text),
|
|
126
|
+
chunks_created=count,
|
|
127
|
+
title=title,
|
|
128
|
+
success=True,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
def _process_youtube(self, url: str, progress: ProgressCallback) -> tuple[str, str]:
|
|
132
|
+
"""Download YouTube video and transcribe audio."""
|
|
133
|
+
try:
|
|
134
|
+
import yt_dlp
|
|
135
|
+
except ImportError:
|
|
136
|
+
raise RuntimeError("yt-dlp not installed. Run: pip install yt-dlp")
|
|
137
|
+
|
|
138
|
+
progress(5, "Fetching video info...")
|
|
139
|
+
|
|
140
|
+
# Download audio only
|
|
141
|
+
audio_path = str(self._media_dir / "yt_audio.wav")
|
|
142
|
+
ydl_opts = {
|
|
143
|
+
"format": "bestaudio/best",
|
|
144
|
+
"outtmpl": str(self._media_dir / "yt_audio.%(ext)s"),
|
|
145
|
+
"postprocessors": [{
|
|
146
|
+
"key": "FFmpegExtractAudio",
|
|
147
|
+
"preferredcodec": "wav",
|
|
148
|
+
"preferredquality": "16",
|
|
149
|
+
}],
|
|
150
|
+
"quiet": True,
|
|
151
|
+
"no_warnings": True,
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
progress(10, "Downloading audio...")
|
|
155
|
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
|
156
|
+
info = ydl.extract_info(url, download=True)
|
|
157
|
+
title = info.get("title", "YouTube Video")
|
|
158
|
+
|
|
159
|
+
progress(35, "Transcribing audio...")
|
|
160
|
+
text = self._transcribe_audio(audio_path)
|
|
161
|
+
|
|
162
|
+
# Cleanup
|
|
163
|
+
try:
|
|
164
|
+
os.remove(audio_path)
|
|
165
|
+
except OSError:
|
|
166
|
+
pass
|
|
167
|
+
|
|
168
|
+
return text, title
|
|
169
|
+
|
|
170
|
+
def _process_pdf(self, path: str, progress: ProgressCallback) -> tuple[str, str]:
|
|
171
|
+
"""Extract text from PDF."""
|
|
172
|
+
try:
|
|
173
|
+
import pdfplumber
|
|
174
|
+
except ImportError:
|
|
175
|
+
raise RuntimeError("pdfplumber not installed. Run: pip install pdfplumber")
|
|
176
|
+
|
|
177
|
+
progress(10, "Opening PDF...")
|
|
178
|
+
filepath = Path(path)
|
|
179
|
+
if not filepath.exists():
|
|
180
|
+
raise FileNotFoundError(f"PDF not found: {path}")
|
|
181
|
+
|
|
182
|
+
pages_text = []
|
|
183
|
+
with pdfplumber.open(filepath) as pdf:
|
|
184
|
+
total_pages = len(pdf.pages)
|
|
185
|
+
for i, page in enumerate(pdf.pages):
|
|
186
|
+
text = page.extract_text() or ""
|
|
187
|
+
pages_text.append(text)
|
|
188
|
+
pct = 10 + int((i / total_pages) * 60)
|
|
189
|
+
progress(pct, f"Extracting page {i + 1}/{total_pages}...")
|
|
190
|
+
|
|
191
|
+
title = filepath.stem.replace("-", " ").replace("_", " ")
|
|
192
|
+
return "\n\n".join(pages_text), title
|
|
193
|
+
|
|
194
|
+
def _process_audio(self, path: str, progress: ProgressCallback) -> tuple[str, str]:
|
|
195
|
+
"""Transcribe audio file."""
|
|
196
|
+
progress(10, "Loading audio...")
|
|
197
|
+
filepath = Path(path)
|
|
198
|
+
if not filepath.exists():
|
|
199
|
+
raise FileNotFoundError(f"Audio not found: {path}")
|
|
200
|
+
|
|
201
|
+
progress(20, "Transcribing audio...")
|
|
202
|
+
text = self._transcribe_audio(str(filepath))
|
|
203
|
+
title = filepath.stem.replace("-", " ").replace("_", " ")
|
|
204
|
+
return text, title
|
|
205
|
+
|
|
206
|
+
def _process_web(self, url: str, progress: ProgressCallback) -> tuple[str, str]:
|
|
207
|
+
"""Scrape web page content."""
|
|
208
|
+
try:
|
|
209
|
+
import requests
|
|
210
|
+
from bs4 import BeautifulSoup
|
|
211
|
+
except ImportError:
|
|
212
|
+
raise RuntimeError("beautifulsoup4 and requests not installed. Run: pip install beautifulsoup4 requests")
|
|
213
|
+
|
|
214
|
+
progress(10, "Fetching page...")
|
|
215
|
+
resp = requests.get(url, timeout=15, headers={
|
|
216
|
+
"User-Agent": "Mozilla/5.0 (ArkaOS Knowledge Indexer)"
|
|
217
|
+
})
|
|
218
|
+
resp.raise_for_status()
|
|
219
|
+
|
|
220
|
+
progress(40, "Parsing content...")
|
|
221
|
+
soup = BeautifulSoup(resp.text, "html.parser")
|
|
222
|
+
|
|
223
|
+
# Remove scripts, styles, nav, footer
|
|
224
|
+
for tag in soup(["script", "style", "nav", "footer", "header", "aside"]):
|
|
225
|
+
tag.decompose()
|
|
226
|
+
|
|
227
|
+
# Get title
|
|
228
|
+
title = soup.title.string if soup.title else url
|
|
229
|
+
|
|
230
|
+
# Get main content (article > main > body)
|
|
231
|
+
main = soup.find("article") or soup.find("main") or soup.find("body")
|
|
232
|
+
text = main.get_text(separator="\n\n", strip=True) if main else soup.get_text(separator="\n\n", strip=True)
|
|
233
|
+
|
|
234
|
+
# Clean up whitespace
|
|
235
|
+
text = re.sub(r'\n{3,}', '\n\n', text)
|
|
236
|
+
|
|
237
|
+
return text, title
|
|
238
|
+
|
|
239
|
+
def _process_markdown(self, path: str, progress: ProgressCallback) -> tuple[str, str]:
|
|
240
|
+
"""Read markdown/text file directly."""
|
|
241
|
+
progress(10, "Reading file...")
|
|
242
|
+
filepath = Path(path)
|
|
243
|
+
if not filepath.exists():
|
|
244
|
+
raise FileNotFoundError(f"File not found: {path}")
|
|
245
|
+
|
|
246
|
+
text = filepath.read_text(encoding="utf-8")
|
|
247
|
+
title = filepath.stem.replace("-", " ").replace("_", " ")
|
|
248
|
+
return text, title
|
|
249
|
+
|
|
250
|
+
def _transcribe_audio(self, audio_path: str) -> str:
|
|
251
|
+
"""Transcribe audio using faster-whisper (or fallback)."""
|
|
252
|
+
try:
|
|
253
|
+
from faster_whisper import WhisperModel
|
|
254
|
+
model = WhisperModel("base", device="cpu", compute_type="int8")
|
|
255
|
+
segments, _ = model.transcribe(audio_path, beam_size=5)
|
|
256
|
+
return " ".join(segment.text for segment in segments)
|
|
257
|
+
except ImportError:
|
|
258
|
+
pass
|
|
259
|
+
|
|
260
|
+
try:
|
|
261
|
+
import whisper
|
|
262
|
+
model = whisper.load_model("base")
|
|
263
|
+
result = model.transcribe(audio_path)
|
|
264
|
+
return result["text"]
|
|
265
|
+
except ImportError:
|
|
266
|
+
raise RuntimeError(
|
|
267
|
+
"No transcription engine available. Install one:\n"
|
|
268
|
+
" pip install faster-whisper (recommended, lighter)\n"
|
|
269
|
+
" pip install openai-whisper (original, heavier)"
|
|
270
|
+
)
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/installer/cli.js
CHANGED
|
@@ -39,6 +39,7 @@ Usage:
|
|
|
39
39
|
npx arkaos init Initialize project config (.arkaos.json)
|
|
40
40
|
npx arkaos update Update to latest version
|
|
41
41
|
npx arkaos migrate Migrate from v1 to v2
|
|
42
|
+
npx arkaos dashboard Start monitoring dashboard
|
|
42
43
|
npx arkaos doctor Run health checks
|
|
43
44
|
npx arkaos uninstall Remove ArkaOS
|
|
44
45
|
|
|
@@ -98,6 +99,18 @@ async function main() {
|
|
|
98
99
|
await migrate();
|
|
99
100
|
break;
|
|
100
101
|
|
|
102
|
+
case "dashboard": {
|
|
103
|
+
const { execSync: execDash } = await import("node:child_process");
|
|
104
|
+
const repoRootDash = dirname(fileURLToPath(import.meta.url)).replace(/\/installer$/, "");
|
|
105
|
+
try {
|
|
106
|
+
execDash(`bash "${repoRootDash}/scripts/start-dashboard.sh"`, {
|
|
107
|
+
stdio: "inherit",
|
|
108
|
+
env: { ...process.env, ARKAOS_ROOT: repoRootDash },
|
|
109
|
+
});
|
|
110
|
+
} catch { process.exit(1); }
|
|
111
|
+
break;
|
|
112
|
+
}
|
|
113
|
+
|
|
101
114
|
case "index": {
|
|
102
115
|
const { execSync } = await import("node:child_process");
|
|
103
116
|
const indexArgs = positionals.slice(1).join(" ");
|
package/package.json
CHANGED
package/pyproject.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "arkaos-core"
|
|
3
|
-
version = "2.0
|
|
3
|
+
version = "2.1.0"
|
|
4
4
|
description = "Core engine for ArkaOS — The Operating System for AI Agent Teams"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
license = {text = "MIT"}
|
|
@@ -32,6 +32,17 @@ knowledge = [
|
|
|
32
32
|
"fastembed>=0.8.0",
|
|
33
33
|
"sqlite-vss>=0.1.2",
|
|
34
34
|
]
|
|
35
|
+
dashboard = [
|
|
36
|
+
"fastapi>=0.115.0",
|
|
37
|
+
"uvicorn>=0.32.0",
|
|
38
|
+
]
|
|
39
|
+
ingest = [
|
|
40
|
+
"yt-dlp>=2024.0",
|
|
41
|
+
"faster-whisper>=1.0.0",
|
|
42
|
+
"pdfplumber>=0.11.0",
|
|
43
|
+
"beautifulsoup4>=4.12.0",
|
|
44
|
+
"requests>=2.31.0",
|
|
45
|
+
]
|
|
35
46
|
dev = [
|
|
36
47
|
"pytest>=8.0",
|
|
37
48
|
"pytest-cov>=5.0",
|