arkaos 2.0.3 → 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/VERSION +1 -1
  2. package/core/budget/__pycache__/__init__.cpython-313.pyc +0 -0
  3. package/core/budget/__pycache__/manager.cpython-313.pyc +0 -0
  4. package/core/budget/__pycache__/schema.cpython-313.pyc +0 -0
  5. package/core/knowledge/__pycache__/__init__.cpython-313.pyc +0 -0
  6. package/core/knowledge/__pycache__/chunker.cpython-313.pyc +0 -0
  7. package/core/knowledge/__pycache__/embedder.cpython-313.pyc +0 -0
  8. package/core/knowledge/__pycache__/indexer.cpython-313.pyc +0 -0
  9. package/core/knowledge/__pycache__/ingest.cpython-313.pyc +0 -0
  10. package/core/knowledge/__pycache__/vector_store.cpython-313.pyc +0 -0
  11. package/core/knowledge/ingest.py +270 -0
  12. package/core/obsidian/__pycache__/__init__.cpython-313.pyc +0 -0
  13. package/core/obsidian/__pycache__/templates.cpython-313.pyc +0 -0
  14. package/core/obsidian/__pycache__/writer.cpython-313.pyc +0 -0
  15. package/core/orchestration/__pycache__/__init__.cpython-313.pyc +0 -0
  16. package/core/orchestration/__pycache__/patterns.cpython-313.pyc +0 -0
  17. package/core/orchestration/__pycache__/protocol.cpython-313.pyc +0 -0
  18. package/core/personas/__init__.py +6 -0
  19. package/core/personas/__pycache__/__init__.cpython-313.pyc +0 -0
  20. package/core/personas/__pycache__/manager.cpython-313.pyc +0 -0
  21. package/core/personas/__pycache__/schema.cpython-313.pyc +0 -0
  22. package/core/personas/manager.py +102 -0
  23. package/core/personas/schema.py +127 -0
  24. package/core/runtime/__pycache__/subagent.cpython-313.pyc +0 -0
  25. package/core/squads/__pycache__/schema.cpython-313.pyc +0 -0
  26. package/core/synapse/__pycache__/engine.cpython-313.pyc +0 -0
  27. package/core/synapse/__pycache__/layers.cpython-313.pyc +0 -0
  28. package/core/tasks/__pycache__/schema.cpython-313.pyc +0 -0
  29. package/core/workflow/__pycache__/engine.cpython-313.pyc +0 -0
  30. package/core/workflow/__pycache__/schema.cpython-313.pyc +0 -0
  31. package/installer/cli.js +13 -0
  32. package/package.json +1 -1
  33. package/pyproject.toml +12 -1
package/VERSION CHANGED
@@ -1 +1 @@
1
- 2.0.3
1
+ 2.1.1
@@ -0,0 +1,270 @@
1
+ """Knowledge ingest engine — process YouTube, PDF, audio, web, markdown.
2
+
3
+ Downloads, transcribes, extracts text, chunks, embeds, and indexes into
4
+ the vector store. Reports progress via callback for real-time UI updates.
5
+ """
6
+
7
+ import os
8
+ import re
9
+ import tempfile
10
+ from dataclasses import dataclass, field
11
+ from pathlib import Path
12
+ from typing import Callable, Optional
13
+
14
+ from core.knowledge.chunker import chunk_markdown
15
+ from core.knowledge.vector_store import VectorStore
16
+
17
+
18
+ @dataclass
19
+ class IngestResult:
20
+ """Result of an ingest operation."""
21
+ source: str
22
+ source_type: str
23
+ text_length: int = 0
24
+ chunks_created: int = 0
25
+ title: str = ""
26
+ error: str = ""
27
+ success: bool = True
28
+
29
+
30
+ ProgressCallback = Callable[[int, str], None] # (percent, message)
31
+
32
+
33
+ def detect_source_type(source: str) -> str:
34
+ """Auto-detect content type from URL or file extension."""
35
+ source_lower = source.lower()
36
+
37
+ # YouTube URLs
38
+ if any(domain in source_lower for domain in ["youtube.com", "youtu.be"]):
39
+ return "youtube"
40
+
41
+ # Web URLs
42
+ if source_lower.startswith(("http://", "https://")):
43
+ return "web"
44
+
45
+ # File extensions
46
+ ext = Path(source).suffix.lower()
47
+ if ext == ".pdf":
48
+ return "pdf"
49
+ if ext in (".mp3", ".wav", ".m4a", ".ogg", ".flac", ".webm"):
50
+ return "audio"
51
+ if ext in (".md", ".txt", ".rst"):
52
+ return "markdown"
53
+
54
+ return "unknown"
55
+
56
+
57
+ class IngestEngine:
58
+ """Processes content from various sources into the vector store."""
59
+
60
+ def __init__(self, store: VectorStore, media_dir: str | Path = "") -> None:
61
+ self._store = store
62
+ self._media_dir = Path(media_dir) if media_dir else Path.home() / ".arkaos" / "media"
63
+ self._media_dir.mkdir(parents=True, exist_ok=True)
64
+
65
+ def ingest(
66
+ self,
67
+ source: str,
68
+ source_type: str = "",
69
+ on_progress: Optional[ProgressCallback] = None,
70
+ metadata: dict | None = None,
71
+ ) -> IngestResult:
72
+ """Ingest content from any supported source.
73
+
74
+ Args:
75
+ source: URL or file path.
76
+ source_type: youtube, pdf, audio, web, markdown. Auto-detected if empty.
77
+ on_progress: Callback(percent, message) for progress updates.
78
+ metadata: Extra metadata to attach to indexed chunks.
79
+ """
80
+ if not source_type:
81
+ source_type = detect_source_type(source)
82
+
83
+ progress = on_progress or (lambda p, m: None)
84
+ progress(0, f"Starting {source_type} ingest...")
85
+
86
+ processors = {
87
+ "youtube": self._process_youtube,
88
+ "pdf": self._process_pdf,
89
+ "audio": self._process_audio,
90
+ "web": self._process_web,
91
+ "markdown": self._process_markdown,
92
+ }
93
+
94
+ processor = processors.get(source_type)
95
+ if not processor:
96
+ return IngestResult(source=source, source_type=source_type, error=f"Unsupported type: {source_type}", success=False)
97
+
98
+ try:
99
+ text, title = processor(source, progress)
100
+ except Exception as e:
101
+ return IngestResult(source=source, source_type=source_type, error=str(e), success=False)
102
+
103
+ if not text or len(text.strip()) < 50:
104
+ return IngestResult(source=source, source_type=source_type, error="Extracted text too short", success=False)
105
+
106
+ # Chunk and index
107
+ progress(75, "Chunking content...")
108
+ chunks = chunk_markdown(text, max_tokens=512, source=source)
109
+
110
+ progress(85, f"Indexing {len(chunks)} chunks...")
111
+ texts = [c.text for c in chunks]
112
+ headings = [c.heading for c in chunks]
113
+ count = self._store.index_chunks(
114
+ texts=texts,
115
+ headings=headings,
116
+ source=source,
117
+ metadata={"type": source_type, "title": title, **(metadata or {})},
118
+ )
119
+
120
+ progress(100, f"Done — {count} chunks indexed")
121
+
122
+ return IngestResult(
123
+ source=source,
124
+ source_type=source_type,
125
+ text_length=len(text),
126
+ chunks_created=count,
127
+ title=title,
128
+ success=True,
129
+ )
130
+
131
+ def _process_youtube(self, url: str, progress: ProgressCallback) -> tuple[str, str]:
132
+ """Download YouTube video and transcribe audio."""
133
+ try:
134
+ import yt_dlp
135
+ except ImportError:
136
+ raise RuntimeError("yt-dlp not installed. Run: pip install yt-dlp")
137
+
138
+ progress(5, "Fetching video info...")
139
+
140
+ # Download audio only
141
+ audio_path = str(self._media_dir / "yt_audio.wav")
142
+ ydl_opts = {
143
+ "format": "bestaudio/best",
144
+ "outtmpl": str(self._media_dir / "yt_audio.%(ext)s"),
145
+ "postprocessors": [{
146
+ "key": "FFmpegExtractAudio",
147
+ "preferredcodec": "wav",
148
+ "preferredquality": "16",
149
+ }],
150
+ "quiet": True,
151
+ "no_warnings": True,
152
+ }
153
+
154
+ progress(10, "Downloading audio...")
155
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
156
+ info = ydl.extract_info(url, download=True)
157
+ title = info.get("title", "YouTube Video")
158
+
159
+ progress(35, "Transcribing audio...")
160
+ text = self._transcribe_audio(audio_path)
161
+
162
+ # Cleanup
163
+ try:
164
+ os.remove(audio_path)
165
+ except OSError:
166
+ pass
167
+
168
+ return text, title
169
+
170
+ def _process_pdf(self, path: str, progress: ProgressCallback) -> tuple[str, str]:
171
+ """Extract text from PDF."""
172
+ try:
173
+ import pdfplumber
174
+ except ImportError:
175
+ raise RuntimeError("pdfplumber not installed. Run: pip install pdfplumber")
176
+
177
+ progress(10, "Opening PDF...")
178
+ filepath = Path(path)
179
+ if not filepath.exists():
180
+ raise FileNotFoundError(f"PDF not found: {path}")
181
+
182
+ pages_text = []
183
+ with pdfplumber.open(filepath) as pdf:
184
+ total_pages = len(pdf.pages)
185
+ for i, page in enumerate(pdf.pages):
186
+ text = page.extract_text() or ""
187
+ pages_text.append(text)
188
+ pct = 10 + int((i / total_pages) * 60)
189
+ progress(pct, f"Extracting page {i + 1}/{total_pages}...")
190
+
191
+ title = filepath.stem.replace("-", " ").replace("_", " ")
192
+ return "\n\n".join(pages_text), title
193
+
194
+ def _process_audio(self, path: str, progress: ProgressCallback) -> tuple[str, str]:
195
+ """Transcribe audio file."""
196
+ progress(10, "Loading audio...")
197
+ filepath = Path(path)
198
+ if not filepath.exists():
199
+ raise FileNotFoundError(f"Audio not found: {path}")
200
+
201
+ progress(20, "Transcribing audio...")
202
+ text = self._transcribe_audio(str(filepath))
203
+ title = filepath.stem.replace("-", " ").replace("_", " ")
204
+ return text, title
205
+
206
+ def _process_web(self, url: str, progress: ProgressCallback) -> tuple[str, str]:
207
+ """Scrape web page content."""
208
+ try:
209
+ import requests
210
+ from bs4 import BeautifulSoup
211
+ except ImportError:
212
+ raise RuntimeError("beautifulsoup4 and requests not installed. Run: pip install beautifulsoup4 requests")
213
+
214
+ progress(10, "Fetching page...")
215
+ resp = requests.get(url, timeout=15, headers={
216
+ "User-Agent": "Mozilla/5.0 (ArkaOS Knowledge Indexer)"
217
+ })
218
+ resp.raise_for_status()
219
+
220
+ progress(40, "Parsing content...")
221
+ soup = BeautifulSoup(resp.text, "html.parser")
222
+
223
+ # Remove scripts, styles, nav, footer
224
+ for tag in soup(["script", "style", "nav", "footer", "header", "aside"]):
225
+ tag.decompose()
226
+
227
+ # Get title
228
+ title = soup.title.string if soup.title else url
229
+
230
+ # Get main content (article > main > body)
231
+ main = soup.find("article") or soup.find("main") or soup.find("body")
232
+ text = main.get_text(separator="\n\n", strip=True) if main else soup.get_text(separator="\n\n", strip=True)
233
+
234
+ # Clean up whitespace
235
+ text = re.sub(r'\n{3,}', '\n\n', text)
236
+
237
+ return text, title
238
+
239
+ def _process_markdown(self, path: str, progress: ProgressCallback) -> tuple[str, str]:
240
+ """Read markdown/text file directly."""
241
+ progress(10, "Reading file...")
242
+ filepath = Path(path)
243
+ if not filepath.exists():
244
+ raise FileNotFoundError(f"File not found: {path}")
245
+
246
+ text = filepath.read_text(encoding="utf-8")
247
+ title = filepath.stem.replace("-", " ").replace("_", " ")
248
+ return text, title
249
+
250
+ def _transcribe_audio(self, audio_path: str) -> str:
251
+ """Transcribe audio using faster-whisper (or fallback)."""
252
+ try:
253
+ from faster_whisper import WhisperModel
254
+ model = WhisperModel("base", device="cpu", compute_type="int8")
255
+ segments, _ = model.transcribe(audio_path, beam_size=5)
256
+ return " ".join(segment.text for segment in segments)
257
+ except ImportError:
258
+ pass
259
+
260
+ try:
261
+ import whisper
262
+ model = whisper.load_model("base")
263
+ result = model.transcribe(audio_path)
264
+ return result["text"]
265
+ except ImportError:
266
+ raise RuntimeError(
267
+ "No transcription engine available. Install one:\n"
268
+ " pip install faster-whisper (recommended, lighter)\n"
269
+ " pip install openai-whisper (original, heavier)"
270
+ )
@@ -0,0 +1,6 @@
1
+ """Persona system — create, store, and clone personas as agents."""
2
+
3
+ from core.personas.schema import Persona
4
+ from core.personas.manager import PersonaManager
5
+
6
+ __all__ = ["Persona", "PersonaManager"]
@@ -0,0 +1,102 @@
1
+ """Persona manager — CRUD operations and cloning to agents."""
2
+
3
+ import json
4
+ from datetime import datetime
5
+ from pathlib import Path
6
+ from typing import Optional
7
+
8
+ import yaml
9
+
10
+ from core.personas.schema import Persona
11
+
12
+
13
+ class PersonaManager:
14
+ """Manages persona lifecycle: create, store, list, clone to agent."""
15
+
16
+ def __init__(self, storage_path: str | Path = "") -> None:
17
+ self._personas: dict[str, Persona] = {}
18
+ self._storage_path = Path(storage_path) if storage_path else None
19
+ if self._storage_path and self._storage_path.exists():
20
+ self._load()
21
+
22
+ def create(self, persona: Persona) -> Persona:
23
+ """Create a new persona."""
24
+ persona.created_at = datetime.now().isoformat()
25
+ persona.updated_at = persona.created_at
26
+ self._personas[persona.id] = persona
27
+ self._save()
28
+ return persona
29
+
30
+ def get(self, persona_id: str) -> Optional[Persona]:
31
+ return self._personas.get(persona_id)
32
+
33
+ def list_all(self) -> list[Persona]:
34
+ return list(self._personas.values())
35
+
36
+ def update(self, persona_id: str, updates: dict) -> Optional[Persona]:
37
+ persona = self._personas.get(persona_id)
38
+ if not persona:
39
+ return None
40
+ for key, value in updates.items():
41
+ if hasattr(persona, key):
42
+ setattr(persona, key, value)
43
+ persona.updated_at = datetime.now().isoformat()
44
+ self._save()
45
+ return persona
46
+
47
+ def delete(self, persona_id: str) -> bool:
48
+ if persona_id in self._personas:
49
+ del self._personas[persona_id]
50
+ self._save()
51
+ return True
52
+ return False
53
+
54
+ def clone_to_agent(
55
+ self,
56
+ persona_id: str,
57
+ department: str = "strategy",
58
+ tier: int = 2,
59
+ agents_dir: str | Path = "",
60
+ ) -> Optional[str]:
61
+ """Clone a persona to an ArkaOS agent YAML file.
62
+
63
+ Returns the agent ID if successful, None if persona not found.
64
+ """
65
+ persona = self._personas.get(persona_id)
66
+ if not persona:
67
+ return None
68
+
69
+ agent_data = persona.to_agent_yaml(department=department, tier=tier)
70
+ agent_id = agent_data["id"]
71
+
72
+ if agents_dir:
73
+ output_dir = Path(agents_dir)
74
+ output_dir.mkdir(parents=True, exist_ok=True)
75
+ output_path = output_dir / f"{agent_id}.yaml"
76
+ with open(output_path, "w") as f:
77
+ yaml.dump(agent_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
78
+
79
+ # Track the clone
80
+ persona.cloned_to_agents.append(agent_id)
81
+ persona.updated_at = datetime.now().isoformat()
82
+ self._save()
83
+
84
+ return agent_id
85
+
86
+ def _save(self) -> None:
87
+ if self._storage_path is None:
88
+ return
89
+ self._storage_path.parent.mkdir(parents=True, exist_ok=True)
90
+ data = {pid: p.model_dump(mode="json") for pid, p in self._personas.items()}
91
+ with open(self._storage_path, "w") as f:
92
+ json.dump(data, f, indent=2)
93
+
94
+ def _load(self) -> None:
95
+ if self._storage_path is None or not self._storage_path.exists():
96
+ return
97
+ content = self._storage_path.read_text().strip()
98
+ if not content:
99
+ return
100
+ data = json.loads(content)
101
+ for pid, pdata in data.items():
102
+ self._personas[pid] = Persona.model_validate(pdata)
@@ -0,0 +1,127 @@
1
+ """Persona schema — models for persona creation and cloning."""
2
+
3
+ from datetime import datetime
4
+ from typing import Optional, Any
5
+
6
+ from pydantic import BaseModel, Field
7
+
8
+
9
+ class PersonaDISC(BaseModel):
10
+ primary: str = "C"
11
+ secondary: str = "S"
12
+ communication_style: str = ""
13
+ under_pressure: str = ""
14
+ motivator: str = ""
15
+
16
+
17
+ class PersonaEnneagram(BaseModel):
18
+ type: int = 5
19
+ wing: int = 6
20
+ core_motivation: str = ""
21
+ core_fear: str = ""
22
+ subtype: str = "self-preservation"
23
+
24
+
25
+ class PersonaBigFive(BaseModel):
26
+ openness: int = 50
27
+ conscientiousness: int = 50
28
+ extraversion: int = 50
29
+ agreeableness: int = 50
30
+ neuroticism: int = 50
31
+
32
+
33
+ class PersonaCommunication(BaseModel):
34
+ tone: str = ""
35
+ vocabulary_level: str = "specialist"
36
+ preferred_format: str = ""
37
+ avoid: list[str] = Field(default_factory=list)
38
+
39
+
40
+ class Persona(BaseModel):
41
+ """A persona based on a real person or archetype."""
42
+ id: str
43
+ name: str
44
+ title: str = "" # e.g., "Business Strategy", "Growth Marketing"
45
+ tagline: str = "" # e.g., "The Natural Commander with emotional depth"
46
+ source: str = "" # e.g., "Alex Hormozi", "Naval Ravikant"
47
+ avatar_url: str = ""
48
+
49
+ # Behavioral DNA
50
+ disc: PersonaDISC = Field(default_factory=PersonaDISC)
51
+ enneagram: PersonaEnneagram = Field(default_factory=PersonaEnneagram)
52
+ big_five: PersonaBigFive = Field(default_factory=PersonaBigFive)
53
+ mbti: str = "INTJ"
54
+
55
+ # Knowledge
56
+ mental_models: list[str] = Field(default_factory=list)
57
+ expertise_domains: list[str] = Field(default_factory=list)
58
+ frameworks: list[str] = Field(default_factory=list)
59
+ key_quotes: list[str] = Field(default_factory=list)
60
+
61
+ # Communication
62
+ communication: PersonaCommunication = Field(default_factory=PersonaCommunication)
63
+
64
+ # Metadata
65
+ created_at: str = ""
66
+ updated_at: str = ""
67
+ cloned_to_agents: list[str] = Field(default_factory=list)
68
+
69
+ def to_agent_yaml(self, department: str = "strategy", tier: int = 2) -> dict:
70
+ """Convert persona to an ArkaOS agent YAML structure."""
71
+ agent_id = f"persona-{self.id}"
72
+ return {
73
+ "id": agent_id,
74
+ "name": self.name,
75
+ "role": self.title or f"{self.source} Persona",
76
+ "department": department,
77
+ "tier": tier,
78
+ "behavioral_dna": {
79
+ "disc": {
80
+ "primary": self.disc.primary,
81
+ "secondary": self.disc.secondary,
82
+ "communication_style": self.disc.communication_style,
83
+ "under_pressure": self.disc.under_pressure,
84
+ "motivator": self.disc.motivator,
85
+ },
86
+ "enneagram": {
87
+ "type": self.enneagram.type,
88
+ "wing": self.enneagram.wing,
89
+ "core_motivation": self.enneagram.core_motivation,
90
+ "core_fear": self.enneagram.core_fear,
91
+ "subtype": self.enneagram.subtype,
92
+ },
93
+ "big_five": {
94
+ "openness": self.big_five.openness,
95
+ "conscientiousness": self.big_five.conscientiousness,
96
+ "extraversion": self.big_five.extraversion,
97
+ "agreeableness": self.big_five.agreeableness,
98
+ "neuroticism": self.big_five.neuroticism,
99
+ },
100
+ "mbti": {"type": self.mbti},
101
+ },
102
+ "mental_models": {
103
+ "primary": self.mental_models[:3],
104
+ "secondary": self.mental_models[3:6],
105
+ },
106
+ "authority": {
107
+ "veto": False,
108
+ "approve_budget": False,
109
+ "approve_architecture": False,
110
+ "orchestrate": False,
111
+ "delegates_to": [],
112
+ "escalates_to": None,
113
+ },
114
+ "expertise": {
115
+ "domains": self.expertise_domains[:5],
116
+ "frameworks": self.frameworks[:5],
117
+ "depth": "advanced",
118
+ "years_equivalent": 10,
119
+ },
120
+ "communication": {
121
+ "language": "en",
122
+ "tone": self.communication.tone,
123
+ "vocabulary_level": self.communication.vocabulary_level,
124
+ "preferred_format": self.communication.preferred_format,
125
+ "avoid": self.communication.avoid,
126
+ },
127
+ }
package/installer/cli.js CHANGED
@@ -39,6 +39,7 @@ Usage:
39
39
  npx arkaos init Initialize project config (.arkaos.json)
40
40
  npx arkaos update Update to latest version
41
41
  npx arkaos migrate Migrate from v1 to v2
42
+ npx arkaos dashboard Start monitoring dashboard
42
43
  npx arkaos doctor Run health checks
43
44
  npx arkaos uninstall Remove ArkaOS
44
45
 
@@ -98,6 +99,18 @@ async function main() {
98
99
  await migrate();
99
100
  break;
100
101
 
102
+ case "dashboard": {
103
+ const { execSync: execDash } = await import("node:child_process");
104
+ const repoRootDash = dirname(fileURLToPath(import.meta.url)).replace(/\/installer$/, "");
105
+ try {
106
+ execDash(`bash "${repoRootDash}/scripts/start-dashboard.sh"`, {
107
+ stdio: "inherit",
108
+ env: { ...process.env, ARKAOS_ROOT: repoRootDash },
109
+ });
110
+ } catch { process.exit(1); }
111
+ break;
112
+ }
113
+
101
114
  case "index": {
102
115
  const { execSync } = await import("node:child_process");
103
116
  const indexArgs = positionals.slice(1).join(" ");
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "arkaos",
3
- "version": "2.0.3",
3
+ "version": "2.1.1",
4
4
  "description": "The Operating System for AI Agent Teams",
5
5
  "type": "module",
6
6
  "bin": {
package/pyproject.toml CHANGED
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "arkaos-core"
3
- version = "2.0.3"
3
+ version = "2.1.1"
4
4
  description = "Core engine for ArkaOS — The Operating System for AI Agent Teams"
5
5
  readme = "README.md"
6
6
  license = {text = "MIT"}
@@ -32,6 +32,17 @@ knowledge = [
32
32
  "fastembed>=0.8.0",
33
33
  "sqlite-vss>=0.1.2",
34
34
  ]
35
+ dashboard = [
36
+ "fastapi>=0.115.0",
37
+ "uvicorn>=0.32.0",
38
+ ]
39
+ ingest = [
40
+ "yt-dlp>=2024.0",
41
+ "faster-whisper>=1.0.0",
42
+ "pdfplumber>=0.11.0",
43
+ "beautifulsoup4>=4.12.0",
44
+ "requests>=2.31.0",
45
+ ]
35
46
  dev = [
36
47
  "pytest>=8.0",
37
48
  "pytest-cov>=5.0",