arkaos 2.72.0 → 2.74.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/VERSION CHANGED
@@ -1 +1 @@
1
- 2.72.0
1
+ 2.74.0
@@ -0,0 +1,230 @@
1
+ """AI-powered persona builder (PR57 v2.74.0).
2
+
3
+ Generates a draft Persona from already-indexed content in the vector
4
+ store. The user ingests sources (YouTube transcripts, articles, PDFs)
5
+ via the knowledge dashboard, then the builder:
6
+
7
+ 1. Searches the vector store for chunks about the target person/topic.
8
+ 2. Sends those chunks to the configured LLM via the multi-backend
9
+ `LLMProvider` (Claude Code subagent / Anthropic API / Ollama local).
10
+ 3. Parses the LLM's JSON response into a `Persona` draft for the
11
+ operator to review and edit before saving.
12
+
13
+ The builder NEVER writes to the database — that's the existing
14
+ `PersonaManager.create()` path. The builder produces a draft; the
15
+ operator owns the persist decision (per the project memory's
16
+ "Generated persona presented for review" step).
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import json
22
+ import re
23
+ import uuid
24
+ from dataclasses import dataclass
25
+ from datetime import datetime, timezone
26
+
27
+ from core.knowledge.vector_store import VectorStore
28
+ from core.personas.schema import (
29
+ Persona,
30
+ PersonaBigFive,
31
+ PersonaCommunication,
32
+ PersonaDISC,
33
+ PersonaEnneagram,
34
+ )
35
+ from core.runtime.llm_provider import LLMProvider, get_llm_provider
36
+
37
+
38
+ _PERSONA_SYSTEM_PROMPT = """You build behavioural-DNA personas from quotes
39
+ and writings of real people. Read the supplied content carefully, then
40
+ emit a single JSON object that follows this exact schema. Use ONLY the
41
+ JSON keys listed — no prose, no markdown fences, no extra fields.
42
+
43
+ {
44
+ "title": "<one-line role label>",
45
+ "tagline": "<one-line essence>",
46
+ "disc": {
47
+ "primary": "D|I|S|C",
48
+ "secondary": "D|I|S|C",
49
+ "communication_style": "<one sentence>",
50
+ "under_pressure": "<one sentence>",
51
+ "motivator": "<one sentence>"
52
+ },
53
+ "enneagram": {
54
+ "type": 1-9,
55
+ "wing": 1-9,
56
+ "core_motivation": "<one sentence>",
57
+ "core_fear": "<one sentence>",
58
+ "subtype": "self-preservation|social|sexual"
59
+ },
60
+ "big_five": {
61
+ "openness": 0-100,
62
+ "conscientiousness": 0-100,
63
+ "extraversion": 0-100,
64
+ "agreeableness": 0-100,
65
+ "neuroticism": 0-100
66
+ },
67
+ "mbti": "<4-letter type>",
68
+ "mental_models": ["<model>", ...],
69
+ "expertise_domains": ["<domain>", ...],
70
+ "frameworks": ["<framework>", ...],
71
+ "key_quotes": ["<verbatim quote>", ...],
72
+ "communication": {
73
+ "tone": "<adjective>",
74
+ "vocabulary_level": "lay|specialist|expert",
75
+ "preferred_format": "<format hint>",
76
+ "avoid": ["<phrase to avoid>", ...]
77
+ }
78
+ }
79
+
80
+ If the content is insufficient to infer a field, use the closest neutral
81
+ default rather than fabricating. NEVER invent quotes — only include
82
+ verbatim text that appears in the content."""
83
+
84
+
85
+ @dataclass(frozen=True)
86
+ class BuildResult:
87
+ """Output of a persona-builder run."""
88
+
89
+ persona: Persona
90
+ chunks_used: int
91
+ provider_name: str
92
+ raw_response: str
93
+
94
+
95
+ class PersonaBuildError(RuntimeError):
96
+ """Raised when the LLM response can't be parsed into a Persona."""
97
+
98
+
99
+ class PersonaBuilder:
100
+ """Generate persona drafts from indexed content."""
101
+
102
+ MAX_CONTEXT_CHARS = 18_000
103
+
104
+ def __init__(
105
+ self,
106
+ store: VectorStore,
107
+ provider: LLMProvider | None = None,
108
+ ) -> None:
109
+ self._store = store
110
+ self._provider = provider or get_llm_provider()
111
+
112
+ def generate(
113
+ self,
114
+ name: str,
115
+ search_query: str = "",
116
+ top_k: int = 20,
117
+ source_label: str = "",
118
+ ) -> BuildResult:
119
+ """Build a persona draft for `name`.
120
+
121
+ Searches the vector store for `search_query` (defaults to the
122
+ name), truncates the joined chunks to MAX_CONTEXT_CHARS, sends
123
+ them to the configured LLM, parses the JSON response, and
124
+ returns a draft Persona plus telemetry.
125
+ """
126
+ if not name or not name.strip():
127
+ raise PersonaBuildError("name must not be empty")
128
+ query = (search_query or name).strip()
129
+ chunks = self._store.search(query, top_k=top_k)
130
+ if not chunks:
131
+ raise PersonaBuildError(
132
+ f"no indexed content matches {query!r} — "
133
+ "ingest sources first via /api/knowledge/ingest"
134
+ )
135
+ context = self._compose_context(chunks)
136
+ prompt = f"Person: {name}\n\nContent:\n{context}"
137
+ response = self._provider.complete(
138
+ prompt, system=_PERSONA_SYSTEM_PROMPT, max_tokens=3000,
139
+ )
140
+ persona = self._parse(name, source_label or name, response.text)
141
+ return BuildResult(
142
+ persona=persona,
143
+ chunks_used=len(chunks),
144
+ provider_name=self._provider.name(),
145
+ raw_response=response.text,
146
+ )
147
+
148
+ def _compose_context(self, chunks: list[dict]) -> str:
149
+ parts: list[str] = []
150
+ total = 0
151
+ for chunk in chunks:
152
+ text = chunk.get("text") or ""
153
+ if not text:
154
+ continue
155
+ if total + len(text) > self.MAX_CONTEXT_CHARS:
156
+ break
157
+ heading = chunk.get("heading") or ""
158
+ block = f"[{heading}]\n{text}" if heading else text
159
+ parts.append(block)
160
+ total += len(block)
161
+ return "\n\n---\n\n".join(parts)
162
+
163
+ def _parse(self, name: str, source_label: str, raw: str) -> Persona:
164
+ data = _extract_json_object(raw)
165
+ if data is None:
166
+ raise PersonaBuildError(
167
+ f"LLM did not return a JSON object; raw response: {raw[:200]!r}"
168
+ )
169
+ now = datetime.now(timezone.utc).isoformat()
170
+ try:
171
+ return Persona(
172
+ id=str(uuid.uuid4()),
173
+ name=name,
174
+ title=str(data.get("title") or ""),
175
+ tagline=str(data.get("tagline") or ""),
176
+ source=source_label,
177
+ disc=PersonaDISC(**(data.get("disc") or {})),
178
+ enneagram=PersonaEnneagram(**(data.get("enneagram") or {})),
179
+ big_five=PersonaBigFive(**(data.get("big_five") or {})),
180
+ mbti=str(data.get("mbti") or "INTJ"),
181
+ mental_models=_as_str_list(data.get("mental_models")),
182
+ expertise_domains=_as_str_list(data.get("expertise_domains")),
183
+ frameworks=_as_str_list(data.get("frameworks")),
184
+ key_quotes=_as_str_list(data.get("key_quotes")),
185
+ communication=PersonaCommunication(
186
+ **(data.get("communication") or {})
187
+ ),
188
+ created_at=now,
189
+ updated_at=now,
190
+ )
191
+ except (TypeError, ValueError) as exc:
192
+ raise PersonaBuildError(
193
+ f"LLM JSON does not match Persona schema: {exc}"
194
+ ) from exc
195
+
196
+
197
+ _JSON_OBJECT_RE = re.compile(r"\{.*\}", re.DOTALL)
198
+
199
+
200
+ def _extract_json_object(raw: str) -> dict | None:
201
+ """Parse the first JSON object in `raw`.
202
+
203
+ Tolerates models that wrap JSON in markdown fences or add a leading
204
+ explanation. Returns None when no parseable object is found.
205
+ """
206
+ if not raw:
207
+ return None
208
+ candidates = [raw.strip()]
209
+ fence_match = re.search(
210
+ r"```(?:json)?\s*(\{.*?\})\s*```", raw, flags=re.DOTALL,
211
+ )
212
+ if fence_match:
213
+ candidates.insert(0, fence_match.group(1))
214
+ bare_match = _JSON_OBJECT_RE.search(raw)
215
+ if bare_match:
216
+ candidates.append(bare_match.group(0))
217
+ for cand in candidates:
218
+ try:
219
+ obj = json.loads(cand)
220
+ except json.JSONDecodeError:
221
+ continue
222
+ if isinstance(obj, dict):
223
+ return obj
224
+ return None
225
+
226
+
227
+ def _as_str_list(value: object) -> list[str]:
228
+ if not isinstance(value, list):
229
+ return []
230
+ return [str(item) for item in value if isinstance(item, (str, int, float))]
@@ -16,16 +16,27 @@ const ingestError = ref<string | null>(null)
16
16
  const isDragging = ref(false)
17
17
  const pasteText = ref('')
18
18
  const pasteTitle = ref('')
19
+ // PR56 v2.73.0 — bulk URL ingest mode. Paste a list of URLs (one per
20
+ // line) and the backend queues one job per source.
21
+ const bulkUrls = ref('')
19
22
 
20
- const activeInputMode = ref<'url' | 'file' | 'text' | 'research'>('url')
23
+ const activeInputMode = ref<'url' | 'file' | 'text' | 'research' | 'bulk'>('url')
21
24
 
22
25
  const inputModes = [
23
26
  { label: 'URL', value: 'url' as const, icon: 'i-lucide-link' },
27
+ { label: 'Bulk', value: 'bulk' as const, icon: 'i-lucide-list' },
24
28
  { label: 'File', value: 'file' as const, icon: 'i-lucide-upload' },
25
29
  { label: 'Text', value: 'text' as const, icon: 'i-lucide-type' },
26
30
  { label: 'Research', value: 'research' as const, icon: 'i-lucide-search' },
27
31
  ]
28
32
 
33
+ const bulkUrlCount = computed(() =>
34
+ bulkUrls.value
35
+ .split('\n')
36
+ .map((s) => s.trim())
37
+ .filter((s) => s.length > 0).length
38
+ )
39
+
29
40
  function handleDrop(e: DragEvent) {
30
41
  isDragging.value = false
31
42
  const file = e.dataTransfer?.files?.[0]
@@ -87,6 +98,7 @@ function clearFile() {
87
98
  }
88
99
 
89
100
  const canIngest = computed(() => {
101
+ if (activeInputMode.value === 'bulk') return bulkUrlCount.value > 0
90
102
  return detectedType.value !== null
91
103
  })
92
104
 
@@ -178,7 +190,11 @@ onUnmounted(() => {
178
190
  })
179
191
 
180
192
  async function handleIngest() {
181
- if (!detectedType.value && activeInputMode.value !== 'text') return
193
+ if (
194
+ !detectedType.value
195
+ && activeInputMode.value !== 'text'
196
+ && activeInputMode.value !== 'bulk'
197
+ ) return
182
198
 
183
199
  ingestError.value = null
184
200
 
@@ -199,6 +215,17 @@ async function handleIngest() {
199
215
  body: { source: pasteText.value.slice(0, 100), type: 'markdown', text: pasteText.value, title: pasteTitle.value },
200
216
  })
201
217
  }
218
+ // Bulk URL paste — one job per non-blank line, server caps at 50
219
+ else if (activeInputMode.value === 'bulk' && bulkUrlCount.value > 0) {
220
+ const sources = bulkUrls.value
221
+ .split('\n')
222
+ .map((s) => s.trim())
223
+ .filter((s) => s.length > 0)
224
+ await $fetch(`${apiBase}/api/knowledge/ingest-bulk`, {
225
+ method: 'POST',
226
+ body: { sources },
227
+ })
228
+ }
202
229
  // URL or Research — standard ingest
203
230
  else {
204
231
  const source = ingestUrl.value.trim()
@@ -215,6 +242,7 @@ async function handleIngest() {
215
242
  clearFile()
216
243
  pasteText.value = ''
217
244
  pasteTitle.value = ''
245
+ bulkUrls.value = ''
218
246
 
219
247
  // Refresh jobs table + connect WebSocket
220
248
  fetchJobs()
@@ -415,6 +443,23 @@ function formatScore(score: number): string {
415
443
  />
416
444
  </div>
417
445
 
446
+ <!-- Mode: Bulk URLs (PR56 v2.73.0) -->
447
+ <div v-if="activeInputMode === 'bulk'" class="space-y-3">
448
+ <UTextarea
449
+ v-model="bulkUrls"
450
+ placeholder="Paste one URL per line. Up to 50 sources per batch.&#10;&#10;https://www.youtube.com/watch?v=...&#10;https://example.com/article&#10;https://example.com/paper.pdf"
451
+ :rows="8"
452
+ size="lg"
453
+ class="w-full font-mono text-sm"
454
+ />
455
+ <div class="flex items-center justify-between text-xs text-muted">
456
+ <span>{{ bulkUrlCount }} source{{ bulkUrlCount === 1 ? '' : 's' }} detected</span>
457
+ <span v-if="bulkUrlCount > 50" class="text-red-400">
458
+ Over the 50-source cap — extras will be rejected.
459
+ </span>
460
+ </div>
461
+ </div>
462
+
418
463
  <!-- Mode: Research -->
419
464
  <div v-if="activeInputMode === 'research'" class="space-y-3">
420
465
  <UInput
@@ -447,7 +492,11 @@ function formatScore(score: number): string {
447
492
  </div>
448
493
 
449
494
  <UButton
450
- :label="activeInputMode === 'research' ? 'Research & Index' : 'Ingest'"
495
+ :label="
496
+ activeInputMode === 'research' ? 'Research & Index'
497
+ : activeInputMode === 'bulk' ? `Ingest ${bulkUrlCount} source${bulkUrlCount === 1 ? '' : 's'}`
498
+ : 'Ingest'
499
+ "
451
500
  icon="i-lucide-zap"
452
501
  size="md"
453
502
  :disabled="!canIngest && !(activeInputMode === 'text' && pasteText.length > 50)"
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "arkaos",
3
- "version": "2.72.0",
3
+ "version": "2.74.0",
4
4
  "description": "The Operating System for AI Agent Teams",
5
5
  "type": "module",
6
6
  "bin": {
package/pyproject.toml CHANGED
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "arkaos-core"
3
- version = "2.72.0"
3
+ version = "2.74.0"
4
4
  description = "Core engine for ArkaOS — The Operating System for AI Agent Teams"
5
5
  readme = "README.md"
6
6
  license = {text = "MIT"}
@@ -487,6 +487,48 @@ def knowledge_ingest(body: dict):
487
487
  return {"job_id": job.id, "source_type": source_type, "status": "queued"}
488
488
 
489
489
 
490
+ @app.post("/api/knowledge/ingest-bulk")
491
+ def knowledge_ingest_bulk(body: dict):
492
+ """PR56 v2.73.0 — bulk URL ingest.
493
+
494
+ Accepts ``{"sources": ["url1", "url2", ...]}`` and queues one
495
+ background job per source. Returns ``{"jobs": [{...}, ...]}`` so the
496
+ dashboard can subscribe to each via the existing /ws/tasks stream.
497
+ Empty / whitespace-only lines are filtered. Duplicates collapse on
498
+ the JobManager side (one job per unique source).
499
+ """
500
+ raw_sources = body.get("sources") or []
501
+ if not isinstance(raw_sources, list):
502
+ return {"error": "sources must be a list"}
503
+ cleaned = []
504
+ seen: set[str] = set()
505
+ for raw in raw_sources:
506
+ if not isinstance(raw, str):
507
+ continue
508
+ s = raw.strip()
509
+ if not s or s in seen:
510
+ continue
511
+ seen.add(s)
512
+ cleaned.append(s)
513
+ if not cleaned:
514
+ return {"error": "no valid sources provided"}
515
+ if len(cleaned) > 50:
516
+ return {"error": "bulk ingest is capped at 50 sources per request"}
517
+ jobs = []
518
+ for source in cleaned:
519
+ result = knowledge_ingest({"source": source})
520
+ if "error" in result:
521
+ jobs.append({"source": source, "error": result["error"]})
522
+ else:
523
+ jobs.append({
524
+ "source": source,
525
+ "job_id": result["job_id"],
526
+ "source_type": result.get("source_type"),
527
+ "status": result.get("status", "queued"),
528
+ })
529
+ return {"jobs": jobs, "count": len(jobs)}
530
+
531
+
490
532
  @app.get("/api/tasks/{task_id}")
491
533
  def task_detail(task_id: str):
492
534
  """Get a single task by ID. Also checks jobs."""
@@ -645,6 +687,48 @@ def persona_delete(persona_id: str):
645
687
  return {"error": "Persona not found"}
646
688
 
647
689
 
690
+ @app.post("/api/personas/build")
691
+ def persona_build(body: dict):
692
+ """PR57 v2.74.0 — AI-powered persona draft from already-indexed content.
693
+
694
+ Body: {
695
+ "name": "<person to model>",
696
+ "search_query": "<optional vector search query>",
697
+ "top_k": <optional, default 20>,
698
+ "source_label": "<optional label, e.g. 'Alex Hormozi'>"
699
+ }
700
+
701
+ Returns: {persona: {...draft...}, chunks_used, provider_name}
702
+ The draft is NOT saved — the operator reviews and calls
703
+ POST /api/personas to persist.
704
+ """
705
+ name = (body.get("name") or "").strip()
706
+ if not name:
707
+ return {"error": "name is required"}
708
+ store = _get_vector_store()
709
+ if not store:
710
+ from core.knowledge.vector_store import VectorStore
711
+ kb_db = Path.home() / ".arkaos" / "knowledge.db"
712
+ kb_db.parent.mkdir(parents=True, exist_ok=True)
713
+ store = VectorStore(kb_db)
714
+ from core.personas.builder import PersonaBuilder, PersonaBuildError
715
+ builder = PersonaBuilder(store)
716
+ try:
717
+ result = builder.generate(
718
+ name=name,
719
+ search_query=body.get("search_query", ""),
720
+ top_k=int(body.get("top_k", 20) or 20),
721
+ source_label=body.get("source_label", ""),
722
+ )
723
+ except PersonaBuildError as exc:
724
+ return {"error": str(exc)}
725
+ return {
726
+ "persona": result.persona.model_dump(),
727
+ "chunks_used": result.chunks_used,
728
+ "provider_name": result.provider_name,
729
+ }
730
+
731
+
648
732
  # --- API Keys ---
649
733
 
650
734
  @app.get("/api/keys")