arkaos 3.78.0 → 4.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +42 -30
- package/VERSION +1 -1
- package/arka/SKILL.md +2 -2
- package/config/agent-allowlists/laravel.yaml +1 -0
- package/config/agent-allowlists/node.yaml +1 -0
- package/config/agent-allowlists/nuxt.yaml +1 -0
- package/config/agent-allowlists/python.yaml +1 -0
- package/core/agents/__pycache__/registry_gen.cpython-313.pyc +0 -0
- package/core/agents/__pycache__/schema.cpython-313.pyc +0 -0
- package/core/agents/registry_gen.py +6 -1
- package/core/agents/schema.py +4 -0
- package/core/cognition/__pycache__/reorganizer.cpython-313.pyc +0 -0
- package/core/cognition/reorganizer.py +37 -7
- package/core/governance/__pycache__/design_system_lint.cpython-313.pyc +0 -0
- package/core/governance/__pycache__/design_system_lint_cli.cpython-313.pyc +0 -0
- package/core/knowledge/__pycache__/agent_match.cpython-313.pyc +0 -0
- package/core/knowledge/__pycache__/chunker.cpython-313.pyc +0 -0
- package/core/knowledge/__pycache__/ingest.cpython-313.pyc +0 -0
- package/core/knowledge/__pycache__/sources.cpython-313.pyc +0 -0
- package/core/knowledge/__pycache__/vector_store.cpython-313.pyc +0 -0
- package/core/knowledge/agent_match.py +114 -0
- package/core/knowledge/chunker.py +45 -0
- package/core/knowledge/ingest.py +156 -78
- package/core/knowledge/sources.py +138 -0
- package/core/knowledge/vector_store.py +52 -0
- package/core/squads/__pycache__/loader.cpython-313.pyc +0 -0
- package/core/squads/loader.py +25 -0
- package/core/sync/__pycache__/agent_provisioner.cpython-313.pyc +0 -0
- package/core/sync/agent_provisioner.py +19 -8
- package/dashboard/app/components/KnowledgeSourcesList.vue +40 -13
- package/dashboard/app/pages/cognition.vue +9 -4
- package/dashboard/app/pages/knowledge/[id].vue +669 -0
- package/dashboard/app/pages/knowledge/index.vue +1281 -0
- package/dashboard/app/types/index.d.ts +1 -1
- package/departments/brand/agents/ux-designer.yaml +15 -1
- package/departments/brand/agents/ux-researcher.yaml +73 -0
- package/departments/brand/agents/ux-strategist.yaml +72 -0
- package/departments/dev/agents/ai-engineering/ai-engineering-lead.yaml +76 -0
- package/departments/dev/agents/architect.yaml +9 -3
- package/departments/dev/agents/backend-core/laravel-eng.yaml +76 -0
- package/departments/dev/agents/backend-core/node-ts-eng.yaml +76 -0
- package/departments/dev/agents/backend-core/python-eng.yaml +76 -0
- package/departments/dev/agents/backend-dev.yaml +10 -4
- package/departments/dev/agents/data-platform/etl-eng.yaml +74 -0
- package/departments/dev/agents/dba.yaml +7 -3
- package/departments/dev/references/backend-knowledge-and-tools.md +70 -0
- package/departments/ecom/agents/retention-manager.yaml +13 -1
- package/departments/leadership/agents/culture-coach.yaml +20 -0
- package/departments/leadership/agents/hr-specialist.yaml +18 -0
- package/departments/leadership/agents/leadership-director.yaml +10 -0
- package/departments/org/agents/chief-of-staff.yaml +76 -0
- package/departments/org/agents/coo.yaml +11 -0
- package/departments/org/agents/okr-steward.yaml +71 -0
- package/departments/org/agents/org-designer.yaml +23 -0
- package/departments/org/skills/okr-cadence/SKILL.md +34 -0
- package/departments/org/skills/principles-audit/SKILL.md +36 -0
- package/departments/pm/agents/pm-director.yaml +21 -8
- package/departments/pm/agents/product-owner.yaml +24 -2
- package/departments/pm/agents/scrum-master.yaml +21 -0
- package/departments/pm/agents/strategic-pm.yaml +72 -0
- package/departments/pm/skills/discovery-plan/SKILL.md +7 -1
- package/departments/quality/agents/cqo.yaml +8 -0
- package/departments/saas/agents/cs-manager.yaml +19 -2
- package/departments/saas/agents/growth-engineer.yaml +14 -1
- package/departments/saas/agents/metrics-analyst.yaml +17 -1
- package/departments/saas/agents/revops-lead.yaml +73 -0
- package/departments/saas/skills/leaky-bucket/SKILL.md +28 -0
- package/departments/saas/skills/voc-loop/SKILL.md +29 -0
- package/departments/sales/agents/sales-director.yaml +9 -0
- package/departments/sales/agents/sdr.yaml +72 -0
- package/departments/strategy/agents/decision-quality.yaml +72 -0
- package/departments/strategy/agents/strategy-director.yaml +13 -0
- package/departments/strategy/skills/premortem/SKILL.md +33 -0
- package/knowledge/agents-registry-v2.json +1218 -78
- package/package.json +1 -1
- package/pyproject.toml +1 -1
- package/scripts/__pycache__/dashboard-api.cpython-313.pyc +0 -0
- package/scripts/bench/__init__.py +5 -0
- package/scripts/bench/__pycache__/__init__.cpython-313.pyc +0 -0
- package/scripts/bench/__pycache__/harness.cpython-313.pyc +0 -0
- package/scripts/bench/__pycache__/run.cpython-313.pyc +0 -0
- package/scripts/bench/harness.py +138 -0
- package/scripts/bench/run.py +136 -0
- package/scripts/dashboard-api.py +376 -13
- package/scripts/tools/__pycache__/docs_stats.cpython-313.pyc +0 -0
- package/scripts/tools/docs_stats.py +154 -0
- package/dashboard/app/pages/knowledge.vue +0 -918
package/README.md
CHANGED
|
@@ -2,13 +2,16 @@
|
|
|
2
2
|
|
|
3
3
|
**The Operating System for AI Agent Teams.**
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
82 agents. 17 departments. 267 skills. Enterprise frameworks. Multi-runtime. One install.
|
|
6
6
|
|
|
7
7
|
```bash
|
|
8
8
|
npx arkaos install
|
|
9
9
|
```
|
|
10
10
|
|
|
11
|
-
[](https://www.npmjs.com/package/arkaos) [](LICENSE) [](https://www.npmjs.com/package/arkaos) [](LICENSE) []()
|
|
12
|
+
|
|
13
|
+
> All counts in this document are generated by `python scripts/tools/docs_stats.py`
|
|
14
|
+
> and locked by a test — they cannot drift from the repository.
|
|
12
15
|
|
|
13
16
|
---
|
|
14
17
|
|
|
@@ -99,7 +102,7 @@ In plain language. No special syntax required.
|
|
|
99
102
|
|
|
100
103
|
### 2. ArkaOS routes to the right squad
|
|
101
104
|
|
|
102
|
-
The Synapse engine (
|
|
105
|
+
The Synapse engine (12-layer context injection, ~87ms cold / ~83ms warm — see [Benchmarks](wiki/11-Benchmarks.md)) analyzes your request and routes it to the correct department. Each department has a lead agent who orchestrates specialists.
|
|
103
106
|
|
|
104
107
|
### 3. Agents execute with enterprise frameworks
|
|
105
108
|
|
|
@@ -132,24 +135,27 @@ Every decision, solution, and pattern is captured. The Cognitive Layer curates i
|
|
|
132
135
|
|
|
133
136
|
| Department | Prefix | Agents | What It Does |
|
|
134
137
|
|-----------|--------|--------|-------------|
|
|
135
|
-
| **Development** | `/dev` |
|
|
138
|
+
| **Development** | `/dev` | 15 | Full-stack features, APIs, architecture, security, CI/CD |
|
|
139
|
+
| **Brand & Design** | `/brand` | 10 | Brand identity, UX/UI, design systems, naming |
|
|
136
140
|
| **Marketing** | `/mkt` | 4 | SEO, paid ads, email campaigns, growth loops |
|
|
137
|
-
| **
|
|
138
|
-
| **Finance** | `/fin` | 3 | DCF valuation, unit economics, budgets, investor prep |
|
|
139
|
-
| **Strategy** | `/strat` | 3 | Market analysis, competitive intelligence, business models |
|
|
141
|
+
| **Strategy** | `/strat` | 4 | Market analysis, competitive intelligence, business models |
|
|
140
142
|
| **E-Commerce** | `/ecom` | 4 | Store optimization, CRO, pricing, RFM segmentation |
|
|
141
|
-
| **Knowledge** | `/kb` |
|
|
142
|
-
| **
|
|
143
|
-
| **Project Mgmt** | `/pm` | 3 | Scrum, Shape Up, discovery, roadmaps |
|
|
144
|
-
| **SaaS** | `/saas` | 4 | Idea validation, metrics, PLG strategy, scaffolding |
|
|
145
|
-
| **Landing Pages** | `/landing` | 4 | Sales copy, funnels, offers, page generation |
|
|
143
|
+
| **Knowledge** | `/kb` | 4 | Research, Zettelkasten, persona building, ingestion |
|
|
144
|
+
| **Project Mgmt** | `/pm` | 4 | Scrum, Shape Up, discovery, roadmaps |
|
|
146
145
|
| **Content** | `/content` | 4 | Viral hooks, scripts, repurposing, content calendars |
|
|
147
|
-
| **
|
|
148
|
-
| **
|
|
149
|
-
| **
|
|
150
|
-
| **
|
|
146
|
+
| **Sales** | `/sales` | 4 | Pipeline management, SPIN selling, negotiation |
|
|
147
|
+
| **SaaS** | `/saas` | 5 | Idea validation, metrics, PLG strategy, scaffolding |
|
|
148
|
+
| **Organization** | `/org` | 5 | Org design, team topologies, matrix structure |
|
|
149
|
+
| **Landing Pages** | `/landing` | 4 | Sales copy, funnels, offers, page generation |
|
|
150
|
+
| **Finance** | `/fin` | 3 | DCF valuation, unit economics, budgets, investor prep |
|
|
151
|
+
| **Operations** | `/ops` | 3 | Automation, SOPs, compliance (GDPR, ISO, SOC 2) |
|
|
152
|
+
| **Communities** | `/community` | 3 | Groups, membership, gamification, engagement |
|
|
153
|
+
| **Leadership** | `/lead` | 3 | Team health, OKRs, culture, hiring frameworks |
|
|
151
154
|
| **Quality Gate** | (auto) | 3 | Mandatory review on every workflow. Veto power. |
|
|
152
155
|
|
|
156
|
+
> 82 agents across 17 departments (81 unique; `cro-specialist` is shared by
|
|
157
|
+
> E-Commerce and Landing in the matrix structure).
|
|
158
|
+
|
|
153
159
|
---
|
|
154
160
|
|
|
155
161
|
## Cognitive Layer (v2.10)
|
|
@@ -386,7 +392,7 @@ python scripts/tools/okr_cascade.py growth --json
|
|
|
386
392
|
User Input
|
|
387
393
|
│
|
|
388
394
|
▼
|
|
389
|
-
Synapse v2 (
|
|
395
|
+
Synapse v2 (12-layer context injection, ~87ms cold / ~83ms warm)
|
|
390
396
|
│
|
|
391
397
|
▼
|
|
392
398
|
Orchestrator (/do → department routing)
|
|
@@ -408,13 +414,13 @@ Output (Obsidian vault + structured deliverables)
|
|
|
408
414
|
|
|
409
415
|
| System | Purpose |
|
|
410
416
|
|--------|---------|
|
|
411
|
-
| **Synapse v2** |
|
|
417
|
+
| **Synapse v2** | 12-layer context injection (~87ms cold, ~83ms warm; cacheable layers are sub-millisecond) |
|
|
412
418
|
| **Workflow Engine** | YAML workflows with phases, gates, parallelization |
|
|
413
419
|
| **Agent Schema** | 4-framework behavioral DNA with consistency validation |
|
|
414
420
|
| **Squad Framework** | Department squads + ad-hoc project squads (matrix) |
|
|
415
421
|
| **Cognitive Layer** | Memory, Dreaming, Research, Scheduler |
|
|
416
422
|
| **Living Specs** | Bidirectional spec/code sync |
|
|
417
|
-
| **Governance** | Constitution with
|
|
423
|
+
| **Governance** | Constitution with 25 non-negotiable rules (+ 11 must, 8 should) |
|
|
418
424
|
| **Multi-Runtime** | Claude Code, Codex, Gemini, Cursor adapters |
|
|
419
425
|
|
|
420
426
|
### Tech Stack
|
|
@@ -427,22 +433,28 @@ Output (Obsidian vault + structured deliverables)
|
|
|
427
433
|
| Workflows | YAML |
|
|
428
434
|
| Agent Definitions | YAML |
|
|
429
435
|
| Knowledge | Obsidian + SQLite-VSS |
|
|
430
|
-
| Tests | pytest (
|
|
436
|
+
| Tests | pytest (4,500+ tests) |
|
|
431
437
|
|
|
432
438
|
---
|
|
433
439
|
|
|
434
440
|
## Documentation
|
|
435
441
|
|
|
436
|
-
Full documentation
|
|
442
|
+
Full documentation lives in two places in this repository:
|
|
443
|
+
|
|
444
|
+
**[`wiki/`](wiki/Home.md)** — the user-facing guide (step-by-step, features, benchmarks):
|
|
445
|
+
|
|
446
|
+
- [Home](wiki/Home.md) — the index of everything
|
|
447
|
+
- [Getting Started](wiki/01-Getting-Started.md) — install and run your first command
|
|
448
|
+
- [Core Concepts](wiki/02-Core-Concepts.md) — squads, agents, tiers, behavioral DNA
|
|
449
|
+
- [The 13-Phase Flow](wiki/03-The-13-Phase-Flow.md) — how every request is handled
|
|
450
|
+
- [Departments](wiki/04-Departments/) — one page per department
|
|
451
|
+
- [Commands Reference](wiki/05-Commands-Reference.md)
|
|
452
|
+
- [Cognitive Layer](wiki/06-Cognitive-Layer.md) — memory, dreaming, research
|
|
453
|
+
- [Benchmarks](wiki/11-Benchmarks.md) — measured, reproducible numbers
|
|
454
|
+
- [Competitive Analysis](wiki/12-Competitive-Analysis.md) and [Benefits & ROI](wiki/13-Benefits-ROI.md)
|
|
437
455
|
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
- [Departments & Agents](https://github.com/andreagroferreira/arka-os/wiki/Departments)
|
|
441
|
-
- [Cognitive Layer](https://github.com/andreagroferreira/arka-os/wiki/Cognitive-Layer)
|
|
442
|
-
- [Ecosystem Management](https://github.com/andreagroferreira/arka-os/wiki/Ecosystems)
|
|
443
|
-
- [Configuration](https://github.com/andreagroferreira/arka-os/wiki/Configuration)
|
|
444
|
-
- [Creating Projects](https://github.com/andreagroferreira/arka-os/wiki/Creating-Projects)
|
|
445
|
-
- [Update & Sync](https://github.com/andreagroferreira/arka-os/wiki/Update-and-Sync)
|
|
456
|
+
**[`docs/`](docs/)** — the technical/contributor reference (architecture, API,
|
|
457
|
+
agent schema, core engine, ADRs).
|
|
446
458
|
|
|
447
459
|
---
|
|
448
460
|
|
|
@@ -493,7 +505,7 @@ Department commands: `/dev`, `/mkt`, `/brand`, `/fin`, `/strat`, `/ecom`, `/kb`,
|
|
|
493
505
|
|
|
494
506
|
## Contributing
|
|
495
507
|
|
|
496
|
-
See [CONTRIBUTING.md](.github/CONTRIBUTING.md). PRs welcome — all changes require passing the full test suite (
|
|
508
|
+
See [CONTRIBUTING.md](.github/CONTRIBUTING.md). PRs welcome — all changes require passing the full test suite (4,500+ tests as of v4.0.0) and Quality Gate review (Marta CQO + Eduardo Copy + Francisca Tech, all Opus).
|
|
497
509
|
|
|
498
510
|
## License
|
|
499
511
|
|
package/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
|
|
1
|
+
4.0.1
|
package/arka/SKILL.md
CHANGED
|
@@ -21,10 +21,10 @@ treat them as your default source. External research supplements, it
|
|
|
21
21
|
does not replace the vault.
|
|
22
22
|
<!-- arka:kb-first-prefix end -->
|
|
23
23
|
|
|
24
|
-
# ArkaOS
|
|
24
|
+
# ArkaOS — Main Orchestrator
|
|
25
25
|
|
|
26
26
|
> **The Operating System for AI Agent Teams**
|
|
27
|
-
>
|
|
27
|
+
> 82 agents. 17 departments. 267 skills. Multi-runtime. Dashboard. Knowledge RAG.
|
|
28
28
|
|
|
29
29
|
## ⛔ Mandatory 13-phase flow (NON-NEGOTIABLE)
|
|
30
30
|
|
|
Binary file
|
|
Binary file
|
|
@@ -27,7 +27,9 @@ def generate_registry(departments_dir: str | Path, output_path: str | Path) -> d
|
|
|
27
27
|
agents = []
|
|
28
28
|
errors = []
|
|
29
29
|
|
|
30
|
-
|
|
30
|
+
# Recursive: also picks up sub-squad subdirectories
|
|
31
|
+
# (e.g. dev/agents/backend-core/*.yaml, brand/agents/design-ops/*.yaml).
|
|
32
|
+
for yaml_file in sorted(departments_dir.glob("*/agents/**/*.yaml")):
|
|
31
33
|
try:
|
|
32
34
|
agent = load_agent(yaml_file)
|
|
33
35
|
entry = {
|
|
@@ -36,6 +38,8 @@ def generate_registry(departments_dir: str | Path, output_path: str | Path) -> d
|
|
|
36
38
|
"role": agent.role,
|
|
37
39
|
"department": agent.department,
|
|
38
40
|
"tier": agent.tier,
|
|
41
|
+
"parent_squad": agent.parent_squad,
|
|
42
|
+
"sub_squad_role": agent.sub_squad_role,
|
|
39
43
|
"disc": {
|
|
40
44
|
"primary": agent.behavioral_dna.disc.primary.value,
|
|
41
45
|
"secondary": agent.behavioral_dna.disc.secondary.value,
|
|
@@ -60,6 +64,7 @@ def generate_registry(departments_dir: str | Path, output_path: str | Path) -> d
|
|
|
60
64
|
},
|
|
61
65
|
"expertise_domains": agent.expertise.domains[:5],
|
|
62
66
|
"frameworks": agent.expertise.frameworks[:5],
|
|
67
|
+
"knowledge_sources": agent.expertise.knowledge_sources,
|
|
63
68
|
"file": str(yaml_file.relative_to(departments_dir.parent)),
|
|
64
69
|
"memory_path": agent.memory_path,
|
|
65
70
|
}
|
package/core/agents/schema.py
CHANGED
|
@@ -237,6 +237,10 @@ class Expertise(BaseModel):
|
|
|
237
237
|
"""What the agent knows."""
|
|
238
238
|
domains: list[str] = Field(default_factory=list)
|
|
239
239
|
frameworks: list[str] = Field(default_factory=list)
|
|
240
|
+
knowledge_sources: list[str] = Field(
|
|
241
|
+
default_factory=list,
|
|
242
|
+
description="Obsidian KB notes ([[wikilinks]]) that ground this agent's expertise.",
|
|
243
|
+
)
|
|
240
244
|
depth: str = "expert" # novice, intermediate, expert, master
|
|
241
245
|
years_equivalent: int = 10
|
|
242
246
|
|
|
Binary file
|
|
@@ -291,26 +291,56 @@ def _safe_int(value: object) -> int:
|
|
|
291
291
|
return 0
|
|
292
292
|
|
|
293
293
|
|
|
294
|
-
def
|
|
294
|
+
def redact_clients(text: str) -> str:
|
|
295
|
+
"""Redact configured client identifiers from arbitrary text.
|
|
296
|
+
|
|
297
|
+
Public, reusable wrapper around the module's compiled redaction regex
|
|
298
|
+
(loaded from ``~/.arkaos/redaction-clients.json``). Returns the text
|
|
299
|
+
unchanged when no patterns are configured. Used by any propose-only
|
|
300
|
+
producer — the reorganizer report and the agent-attribution proposal —
|
|
301
|
+
so client names never leak into a generated artifact.
|
|
302
|
+
"""
|
|
295
303
|
if _REDACT_RE is None:
|
|
296
304
|
return text
|
|
297
305
|
return _REDACT_RE.sub(_REDACT_TOKEN, text)
|
|
298
306
|
|
|
299
307
|
|
|
300
|
-
def
|
|
301
|
-
|
|
308
|
+
def _redact(text: str) -> str:
|
|
309
|
+
return redact_clients(text)
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def md_escape(text: str) -> str:
|
|
313
|
+
"""Neutralise untrusted text for an HTML-rendering markdown viewer.
|
|
302
314
|
|
|
303
|
-
Titles and excerpts come from
|
|
304
|
-
|
|
305
|
-
|
|
315
|
+
Public, reusable wrapper. Titles and excerpts come from untrusted
|
|
316
|
+
sources (web page titles, YouTube/PDF metadata) and land in .md files
|
|
317
|
+
that the user opens in viewers such as Obsidian, which render raw HTML
|
|
318
|
+
inside markdown. To prevent stored HTML/JS injection (CWE-79) we
|
|
319
|
+
HTML-escape ``<``/``>`` to ``<``/``>`` so any tag renders as
|
|
320
|
+
literal text. We also escape pipes/backslashes (a ``|`` silently shifts
|
|
321
|
+
table columns), strip backticks, and flatten newlines so an untrusted
|
|
322
|
+
title can never distort or execute inside the rendered artifact. Used by
|
|
323
|
+
the reorganizer report and the agent-attribution proposal.
|
|
324
|
+
|
|
325
|
+
Callers redact client names *before* escaping, so the trusted
|
|
326
|
+
``<redacted-client>`` marker may already be present; it is preserved
|
|
327
|
+
verbatim while every other angle bracket is neutralised.
|
|
306
328
|
"""
|
|
307
|
-
|
|
329
|
+
escaped = (
|
|
308
330
|
text.replace("\\", "\\\\")
|
|
331
|
+
.replace("<", "<")
|
|
332
|
+
.replace(">", ">")
|
|
309
333
|
.replace("|", "\\|")
|
|
310
334
|
.replace("\n", " ")
|
|
311
335
|
.replace("\r", " ")
|
|
312
336
|
.replace("`", "")
|
|
313
337
|
)
|
|
338
|
+
escaped_token = _REDACT_TOKEN.replace("<", "<").replace(">", ">")
|
|
339
|
+
return escaped.replace(escaped_token, _REDACT_TOKEN)
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
def _md_escape(text: str) -> str:
|
|
343
|
+
return md_escape(text)
|
|
314
344
|
|
|
315
345
|
|
|
316
346
|
def _body_excerpt(body: str) -> str:
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
"""Semantic agent attribution for a knowledge source (PR3).
|
|
2
|
+
|
|
3
|
+
Given the knowledge text of a source, suggest WHICH agents should learn
|
|
4
|
+
from it by comparing the source text against each agent's expertise
|
|
5
|
+
profile via local embeddings (``core.knowledge.embedder``).
|
|
6
|
+
|
|
7
|
+
Pure and propose-only: this module reads agent dicts passed in by the
|
|
8
|
+
caller and NEVER writes agent YAMLs. It degrades gracefully — when the
|
|
9
|
+
embedder is unavailable (fastembed missing) or the source text is empty
|
|
10
|
+
it returns an empty list, and the caller surfaces a reason.
|
|
11
|
+
|
|
12
|
+
The registry stores ``expertise_domains`` and ``frameworks`` as flat
|
|
13
|
+
list keys on each agent dict (not nested under an ``expertise`` object).
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import math
|
|
19
|
+
|
|
20
|
+
from core.knowledge import embedder
|
|
21
|
+
|
|
22
|
+
_PROFILE_FIELDS = ("expertise_domains", "frameworks")
|
|
23
|
+
_MAX_MATCHED_TERMS = 5
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def agent_profile_text(agent: dict) -> str:
|
|
27
|
+
"""Build one searchable string from an agent's role + expertise.
|
|
28
|
+
|
|
29
|
+
Concatenates role, expertise domains, frameworks, and (optionally)
|
|
30
|
+
name into a single space-joined string suitable for embedding. Empty
|
|
31
|
+
fields are skipped so a sparse agent still yields a clean profile.
|
|
32
|
+
"""
|
|
33
|
+
parts: list[str] = []
|
|
34
|
+
name = str(agent.get("name") or "").strip()
|
|
35
|
+
role = str(agent.get("role") or "").strip()
|
|
36
|
+
if role:
|
|
37
|
+
parts.append(role)
|
|
38
|
+
for field in _PROFILE_FIELDS:
|
|
39
|
+
parts.extend(str(v).strip() for v in agent.get(field) or [] if str(v).strip())
|
|
40
|
+
if name:
|
|
41
|
+
parts.append(name)
|
|
42
|
+
return " ".join(parts)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def cosine(a: list[float], b: list[float]) -> float:
|
|
46
|
+
"""Cosine similarity of two vectors; 0.0 if either is empty/zero-norm."""
|
|
47
|
+
if not a or not b or len(a) != len(b):
|
|
48
|
+
return 0.0
|
|
49
|
+
dot = sum(x * y for x, y in zip(a, b))
|
|
50
|
+
norm_a = math.sqrt(sum(x * x for x in a))
|
|
51
|
+
norm_b = math.sqrt(sum(y * y for y in b))
|
|
52
|
+
if norm_a == 0.0 or norm_b == 0.0:
|
|
53
|
+
return 0.0
|
|
54
|
+
return dot / (norm_a * norm_b)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _profile_texts(agents: list[dict]) -> list[str]:
|
|
58
|
+
"""Profile text for each agent, preserving order."""
|
|
59
|
+
return [agent_profile_text(agent) for agent in agents]
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _explain_match(source_text: str, agent: dict) -> list[str]:
|
|
63
|
+
"""Up to 5 expertise/framework terms that textually appear in source.
|
|
64
|
+
|
|
65
|
+
A cheap, case-insensitive substring "why" explanation — independent of
|
|
66
|
+
the embedding similarity — so the proposal can show concrete overlap.
|
|
67
|
+
"""
|
|
68
|
+
haystack = source_text.lower()
|
|
69
|
+
matched: list[str] = []
|
|
70
|
+
for field in _PROFILE_FIELDS:
|
|
71
|
+
for term in agent.get(field) or []:
|
|
72
|
+
clean = str(term).strip()
|
|
73
|
+
if clean and clean.lower() in haystack and clean not in matched:
|
|
74
|
+
matched.append(clean)
|
|
75
|
+
if len(matched) >= _MAX_MATCHED_TERMS:
|
|
76
|
+
return matched
|
|
77
|
+
return matched
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _build_result(source_text: str, agent: dict, score: float) -> dict:
|
|
81
|
+
"""Shape one ranked match for the API response."""
|
|
82
|
+
return {
|
|
83
|
+
"id": agent.get("id", ""),
|
|
84
|
+
"name": agent.get("name", ""),
|
|
85
|
+
"department": agent.get("department", ""),
|
|
86
|
+
"role": agent.get("role", ""),
|
|
87
|
+
"score": round(score, 3),
|
|
88
|
+
"matched_terms": _explain_match(source_text, agent),
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def match_agents(source_text: str, agents: list[dict], top_n: int = 5) -> list[dict]:
|
|
93
|
+
"""Rank agents by semantic similarity of their expertise to the source.
|
|
94
|
+
|
|
95
|
+
Returns ``[]`` when ``source_text`` is empty or the embedder is
|
|
96
|
+
unavailable (caller surfaces a reason). Embeds the source once and the
|
|
97
|
+
agent profiles in a single batch, then sorts by cosine descending and
|
|
98
|
+
returns the top ``top_n`` results, each with id/name/department/role/
|
|
99
|
+
score (0..1, 3dp) and matched_terms. Never writes anything.
|
|
100
|
+
"""
|
|
101
|
+
if not source_text.strip() or not agents:
|
|
102
|
+
return []
|
|
103
|
+
source_vec = embedder.embed(source_text)
|
|
104
|
+
if source_vec is None:
|
|
105
|
+
return []
|
|
106
|
+
agent_vecs = embedder.embed_batch(_profile_texts(agents))
|
|
107
|
+
if agent_vecs is None:
|
|
108
|
+
return []
|
|
109
|
+
scored = [
|
|
110
|
+
_build_result(source_text, agent, cosine(source_vec, vec))
|
|
111
|
+
for agent, vec in zip(agents, agent_vecs)
|
|
112
|
+
]
|
|
113
|
+
scored.sort(key=lambda r: r["score"], reverse=True)
|
|
114
|
+
return scored[: max(top_n, 0)]
|
|
@@ -119,3 +119,48 @@ def chunk_markdown(
|
|
|
119
119
|
))
|
|
120
120
|
|
|
121
121
|
return chunks
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
# Minimum contiguous token run treated as a real overlap. The chunker seeds
|
|
125
|
+
# each chunk with the previous chunk's last ``overlap_tokens`` words (default
|
|
126
|
+
# 50), so genuine seams are tens of tokens long. Requiring >= 5 avoids
|
|
127
|
+
# stripping on a 1-2 word coincidence (e.g. both chunks ending/starting "the").
|
|
128
|
+
_MIN_OVERLAP_TOKENS = 5
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _overlap_token_count(prev: list[str], cur: list[str], window: int) -> int:
|
|
132
|
+
"""Return length of the longest suffix of ``prev`` that prefixes ``cur``.
|
|
133
|
+
|
|
134
|
+
Compares whitespace tokens (never mid-word). Searches the largest possible
|
|
135
|
+
overlap first within ``window`` and returns the first match, so the result
|
|
136
|
+
is the true chunker overlap window rather than a short coincidence. Returns
|
|
137
|
+
0 when no run of at least ``_MIN_OVERLAP_TOKENS`` tokens matches.
|
|
138
|
+
"""
|
|
139
|
+
max_len = min(len(prev), len(cur), window)
|
|
140
|
+
for length in range(max_len, _MIN_OVERLAP_TOKENS - 1, -1):
|
|
141
|
+
if prev[-length:] == cur[:length]:
|
|
142
|
+
return length
|
|
143
|
+
return 0
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def stitch_chunks(texts: list[str], max_overlap_tokens: int = 200) -> str:
|
|
147
|
+
"""Re-join overlapping chunks into a clean transcript, deduping seams.
|
|
148
|
+
|
|
149
|
+
``chunk_markdown`` prepends each chunk with a token-overlap window copied
|
|
150
|
+
from the previous chunk. Naively joining the chunks therefore repeats that
|
|
151
|
+
window at every boundary. This detects the actual overlap per adjacent pair
|
|
152
|
+
(longest suffix-of-prev == prefix-of-cur, on token boundaries, capped at
|
|
153
|
+
``max_overlap_tokens``) and strips it from the later chunk before joining
|
|
154
|
+
with blank lines. No overlap detected -> plain join, so content is never
|
|
155
|
+
lost. Single chunk returns as-is; empty list returns "".
|
|
156
|
+
"""
|
|
157
|
+
parts = [t for t in texts if t]
|
|
158
|
+
if not parts:
|
|
159
|
+
return ""
|
|
160
|
+
result = [parts[0]]
|
|
161
|
+
for cur in parts[1:]:
|
|
162
|
+
prev_tokens = result[-1].split()
|
|
163
|
+
cur_tokens = cur.split()
|
|
164
|
+
overlap = _overlap_token_count(prev_tokens, cur_tokens, max_overlap_tokens)
|
|
165
|
+
result.append(" ".join(cur_tokens[overlap:]) if overlap else cur)
|
|
166
|
+
return "\n\n".join(p for p in result if p)
|