arkaos 3.78.0 → 4.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. package/README.md +42 -30
  2. package/VERSION +1 -1
  3. package/arka/SKILL.md +2 -2
  4. package/config/agent-allowlists/laravel.yaml +1 -0
  5. package/config/agent-allowlists/node.yaml +1 -0
  6. package/config/agent-allowlists/nuxt.yaml +1 -0
  7. package/config/agent-allowlists/python.yaml +1 -0
  8. package/core/agents/__pycache__/registry_gen.cpython-313.pyc +0 -0
  9. package/core/agents/__pycache__/schema.cpython-313.pyc +0 -0
  10. package/core/agents/registry_gen.py +6 -1
  11. package/core/agents/schema.py +4 -0
  12. package/core/cognition/__pycache__/reorganizer.cpython-313.pyc +0 -0
  13. package/core/cognition/reorganizer.py +37 -7
  14. package/core/governance/__pycache__/design_system_lint.cpython-313.pyc +0 -0
  15. package/core/governance/__pycache__/design_system_lint_cli.cpython-313.pyc +0 -0
  16. package/core/knowledge/__pycache__/agent_match.cpython-313.pyc +0 -0
  17. package/core/knowledge/__pycache__/chunker.cpython-313.pyc +0 -0
  18. package/core/knowledge/__pycache__/ingest.cpython-313.pyc +0 -0
  19. package/core/knowledge/__pycache__/sources.cpython-313.pyc +0 -0
  20. package/core/knowledge/__pycache__/vector_store.cpython-313.pyc +0 -0
  21. package/core/knowledge/agent_match.py +114 -0
  22. package/core/knowledge/chunker.py +45 -0
  23. package/core/knowledge/ingest.py +156 -78
  24. package/core/knowledge/sources.py +138 -0
  25. package/core/knowledge/vector_store.py +52 -0
  26. package/core/squads/__pycache__/loader.cpython-313.pyc +0 -0
  27. package/core/squads/loader.py +25 -0
  28. package/core/sync/__pycache__/agent_provisioner.cpython-313.pyc +0 -0
  29. package/core/sync/agent_provisioner.py +19 -8
  30. package/dashboard/app/components/KnowledgeSourcesList.vue +40 -13
  31. package/dashboard/app/pages/cognition.vue +9 -4
  32. package/dashboard/app/pages/knowledge/[id].vue +669 -0
  33. package/dashboard/app/pages/knowledge/index.vue +1281 -0
  34. package/dashboard/app/types/index.d.ts +1 -1
  35. package/departments/brand/agents/ux-designer.yaml +15 -1
  36. package/departments/brand/agents/ux-researcher.yaml +73 -0
  37. package/departments/brand/agents/ux-strategist.yaml +72 -0
  38. package/departments/dev/agents/ai-engineering/ai-engineering-lead.yaml +76 -0
  39. package/departments/dev/agents/architect.yaml +9 -3
  40. package/departments/dev/agents/backend-core/laravel-eng.yaml +76 -0
  41. package/departments/dev/agents/backend-core/node-ts-eng.yaml +76 -0
  42. package/departments/dev/agents/backend-core/python-eng.yaml +76 -0
  43. package/departments/dev/agents/backend-dev.yaml +10 -4
  44. package/departments/dev/agents/data-platform/etl-eng.yaml +74 -0
  45. package/departments/dev/agents/dba.yaml +7 -3
  46. package/departments/dev/references/backend-knowledge-and-tools.md +70 -0
  47. package/departments/ecom/agents/retention-manager.yaml +13 -1
  48. package/departments/leadership/agents/culture-coach.yaml +20 -0
  49. package/departments/leadership/agents/hr-specialist.yaml +18 -0
  50. package/departments/leadership/agents/leadership-director.yaml +10 -0
  51. package/departments/org/agents/chief-of-staff.yaml +76 -0
  52. package/departments/org/agents/coo.yaml +11 -0
  53. package/departments/org/agents/okr-steward.yaml +71 -0
  54. package/departments/org/agents/org-designer.yaml +23 -0
  55. package/departments/org/skills/okr-cadence/SKILL.md +34 -0
  56. package/departments/org/skills/principles-audit/SKILL.md +36 -0
  57. package/departments/pm/agents/pm-director.yaml +21 -8
  58. package/departments/pm/agents/product-owner.yaml +24 -2
  59. package/departments/pm/agents/scrum-master.yaml +21 -0
  60. package/departments/pm/agents/strategic-pm.yaml +72 -0
  61. package/departments/pm/skills/discovery-plan/SKILL.md +7 -1
  62. package/departments/quality/agents/cqo.yaml +8 -0
  63. package/departments/saas/agents/cs-manager.yaml +19 -2
  64. package/departments/saas/agents/growth-engineer.yaml +14 -1
  65. package/departments/saas/agents/metrics-analyst.yaml +17 -1
  66. package/departments/saas/agents/revops-lead.yaml +73 -0
  67. package/departments/saas/skills/leaky-bucket/SKILL.md +28 -0
  68. package/departments/saas/skills/voc-loop/SKILL.md +29 -0
  69. package/departments/sales/agents/sales-director.yaml +9 -0
  70. package/departments/sales/agents/sdr.yaml +72 -0
  71. package/departments/strategy/agents/decision-quality.yaml +72 -0
  72. package/departments/strategy/agents/strategy-director.yaml +13 -0
  73. package/departments/strategy/skills/premortem/SKILL.md +33 -0
  74. package/knowledge/agents-registry-v2.json +1218 -78
  75. package/package.json +1 -1
  76. package/pyproject.toml +1 -1
  77. package/scripts/__pycache__/dashboard-api.cpython-313.pyc +0 -0
  78. package/scripts/bench/__init__.py +5 -0
  79. package/scripts/bench/__pycache__/__init__.cpython-313.pyc +0 -0
  80. package/scripts/bench/__pycache__/harness.cpython-313.pyc +0 -0
  81. package/scripts/bench/__pycache__/run.cpython-313.pyc +0 -0
  82. package/scripts/bench/harness.py +138 -0
  83. package/scripts/bench/run.py +136 -0
  84. package/scripts/dashboard-api.py +376 -13
  85. package/scripts/tools/__pycache__/docs_stats.cpython-313.pyc +0 -0
  86. package/scripts/tools/docs_stats.py +154 -0
  87. package/dashboard/app/pages/knowledge.vue +0 -918
package/README.md CHANGED
@@ -2,13 +2,16 @@
2
2
 
3
3
  **The Operating System for AI Agent Teams.**
4
4
 
5
- 106 agents. 17 departments. 250+ skills. Enterprise frameworks. Multi-runtime. One install.
5
+ 82 agents. 17 departments. 267 skills. Enterprise frameworks. Multi-runtime. One install.
6
6
 
7
7
  ```bash
8
8
  npx arkaos install
9
9
  ```
10
10
 
11
- [![npm](https://img.shields.io/npm/v/arkaos)](https://www.npmjs.com/package/arkaos) [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE) [![Tests](https://img.shields.io/badge/tests-3025%20passing-brightgreen)]()
11
+ [![npm](https://img.shields.io/npm/v/arkaos)](https://www.npmjs.com/package/arkaos) [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE) [![Tests](https://img.shields.io/badge/tests-4500%2B%20passing-brightgreen)]()
12
+
13
+ > All counts in this document are generated by `python scripts/tools/docs_stats.py`
14
+ > and locked by a test — they cannot drift from the repository.
12
15
 
13
16
  ---
14
17
 
@@ -99,7 +102,7 @@ In plain language. No special syntax required.
99
102
 
100
103
  ### 2. ArkaOS routes to the right squad
101
104
 
102
- The Synapse engine (8-layer context injection in <1ms) analyzes your request and routes it to the correct department. Each department has a lead agent who orchestrates specialists.
105
+ The Synapse engine (12-layer context injection, ~87ms cold / ~83ms warm — see [Benchmarks](wiki/11-Benchmarks.md)) analyzes your request and routes it to the correct department. Each department has a lead agent who orchestrates specialists.
103
106
 
104
107
  ### 3. Agents execute with enterprise frameworks
105
108
 
@@ -132,24 +135,27 @@ Every decision, solution, and pattern is captured. The Cognitive Layer curates i
132
135
 
133
136
  | Department | Prefix | Agents | What It Does |
134
137
  |-----------|--------|--------|-------------|
135
- | **Development** | `/dev` | 10 | Full-stack features, APIs, architecture, security, CI/CD |
138
+ | **Development** | `/dev` | 15 | Full-stack features, APIs, architecture, security, CI/CD |
139
+ | **Brand & Design** | `/brand` | 10 | Brand identity, UX/UI, design systems, naming |
136
140
  | **Marketing** | `/mkt` | 4 | SEO, paid ads, email campaigns, growth loops |
137
- | **Brand & Design** | `/brand` | 4 | Brand identity, UX/UI, design systems, naming |
138
- | **Finance** | `/fin` | 3 | DCF valuation, unit economics, budgets, investor prep |
139
- | **Strategy** | `/strat` | 3 | Market analysis, competitive intelligence, business models |
141
+ | **Strategy** | `/strat` | 4 | Market analysis, competitive intelligence, business models |
140
142
  | **E-Commerce** | `/ecom` | 4 | Store optimization, CRO, pricing, RFM segmentation |
141
- | **Knowledge** | `/kb` | 3 | Research, Zettelkasten, persona building, ingestion |
142
- | **Operations** | `/ops` | 4 | Automation, SOPs, compliance (GDPR, ISO, SOC 2) |
143
- | **Project Mgmt** | `/pm` | 3 | Scrum, Shape Up, discovery, roadmaps |
144
- | **SaaS** | `/saas` | 4 | Idea validation, metrics, PLG strategy, scaffolding |
145
- | **Landing Pages** | `/landing` | 4 | Sales copy, funnels, offers, page generation |
143
+ | **Knowledge** | `/kb` | 4 | Research, Zettelkasten, persona building, ingestion |
144
+ | **Project Mgmt** | `/pm` | 4 | Scrum, Shape Up, discovery, roadmaps |
146
145
  | **Content** | `/content` | 4 | Viral hooks, scripts, repurposing, content calendars |
147
- | **Communities** | `/community` | 2 | Groups, membership, gamification, engagement |
148
- | **Sales** | `/sales` | 2 | Pipeline management, SPIN selling, negotiation |
149
- | **Leadership** | `/lead` | 2 | Team health, OKRs, culture, hiring frameworks |
150
- | **Organization** | `/org` | 1 | Org design, team topologies, matrix structure |
146
+ | **Sales** | `/sales` | 4 | Pipeline management, SPIN selling, negotiation |
147
+ | **SaaS** | `/saas` | 5 | Idea validation, metrics, PLG strategy, scaffolding |
148
+ | **Organization** | `/org` | 5 | Org design, team topologies, matrix structure |
149
+ | **Landing Pages** | `/landing` | 4 | Sales copy, funnels, offers, page generation |
150
+ | **Finance** | `/fin` | 3 | DCF valuation, unit economics, budgets, investor prep |
151
+ | **Operations** | `/ops` | 3 | Automation, SOPs, compliance (GDPR, ISO, SOC 2) |
152
+ | **Communities** | `/community` | 3 | Groups, membership, gamification, engagement |
153
+ | **Leadership** | `/lead` | 3 | Team health, OKRs, culture, hiring frameworks |
151
154
  | **Quality Gate** | (auto) | 3 | Mandatory review on every workflow. Veto power. |
152
155
 
156
+ > 82 agents across 17 departments (81 unique; `cro-specialist` is shared by
157
+ > E-Commerce and Landing in the matrix structure).
158
+
153
159
  ---
154
160
 
155
161
  ## Cognitive Layer (v2.10)
@@ -386,7 +392,7 @@ python scripts/tools/okr_cascade.py growth --json
386
392
  User Input
387
393
 
388
394
 
389
- Synapse v2 (8-layer context injection, <1ms, cached)
395
+ Synapse v2 (12-layer context injection, ~87ms cold / ~83ms warm)
390
396
 
391
397
 
392
398
  Orchestrator (/do → department routing)
@@ -408,13 +414,13 @@ Output (Obsidian vault + structured deliverables)
408
414
 
409
415
  | System | Purpose |
410
416
  |--------|---------|
411
- | **Synapse v2** | 8-layer context injection (<1ms, with caching) |
417
+ | **Synapse v2** | 12-layer context injection (~87ms cold, ~83ms warm; cacheable layers are sub-millisecond) |
412
418
  | **Workflow Engine** | YAML workflows with phases, gates, parallelization |
413
419
  | **Agent Schema** | 4-framework behavioral DNA with consistency validation |
414
420
  | **Squad Framework** | Department squads + ad-hoc project squads (matrix) |
415
421
  | **Cognitive Layer** | Memory, Dreaming, Research, Scheduler |
416
422
  | **Living Specs** | Bidirectional spec/code sync |
417
- | **Governance** | Constitution with 14 non-negotiable rules |
423
+ | **Governance** | Constitution with 25 non-negotiable rules (+ 11 must, 8 should) |
418
424
  | **Multi-Runtime** | Claude Code, Codex, Gemini, Cursor adapters |
419
425
 
420
426
  ### Tech Stack
@@ -427,22 +433,28 @@ Output (Obsidian vault + structured deliverables)
427
433
  | Workflows | YAML |
428
434
  | Agent Definitions | YAML |
429
435
  | Knowledge | Obsidian + SQLite-VSS |
430
- | Tests | pytest (1,993 tests) |
436
+ | Tests | pytest (4,500+ tests) |
431
437
 
432
438
  ---
433
439
 
434
440
  ## Documentation
435
441
 
436
- Full documentation is available on the **[GitHub Wiki](https://github.com/andreagroferreira/arka-os/wiki)**:
442
+ Full documentation lives in two places in this repository:
443
+
444
+ **[`wiki/`](wiki/Home.md)** — the user-facing guide (step-by-step, features, benchmarks):
445
+
446
+ - [Home](wiki/Home.md) — the index of everything
447
+ - [Getting Started](wiki/01-Getting-Started.md) — install and run your first command
448
+ - [Core Concepts](wiki/02-Core-Concepts.md) — squads, agents, tiers, behavioral DNA
449
+ - [The 13-Phase Flow](wiki/03-The-13-Phase-Flow.md) — how every request is handled
450
+ - [Departments](wiki/04-Departments/) — one page per department
451
+ - [Commands Reference](wiki/05-Commands-Reference.md)
452
+ - [Cognitive Layer](wiki/06-Cognitive-Layer.md) — memory, dreaming, research
453
+ - [Benchmarks](wiki/11-Benchmarks.md) — measured, reproducible numbers
454
+ - [Competitive Analysis](wiki/12-Competitive-Analysis.md) and [Benefits & ROI](wiki/13-Benefits-ROI.md)
437
455
 
438
- - [Getting Started](https://github.com/andreagroferreira/arka-os/wiki/Getting-Started)
439
- - [Installation Guide](https://github.com/andreagroferreira/arka-os/wiki/Installation)
440
- - [Departments & Agents](https://github.com/andreagroferreira/arka-os/wiki/Departments)
441
- - [Cognitive Layer](https://github.com/andreagroferreira/arka-os/wiki/Cognitive-Layer)
442
- - [Ecosystem Management](https://github.com/andreagroferreira/arka-os/wiki/Ecosystems)
443
- - [Configuration](https://github.com/andreagroferreira/arka-os/wiki/Configuration)
444
- - [Creating Projects](https://github.com/andreagroferreira/arka-os/wiki/Creating-Projects)
445
- - [Update & Sync](https://github.com/andreagroferreira/arka-os/wiki/Update-and-Sync)
456
+ **[`docs/`](docs/)** — the technical/contributor reference (architecture, API,
457
+ agent schema, core engine, ADRs).
446
458
 
447
459
  ---
448
460
 
@@ -493,7 +505,7 @@ Department commands: `/dev`, `/mkt`, `/brand`, `/fin`, `/strat`, `/ecom`, `/kb`,
493
505
 
494
506
  ## Contributing
495
507
 
496
- See [CONTRIBUTING.md](.github/CONTRIBUTING.md). PRs welcome — all changes require passing the full test suite (3,473+ tests as of v2.46.x) and Quality Gate review (Marta CQO + Eduardo Copy + Francisca Tech, all Opus).
508
+ See [CONTRIBUTING.md](.github/CONTRIBUTING.md). PRs welcome — all changes require passing the full test suite (4,500+ tests as of v4.0.0) and Quality Gate review (Marta CQO + Eduardo Copy + Francisca Tech, all Opus).
497
509
 
498
510
  ## License
499
511
 
package/VERSION CHANGED
@@ -1 +1 @@
1
- 3.78.0
1
+ 4.0.1
package/arka/SKILL.md CHANGED
@@ -21,10 +21,10 @@ treat them as your default source. External research supplements, it
21
21
  does not replace the vault.
22
22
  <!-- arka:kb-first-prefix end -->
23
23
 
24
- # ArkaOS v2 — Main Orchestrator
24
+ # ArkaOS — Main Orchestrator
25
25
 
26
26
  > **The Operating System for AI Agent Teams**
27
- > 65 agents. 17 departments. 244+ skills. Multi-runtime. Dashboard. Knowledge RAG.
27
+ > 82 agents. 17 departments. 267 skills. Multi-runtime. Dashboard. Knowledge RAG.
28
28
 
29
29
  ## ⛔ Mandatory 13-phase flow (NON-NEGOTIABLE)
30
30
 
@@ -1,6 +1,7 @@
1
1
  stack: laravel
2
2
  baseline:
3
3
  - backend-dev
4
+ - laravel-eng
4
5
  - senior-dev
5
6
  - architect
6
7
  - qa
@@ -1,6 +1,7 @@
1
1
  stack: node
2
2
  baseline:
3
3
  - backend-dev
4
+ - node-ts-eng
4
5
  - senior-dev
5
6
  - architect
6
7
  - qa
@@ -2,6 +2,7 @@ stack: nuxt
2
2
  baseline:
3
3
  - frontend-dev
4
4
  - backend-dev
5
+ - node-ts-eng
5
6
  - architect
6
7
  - qa
7
8
  - devops
@@ -1,5 +1,6 @@
1
1
  stack: python
2
2
  baseline:
3
+ - python-eng
3
4
  - senior-dev
4
5
  - architect
5
6
  - qa
@@ -27,7 +27,9 @@ def generate_registry(departments_dir: str | Path, output_path: str | Path) -> d
27
27
  agents = []
28
28
  errors = []
29
29
 
30
- for yaml_file in sorted(departments_dir.glob("*/agents/*.yaml")):
30
+ # Recursive: also picks up sub-squad subdirectories
31
+ # (e.g. dev/agents/backend-core/*.yaml, brand/agents/design-ops/*.yaml).
32
+ for yaml_file in sorted(departments_dir.glob("*/agents/**/*.yaml")):
31
33
  try:
32
34
  agent = load_agent(yaml_file)
33
35
  entry = {
@@ -36,6 +38,8 @@ def generate_registry(departments_dir: str | Path, output_path: str | Path) -> d
36
38
  "role": agent.role,
37
39
  "department": agent.department,
38
40
  "tier": agent.tier,
41
+ "parent_squad": agent.parent_squad,
42
+ "sub_squad_role": agent.sub_squad_role,
39
43
  "disc": {
40
44
  "primary": agent.behavioral_dna.disc.primary.value,
41
45
  "secondary": agent.behavioral_dna.disc.secondary.value,
@@ -60,6 +64,7 @@ def generate_registry(departments_dir: str | Path, output_path: str | Path) -> d
60
64
  },
61
65
  "expertise_domains": agent.expertise.domains[:5],
62
66
  "frameworks": agent.expertise.frameworks[:5],
67
+ "knowledge_sources": agent.expertise.knowledge_sources,
63
68
  "file": str(yaml_file.relative_to(departments_dir.parent)),
64
69
  "memory_path": agent.memory_path,
65
70
  }
@@ -237,6 +237,10 @@ class Expertise(BaseModel):
237
237
  """What the agent knows."""
238
238
  domains: list[str] = Field(default_factory=list)
239
239
  frameworks: list[str] = Field(default_factory=list)
240
+ knowledge_sources: list[str] = Field(
241
+ default_factory=list,
242
+ description="Obsidian KB notes ([[wikilinks]]) that ground this agent's expertise.",
243
+ )
240
244
  depth: str = "expert" # novice, intermediate, expert, master
241
245
  years_equivalent: int = 10
242
246
 
@@ -291,26 +291,56 @@ def _safe_int(value: object) -> int:
291
291
  return 0
292
292
 
293
293
 
294
- def _redact(text: str) -> str:
294
+ def redact_clients(text: str) -> str:
295
+ """Redact configured client identifiers from arbitrary text.
296
+
297
+ Public, reusable wrapper around the module's compiled redaction regex
298
+ (loaded from ``~/.arkaos/redaction-clients.json``). Returns the text
299
+ unchanged when no patterns are configured. Used by any propose-only
300
+ producer — the reorganizer report and the agent-attribution proposal —
301
+ so client names never leak into a generated artifact.
302
+ """
295
303
  if _REDACT_RE is None:
296
304
  return text
297
305
  return _REDACT_RE.sub(_REDACT_TOKEN, text)
298
306
 
299
307
 
300
- def _md_escape(text: str) -> str:
301
- """Escape markdown control characters that would distort a table row.
308
+ def _redact(text: str) -> str:
309
+ return redact_clients(text)
310
+
311
+
312
+ def md_escape(text: str) -> str:
313
+ """Neutralise untrusted text for an HTML-rendering markdown viewer.
302
314
 
303
- Titles and excerpts come from frontmatter the operator controls, but
304
- a `|` in a title silently shifts table columns and corrupts the raw-
305
- artifact table. Escape pipes, newlines, and stray backticks.
315
+ Public, reusable wrapper. Titles and excerpts come from untrusted
316
+ sources (web page titles, YouTube/PDF metadata) and land in .md files
317
+ that the user opens in viewers such as Obsidian, which render raw HTML
318
+ inside markdown. To prevent stored HTML/JS injection (CWE-79) we
319
+ HTML-escape ``<``/``>`` to ``&lt;``/``&gt;`` so any tag renders as
320
+ literal text. We also escape pipes/backslashes (a ``|`` silently shifts
321
+ table columns), strip backticks, and flatten newlines so an untrusted
322
+ title can never distort or execute inside the rendered artifact. Used by
323
+ the reorganizer report and the agent-attribution proposal.
324
+
325
+ Callers redact client names *before* escaping, so the trusted
326
+ ``<redacted-client>`` marker may already be present; it is preserved
327
+ verbatim while every other angle bracket is neutralised.
306
328
  """
307
- return (
329
+ escaped = (
308
330
  text.replace("\\", "\\\\")
331
+ .replace("<", "&lt;")
332
+ .replace(">", "&gt;")
309
333
  .replace("|", "\\|")
310
334
  .replace("\n", " ")
311
335
  .replace("\r", " ")
312
336
  .replace("`", "")
313
337
  )
338
+ escaped_token = _REDACT_TOKEN.replace("<", "&lt;").replace(">", "&gt;")
339
+ return escaped.replace(escaped_token, _REDACT_TOKEN)
340
+
341
+
342
+ def _md_escape(text: str) -> str:
343
+ return md_escape(text)
314
344
 
315
345
 
316
346
  def _body_excerpt(body: str) -> str:
@@ -0,0 +1,114 @@
1
+ """Semantic agent attribution for a knowledge source (PR3).
2
+
3
+ Given the knowledge text of a source, suggest WHICH agents should learn
4
+ from it by comparing the source text against each agent's expertise
5
+ profile via local embeddings (``core.knowledge.embedder``).
6
+
7
+ Pure and propose-only: this module reads agent dicts passed in by the
8
+ caller and NEVER writes agent YAMLs. It degrades gracefully — when the
9
+ embedder is unavailable (fastembed missing) or the source text is empty
10
+ it returns an empty list, and the caller surfaces a reason.
11
+
12
+ The registry stores ``expertise_domains`` and ``frameworks`` as flat
13
+ list keys on each agent dict (not nested under an ``expertise`` object).
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import math
19
+
20
+ from core.knowledge import embedder
21
+
22
+ _PROFILE_FIELDS = ("expertise_domains", "frameworks")
23
+ _MAX_MATCHED_TERMS = 5
24
+
25
+
26
+ def agent_profile_text(agent: dict) -> str:
27
+ """Build one searchable string from an agent's role + expertise.
28
+
29
+ Concatenates role, expertise domains, frameworks, and (optionally)
30
+ name into a single space-joined string suitable for embedding. Empty
31
+ fields are skipped so a sparse agent still yields a clean profile.
32
+ """
33
+ parts: list[str] = []
34
+ name = str(agent.get("name") or "").strip()
35
+ role = str(agent.get("role") or "").strip()
36
+ if role:
37
+ parts.append(role)
38
+ for field in _PROFILE_FIELDS:
39
+ parts.extend(str(v).strip() for v in agent.get(field) or [] if str(v).strip())
40
+ if name:
41
+ parts.append(name)
42
+ return " ".join(parts)
43
+
44
+
45
+ def cosine(a: list[float], b: list[float]) -> float:
46
+ """Cosine similarity of two vectors; 0.0 if either is empty/zero-norm."""
47
+ if not a or not b or len(a) != len(b):
48
+ return 0.0
49
+ dot = sum(x * y for x, y in zip(a, b))
50
+ norm_a = math.sqrt(sum(x * x for x in a))
51
+ norm_b = math.sqrt(sum(y * y for y in b))
52
+ if norm_a == 0.0 or norm_b == 0.0:
53
+ return 0.0
54
+ return dot / (norm_a * norm_b)
55
+
56
+
57
+ def _profile_texts(agents: list[dict]) -> list[str]:
58
+ """Profile text for each agent, preserving order."""
59
+ return [agent_profile_text(agent) for agent in agents]
60
+
61
+
62
+ def _explain_match(source_text: str, agent: dict) -> list[str]:
63
+ """Up to 5 expertise/framework terms that textually appear in source.
64
+
65
+ A cheap, case-insensitive substring "why" explanation — independent of
66
+ the embedding similarity — so the proposal can show concrete overlap.
67
+ """
68
+ haystack = source_text.lower()
69
+ matched: list[str] = []
70
+ for field in _PROFILE_FIELDS:
71
+ for term in agent.get(field) or []:
72
+ clean = str(term).strip()
73
+ if clean and clean.lower() in haystack and clean not in matched:
74
+ matched.append(clean)
75
+ if len(matched) >= _MAX_MATCHED_TERMS:
76
+ return matched
77
+ return matched
78
+
79
+
80
+ def _build_result(source_text: str, agent: dict, score: float) -> dict:
81
+ """Shape one ranked match for the API response."""
82
+ return {
83
+ "id": agent.get("id", ""),
84
+ "name": agent.get("name", ""),
85
+ "department": agent.get("department", ""),
86
+ "role": agent.get("role", ""),
87
+ "score": round(score, 3),
88
+ "matched_terms": _explain_match(source_text, agent),
89
+ }
90
+
91
+
92
+ def match_agents(source_text: str, agents: list[dict], top_n: int = 5) -> list[dict]:
93
+ """Rank agents by semantic similarity of their expertise to the source.
94
+
95
+ Returns ``[]`` when ``source_text`` is empty or the embedder is
96
+ unavailable (caller surfaces a reason). Embeds the source once and the
97
+ agent profiles in a single batch, then sorts by cosine descending and
98
+ returns the top ``top_n`` results, each with id/name/department/role/
99
+ score (0..1, 3dp) and matched_terms. Never writes anything.
100
+ """
101
+ if not source_text.strip() or not agents:
102
+ return []
103
+ source_vec = embedder.embed(source_text)
104
+ if source_vec is None:
105
+ return []
106
+ agent_vecs = embedder.embed_batch(_profile_texts(agents))
107
+ if agent_vecs is None:
108
+ return []
109
+ scored = [
110
+ _build_result(source_text, agent, cosine(source_vec, vec))
111
+ for agent, vec in zip(agents, agent_vecs)
112
+ ]
113
+ scored.sort(key=lambda r: r["score"], reverse=True)
114
+ return scored[: max(top_n, 0)]
@@ -119,3 +119,48 @@ def chunk_markdown(
119
119
  ))
120
120
 
121
121
  return chunks
122
+
123
+
124
+ # Minimum contiguous token run treated as a real overlap. The chunker seeds
125
+ # each chunk with the previous chunk's last ``overlap_tokens`` words (default
126
+ # 50), so genuine seams are tens of tokens long. Requiring >= 5 avoids
127
+ # stripping on a 1-2 word coincidence (e.g. both chunks ending/starting "the").
128
+ _MIN_OVERLAP_TOKENS = 5
129
+
130
+
131
+ def _overlap_token_count(prev: list[str], cur: list[str], window: int) -> int:
132
+ """Return length of the longest suffix of ``prev`` that prefixes ``cur``.
133
+
134
+ Compares whitespace tokens (never mid-word). Searches the largest possible
135
+ overlap first within ``window`` and returns the first match, so the result
136
+ is the true chunker overlap window rather than a short coincidence. Returns
137
+ 0 when no run of at least ``_MIN_OVERLAP_TOKENS`` tokens matches.
138
+ """
139
+ max_len = min(len(prev), len(cur), window)
140
+ for length in range(max_len, _MIN_OVERLAP_TOKENS - 1, -1):
141
+ if prev[-length:] == cur[:length]:
142
+ return length
143
+ return 0
144
+
145
+
146
+ def stitch_chunks(texts: list[str], max_overlap_tokens: int = 200) -> str:
147
+ """Re-join overlapping chunks into a clean transcript, deduping seams.
148
+
149
+ ``chunk_markdown`` prepends each chunk with a token-overlap window copied
150
+ from the previous chunk. Naively joining the chunks therefore repeats that
151
+ window at every boundary. This detects the actual overlap per adjacent pair
152
+ (longest suffix-of-prev == prefix-of-cur, on token boundaries, capped at
153
+ ``max_overlap_tokens``) and strips it from the later chunk before joining
154
+ with blank lines. No overlap detected -> plain join, so content is never
155
+ lost. Single chunk returns as-is; empty list returns "".
156
+ """
157
+ parts = [t for t in texts if t]
158
+ if not parts:
159
+ return ""
160
+ result = [parts[0]]
161
+ for cur in parts[1:]:
162
+ prev_tokens = result[-1].split()
163
+ cur_tokens = cur.split()
164
+ overlap = _overlap_token_count(prev_tokens, cur_tokens, max_overlap_tokens)
165
+ result.append(" ".join(cur_tokens[overlap:]) if overlap else cur)
166
+ return "\n\n".join(p for p in result if p)