arkaos 4.0.0 → 4.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +42 -30
- package/VERSION +1 -1
- package/arka/SKILL.md +2 -2
- package/package.json +1 -1
- package/pyproject.toml +1 -1
- package/scripts/bench/__init__.py +5 -0
- package/scripts/bench/__pycache__/__init__.cpython-313.pyc +0 -0
- package/scripts/bench/__pycache__/harness.cpython-313.pyc +0 -0
- package/scripts/bench/__pycache__/run.cpython-313.pyc +0 -0
- package/scripts/bench/harness.py +138 -0
- package/scripts/bench/run.py +136 -0
- package/scripts/tools/__pycache__/docs_stats.cpython-313.pyc +0 -0
- package/scripts/tools/docs_stats.py +154 -0
package/README.md
CHANGED
|
@@ -2,13 +2,16 @@
|
|
|
2
2
|
|
|
3
3
|
**The Operating System for AI Agent Teams.**
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
82 agents. 17 departments. 267 skills. Enterprise frameworks. Multi-runtime. One install.
|
|
6
6
|
|
|
7
7
|
```bash
|
|
8
8
|
npx arkaos install
|
|
9
9
|
```
|
|
10
10
|
|
|
11
|
-
[](https://www.npmjs.com/package/arkaos) [](LICENSE) [](https://www.npmjs.com/package/arkaos) [](LICENSE) []()
|
|
12
|
+
|
|
13
|
+
> All counts in this document are generated by `python scripts/tools/docs_stats.py`
|
|
14
|
+
> and locked by a test — they cannot drift from the repository.
|
|
12
15
|
|
|
13
16
|
---
|
|
14
17
|
|
|
@@ -99,7 +102,7 @@ In plain language. No special syntax required.
|
|
|
99
102
|
|
|
100
103
|
### 2. ArkaOS routes to the right squad
|
|
101
104
|
|
|
102
|
-
The Synapse engine (
|
|
105
|
+
The Synapse engine (12-layer context injection, ~87ms cold / ~83ms warm — see [Benchmarks](wiki/11-Benchmarks.md)) analyzes your request and routes it to the correct department. Each department has a lead agent who orchestrates specialists.
|
|
103
106
|
|
|
104
107
|
### 3. Agents execute with enterprise frameworks
|
|
105
108
|
|
|
@@ -132,24 +135,27 @@ Every decision, solution, and pattern is captured. The Cognitive Layer curates i
|
|
|
132
135
|
|
|
133
136
|
| Department | Prefix | Agents | What It Does |
|
|
134
137
|
|-----------|--------|--------|-------------|
|
|
135
|
-
| **Development** | `/dev` |
|
|
138
|
+
| **Development** | `/dev` | 15 | Full-stack features, APIs, architecture, security, CI/CD |
|
|
139
|
+
| **Brand & Design** | `/brand` | 10 | Brand identity, UX/UI, design systems, naming |
|
|
136
140
|
| **Marketing** | `/mkt` | 4 | SEO, paid ads, email campaigns, growth loops |
|
|
137
|
-
| **
|
|
138
|
-
| **Finance** | `/fin` | 3 | DCF valuation, unit economics, budgets, investor prep |
|
|
139
|
-
| **Strategy** | `/strat` | 3 | Market analysis, competitive intelligence, business models |
|
|
141
|
+
| **Strategy** | `/strat` | 4 | Market analysis, competitive intelligence, business models |
|
|
140
142
|
| **E-Commerce** | `/ecom` | 4 | Store optimization, CRO, pricing, RFM segmentation |
|
|
141
|
-
| **Knowledge** | `/kb` |
|
|
142
|
-
| **
|
|
143
|
-
| **Project Mgmt** | `/pm` | 3 | Scrum, Shape Up, discovery, roadmaps |
|
|
144
|
-
| **SaaS** | `/saas` | 4 | Idea validation, metrics, PLG strategy, scaffolding |
|
|
145
|
-
| **Landing Pages** | `/landing` | 4 | Sales copy, funnels, offers, page generation |
|
|
143
|
+
| **Knowledge** | `/kb` | 4 | Research, Zettelkasten, persona building, ingestion |
|
|
144
|
+
| **Project Mgmt** | `/pm` | 4 | Scrum, Shape Up, discovery, roadmaps |
|
|
146
145
|
| **Content** | `/content` | 4 | Viral hooks, scripts, repurposing, content calendars |
|
|
147
|
-
| **
|
|
148
|
-
| **
|
|
149
|
-
| **
|
|
150
|
-
| **
|
|
146
|
+
| **Sales** | `/sales` | 4 | Pipeline management, SPIN selling, negotiation |
|
|
147
|
+
| **SaaS** | `/saas` | 5 | Idea validation, metrics, PLG strategy, scaffolding |
|
|
148
|
+
| **Organization** | `/org` | 5 | Org design, team topologies, matrix structure |
|
|
149
|
+
| **Landing Pages** | `/landing` | 4 | Sales copy, funnels, offers, page generation |
|
|
150
|
+
| **Finance** | `/fin` | 3 | DCF valuation, unit economics, budgets, investor prep |
|
|
151
|
+
| **Operations** | `/ops` | 3 | Automation, SOPs, compliance (GDPR, ISO, SOC 2) |
|
|
152
|
+
| **Communities** | `/community` | 3 | Groups, membership, gamification, engagement |
|
|
153
|
+
| **Leadership** | `/lead` | 3 | Team health, OKRs, culture, hiring frameworks |
|
|
151
154
|
| **Quality Gate** | (auto) | 3 | Mandatory review on every workflow. Veto power. |
|
|
152
155
|
|
|
156
|
+
> 82 agents across 17 departments (81 unique; `cro-specialist` is shared by
|
|
157
|
+
> E-Commerce and Landing in the matrix structure).
|
|
158
|
+
|
|
153
159
|
---
|
|
154
160
|
|
|
155
161
|
## Cognitive Layer (v2.10)
|
|
@@ -386,7 +392,7 @@ python scripts/tools/okr_cascade.py growth --json
|
|
|
386
392
|
User Input
|
|
387
393
|
│
|
|
388
394
|
▼
|
|
389
|
-
Synapse v2 (
|
|
395
|
+
Synapse v2 (12-layer context injection, ~87ms cold / ~83ms warm)
|
|
390
396
|
│
|
|
391
397
|
▼
|
|
392
398
|
Orchestrator (/do → department routing)
|
|
@@ -408,13 +414,13 @@ Output (Obsidian vault + structured deliverables)
|
|
|
408
414
|
|
|
409
415
|
| System | Purpose |
|
|
410
416
|
|--------|---------|
|
|
411
|
-
| **Synapse v2** |
|
|
417
|
+
| **Synapse v2** | 12-layer context injection (~87ms cold, ~83ms warm; cacheable layers are sub-millisecond) |
|
|
412
418
|
| **Workflow Engine** | YAML workflows with phases, gates, parallelization |
|
|
413
419
|
| **Agent Schema** | 4-framework behavioral DNA with consistency validation |
|
|
414
420
|
| **Squad Framework** | Department squads + ad-hoc project squads (matrix) |
|
|
415
421
|
| **Cognitive Layer** | Memory, Dreaming, Research, Scheduler |
|
|
416
422
|
| **Living Specs** | Bidirectional spec/code sync |
|
|
417
|
-
| **Governance** | Constitution with
|
|
423
|
+
| **Governance** | Constitution with 25 non-negotiable rules (+ 11 must, 8 should) |
|
|
418
424
|
| **Multi-Runtime** | Claude Code, Codex, Gemini, Cursor adapters |
|
|
419
425
|
|
|
420
426
|
### Tech Stack
|
|
@@ -427,22 +433,28 @@ Output (Obsidian vault + structured deliverables)
|
|
|
427
433
|
| Workflows | YAML |
|
|
428
434
|
| Agent Definitions | YAML |
|
|
429
435
|
| Knowledge | Obsidian + SQLite-VSS |
|
|
430
|
-
| Tests | pytest (
|
|
436
|
+
| Tests | pytest (4,500+ tests) |
|
|
431
437
|
|
|
432
438
|
---
|
|
433
439
|
|
|
434
440
|
## Documentation
|
|
435
441
|
|
|
436
|
-
Full documentation
|
|
442
|
+
Full documentation lives in two places in this repository:
|
|
443
|
+
|
|
444
|
+
**[`wiki/`](wiki/Home.md)** — the user-facing guide (step-by-step, features, benchmarks):
|
|
445
|
+
|
|
446
|
+
- [Home](wiki/Home.md) — the index of everything
|
|
447
|
+
- [Getting Started](wiki/01-Getting-Started.md) — install and run your first command
|
|
448
|
+
- [Core Concepts](wiki/02-Core-Concepts.md) — squads, agents, tiers, behavioral DNA
|
|
449
|
+
- [The 13-Phase Flow](wiki/03-The-13-Phase-Flow.md) — how every request is handled
|
|
450
|
+
- [Departments](wiki/04-Departments/) — one page per department
|
|
451
|
+
- [Commands Reference](wiki/05-Commands-Reference.md)
|
|
452
|
+
- [Cognitive Layer](wiki/06-Cognitive-Layer.md) — memory, dreaming, research
|
|
453
|
+
- [Benchmarks](wiki/11-Benchmarks.md) — measured, reproducible numbers
|
|
454
|
+
- [Competitive Analysis](wiki/12-Competitive-Analysis.md) and [Benefits & ROI](wiki/13-Benefits-ROI.md)
|
|
437
455
|
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
- [Departments & Agents](https://github.com/andreagroferreira/arka-os/wiki/Departments)
|
|
441
|
-
- [Cognitive Layer](https://github.com/andreagroferreira/arka-os/wiki/Cognitive-Layer)
|
|
442
|
-
- [Ecosystem Management](https://github.com/andreagroferreira/arka-os/wiki/Ecosystems)
|
|
443
|
-
- [Configuration](https://github.com/andreagroferreira/arka-os/wiki/Configuration)
|
|
444
|
-
- [Creating Projects](https://github.com/andreagroferreira/arka-os/wiki/Creating-Projects)
|
|
445
|
-
- [Update & Sync](https://github.com/andreagroferreira/arka-os/wiki/Update-and-Sync)
|
|
456
|
+
**[`docs/`](docs/)** — the technical/contributor reference (architecture, API,
|
|
457
|
+
agent schema, core engine, ADRs).
|
|
446
458
|
|
|
447
459
|
---
|
|
448
460
|
|
|
@@ -493,7 +505,7 @@ Department commands: `/dev`, `/mkt`, `/brand`, `/fin`, `/strat`, `/ecom`, `/kb`,
|
|
|
493
505
|
|
|
494
506
|
## Contributing
|
|
495
507
|
|
|
496
|
-
See [CONTRIBUTING.md](.github/CONTRIBUTING.md). PRs welcome — all changes require passing the full test suite (
|
|
508
|
+
See [CONTRIBUTING.md](.github/CONTRIBUTING.md). PRs welcome — all changes require passing the full test suite (4,500+ tests as of v4.0.0) and Quality Gate review (Marta CQO + Eduardo Copy + Francisca Tech, all Opus).
|
|
497
509
|
|
|
498
510
|
## License
|
|
499
511
|
|
package/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
4.0.
|
|
1
|
+
4.0.1
|
package/arka/SKILL.md
CHANGED
|
@@ -21,10 +21,10 @@ treat them as your default source. External research supplements, it
|
|
|
21
21
|
does not replace the vault.
|
|
22
22
|
<!-- arka:kb-first-prefix end -->
|
|
23
23
|
|
|
24
|
-
# ArkaOS
|
|
24
|
+
# ArkaOS — Main Orchestrator
|
|
25
25
|
|
|
26
26
|
> **The Operating System for AI Agent Teams**
|
|
27
|
-
>
|
|
27
|
+
> 82 agents. 17 departments. 267 skills. Multi-runtime. Dashboard. Knowledge RAG.
|
|
28
28
|
|
|
29
29
|
## ⛔ Mandatory 13-phase flow (NON-NEGOTIABLE)
|
|
30
30
|
|
package/package.json
CHANGED
package/pyproject.toml
CHANGED
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
"""ArkaOS benchmark harness -- core engine measurements.
|
|
2
|
+
|
|
3
|
+
Three honest measurements:
|
|
4
|
+
|
|
5
|
+
1. Synapse injection latency (engine-only, no vector store) -- cold vs warm,
|
|
6
|
+
plus per-layer compute time so the "cached layers are sub-millisecond"
|
|
7
|
+
claim can be verified against the "full engine costs N ms" reality.
|
|
8
|
+
2. Subagent handoff artifact size -- measured token estimate vs the documented
|
|
9
|
+
~379-token claim.
|
|
10
|
+
3. Routing accuracy -- DepartmentLayer keyword detection over a fixed labelled
|
|
11
|
+
prompt set.
|
|
12
|
+
|
|
13
|
+
All numbers are reproducible. Timings vary by machine; routing accuracy and
|
|
14
|
+
handoff sizes are deterministic.
|
|
15
|
+
"""
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import statistics
|
|
19
|
+
import sys
|
|
20
|
+
import time
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
from typing import Callable
|
|
23
|
+
|
|
24
|
+
_REPO_ROOT = Path(__file__).resolve().parents[2]
|
|
25
|
+
if str(_REPO_ROOT) not in sys.path:
|
|
26
|
+
sys.path.insert(0, str(_REPO_ROOT))
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _percentiles(samples_ms: list[float]) -> dict:
|
|
30
|
+
"""Summarise a list of millisecond samples."""
|
|
31
|
+
ordered = sorted(samples_ms)
|
|
32
|
+
return {
|
|
33
|
+
"runs": len(ordered),
|
|
34
|
+
"min": round(ordered[0], 3),
|
|
35
|
+
"p50": round(statistics.median(ordered), 3),
|
|
36
|
+
"mean": round(statistics.mean(ordered), 3),
|
|
37
|
+
"max": round(ordered[-1], 3),
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _time_call(fn: Callable[[], object]) -> float:
|
|
42
|
+
"""Time a single call, return elapsed milliseconds."""
|
|
43
|
+
start = time.perf_counter()
|
|
44
|
+
fn()
|
|
45
|
+
return (time.perf_counter() - start) * 1000.0
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def bench_synapse_latency(runs: int = 50) -> dict:
|
|
49
|
+
"""Measure Synapse engine injection latency (cold vs warm) + per-layer ms."""
|
|
50
|
+
from core.synapse.engine import create_default_engine
|
|
51
|
+
from core.synapse.layers import PromptContext
|
|
52
|
+
|
|
53
|
+
engine = create_default_engine()
|
|
54
|
+
ctx = PromptContext(
|
|
55
|
+
user_input="fix the authentication bug in the login controller",
|
|
56
|
+
cwd="/tmp/project", git_branch="feat/auth", project_name="demo",
|
|
57
|
+
project_stack="laravel 11", active_agent="backend-dev",
|
|
58
|
+
)
|
|
59
|
+
cold = [_time_call(lambda: (engine.clear_cache(), engine.inject(ctx))) for _ in range(runs)]
|
|
60
|
+
engine.inject(ctx) # warm the cache
|
|
61
|
+
warm = [_time_call(lambda: engine.inject(ctx)) for _ in range(runs)]
|
|
62
|
+
last = engine.metrics[-1] if engine.metrics else {}
|
|
63
|
+
profile = {
|
|
64
|
+
"layers_computed": last.get("layers_computed"),
|
|
65
|
+
"layers_skipped": last.get("layers_skipped"),
|
|
66
|
+
"tokens_injected": last.get("tokens_injected"),
|
|
67
|
+
}
|
|
68
|
+
return {"layer_count": engine.layer_count,
|
|
69
|
+
"cold_ms": _percentiles(cold), "warm_ms": _percentiles(warm),
|
|
70
|
+
"injection_profile": profile}
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def bench_subagent_handoff() -> dict:
|
|
74
|
+
"""Measure a representative handoff artifact's token estimate."""
|
|
75
|
+
from core.runtime.subagent import HandoffArtifact
|
|
76
|
+
|
|
77
|
+
artifact = HandoffArtifact(
|
|
78
|
+
task_id="task-0042",
|
|
79
|
+
task_description="Implement Stripe subscription billing with idempotent webhooks",
|
|
80
|
+
agent_id="backend-dev", agent_role="Senior Backend Developer",
|
|
81
|
+
agent_disc="D:80 I:50 S:45 C:78", department="dev",
|
|
82
|
+
relevant_files=["app/Services/BillingService.php",
|
|
83
|
+
"app/Http/Controllers/WebhookController.php",
|
|
84
|
+
"tests/Feature/BillingTest.php"],
|
|
85
|
+
context_summary=("Laravel 11 app, Cashier installed. Customer model has "
|
|
86
|
+
"stripe_id. Need tiered pricing with volume discounts."),
|
|
87
|
+
constraints=["SOLID + Services/Repositories", "Feature tests required",
|
|
88
|
+
"Idempotent webhook handling"],
|
|
89
|
+
expected_output="Tested, secure billing implementation with passing suite",
|
|
90
|
+
quality_criteria=["80%+ coverage", "OWASP reviewed", "Conventional commits"],
|
|
91
|
+
)
|
|
92
|
+
return {"documented_claim": 379,
|
|
93
|
+
"measured_tokens": artifact.estimated_tokens,
|
|
94
|
+
"prompt_chars": len(artifact.to_prompt())}
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
# Fixed labelled prompt set for routing accuracy. (prompt, expected_department)
|
|
98
|
+
_ROUTING_SET: list[tuple[str, str]] = [
|
|
99
|
+
("fix the authentication bug in the login controller", "dev"),
|
|
100
|
+
("refactor the payment service and add unit tests", "dev"),
|
|
101
|
+
("create a go-to-market plan for our new SaaS", "saas"),
|
|
102
|
+
("design a brand identity with logo and color palette", "brand"),
|
|
103
|
+
("write viral content hooks for our TikTok channel", "content"),
|
|
104
|
+
("build a high-converting landing page funnel", "landing"),
|
|
105
|
+
("audit our online store conversion rate", "ecom"),
|
|
106
|
+
("model our Q3 budget and cash flow forecast", "finance"),
|
|
107
|
+
("run a competitive analysis with Porter's Five Forces", "strategy"),
|
|
108
|
+
("plan the next sprint and groom the backlog", "pm"),
|
|
109
|
+
("set up an SEO and paid ads growth campaign", "marketing"),
|
|
110
|
+
("automate our client onboarding with an SOP", "ops"),
|
|
111
|
+
]
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def bench_routing_accuracy() -> dict:
|
|
115
|
+
"""Measure DepartmentLayer keyword routing over the labelled prompt set."""
|
|
116
|
+
from core.synapse.layers import DepartmentLayer, PromptContext
|
|
117
|
+
|
|
118
|
+
layer = DepartmentLayer()
|
|
119
|
+
hits, details = 0, []
|
|
120
|
+
for prompt, expected in _ROUTING_SET:
|
|
121
|
+
result = layer.compute(PromptContext(user_input=prompt))
|
|
122
|
+
detected = (result.content or "").strip()
|
|
123
|
+
ok = detected == expected
|
|
124
|
+
hits += int(ok)
|
|
125
|
+
details.append({"prompt": prompt, "expected": expected,
|
|
126
|
+
"detected": detected or "(none)", "ok": ok})
|
|
127
|
+
total = len(_ROUTING_SET)
|
|
128
|
+
return {"total": total, "correct": hits,
|
|
129
|
+
"accuracy_pct": round(100.0 * hits / total, 1), "details": details}
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def run_all(runs: int = 50) -> dict:
|
|
133
|
+
"""Run every benchmark and return a combined result dict."""
|
|
134
|
+
return {
|
|
135
|
+
"synapse_latency": bench_synapse_latency(runs=runs),
|
|
136
|
+
"subagent_handoff": bench_subagent_handoff(),
|
|
137
|
+
"routing_accuracy": bench_routing_accuracy(),
|
|
138
|
+
}
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Run the ArkaOS benchmark harness and persist results.
|
|
3
|
+
|
|
4
|
+
Writes:
|
|
5
|
+
- benchmarks/results.json -- machine-readable, consumed by the wiki
|
|
6
|
+
- benchmarks/results.md -- human-readable summary table
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python scripts/bench/run.py # default 50 runs
|
|
10
|
+
python scripts/bench/run.py --runs 100
|
|
11
|
+
python scripts/bench/run.py --runs 30 --no-write # print only
|
|
12
|
+
"""
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import argparse
|
|
16
|
+
import json
|
|
17
|
+
import platform
|
|
18
|
+
import sys
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
|
|
21
|
+
_REPO_ROOT = Path(__file__).resolve().parents[2]
|
|
22
|
+
if str(_REPO_ROOT) not in sys.path:
|
|
23
|
+
sys.path.insert(0, str(_REPO_ROOT))
|
|
24
|
+
|
|
25
|
+
from scripts.bench import harness # noqa: E402
|
|
26
|
+
|
|
27
|
+
_OUT_DIR = _REPO_ROOT / "benchmarks"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _environment() -> dict:
|
|
31
|
+
"""Capture the machine environment (numbers are machine-relative)."""
|
|
32
|
+
return {
|
|
33
|
+
"python": platform.python_version(),
|
|
34
|
+
"platform": platform.platform(),
|
|
35
|
+
"machine": platform.machine(),
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _synapse_section(sl: dict) -> list[str]:
|
|
40
|
+
"""Render the Synapse latency section."""
|
|
41
|
+
prof = sl["injection_profile"]
|
|
42
|
+
return [
|
|
43
|
+
"## Synapse context injection (engine-only, no vector store)",
|
|
44
|
+
"",
|
|
45
|
+
f"- Registered layers: **{sl['layer_count']}**",
|
|
46
|
+
f"- Cold injection (cache cleared each run): "
|
|
47
|
+
f"p50 **{sl['cold_ms']['p50']} ms**, mean {sl['cold_ms']['mean']} ms, "
|
|
48
|
+
f"min {sl['cold_ms']['min']} ms, max {sl['cold_ms']['max']} ms "
|
|
49
|
+
f"({sl['cold_ms']['runs']} runs)",
|
|
50
|
+
f"- Warm injection (cached): "
|
|
51
|
+
f"p50 **{sl['warm_ms']['p50']} ms**, mean {sl['warm_ms']['mean']} ms "
|
|
52
|
+
f"({sl['warm_ms']['runs']} runs)",
|
|
53
|
+
"- The small cold/warm delta is expected: cacheable layers are a "
|
|
54
|
+
"minority of total compute, so warming the cache saves only a few ms.",
|
|
55
|
+
f"- Representative injection: {prof['layers_computed']} layers computed, "
|
|
56
|
+
f"{prof['layers_skipped']} skipped, {prof['tokens_injected']} tokens injected",
|
|
57
|
+
"",
|
|
58
|
+
]
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _handoff_section(ho: dict) -> list[str]:
|
|
62
|
+
"""Render the subagent handoff section."""
|
|
63
|
+
return [
|
|
64
|
+
"## Subagent handoff artifact",
|
|
65
|
+
"",
|
|
66
|
+
f"- Measured (representative artifact): **{ho['measured_tokens']} word-tokens** "
|
|
67
|
+
f"({ho['prompt_chars']} chars). 'word-tokens' is a whitespace-split estimate, "
|
|
68
|
+
"not a BPE tokenizer count.",
|
|
69
|
+
f"- Previously documented claim: {ho['documented_claim']}",
|
|
70
|
+
"",
|
|
71
|
+
]
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _routing_section(ra: dict) -> list[str]:
|
|
75
|
+
"""Render the routing accuracy section + table."""
|
|
76
|
+
out = [
|
|
77
|
+
"## Routing accuracy (DepartmentLayer keyword detection)",
|
|
78
|
+
"",
|
|
79
|
+
f"- **{ra['correct']}/{ra['total']} = {ra['accuracy_pct']}%** on a fixed "
|
|
80
|
+
"labelled prompt set",
|
|
81
|
+
"",
|
|
82
|
+
"| Prompt | Expected | Detected | OK |",
|
|
83
|
+
"|---|---|---|:--:|",
|
|
84
|
+
]
|
|
85
|
+
for d in ra["details"]:
|
|
86
|
+
mark = "yes" if d["ok"] else "no"
|
|
87
|
+
prompt = d["prompt"] if len(d["prompt"]) <= 48 else d["prompt"][:45] + "..."
|
|
88
|
+
out.append(f"| {prompt} | {d['expected']} | {d['detected']} | {mark} |")
|
|
89
|
+
out.append("")
|
|
90
|
+
return out
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def render_markdown(results: dict, env: dict) -> str:
|
|
94
|
+
"""Render a human-readable benchmark summary."""
|
|
95
|
+
header = [
|
|
96
|
+
"# ArkaOS Benchmarks",
|
|
97
|
+
"",
|
|
98
|
+
"> Generated by `python scripts/bench/run.py`. Timings are "
|
|
99
|
+
"machine-relative; routing accuracy and handoff size are deterministic.",
|
|
100
|
+
"",
|
|
101
|
+
f"**Environment:** Python {env['python']} - {env['platform']}",
|
|
102
|
+
"",
|
|
103
|
+
]
|
|
104
|
+
return "\n".join(header
|
|
105
|
+
+ _synapse_section(results["synapse_latency"])
|
|
106
|
+
+ _handoff_section(results["subagent_handoff"])
|
|
107
|
+
+ _routing_section(results["routing_accuracy"]))
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def main() -> int:
|
|
111
|
+
"""Entry point."""
|
|
112
|
+
parser = argparse.ArgumentParser(description="Run ArkaOS benchmarks")
|
|
113
|
+
parser.add_argument("--runs", type=int, default=50, help="Latency samples (default 50)")
|
|
114
|
+
parser.add_argument("--no-write", action="store_true", help="Print only, do not write files")
|
|
115
|
+
args = parser.parse_args()
|
|
116
|
+
|
|
117
|
+
env = _environment()
|
|
118
|
+
results = harness.run_all(runs=args.runs)
|
|
119
|
+
payload = {"environment": env, "results": results}
|
|
120
|
+
md = render_markdown(results, env)
|
|
121
|
+
|
|
122
|
+
if args.no_write:
|
|
123
|
+
print(json.dumps(payload, indent=2))
|
|
124
|
+
print("\n" + md)
|
|
125
|
+
return 0
|
|
126
|
+
|
|
127
|
+
_OUT_DIR.mkdir(exist_ok=True)
|
|
128
|
+
(_OUT_DIR / "results.json").write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
|
|
129
|
+
(_OUT_DIR / "results.md").write_text(md + "\n", encoding="utf-8")
|
|
130
|
+
print(f"Wrote {_OUT_DIR / 'results.json'} and {_OUT_DIR / 'results.md'}")
|
|
131
|
+
print("\n" + md)
|
|
132
|
+
return 0
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
if __name__ == "__main__":
|
|
136
|
+
sys.exit(main())
|
|
Binary file
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""ArkaOS Docs Stats -- canonical source of truth for documentation numbers.
|
|
3
|
+
|
|
4
|
+
Counts agents, departments, skills, ADRs, and tests directly from the
|
|
5
|
+
repository so that every document (README, wiki, CLAUDE.md) consumes generated
|
|
6
|
+
numbers instead of hand-typed ones. This is the antidote to documentation
|
|
7
|
+
drift: no number is ever written by hand.
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
python docs_stats.py # human-readable (repo root auto-detected)
|
|
11
|
+
python docs_stats.py --json
|
|
12
|
+
python docs_stats.py --root /path/to/arka-os --json
|
|
13
|
+
python docs_stats.py --with-pytest # also collect authoritative pytest case count
|
|
14
|
+
"""
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
import json
|
|
19
|
+
import re
|
|
20
|
+
import subprocess
|
|
21
|
+
import sys
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
from typing import Optional
|
|
24
|
+
|
|
25
|
+
_TEST_DEF_RE = re.compile(r"^\s*(?:async\s+)?def\s+test_\w+", re.MULTILINE)
|
|
26
|
+
_COLLECTED_RE = re.compile(r"(\d+)\s+tests?\s+collected")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def repo_root(start: Optional[Path] = None) -> Path:
|
|
30
|
+
"""Find the repo root by walking up to a dir with VERSION + departments/."""
|
|
31
|
+
cur = (start or Path(__file__).resolve()).resolve()
|
|
32
|
+
candidates = [cur, *cur.parents] if cur.is_dir() else [cur.parent, *cur.parents]
|
|
33
|
+
for p in candidates:
|
|
34
|
+
if (p / "VERSION").is_file() and (p / "departments").is_dir():
|
|
35
|
+
return p
|
|
36
|
+
return Path.cwd()
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def read_version(root: Path) -> str:
|
|
40
|
+
"""Read the canonical version string from the VERSION file."""
|
|
41
|
+
vf = root / "VERSION"
|
|
42
|
+
return vf.read_text(encoding="utf-8").strip() if vf.is_file() else ""
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def count_agents(root: Path) -> dict:
|
|
46
|
+
"""Count agent YAML files under departments/*/agents/ (recursive, to
|
|
47
|
+
include sub-squad nesting). Returns total files + unique slugs."""
|
|
48
|
+
dep = root / "departments"
|
|
49
|
+
files = [f for d in dep.glob("*/agents") if d.is_dir()
|
|
50
|
+
for f in d.rglob("*.yaml")] if dep.is_dir() else []
|
|
51
|
+
return {"files": len(files), "unique_slugs": len({f.name for f in files})}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def count_departments(root: Path) -> int:
|
|
55
|
+
"""Count department directories under departments/."""
|
|
56
|
+
dep = root / "departments"
|
|
57
|
+
return sum(1 for d in dep.iterdir() if d.is_dir()) if dep.is_dir() else 0
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def count_skills(root: Path) -> dict:
|
|
61
|
+
"""Count SKILL.md files by area. 'core' = departments + arka."""
|
|
62
|
+
def _n(rel: str) -> int:
|
|
63
|
+
base = root / rel
|
|
64
|
+
return len(list(base.rglob("SKILL.md"))) if base.is_dir() else 0
|
|
65
|
+
|
|
66
|
+
dept, arka, market = _n("departments"), _n("arka"), _n("marketplace")
|
|
67
|
+
return {"departments": dept, "arka": arka, "marketplace": market,
|
|
68
|
+
"core": dept + arka}
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def count_adrs(root: Path) -> int:
|
|
72
|
+
"""Count Architecture Decision Records in docs/adr/."""
|
|
73
|
+
adr = root / "docs" / "adr"
|
|
74
|
+
return len(list(adr.glob("*.md"))) if adr.is_dir() else 0
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def count_test_functions(root: Path) -> int:
|
|
78
|
+
"""Static count of `def test_` / `async def test_` definitions in tests/."""
|
|
79
|
+
tdir = root / "tests"
|
|
80
|
+
if not tdir.is_dir():
|
|
81
|
+
return 0
|
|
82
|
+
return sum(len(_TEST_DEF_RE.findall(f.read_text(encoding="utf-8", errors="replace")))
|
|
83
|
+
for f in tdir.rglob("test_*.py"))
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def collect_pytest_cases(root: Path) -> Optional[int]:
|
|
87
|
+
"""Authoritative pytest case count via --collect-only. None on failure."""
|
|
88
|
+
try:
|
|
89
|
+
out = subprocess.run(
|
|
90
|
+
[sys.executable, "-m", "pytest", "--collect-only", "-q"],
|
|
91
|
+
cwd=root, capture_output=True, text=True, timeout=300, check=False)
|
|
92
|
+
except (OSError, subprocess.SubprocessError):
|
|
93
|
+
return None
|
|
94
|
+
for line in reversed(out.stdout.splitlines()):
|
|
95
|
+
m = _COLLECTED_RE.search(line)
|
|
96
|
+
if m:
|
|
97
|
+
return int(m.group(1))
|
|
98
|
+
return None
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def gather(root: Path, with_pytest: bool = False) -> dict:
|
|
102
|
+
"""Collect all documentation stats into a JSON-serialisable dict."""
|
|
103
|
+
tests = {"functions": count_test_functions(root)}
|
|
104
|
+
if with_pytest:
|
|
105
|
+
tests["collected"] = collect_pytest_cases(root)
|
|
106
|
+
return {
|
|
107
|
+
"version": read_version(root),
|
|
108
|
+
"agents": count_agents(root),
|
|
109
|
+
"departments": count_departments(root),
|
|
110
|
+
"skills": count_skills(root),
|
|
111
|
+
"adrs": count_adrs(root),
|
|
112
|
+
"tests": tests,
|
|
113
|
+
"root": str(root),
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def format_text(stats: dict) -> str:
|
|
118
|
+
"""Render a human-readable summary."""
|
|
119
|
+
a, s, t = stats["agents"], stats["skills"], stats["tests"]
|
|
120
|
+
lines = [
|
|
121
|
+
"=" * 52,
|
|
122
|
+
"ARKAOS DOCS STATS (canonical)",
|
|
123
|
+
"=" * 52,
|
|
124
|
+
f"Version: {stats['version']}",
|
|
125
|
+
f"Departments: {stats['departments']}",
|
|
126
|
+
f"Agents: {a['files']} files ({a['unique_slugs']} unique slugs)",
|
|
127
|
+
f"Skills (core): {s['core']} (departments {s['departments']} + arka {s['arka']})",
|
|
128
|
+
f" marketplace: {s['marketplace']}",
|
|
129
|
+
f"ADRs: {stats['adrs']}",
|
|
130
|
+
f"Test functions: {t['functions']}",
|
|
131
|
+
]
|
|
132
|
+
if "collected" in t:
|
|
133
|
+
lines.append(f"Test cases: {t['collected']} (pytest collected)")
|
|
134
|
+
lines.append("=" * 52)
|
|
135
|
+
return "\n".join(lines)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def main() -> int:
|
|
139
|
+
"""Entry point."""
|
|
140
|
+
parser = argparse.ArgumentParser(
|
|
141
|
+
description="ArkaOS docs stats -- canonical documentation counter")
|
|
142
|
+
parser.add_argument("--root", default=None, help="Repo root (default: auto-detect)")
|
|
143
|
+
parser.add_argument("--json", action="store_true", help="Output as JSON")
|
|
144
|
+
parser.add_argument("--with-pytest", action="store_true",
|
|
145
|
+
help="Also collect authoritative pytest case count")
|
|
146
|
+
args = parser.parse_args()
|
|
147
|
+
root = Path(args.root).resolve() if args.root else repo_root()
|
|
148
|
+
stats = gather(root, with_pytest=args.with_pytest)
|
|
149
|
+
print(json.dumps(stats, indent=2) if args.json else format_text(stats))
|
|
150
|
+
return 0
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
if __name__ == "__main__":
|
|
154
|
+
sys.exit(main())
|