@pjmendonca/devflow 1.13.2 → 1.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/commands/agent.md +1 -1
- package/.claude/commands/brainstorm.md +28 -0
- package/.claude/commands/bugfix.md +21 -0
- package/.claude/commands/checkpoint.md +0 -1
- package/.claude/commands/collab.md +0 -1
- package/.claude/commands/costs.md +88 -18
- package/.claude/commands/devflow.md +26 -0
- package/.claude/commands/handoff.md +0 -1
- package/.claude/commands/init.md +383 -0
- package/.claude/commands/memory.md +0 -1
- package/.claude/commands/pair.md +0 -1
- package/.claude/commands/review.md +27 -0
- package/.claude/commands/route.md +0 -1
- package/.claude/commands/swarm.md +0 -1
- package/.claude/commands/validate.md +55 -0
- package/.claude/hooks/session-notification.sh +44 -0
- package/.claude/hooks/session-startup.sh +427 -0
- package/.claude/hooks/session-stop.sh +38 -0
- package/.claude/hooks/session_tracker.py +272 -0
- package/.claude/settings.json +38 -0
- package/.claude/skills/brainstorm/SKILL.md +531 -0
- package/.claude/skills/costs/SKILL.md +156 -0
- package/.claude/skills/validate/SKILL.md +101 -0
- package/CHANGELOG.md +284 -0
- package/README.md +207 -10
- package/bin/devflow-install.js +2 -1
- package/bin/devflow.js +4 -0
- package/lib/constants.js +0 -1
- package/lib/exec-python.js +1 -1
- package/package.json +1 -1
- package/tooling/.automation/.checkpoint_lock +1 -0
- package/tooling/.automation/agents/architect.md +19 -0
- package/tooling/.automation/agents/ba.md +19 -0
- package/tooling/.automation/agents/maintainer.md +19 -0
- package/tooling/.automation/agents/pm.md +19 -0
- package/tooling/.automation/agents/reviewer.md +1 -1
- package/tooling/.automation/agents/writer.md +19 -0
- package/tooling/.automation/benchmarks/benchmark_20251230_100119.json +314 -0
- package/tooling/.automation/benchmarks/benchmark_20251230_100216.json +314 -0
- package/tooling/.automation/costs/config.json +31 -0
- package/tooling/.automation/costs/sessions/2025-12-29_20251229_164128.json +22 -0
- package/tooling/.automation/memory/knowledge/kg_integration-test.json +738 -1
- package/tooling/.automation/memory/knowledge/kg_test-story.json +3381 -2
- package/tooling/.automation/memory/shared/shared_integration-test.json +193 -1
- package/tooling/.automation/memory/shared/shared_test-story.json +757 -1
- package/tooling/.automation/memory/shared/shared_test.json +1332 -0
- package/tooling/.automation/memory/shared/shared_validation-check.json +240 -0
- package/tooling/.automation/overrides/templates/architect/cloud-native.yaml +5 -5
- package/tooling/.automation/overrides/templates/architect/enterprise-architect.yaml +23 -5
- package/tooling/.automation/overrides/templates/architect/pragmatic-minimalist.yaml +24 -6
- package/tooling/.automation/overrides/templates/ba/agile-storyteller.yaml +4 -4
- package/tooling/.automation/overrides/templates/ba/domain-expert.yaml +4 -4
- package/tooling/.automation/overrides/templates/ba/requirements-engineer.yaml +4 -4
- package/tooling/.automation/overrides/templates/dev/performance-engineer.yaml +18 -0
- package/tooling/.automation/overrides/templates/dev/rapid-prototyper.yaml +19 -1
- package/tooling/.automation/overrides/templates/dev/security-focused.yaml +18 -0
- package/tooling/.automation/overrides/templates/dev/user-advocate.yaml +54 -0
- package/tooling/.automation/overrides/templates/maintainer/devops-maintainer.yaml +4 -4
- package/tooling/.automation/overrides/templates/maintainer/legacy-steward.yaml +4 -4
- package/tooling/.automation/overrides/templates/maintainer/oss-maintainer.yaml +4 -4
- package/tooling/.automation/overrides/templates/maintainer/reliability-engineer.yaml +55 -0
- package/tooling/.automation/overrides/templates/pm/agile-pm.yaml +4 -4
- package/tooling/.automation/overrides/templates/pm/hybrid-delivery.yaml +3 -3
- package/tooling/.automation/overrides/templates/pm/traditional-pm.yaml +4 -4
- package/tooling/.automation/overrides/templates/reviewer/quick-sanity.yaml +18 -0
- package/tooling/.automation/overrides/templates/reviewer/thorough-critic.yaml +18 -0
- package/tooling/.automation/overrides/templates/sm/agile-coach.yaml +2 -2
- package/tooling/.automation/overrides/templates/sm/startup-pm.yaml +3 -3
- package/tooling/.automation/overrides/templates/writer/api-documentarian.yaml +5 -5
- package/tooling/.automation/overrides/templates/writer/docs-as-code.yaml +4 -4
- package/tooling/.automation/overrides/templates/writer/user-guide-author.yaml +5 -5
- package/tooling/.automation/validation/history/2025-12-29_val_002a28c1.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_01273bb1.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_03369914.json +41 -0
- package/tooling/.automation/validation/history/2025-12-29_val_07a449ba.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_0df1f0a2.json +41 -0
- package/tooling/.automation/validation/history/2025-12-29_val_10ff3d34.json +41 -0
- package/tooling/.automation/validation/history/2025-12-29_val_110771d7.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_13f3a7f9.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_17ba9d21.json +41 -0
- package/tooling/.automation/validation/history/2025-12-29_val_22247089.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_227ea6a4.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_2335d5ae.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_246824bb.json +41 -0
- package/tooling/.automation/validation/history/2025-12-29_val_28b4b9cd.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_2abd12cc.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_2c801b2f.json +59 -0
- package/tooling/.automation/validation/history/2025-12-29_val_2c8cfa8e.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_2ce76eb0.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_30351948.json +41 -0
- package/tooling/.automation/validation/history/2025-12-29_val_30eb7229.json +41 -0
- package/tooling/.automation/validation/history/2025-12-29_val_34df0e77.json +41 -0
- package/tooling/.automation/validation/history/2025-12-29_val_376e4d6a.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_3a4e8a1a.json +59 -0
- package/tooling/.automation/validation/history/2025-12-29_val_3b77a628.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_3ea4e1cf.json +59 -0
- package/tooling/.automation/validation/history/2025-12-29_val_44aacdb4.json +59 -0
- package/tooling/.automation/validation/history/2025-12-29_val_457ddfa8.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_45af6238.json +41 -0
- package/tooling/.automation/validation/history/2025-12-29_val_4735dba1.json +41 -0
- package/tooling/.automation/validation/history/2025-12-29_val_486b203c.json +41 -0
- package/tooling/.automation/validation/history/2025-12-29_val_49dc56cd.json +59 -0
- package/tooling/.automation/validation/history/2025-12-29_val_4d863d6d.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_5149a808.json +59 -0
- package/tooling/.automation/validation/history/2025-12-29_val_52e0bb43.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_585d6319.json +59 -0
- package/tooling/.automation/validation/history/2025-12-29_val_5b2d859a.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_635a7081.json +41 -0
- package/tooling/.automation/validation/history/2025-12-29_val_64df4905.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_70634cee.json +41 -0
- package/tooling/.automation/validation/history/2025-12-29_val_714553f9.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_7f7bfdbf.json +41 -0
- package/tooling/.automation/validation/history/2025-12-29_val_7faad91d.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_81821f8f.json +41 -0
- package/tooling/.automation/validation/history/2025-12-29_val_8249f3c9.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_8422b50f.json +41 -0
- package/tooling/.automation/validation/history/2025-12-29_val_8446c134.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_879f4e26.json +59 -0
- package/tooling/.automation/validation/history/2025-12-29_val_8b6d5bd7.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_8c5cd787.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_91d20bc7.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_958a12b7.json +41 -0
- package/tooling/.automation/validation/history/2025-12-29_val_95d91108.json +41 -0
- package/tooling/.automation/validation/history/2025-12-29_val_980dbb74.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_9e40c79b.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_9f499b7c.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_9f7c3b57.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_a30d5bd4.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_a6eb09c7.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_a86f7b83.json +41 -0
- package/tooling/.automation/validation/history/2025-12-29_val_ad5347e1.json +41 -0
- package/tooling/.automation/validation/history/2025-12-29_val_b0a5a993.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_bcb0192e.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_bf3c9aaa.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_c461ff88.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_c4f4e258.json +41 -0
- package/tooling/.automation/validation/history/2025-12-29_val_c7f0fa6d.json +41 -0
- package/tooling/.automation/validation/history/2025-12-29_val_c911b0e6.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_cc581964.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_cdd5a33b.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_cfd42495.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_d1c7a4ee.json +41 -0
- package/tooling/.automation/validation/history/2025-12-29_val_d2280d0e.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_d2a6ff69.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_d8c53ab2.json +59 -0
- package/tooling/.automation/validation/history/2025-12-29_val_d9c1247a.json +41 -0
- package/tooling/.automation/validation/history/2025-12-29_val_d9d58569.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_dabb4fd9.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_dd8fe359.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_decdffc9.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_e3a95476.json +59 -0
- package/tooling/.automation/validation/history/2025-12-29_val_e776dfca.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_ea70969f.json +59 -0
- package/tooling/.automation/validation/history/2025-12-29_val_ef41ea95.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_f384f9b1.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_f8adc38c.json +41 -0
- package/tooling/.automation/validation/history/2025-12-29_val_fa40b69e.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_fc538d54.json +41 -0
- package/tooling/.automation/validation/history/2025-12-29_val_fe814665.json +32 -0
- package/tooling/.automation/validation/history/2025-12-29_val_ffea4b12.json +32 -0
- package/tooling/.automation/validation/history/2025-12-30_val_02d001e5.json +59 -0
- package/tooling/.automation/validation/history/2025-12-30_val_0b8966dc.json +32 -0
- package/tooling/.automation/validation/history/2025-12-30_val_15455fbf.json +59 -0
- package/tooling/.automation/validation/history/2025-12-30_val_157e34b9.json +32 -0
- package/tooling/.automation/validation/history/2025-12-30_val_28d1d933.json +32 -0
- package/tooling/.automation/validation/history/2025-12-30_val_3442a52c.json +32 -0
- package/tooling/.automation/validation/history/2025-12-30_val_37f1ce1e.json +32 -0
- package/tooling/.automation/validation/history/2025-12-30_val_4f1d8a93.json +32 -0
- package/tooling/.automation/validation/history/2025-12-30_val_56ff1de3.json +32 -0
- package/tooling/.automation/validation/history/2025-12-30_val_664fd4e2.json +41 -0
- package/tooling/.automation/validation/history/2025-12-30_val_66afb0a7.json +32 -0
- package/tooling/.automation/validation/history/2025-12-30_val_7634663c.json +41 -0
- package/tooling/.automation/validation/history/2025-12-30_val_8ea830c3.json +41 -0
- package/tooling/.automation/validation/history/2025-12-30_val_998957c2.json +32 -0
- package/tooling/.automation/validation/history/2025-12-30_val_a52177db.json +32 -0
- package/tooling/.automation/validation/history/2025-12-30_val_a5b65a63.json +32 -0
- package/tooling/.automation/validation/history/2025-12-30_val_ae391d0e.json +32 -0
- package/tooling/.automation/validation/history/2025-12-30_val_c7895339.json +41 -0
- package/tooling/.automation/validation/history/2025-12-30_val_ca416593.json +41 -0
- package/tooling/.automation/validation/history/2025-12-30_val_cee19422.json +32 -0
- package/tooling/.automation/validation/history/2025-12-30_val_ddd4f4e6.json +32 -0
- package/tooling/.automation/validation/history/2025-12-30_val_f2e1394b.json +32 -0
- package/tooling/.automation/validation/history/2025-12-30_val_f4a7fa06.json +41 -0
- package/tooling/.automation/validation/history/2025-12-30_val_ffea3369.json +32 -0
- package/tooling/.automation/validation/history/2026-01-03_val_1287a74c.json +41 -0
- package/tooling/.automation/validation/history/2026-01-03_val_3b24071f.json +32 -0
- package/tooling/.automation/validation/history/2026-01-03_val_44d77573.json +32 -0
- package/tooling/.automation/validation/history/2026-01-03_val_5b31dc51.json +32 -0
- package/tooling/.automation/validation/history/2026-01-03_val_74267244.json +32 -0
- package/tooling/.automation/validation/history/2026-01-03_val_8b2d95c7.json +59 -0
- package/tooling/.automation/validation/history/2026-01-03_val_d875b297.json +41 -0
- package/tooling/.automation/validation-config.yaml +103 -0
- package/tooling/completions/DevflowCompletion.ps1 +21 -21
- package/tooling/completions/_run-story +3 -3
- package/tooling/completions/run-story-completion.bash +8 -8
- package/tooling/docs/DOC-STANDARD.md +14 -14
- package/tooling/docs/stories/.gitkeep +0 -0
- package/tooling/docs/templates/brainstorm-guide.md +314 -0
- package/tooling/docs/templates/migration-spec.md +4 -4
- package/tooling/docs/templates/story.md +66 -0
- package/tooling/scripts/context_checkpoint.py +5 -15
- package/tooling/scripts/cost_dashboard.py +610 -13
- package/tooling/scripts/create-persona.py +1 -12
- package/tooling/scripts/create-persona.sh +44 -44
- package/tooling/scripts/lib/__init__.py +12 -1
- package/tooling/scripts/lib/agent_handoff.py +11 -2
- package/tooling/scripts/lib/agent_router.py +31 -10
- package/tooling/scripts/lib/colors.py +106 -0
- package/tooling/scripts/lib/context_monitor.py +766 -0
- package/tooling/scripts/lib/cost_config.py +229 -10
- package/tooling/scripts/lib/cost_display.py +20 -45
- package/tooling/scripts/lib/cost_tracker.py +462 -15
- package/tooling/scripts/lib/currency_converter.py +28 -5
- package/tooling/scripts/lib/pair_programming.py +102 -3
- package/tooling/scripts/lib/personality_system.py +949 -0
- package/tooling/scripts/lib/platform.py +55 -0
- package/tooling/scripts/lib/shared_memory.py +9 -3
- package/tooling/scripts/lib/swarm_orchestrator.py +514 -75
- package/tooling/scripts/lib/validation_loop.py +1014 -0
- package/tooling/scripts/memory_summarize.py +9 -2
- package/tooling/scripts/new-doc.py +2 -9
- package/tooling/scripts/personalize_agent.py +1 -12
- package/tooling/scripts/rollback-migration.sh +60 -60
- package/tooling/scripts/run-collab.ps1 +16 -16
- package/tooling/scripts/run-collab.py +88 -53
- package/tooling/scripts/run-collab.sh +4 -4
- package/tooling/scripts/run-story.py +278 -20
- package/tooling/scripts/run-story.sh +3 -3
- package/tooling/scripts/setup-checkpoint-service.py +2 -9
- package/tooling/scripts/tech-debt-tracker.py +1 -12
- package/tooling/scripts/test_adversarial_swarm.py +452 -0
- package/tooling/scripts/validate-overrides.py +1 -10
- package/tooling/scripts/validate-overrides.sh +40 -40
- package/tooling/scripts/validate_loop.py +162 -0
- package/tooling/scripts/validate_setup.py +2 -30
- package/.claude/skills/init/SKILL.md +0 -496
|
@@ -288,7 +288,7 @@ main() {
|
|
|
288
288
|
swarm_args="$swarm_args --max-iterations $max_iterations"
|
|
289
289
|
python3 "$SCRIPT_DIR/run-collab.py" $swarm_args
|
|
290
290
|
local exit_code=$?
|
|
291
|
-
|
|
291
|
+
|
|
292
292
|
if [[ $exit_code -eq 0 && "$AUTO_COMMIT" == "true" ]]; then
|
|
293
293
|
auto_commit_changes "$story_key"
|
|
294
294
|
fi
|
|
@@ -298,7 +298,7 @@ main() {
|
|
|
298
298
|
echo ""
|
|
299
299
|
python3 "$SCRIPT_DIR/run-collab.py" "$story_key" --pair --max-revisions "$max_iterations"
|
|
300
300
|
local exit_code=$?
|
|
301
|
-
|
|
301
|
+
|
|
302
302
|
if [[ $exit_code -eq 0 && "$AUTO_COMMIT" == "true" ]]; then
|
|
303
303
|
auto_commit_changes "$story_key"
|
|
304
304
|
fi
|
|
@@ -308,7 +308,7 @@ main() {
|
|
|
308
308
|
echo ""
|
|
309
309
|
python3 "$SCRIPT_DIR/run-collab.py" "$story_key" --auto
|
|
310
310
|
local exit_code=$?
|
|
311
|
-
|
|
311
|
+
|
|
312
312
|
if [[ $exit_code -eq 0 && "$AUTO_COMMIT" == "true" ]]; then
|
|
313
313
|
auto_commit_changes "$story_key"
|
|
314
314
|
fi
|
|
@@ -22,16 +22,9 @@ import sys
|
|
|
22
22
|
from pathlib import Path
|
|
23
23
|
|
|
24
24
|
SCRIPT_DIR = Path(__file__).parent
|
|
25
|
+
sys.path.insert(0, str(SCRIPT_DIR / "lib"))
|
|
25
26
|
|
|
26
|
-
|
|
27
|
-
def get_platform():
|
|
28
|
-
"""Detect the current platform."""
|
|
29
|
-
if sys.platform == "win32":
|
|
30
|
-
return "windows"
|
|
31
|
-
elif sys.platform == "darwin":
|
|
32
|
-
return "macos"
|
|
33
|
-
else:
|
|
34
|
-
return "linux"
|
|
27
|
+
from platform import get_platform
|
|
35
28
|
|
|
36
29
|
|
|
37
30
|
def run_windows(action):
|
|
@@ -28,18 +28,7 @@ from datetime import datetime
|
|
|
28
28
|
from pathlib import Path
|
|
29
29
|
from typing import Any
|
|
30
30
|
|
|
31
|
-
|
|
32
|
-
# Colors for terminal output
|
|
33
|
-
class Colors:
|
|
34
|
-
RED = "\033[0;31m"
|
|
35
|
-
GREEN = "\033[0;32m"
|
|
36
|
-
YELLOW = "\033[1;33m"
|
|
37
|
-
BLUE = "\033[0;34m"
|
|
38
|
-
CYAN = "\033[0;36m"
|
|
39
|
-
MAGENTA = "\033[0;35m"
|
|
40
|
-
BOLD = "\033[1m"
|
|
41
|
-
NC = "\033[0m"
|
|
42
|
-
|
|
31
|
+
from lib.colors import Colors
|
|
43
32
|
|
|
44
33
|
# Debt indicator patterns
|
|
45
34
|
DEBT_PATTERNS = {
|
|
@@ -0,0 +1,452 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Adversarial Swarm Test Harness and Performance Tracker
|
|
4
|
+
|
|
5
|
+
Tests the adversarial swarm system and tracks performance metrics
|
|
6
|
+
to identify trends like diminishing returns.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python3 tooling/scripts/test_adversarial_swarm.py [--runs N] [--plot]
|
|
10
|
+
|
|
11
|
+
Metrics tracked per round:
|
|
12
|
+
- New arguments introduced
|
|
13
|
+
- Challenges raised
|
|
14
|
+
- Concessions made
|
|
15
|
+
- Agreement score delta
|
|
16
|
+
- Token usage
|
|
17
|
+
- Unique issues identified
|
|
18
|
+
|
|
19
|
+
Outputs:
|
|
20
|
+
- JSON results in tooling/.automation/benchmarks/
|
|
21
|
+
- Performance plots (if --plot flag used)
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
import argparse
|
|
25
|
+
import json
|
|
26
|
+
import sys
|
|
27
|
+
from dataclasses import asdict, dataclass, field
|
|
28
|
+
from datetime import datetime
|
|
29
|
+
from pathlib import Path
|
|
30
|
+
from typing import Optional
|
|
31
|
+
|
|
32
|
+
# Add lib to path
|
|
33
|
+
sys.path.insert(0, str(Path(__file__).parent / "lib"))
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
from personality_system import (
|
|
37
|
+
ConvergenceDetector,
|
|
38
|
+
PersonalitySelector,
|
|
39
|
+
)
|
|
40
|
+
except ImportError:
|
|
41
|
+
print("[ERROR] Could not import personality_system. Run from project root.")
|
|
42
|
+
sys.exit(1)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
PROJECT_ROOT = Path(__file__).parent.parent.parent
|
|
46
|
+
BENCHMARK_DIR = PROJECT_ROOT / "tooling" / ".automation" / "benchmarks"
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class RoundMetrics:
|
|
51
|
+
"""Metrics for a single debate round."""
|
|
52
|
+
|
|
53
|
+
round_num: int
|
|
54
|
+
new_arguments: int = 0
|
|
55
|
+
challenges_raised: int = 0
|
|
56
|
+
concessions_made: int = 0
|
|
57
|
+
agreement_score: float = 0.0
|
|
58
|
+
agreement_delta: float = 0.0
|
|
59
|
+
unique_issues: int = 0
|
|
60
|
+
tokens_used: int = 0
|
|
61
|
+
cost_usd: float = 0.0
|
|
62
|
+
positions_changed: int = 0 # How many agents changed position
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@dataclass
|
|
66
|
+
class SwarmBenchmarkResult:
|
|
67
|
+
"""Complete benchmark result for a swarm run."""
|
|
68
|
+
|
|
69
|
+
test_id: str
|
|
70
|
+
task: str
|
|
71
|
+
agents: list[str]
|
|
72
|
+
personas_used: list[str]
|
|
73
|
+
total_rounds: int
|
|
74
|
+
termination_reason: str
|
|
75
|
+
final_agreement_score: float
|
|
76
|
+
total_tokens: int
|
|
77
|
+
total_cost_usd: float
|
|
78
|
+
rounds: list[RoundMetrics] = field(default_factory=list)
|
|
79
|
+
timestamp: str = ""
|
|
80
|
+
duration_seconds: float = 0.0
|
|
81
|
+
|
|
82
|
+
# Derived metrics
|
|
83
|
+
arguments_per_round: list[int] = field(default_factory=list)
|
|
84
|
+
agreement_progression: list[float] = field(default_factory=list)
|
|
85
|
+
marginal_value: list[float] = field(default_factory=list) # Value gained per round
|
|
86
|
+
|
|
87
|
+
def to_dict(self) -> dict:
|
|
88
|
+
result = {
|
|
89
|
+
"test_id": self.test_id,
|
|
90
|
+
"task": self.task,
|
|
91
|
+
"agents": self.agents,
|
|
92
|
+
"personas_used": self.personas_used,
|
|
93
|
+
"total_rounds": self.total_rounds,
|
|
94
|
+
"termination_reason": self.termination_reason,
|
|
95
|
+
"final_agreement_score": self.final_agreement_score,
|
|
96
|
+
"total_tokens": self.total_tokens,
|
|
97
|
+
"total_cost_usd": self.total_cost_usd,
|
|
98
|
+
"timestamp": self.timestamp,
|
|
99
|
+
"duration_seconds": self.duration_seconds,
|
|
100
|
+
"rounds": [asdict(r) for r in self.rounds],
|
|
101
|
+
"arguments_per_round": self.arguments_per_round,
|
|
102
|
+
"agreement_progression": self.agreement_progression,
|
|
103
|
+
"marginal_value": self.marginal_value,
|
|
104
|
+
}
|
|
105
|
+
return result
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class AdversarialSwarmTester:
|
|
109
|
+
"""Tests the adversarial swarm and collects metrics."""
|
|
110
|
+
|
|
111
|
+
def __init__(self, output_dir: Optional[Path] = None):
|
|
112
|
+
self.output_dir = output_dir or BENCHMARK_DIR
|
|
113
|
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
114
|
+
self.results: list[SwarmBenchmarkResult] = []
|
|
115
|
+
|
|
116
|
+
def run_simulated_test(
|
|
117
|
+
self,
|
|
118
|
+
task: str,
|
|
119
|
+
agents: list[str],
|
|
120
|
+
max_rounds: int = 3,
|
|
121
|
+
) -> SwarmBenchmarkResult:
|
|
122
|
+
"""Run a simulated test without actual LLM calls.
|
|
123
|
+
|
|
124
|
+
This tests the personality selection and convergence detection
|
|
125
|
+
without consuming tokens.
|
|
126
|
+
"""
|
|
127
|
+
import random
|
|
128
|
+
import time
|
|
129
|
+
|
|
130
|
+
test_id = f"sim_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
|
131
|
+
start_time = time.time()
|
|
132
|
+
|
|
133
|
+
print(f"\n[TEST] Starting simulated adversarial swarm: {test_id}")
|
|
134
|
+
print(f" Task: {task[:60]}...")
|
|
135
|
+
print(f" Agents: {', '.join(agents)}")
|
|
136
|
+
|
|
137
|
+
# Select personas
|
|
138
|
+
selector = PersonalitySelector()
|
|
139
|
+
personas = selector.select_adversarial_personas(task, len(agents), agents)
|
|
140
|
+
|
|
141
|
+
print(" Selected personas:")
|
|
142
|
+
for p in personas:
|
|
143
|
+
stance = p.adversarial_stance.primary_concern if p.adversarial_stance else "general"
|
|
144
|
+
print(f" - {p.name} ({p.agent_type}) [Focus: {stance}]")
|
|
145
|
+
|
|
146
|
+
# Simulate debate rounds
|
|
147
|
+
_detector = ConvergenceDetector(similarity_threshold=0.8, stability_rounds=2)
|
|
148
|
+
rounds: list[RoundMetrics] = []
|
|
149
|
+
prev_agreement = 0.0
|
|
150
|
+
cumulative_arguments = set()
|
|
151
|
+
|
|
152
|
+
for round_num in range(max_rounds):
|
|
153
|
+
# Simulate round metrics (decreasing novelty over rounds)
|
|
154
|
+
decay_factor = 0.7**round_num
|
|
155
|
+
new_args = int(random.randint(3, 8) * decay_factor) + 1
|
|
156
|
+
challenges = int(random.randint(2, 5) * decay_factor)
|
|
157
|
+
concessions = int(random.randint(0, 2) * (1 - decay_factor) + round_num * 0.5)
|
|
158
|
+
|
|
159
|
+
# Simulate agreement increasing over rounds
|
|
160
|
+
agreement_increase = random.uniform(0.1, 0.25) * decay_factor
|
|
161
|
+
agreement = min(1.0, prev_agreement + agreement_increase)
|
|
162
|
+
|
|
163
|
+
# Add arguments to cumulative set
|
|
164
|
+
for i in range(new_args):
|
|
165
|
+
cumulative_arguments.add(f"arg_{round_num}_{i}")
|
|
166
|
+
|
|
167
|
+
# Calculate marginal value (new unique insights / tokens)
|
|
168
|
+
tokens = random.randint(500, 1500)
|
|
169
|
+
cost = tokens * 0.00001 # Rough estimate
|
|
170
|
+
|
|
171
|
+
round_metrics = RoundMetrics(
|
|
172
|
+
round_num=round_num,
|
|
173
|
+
new_arguments=new_args,
|
|
174
|
+
challenges_raised=challenges,
|
|
175
|
+
concessions_made=concessions,
|
|
176
|
+
agreement_score=agreement,
|
|
177
|
+
agreement_delta=agreement - prev_agreement,
|
|
178
|
+
unique_issues=max(0, int((8 - round_num) * decay_factor)),
|
|
179
|
+
tokens_used=tokens,
|
|
180
|
+
cost_usd=cost,
|
|
181
|
+
positions_changed=max(0, int(len(agents) * decay_factor * 0.5)),
|
|
182
|
+
)
|
|
183
|
+
rounds.append(round_metrics)
|
|
184
|
+
|
|
185
|
+
print(
|
|
186
|
+
f" Round {round_num + 1}: Agreement={agreement:.0%}, "
|
|
187
|
+
f"NewArgs={new_args}, Challenges={challenges}, Concessions={concessions}"
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
prev_agreement = agreement
|
|
191
|
+
|
|
192
|
+
# Check for simulated convergence
|
|
193
|
+
if agreement > 0.85 and round_num >= 1:
|
|
194
|
+
print(" [CONVERGED] High agreement reached")
|
|
195
|
+
break
|
|
196
|
+
|
|
197
|
+
# Build result
|
|
198
|
+
duration = time.time() - start_time
|
|
199
|
+
result = SwarmBenchmarkResult(
|
|
200
|
+
test_id=test_id,
|
|
201
|
+
task=task,
|
|
202
|
+
agents=agents,
|
|
203
|
+
personas_used=[p.name for p in personas],
|
|
204
|
+
total_rounds=len(rounds),
|
|
205
|
+
termination_reason="convergence" if prev_agreement > 0.85 else "max_rounds",
|
|
206
|
+
final_agreement_score=prev_agreement,
|
|
207
|
+
total_tokens=sum(r.tokens_used for r in rounds),
|
|
208
|
+
total_cost_usd=sum(r.cost_usd for r in rounds),
|
|
209
|
+
rounds=rounds,
|
|
210
|
+
timestamp=datetime.now().isoformat(),
|
|
211
|
+
duration_seconds=duration,
|
|
212
|
+
arguments_per_round=[r.new_arguments for r in rounds],
|
|
213
|
+
agreement_progression=[r.agreement_score for r in rounds],
|
|
214
|
+
marginal_value=self._calculate_marginal_value(rounds),
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
self.results.append(result)
|
|
218
|
+
return result
|
|
219
|
+
|
|
220
|
+
def _calculate_marginal_value(self, rounds: list[RoundMetrics]) -> list[float]:
|
|
221
|
+
"""Calculate marginal value (insight gained per token spent) per round."""
|
|
222
|
+
marginal = []
|
|
223
|
+
for _i, r in enumerate(rounds):
|
|
224
|
+
if r.tokens_used == 0:
|
|
225
|
+
marginal.append(0.0)
|
|
226
|
+
else:
|
|
227
|
+
# Value = (new arguments + challenges + agreement delta * 10) / tokens
|
|
228
|
+
value = r.new_arguments + r.challenges_raised + r.agreement_delta * 10
|
|
229
|
+
marginal.append(value / (r.tokens_used / 1000)) # Per 1K tokens
|
|
230
|
+
return marginal
|
|
231
|
+
|
|
232
|
+
def run_batch_tests(self, num_runs: int = 5) -> list[SwarmBenchmarkResult]:
|
|
233
|
+
"""Run a batch of simulated tests with different tasks."""
|
|
234
|
+
test_tasks = [
|
|
235
|
+
"Design a secure authentication system with OAuth2 and JWT",
|
|
236
|
+
"Implement a caching layer for the API with Redis",
|
|
237
|
+
"Refactor the monolith into microservices",
|
|
238
|
+
"Add rate limiting to protect against DDoS",
|
|
239
|
+
"Design a real-time notification system",
|
|
240
|
+
"Implement a data pipeline for analytics",
|
|
241
|
+
"Create a plugin architecture for extensibility",
|
|
242
|
+
"Design a multi-tenant database schema",
|
|
243
|
+
"Implement end-to-end encryption for messages",
|
|
244
|
+
"Build a recommendation engine using collaborative filtering",
|
|
245
|
+
]
|
|
246
|
+
|
|
247
|
+
agent_combos = [
|
|
248
|
+
["ARCHITECT", "DEV", "REVIEWER"],
|
|
249
|
+
["DEV", "REVIEWER", "SECURITY"],
|
|
250
|
+
["ARCHITECT", "DEV", "MAINTAINER"],
|
|
251
|
+
]
|
|
252
|
+
|
|
253
|
+
results = []
|
|
254
|
+
for i in range(num_runs):
|
|
255
|
+
task = test_tasks[i % len(test_tasks)]
|
|
256
|
+
agents = agent_combos[i % len(agent_combos)]
|
|
257
|
+
result = self.run_simulated_test(task, agents)
|
|
258
|
+
results.append(result)
|
|
259
|
+
|
|
260
|
+
return results
|
|
261
|
+
|
|
262
|
+
def save_results(self, filename: Optional[str] = None):
|
|
263
|
+
"""Save benchmark results to JSON."""
|
|
264
|
+
if not filename:
|
|
265
|
+
filename = f"benchmark_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
|
266
|
+
|
|
267
|
+
filepath = self.output_dir / filename
|
|
268
|
+
with open(filepath, "w") as f:
|
|
269
|
+
json.dump([r.to_dict() for r in self.results], f, indent=2)
|
|
270
|
+
|
|
271
|
+
print(f"\n[OK] Results saved to {filepath}")
|
|
272
|
+
return filepath
|
|
273
|
+
|
|
274
|
+
def generate_summary(self) -> dict:
|
|
275
|
+
"""Generate summary statistics from all runs."""
|
|
276
|
+
if not self.results:
|
|
277
|
+
return {}
|
|
278
|
+
|
|
279
|
+
total_runs = len(self.results)
|
|
280
|
+
avg_rounds = sum(r.total_rounds for r in self.results) / total_runs
|
|
281
|
+
avg_agreement = sum(r.final_agreement_score for r in self.results) / total_runs
|
|
282
|
+
avg_cost = sum(r.total_cost_usd for r in self.results) / total_runs
|
|
283
|
+
|
|
284
|
+
# Calculate average marginal value per round across all runs
|
|
285
|
+
max_rounds = max(r.total_rounds for r in self.results)
|
|
286
|
+
avg_marginal_by_round = []
|
|
287
|
+
|
|
288
|
+
for round_idx in range(max_rounds):
|
|
289
|
+
values = []
|
|
290
|
+
for result in self.results:
|
|
291
|
+
if round_idx < len(result.marginal_value):
|
|
292
|
+
values.append(result.marginal_value[round_idx])
|
|
293
|
+
if values:
|
|
294
|
+
avg_marginal_by_round.append(sum(values) / len(values))
|
|
295
|
+
|
|
296
|
+
# Identify diminishing returns point
|
|
297
|
+
diminishing_point = None
|
|
298
|
+
for i in range(1, len(avg_marginal_by_round)):
|
|
299
|
+
if avg_marginal_by_round[i] < avg_marginal_by_round[i - 1] * 0.5:
|
|
300
|
+
diminishing_point = i + 1
|
|
301
|
+
break
|
|
302
|
+
|
|
303
|
+
return {
|
|
304
|
+
"total_runs": total_runs,
|
|
305
|
+
"avg_rounds": avg_rounds,
|
|
306
|
+
"avg_agreement_score": avg_agreement,
|
|
307
|
+
"avg_cost_usd": avg_cost,
|
|
308
|
+
"avg_marginal_value_by_round": avg_marginal_by_round,
|
|
309
|
+
"diminishing_returns_round": diminishing_point,
|
|
310
|
+
"convergence_rate": sum(
|
|
311
|
+
1 for r in self.results if r.termination_reason == "convergence"
|
|
312
|
+
)
|
|
313
|
+
/ total_runs,
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def plot_results(results: list[SwarmBenchmarkResult], output_path: Optional[Path] = None):
|
|
318
|
+
"""Generate performance plots from benchmark results."""
|
|
319
|
+
try:
|
|
320
|
+
import matplotlib.pyplot as plt
|
|
321
|
+
except ImportError:
|
|
322
|
+
print("[WARNING] matplotlib not installed. Run: pip install matplotlib")
|
|
323
|
+
return
|
|
324
|
+
|
|
325
|
+
if not results:
|
|
326
|
+
print("[WARNING] No results to plot")
|
|
327
|
+
return
|
|
328
|
+
|
|
329
|
+
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
|
|
330
|
+
fig.suptitle("Adversarial Swarm Performance Analysis", fontsize=14, fontweight="bold")
|
|
331
|
+
|
|
332
|
+
# Plot 1: Agreement progression over rounds
|
|
333
|
+
ax1 = axes[0, 0]
|
|
334
|
+
for result in results:
|
|
335
|
+
ax1.plot(
|
|
336
|
+
range(1, len(result.agreement_progression) + 1),
|
|
337
|
+
result.agreement_progression,
|
|
338
|
+
marker="o",
|
|
339
|
+
alpha=0.7,
|
|
340
|
+
label=result.test_id[:12],
|
|
341
|
+
)
|
|
342
|
+
ax1.set_xlabel("Round")
|
|
343
|
+
ax1.set_ylabel("Agreement Score")
|
|
344
|
+
ax1.set_title("Agreement Progression Over Rounds")
|
|
345
|
+
ax1.set_ylim(0, 1.1)
|
|
346
|
+
ax1.grid(True, alpha=0.3)
|
|
347
|
+
|
|
348
|
+
# Plot 2: New arguments per round (diminishing returns)
|
|
349
|
+
ax2 = axes[0, 1]
|
|
350
|
+
for result in results:
|
|
351
|
+
ax2.plot(
|
|
352
|
+
range(1, len(result.arguments_per_round) + 1),
|
|
353
|
+
result.arguments_per_round,
|
|
354
|
+
marker="s",
|
|
355
|
+
alpha=0.7,
|
|
356
|
+
)
|
|
357
|
+
ax2.set_xlabel("Round")
|
|
358
|
+
ax2.set_ylabel("New Arguments")
|
|
359
|
+
ax2.set_title("New Arguments Per Round (Diminishing Returns)")
|
|
360
|
+
ax2.grid(True, alpha=0.3)
|
|
361
|
+
|
|
362
|
+
# Plot 3: Marginal value per round
|
|
363
|
+
ax3 = axes[1, 0]
|
|
364
|
+
for result in results:
|
|
365
|
+
ax3.plot(
|
|
366
|
+
range(1, len(result.marginal_value) + 1),
|
|
367
|
+
result.marginal_value,
|
|
368
|
+
marker="^",
|
|
369
|
+
alpha=0.7,
|
|
370
|
+
)
|
|
371
|
+
ax3.set_xlabel("Round")
|
|
372
|
+
ax3.set_ylabel("Marginal Value (per 1K tokens)")
|
|
373
|
+
ax3.set_title("Marginal Value Per Round")
|
|
374
|
+
ax3.grid(True, alpha=0.3)
|
|
375
|
+
|
|
376
|
+
# Plot 4: Cost vs Agreement scatter
|
|
377
|
+
ax4 = axes[1, 1]
|
|
378
|
+
costs = [r.total_cost_usd for r in results]
|
|
379
|
+
agreements = [r.final_agreement_score for r in results]
|
|
380
|
+
rounds = [r.total_rounds for r in results]
|
|
381
|
+
|
|
382
|
+
scatter = ax4.scatter(costs, agreements, c=rounds, cmap="viridis", s=100, alpha=0.7)
|
|
383
|
+
ax4.set_xlabel("Total Cost (USD)")
|
|
384
|
+
ax4.set_ylabel("Final Agreement Score")
|
|
385
|
+
ax4.set_title("Cost vs Agreement (color = rounds)")
|
|
386
|
+
ax4.grid(True, alpha=0.3)
|
|
387
|
+
plt.colorbar(scatter, ax=ax4, label="Rounds")
|
|
388
|
+
|
|
389
|
+
plt.tight_layout()
|
|
390
|
+
|
|
391
|
+
if output_path:
|
|
392
|
+
plt.savefig(output_path, dpi=150, bbox_inches="tight")
|
|
393
|
+
print(f"[OK] Plot saved to {output_path}")
|
|
394
|
+
else:
|
|
395
|
+
plt.show()
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
def main():
|
|
399
|
+
parser = argparse.ArgumentParser(description="Test adversarial swarm performance")
|
|
400
|
+
parser.add_argument("--runs", type=int, default=5, help="Number of test runs")
|
|
401
|
+
parser.add_argument("--plot", action="store_true", help="Generate performance plots")
|
|
402
|
+
parser.add_argument("--output", type=str, help="Output filename for results")
|
|
403
|
+
args = parser.parse_args()
|
|
404
|
+
|
|
405
|
+
print("=" * 60)
|
|
406
|
+
print(" ADVERSARIAL SWARM TEST HARNESS")
|
|
407
|
+
print("=" * 60)
|
|
408
|
+
|
|
409
|
+
tester = AdversarialSwarmTester()
|
|
410
|
+
|
|
411
|
+
# Run batch tests
|
|
412
|
+
print(f"\n[INFO] Running {args.runs} simulated tests...")
|
|
413
|
+
results = tester.run_batch_tests(args.runs)
|
|
414
|
+
|
|
415
|
+
# Save results
|
|
416
|
+
tester.save_results(args.output)
|
|
417
|
+
|
|
418
|
+
# Generate summary
|
|
419
|
+
summary = tester.generate_summary()
|
|
420
|
+
print("\n" + "=" * 60)
|
|
421
|
+
print(" SUMMARY")
|
|
422
|
+
print("=" * 60)
|
|
423
|
+
print(f" Total runs: {summary['total_runs']}")
|
|
424
|
+
print(f" Average rounds: {summary['avg_rounds']:.1f}")
|
|
425
|
+
print(f" Average agreement: {summary['avg_agreement_score']:.0%}")
|
|
426
|
+
print(f" Average cost: ${summary['avg_cost_usd']:.4f}")
|
|
427
|
+
print(f" Convergence rate: {summary['convergence_rate']:.0%}")
|
|
428
|
+
|
|
429
|
+
if summary.get("diminishing_returns_round"):
|
|
430
|
+
print(
|
|
431
|
+
f"\n [INSIGHT] Diminishing returns detected at round {summary['diminishing_returns_round']}"
|
|
432
|
+
)
|
|
433
|
+
print(" Consider limiting debates to this many rounds for efficiency.")
|
|
434
|
+
|
|
435
|
+
print("\n Marginal value by round:")
|
|
436
|
+
for i, val in enumerate(summary.get("avg_marginal_value_by_round", [])):
|
|
437
|
+
bar = "[" + "=" * int(val * 2) + " " * (20 - int(val * 2)) + "]"
|
|
438
|
+
print(f" Round {i + 1}: {bar} {val:.2f}")
|
|
439
|
+
|
|
440
|
+
# Generate plots
|
|
441
|
+
if args.plot:
|
|
442
|
+
plot_path = (
|
|
443
|
+
BENCHMARK_DIR / f"performance_plot_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
|
|
444
|
+
)
|
|
445
|
+
plot_results(results, plot_path)
|
|
446
|
+
|
|
447
|
+
print("\n[OK] Test harness complete")
|
|
448
|
+
return 0
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
if __name__ == "__main__":
|
|
452
|
+
sys.exit(main())
|
|
@@ -28,16 +28,7 @@ from dataclasses import dataclass, field
|
|
|
28
28
|
from pathlib import Path
|
|
29
29
|
from typing import Any
|
|
30
30
|
|
|
31
|
-
|
|
32
|
-
# Colors for terminal output
|
|
33
|
-
class Colors:
|
|
34
|
-
RED = "\033[0;31m"
|
|
35
|
-
GREEN = "\033[0;32m"
|
|
36
|
-
YELLOW = "\033[1;33m"
|
|
37
|
-
BLUE = "\033[0;34m"
|
|
38
|
-
CYAN = "\033[0;36m"
|
|
39
|
-
NC = "\033[0m" # No Color
|
|
40
|
-
|
|
31
|
+
from lib.colors import Colors
|
|
41
32
|
|
|
42
33
|
# Valid values
|
|
43
34
|
VALID_MODELS = ["sonnet", "opus", "haiku"]
|