create-byan-agent 2.4.6 → 2.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +12 -0
- package/README.md +289 -289
- package/install/templates/.claude/CLAUDE.md +24 -0
- package/install/templates/.claude/rules/byan-agents.md +3 -1
- package/install/templates/.claude/rules/elo-trust.md +78 -0
- package/install/templates/.claude/rules/fact-check.md +109 -0
- package/install/templates/workers/fact-check-worker.js +102 -0
- package/package.json +2 -2
- package/src/byan-v2/context/session-state.js +17 -1
- package/src/byan-v2/dispatcher/five-whys-analyzer.js +11 -2
- package/src/byan-v2/elo/challenge-evaluator.js +121 -0
- package/src/byan-v2/elo/domain-config.js +129 -0
- package/src/byan-v2/elo/elo-store.js +159 -0
- package/src/byan-v2/elo/glicko2.js +105 -0
- package/src/byan-v2/elo/index.js +163 -0
- package/src/byan-v2/elo/llm-router.js +47 -0
- package/src/byan-v2/elo/pedagogy-layer.js +170 -0
- package/src/byan-v2/fact-check/claim-parser.js +51 -0
- package/src/byan-v2/fact-check/fact-sheet.js +96 -0
- package/src/byan-v2/fact-check/index.js +263 -0
- package/src/byan-v2/fact-check/knowledge-graph.js +152 -0
- package/src/byan-v2/fact-check/level-scorer.js +45 -0
- package/src/byan-v2/index.js +155 -1
- package/src/byan-v2/orchestrator/glossary-builder.js +8 -1
- package/install/MARC-COPILOT-CLI-TEST-GUIDE.md +0 -441
- package/install/MARC-VALIDATION-REPORT.md +0 -629
- package/install/MARC-VALIDATION-SUMMARY.md +0 -220
- package/install/README-PUBLICATION-V1.0.4.md +0 -291
|
@@ -42,3 +42,27 @@ Voir @.claude/rules/hermes-dispatcher.md pour les commandes Hermes.
|
|
|
42
42
|
- `@hermes` → Dispatcher universel (recommandations, routage, pipelines)
|
|
43
43
|
- Agent disponibles: voir @.claude/rules/byan-agents.md
|
|
44
44
|
- Methodologie: voir @.claude/rules/merise-agile.md
|
|
45
|
+
- Systeme de confiance epistemique: voir @.claude/rules/elo-trust.md
|
|
46
|
+
- Protocol fact-check scientifique: voir @.claude/rules/fact-check.md
|
|
47
|
+
|
|
48
|
+
## ELO Trust System
|
|
49
|
+
|
|
50
|
+
BYAN calibre l'intensite de ses challenges selon votre score ELO par domaine.
|
|
51
|
+
Score bas → explications pedagogiques et scaffolding. Score eleve → aller droit au but.
|
|
52
|
+
|
|
53
|
+
Commandes CLI:
|
|
54
|
+
- `node bin/byan-v2-cli.js elo summary` — voir tous les scores par domaine
|
|
55
|
+
- `node bin/byan-v2-cli.js elo dashboard {domain}` — detail d'un domaine
|
|
56
|
+
- `node bin/byan-v2-cli.js elo declare {domain} {level}` — declarer son expertise (junior/mid/senior/lead/expert)
|
|
57
|
+
|
|
58
|
+
Dans l'agent BYAN, tapez `[ELO]` pour acceder au menu ELO.
|
|
59
|
+
|
|
60
|
+
## Fact-Check Scientifique
|
|
61
|
+
|
|
62
|
+
BYAN applique Zero Trust sur lui-meme : tout claim doit etre demonstrable, quantifiable, reproductible.
|
|
63
|
+
4 types d'assertions : `[REASONING]` `[HYPOTHESIS]` `[CLAIM Ln]` `[FACT USER-VERIFIED]`
|
|
64
|
+
5 niveaux de preuve : L1 (spec officielle, 95%) → L5 (opinion, 20%)
|
|
65
|
+
Domaines stricts : security/performance/compliance → LEVEL-2 minimum sinon BLOCKED.
|
|
66
|
+
|
|
67
|
+
Agent dédié: `@fact-checker` — analyse assertions, audits de documents, chaines de raisonnement.
|
|
68
|
+
Dans BYAN: tapez `[FC]` pour le sous-menu fact-check.
|
|
@@ -13,7 +13,8 @@
|
|
|
13
13
|
|
|
14
14
|
| Agent | Persona | Role |
|
|
15
15
|
|-------|---------|------|
|
|
16
|
-
| **byan** | Builder | Createur d'agents via interview (12 questions, 64 mantras) |
|
|
16
|
+
| **byan** | Builder | Createur d'agents via interview (12 questions, 64 mantras) — [FC] + [ELO] intégrés |
|
|
17
|
+
| **fact-checker** | Scientifique | Fact-check: assertions, audits de documents, chaines de raisonnement |
|
|
17
18
|
| **agent-builder** | Constructeur | Expert en construction d'agents |
|
|
18
19
|
| **marc** | Specialiste | Integration GitHub Copilot |
|
|
19
20
|
| **rachid** | Specialiste | Deploiement NPM/NPX |
|
|
@@ -63,6 +64,7 @@
|
|
|
63
64
|
| `code-review` | Revoir du code |
|
|
64
65
|
| `quick-spec` | Spec rapide conversationnelle |
|
|
65
66
|
| `quick-dev` | Dev rapide (brownfield) |
|
|
67
|
+
| `elo-workflow` | Consulter et gerer le score ELO (via menu [ELO] du BYAN) |
|
|
66
68
|
|
|
67
69
|
## Comment Invoquer un Agent
|
|
68
70
|
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# ELO Trust System — Epistemic Trust Protocol
|
|
2
|
+
|
|
3
|
+
## Principe
|
|
4
|
+
|
|
5
|
+
BYAN mesure la fiabilite des assertions de l'utilisateur par domaine technique
|
|
6
|
+
en utilisant un algorithme Glicko-2 simplifie (echelle 0-1000).
|
|
7
|
+
Plus le score est eleve, moins le challenge est intense et plus la reponse est concise.
|
|
8
|
+
|
|
9
|
+
## Domaines suportees
|
|
10
|
+
|
|
11
|
+
| Domaine | K-factor |
|
|
12
|
+
|---------|----------|
|
|
13
|
+
| security | ×1.5 |
|
|
14
|
+
| compliance | ×1.5 |
|
|
15
|
+
| performance | ×1.2 |
|
|
16
|
+
| javascript, typescript, nodejs, python, rust, go | ×1.0 |
|
|
17
|
+
| algorithms | ×0.8 |
|
|
18
|
+
|
|
19
|
+
## Paliers ELO
|
|
20
|
+
|
|
21
|
+
| Plage | Label | Comportement BYAN |
|
|
22
|
+
|-------|-------|-------------------|
|
|
23
|
+
| 0-200 | Apprenti | Explications completes, analogies, scaffold maximal |
|
|
24
|
+
| 201-450 | Debutant | Guide pas-a-pas, verification frequente |
|
|
25
|
+
| 450-550 | Zone morte | Challenge intense (Dunning-Kruger peak) |
|
|
26
|
+
| 551-750 | Intermediaire | Challenge modere, hypotheses testees |
|
|
27
|
+
| 751-900 | Avance | Challenge minimal, discussion paire-a-paire |
|
|
28
|
+
| 901-1000 | Expert | Reponses courtes, pas d'explications basiques |
|
|
29
|
+
|
|
30
|
+
## Routage LLM (experimental)
|
|
31
|
+
|
|
32
|
+
| ELO max | Modele |
|
|
33
|
+
|---------|--------|
|
|
34
|
+
| 0-200 | claude-opus (raisonnement profond) |
|
|
35
|
+
| 201-600 | claude-sonnet (equilibre) |
|
|
36
|
+
| 601+ | claude-haiku (concis, expert autonome) |
|
|
37
|
+
|
|
38
|
+
## Protocole de challenge
|
|
39
|
+
|
|
40
|
+
Quand l'agent BYAN evalue un claim sur un domaine:
|
|
41
|
+
1. Recupere le score ELO du domaine via `node bin/byan-v2-cli.js elo context {domain}`
|
|
42
|
+
2. Applique le `promptInstructions` retourne (ton, profondeur, scaffold)
|
|
43
|
+
3. Ton invariant: TOUJOURS curieux, JAMAIS accusatoire ("qu'est-ce qui t'a amene a ca?" vs "c'est faux")
|
|
44
|
+
4. Apres echange: enregistre le resultat `VALIDATED | BLOCKED | PARTIAL` via CLI
|
|
45
|
+
5. Ce protocole est silencieux — l'utilisateur voit seulement le challenge, pas les mecaniques ELO
|
|
46
|
+
|
|
47
|
+
## Mecaniques speciales (V2)
|
|
48
|
+
|
|
49
|
+
- **Tilt detector**: 3 BLOCKED consecutifs → BYAN propose une pause pedagogique
|
|
50
|
+
- **First blood**: premier claim dans un domaine vierge = toujours challenge (Zero Trust)
|
|
51
|
+
- **Zone morte 450-550**: incertitude maximale, challenge le plus nuance
|
|
52
|
+
- **ELO farming protection**: claims trop faciles → K-factor reduit automatiquement
|
|
53
|
+
- **Hot hand**: 3 corrects consecutifs → petit boost de K (puis regression vers la moyenne)
|
|
54
|
+
- **Shadow challenger**: expert (750+) peut activer un alter-ego adversarial opt-in
|
|
55
|
+
|
|
56
|
+
## Commandes CLI
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
node bin/byan-v2-cli.js elo summary # tous les domaines
|
|
60
|
+
node bin/byan-v2-cli.js elo dashboard {domain} # detail d'un domaine
|
|
61
|
+
node bin/byan-v2-cli.js elo context {domain} # contexte pour un challenge
|
|
62
|
+
node bin/byan-v2-cli.js elo record {domain} {VALIDATED|BLOCKED|PARTIAL}
|
|
63
|
+
node bin/byan-v2-cli.js elo declare {domain} {junior|mid|senior|lead|expert}
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Menu BYAN
|
|
67
|
+
|
|
68
|
+
Dans l'agent BYAN, tapez `[ELO]` pour acceder au sous-menu ELO:
|
|
69
|
+
- Dashboard par domaine
|
|
70
|
+
- Enregistrer un claim
|
|
71
|
+
- Declarer son expertise
|
|
72
|
+
- Voir le routage LLM recommande
|
|
73
|
+
|
|
74
|
+
## Philosophie
|
|
75
|
+
|
|
76
|
+
Le score ELO n'est pas une punition — c'est un outil de calibration.
|
|
77
|
+
Un score bas signifie "BYAN va t'expliquer plus, pas moins".
|
|
78
|
+
La pedagogie s'adapte au niveau, le ton reste constant: bienveillant et curieux.
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
# Fact-Check Protocol — Demonstrable, Quantifiable, Reproducible
|
|
2
|
+
|
|
3
|
+
## Principe fondateur
|
|
4
|
+
|
|
5
|
+
Tout claim emis par un agent BYAN doit satisfaire les trois criteres :
|
|
6
|
+
|
|
7
|
+
| Critere | Definition | Exemple |
|
|
8
|
+
|---------|-----------|---------|
|
|
9
|
+
| **Demonstrable** | Source primaire verifiable | RFC 7234, redis.io/benchmarks |
|
|
10
|
+
| **Quantifiable** | Precis, pas vague | "Redis > 100k ops/sec" pas "Redis est rapide" |
|
|
11
|
+
| **Reproductible** | L'utilisateur peut le tester | `redis-benchmark -n 100000` |
|
|
12
|
+
|
|
13
|
+
Un claim sans ces trois criteres = opinion ou hypothese, presente comme tel.
|
|
14
|
+
|
|
15
|
+
## Les 4 types d'assertions
|
|
16
|
+
|
|
17
|
+
Tout output d'agent BYAN est prefixe par son type :
|
|
18
|
+
|
|
19
|
+
```
|
|
20
|
+
[REASONING] Deduction logique — pas de garantie de verite
|
|
21
|
+
[HYPOTHESIS] Plausible dans ce contexte — a verifier avant action
|
|
22
|
+
[CLAIM L{n}] Assertion sourced — niveau n (1-5)
|
|
23
|
+
[FACT USER-VERIFIED date] Valide par l'utilisateur avec artefact
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Les 5 niveaux de preuve
|
|
27
|
+
|
|
28
|
+
| Niveau | Score | Sources |
|
|
29
|
+
|--------|-------|---------|
|
|
30
|
+
| LEVEL-1 | 95% | RFC, W3C, ECMAScript, POSIX, spec officielle |
|
|
31
|
+
| LEVEL-2 | 80% | Benchmark executable, CVE reference, docs produit officielles |
|
|
32
|
+
| LEVEL-3 | 65% | Article peer-reviewed, livre technique reconnu |
|
|
33
|
+
| LEVEL-4 | 50% | Consensus communaute (StackOverflow > 1000 votes) |
|
|
34
|
+
| LEVEL-5 | 20% | Opinion / experience personnelle |
|
|
35
|
+
|
|
36
|
+
## Domaines stricts
|
|
37
|
+
|
|
38
|
+
| Domaine | Niveau minimum | Sous le seuil |
|
|
39
|
+
|---------|---------------|---------------|
|
|
40
|
+
| security | LEVEL-2 | BLOCKED — CVE ou benchmark requis |
|
|
41
|
+
| performance | LEVEL-2 | BLOCKED — profiler output ou benchmark requis |
|
|
42
|
+
| compliance | LEVEL-1 | BLOCKED — texte reglementaire requis |
|
|
43
|
+
|
|
44
|
+
## Bloc FACT-CHECK standard
|
|
45
|
+
|
|
46
|
+
```
|
|
47
|
+
┌─ FACT-CHECK ──────────────────────────────────────────────────┐
|
|
48
|
+
│ Claim : [assertion mot pour mot] │
|
|
49
|
+
│ Domain : [security | performance | javascript | general] │
|
|
50
|
+
│ Verdict : [BLOCKED | CLAIM L1 | CLAIM L2 | CLAIM L3 │
|
|
51
|
+
│ | HYPOTHESIS | REASONING | UNVERIFIED] │
|
|
52
|
+
│ Source : [nom exact depuis _byan/knowledge/sources.md │
|
|
53
|
+
│ ou "aucune — preuve requise: [type exact]"] │
|
|
54
|
+
│ Confiance : [score % selon niveau] │
|
|
55
|
+
│ Challenge : [question manquante — source? reproductible?] │
|
|
56
|
+
└───────────────────────────────────────────────────────────────┘
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## Trust Score (audit de document)
|
|
60
|
+
|
|
61
|
+
```
|
|
62
|
+
Trust Score = (assertions CLAIM + FACT) / total × 100
|
|
63
|
+
Badge : A ≥ 90% | B ≥ 75% | C ≥ 60% | D ≥ 40% | F < 40%
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Regles invariantes
|
|
67
|
+
|
|
68
|
+
- NEVER generate a URL — cite only sources in `_byan/knowledge/sources.md` or user-provided
|
|
69
|
+
- ZERO TRUST ON SELF — training data = starting point, not the source
|
|
70
|
+
- TONE INVARIANT — always curious, never accusatory
|
|
71
|
+
- CHAIN WARNING — chain > 3 steps → compute multiplicative confidence; if < 60%, warn
|
|
72
|
+
|
|
73
|
+
## Commandes CLI
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
node bin/byan-v2-cli.js fc check "Redis is always faster than PostgreSQL"
|
|
77
|
+
node bin/byan-v2-cli.js fc parse "This is obviously the best approach"
|
|
78
|
+
node bin/byan-v2-cli.js fc verify "claim text" "proof artifact"
|
|
79
|
+
node bin/byan-v2-cli.js fc graph
|
|
80
|
+
node bin/byan-v2-cli.js fc sheet [session-id]
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## Agent dedié
|
|
84
|
+
|
|
85
|
+
```
|
|
86
|
+
@fact-checker # Agent Copilot CLI dédié
|
|
87
|
+
[FC] # Sous-menu dans l'agent @byan
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## Worker npm
|
|
91
|
+
|
|
92
|
+
```javascript
|
|
93
|
+
const FactCheckWorker = require('./_byan/workers/fact-check-worker');
|
|
94
|
+
const fc = new FactCheckWorker({ verbose: true });
|
|
95
|
+
|
|
96
|
+
const result = fc.check("Redis is always faster than PostgreSQL");
|
|
97
|
+
// → { assertionType: 'HYPOTHESIS', level: 5, score: 20, status: 'OPINION' }
|
|
98
|
+
|
|
99
|
+
const claims = fc.parse("This is obviously the best approach for security");
|
|
100
|
+
// → [{ matched: 'obviously', ... }]
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## Auto-detection patterns
|
|
104
|
+
|
|
105
|
+
Declencheurs automatiques (patterns BYAN) :
|
|
106
|
+
- Mots absolus : `toujours, jamais, forcement, always, never, obviously`
|
|
107
|
+
- Superlatifs : `plus rapide, mieux, optimal, faster, better, superior`
|
|
108
|
+
- Best-practices non sourcees : `bonne pratique, best practice, industry standard`
|
|
109
|
+
- Affirmations certaines : `il est clair que, prouve que, it is well known that`
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Fact-Check Worker
|
|
3
|
+
*
|
|
4
|
+
* Wraps the BYAN FactChecker for easy integration as a reusable worker.
|
|
5
|
+
* Provides claim verification, auto-detection of implicit claims, and
|
|
6
|
+
* fact sheet generation — all following the "demonstrable, quantifiable,
|
|
7
|
+
* reproducible" principle.
|
|
8
|
+
*
|
|
9
|
+
* Install path: _byan/workers/fact-check-worker.js
|
|
10
|
+
* Source: src/byan-v2/fact-check/index.js
|
|
11
|
+
*
|
|
12
|
+
* @module workers/fact-check-worker
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
const path = require('path');
|
|
16
|
+
|
|
17
|
+
class FactCheckWorker {
|
|
18
|
+
constructor(config = {}) {
|
|
19
|
+
const FactChecker = require(path.join(__dirname, '../../../src/byan-v2/fact-check/index'));
|
|
20
|
+
this.checker = new FactChecker({
|
|
21
|
+
enabled: config.enabled !== false,
|
|
22
|
+
mode: config.mode || 'offline',
|
|
23
|
+
min_level: config.min_level ?? 3,
|
|
24
|
+
strict_domains: config.strict_domains || ['security', 'performance', 'compliance'],
|
|
25
|
+
output_fact_sheet: config.output_fact_sheet !== false,
|
|
26
|
+
fact_sheet_path: config.fact_sheet_path || '_byan-output/fact-sheets',
|
|
27
|
+
graph_path: config.graph_path || '_byan/_memory/fact-graph.json'
|
|
28
|
+
});
|
|
29
|
+
this.verbose = config.verbose || false;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Check a single claim.
|
|
34
|
+
* Returns { assertionType, level, score, status, message }
|
|
35
|
+
*
|
|
36
|
+
* @param {string} claim - The assertion to evaluate
|
|
37
|
+
* @param {object} opts - { domain, level, source, proof }
|
|
38
|
+
* @returns {object}
|
|
39
|
+
*/
|
|
40
|
+
check(claim, opts = {}) {
|
|
41
|
+
if (this.verbose) console.log(`[FC] Checking claim: "${claim}"`);
|
|
42
|
+
const result = this.checker.check(claim, opts);
|
|
43
|
+
if (this.verbose) console.log(`[FC] → ${result.assertionType} L${result.level} (${result.score}%)`);
|
|
44
|
+
return result;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Scan text and auto-detect implicit claims using danger patterns.
|
|
49
|
+
* Returns array of { pattern, matched, position, excerpt }
|
|
50
|
+
*
|
|
51
|
+
* @param {string} text
|
|
52
|
+
* @returns {Array}
|
|
53
|
+
*/
|
|
54
|
+
parse(text) {
|
|
55
|
+
const found = this.checker.parse(text);
|
|
56
|
+
if (this.verbose) console.log(`[FC] Detected ${found.length} implicit claim(s)`);
|
|
57
|
+
return found;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
/**
|
|
61
|
+
* Mark a claim as user-verified with a proof artifact.
|
|
62
|
+
* Persists to the knowledge graph for future sessions.
|
|
63
|
+
*
|
|
64
|
+
* @param {string} claim - The assertion
|
|
65
|
+
* @param {string} proof - Proof artifact (command output, URL, benchmark result)
|
|
66
|
+
* @returns {{ id, status, claim }}
|
|
67
|
+
*/
|
|
68
|
+
verify(claim, proof) {
|
|
69
|
+
const result = this.checker.verify(claim, proof);
|
|
70
|
+
if (this.verbose) console.log(`[FC] Verified: ${result.id}`);
|
|
71
|
+
return result;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/**
|
|
75
|
+
* Generate a Markdown fact sheet for a session.
|
|
76
|
+
* Auto-buckets facts from the knowledge graph.
|
|
77
|
+
*
|
|
78
|
+
* @param {string} sessionId
|
|
79
|
+
* @returns {{ content, path }}
|
|
80
|
+
*/
|
|
81
|
+
sheet(sessionId = new Date().toISOString().slice(0, 10)) {
|
|
82
|
+
const graph = this.checker.graph.load();
|
|
83
|
+
const facts = graph.facts.reduce((acc, f) => {
|
|
84
|
+
const bucket = f.status === 'VERIFIED' ? 'verified' :
|
|
85
|
+
f.status === 'DISPUTED' ? 'disputed' :
|
|
86
|
+
f.status === 'OPINION' ? 'opinions' : 'claims';
|
|
87
|
+
(acc[bucket] = acc[bucket] || []).push(f);
|
|
88
|
+
return acc;
|
|
89
|
+
}, {});
|
|
90
|
+
return this.checker.generateFactSheet(sessionId, facts, true);
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
/**
|
|
94
|
+
* Get all persisted facts from the knowledge graph.
|
|
95
|
+
* @returns {Array}
|
|
96
|
+
*/
|
|
97
|
+
getGraph() {
|
|
98
|
+
return this.checker.graph.load().facts;
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
module.exports = FactCheckWorker;
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "create-byan-agent",
|
|
3
|
-
"version": "2.
|
|
4
|
-
"description": "BYAN v2.
|
|
3
|
+
"version": "2.6.0",
|
|
4
|
+
"description": "BYAN v2.6.0 - Intelligent AI agent creator with ELO trust system + scientific fact-check + Hermes universal dispatcher. Multi-platform (Copilot CLI, Claude Code, Codex). Merise Agile + TDD + 64 Mantras. ~54% LLM cost savings.",
|
|
5
5
|
"main": "src/index.js",
|
|
6
6
|
"bin": {
|
|
7
7
|
"create-byan-agent": "./install/bin/create-byan-agent-v2.js",
|
|
@@ -9,6 +9,7 @@ class SessionState {
|
|
|
9
9
|
this.analysisResults = {};
|
|
10
10
|
this.agentProfileDraft = {};
|
|
11
11
|
this.context = {};
|
|
12
|
+
this.facts = { verified: [], claims: [], disputed: [], opinions: [] };
|
|
12
13
|
}
|
|
13
14
|
|
|
14
15
|
addQuestion(question) {
|
|
@@ -26,6 +27,19 @@ class SessionState {
|
|
|
26
27
|
});
|
|
27
28
|
}
|
|
28
29
|
|
|
30
|
+
addFact(fact) {
|
|
31
|
+
if (!this.facts.claims) this.facts = { verified: [], claims: [], disputed: [], opinions: [] };
|
|
32
|
+
const target = fact.status === 'VERIFIED' ? 'verified'
|
|
33
|
+
: fact.status === 'DISPUTED' ? 'disputed'
|
|
34
|
+
: fact.status === 'OPINION' || fact.status === 'HYPOTHESIS' ? 'opinions'
|
|
35
|
+
: 'claims';
|
|
36
|
+
this.facts[target].push({ ...fact, session: this.sessionId, created_at: Date.now() });
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
getFacts() {
|
|
40
|
+
return JSON.parse(JSON.stringify(this.facts));
|
|
41
|
+
}
|
|
42
|
+
|
|
29
43
|
setAnalysisResults(data) {
|
|
30
44
|
this.analysisResults = data;
|
|
31
45
|
}
|
|
@@ -78,7 +92,8 @@ class SessionState {
|
|
|
78
92
|
userResponses: this.userResponses,
|
|
79
93
|
analysisResults: this.analysisResults,
|
|
80
94
|
agentProfileDraft: this.agentProfileDraft,
|
|
81
|
-
context: this.context
|
|
95
|
+
context: this.context,
|
|
96
|
+
facts: this.facts
|
|
82
97
|
};
|
|
83
98
|
}
|
|
84
99
|
|
|
@@ -91,6 +106,7 @@ class SessionState {
|
|
|
91
106
|
state.analysisResults = data.analysisResults || {};
|
|
92
107
|
state.agentProfileDraft = data.agentProfileDraft || {};
|
|
93
108
|
state.context = data.context || {};
|
|
109
|
+
state.facts = data.facts || { verified: [], claims: [], disputed: [], opinions: [] };
|
|
94
110
|
return state;
|
|
95
111
|
}
|
|
96
112
|
}
|
|
@@ -13,9 +13,10 @@
|
|
|
13
13
|
const Logger = require('../observability/logger');
|
|
14
14
|
|
|
15
15
|
class FiveWhysAnalyzer {
|
|
16
|
-
constructor(sessionState, logger) {
|
|
16
|
+
constructor(sessionState, logger, factChecker = null) {
|
|
17
17
|
this.sessionState = sessionState;
|
|
18
18
|
this.logger = logger || new Logger({ logDir: 'logs', logFile: 'five-whys.log' });
|
|
19
|
+
this.factChecker = factChecker;
|
|
19
20
|
|
|
20
21
|
this.depth = 0;
|
|
21
22
|
this.maxDepth = 5;
|
|
@@ -123,9 +124,17 @@ class FiveWhysAnalyzer {
|
|
|
123
124
|
depth: this.depth,
|
|
124
125
|
question: `Why #${this.depth}`,
|
|
125
126
|
answer: answer.trim(),
|
|
126
|
-
timestamp: new Date().toISOString()
|
|
127
|
+
timestamp: new Date().toISOString(),
|
|
128
|
+
assertionType: 'HYPOTHESIS'
|
|
127
129
|
});
|
|
128
130
|
|
|
131
|
+
if (this.factChecker) {
|
|
132
|
+
const claims = this.factChecker.parse(answer);
|
|
133
|
+
if (claims.length > 0) {
|
|
134
|
+
this.logger.info('Fact-check triggered on WHY answer', { depth: this.depth, patterns: claims.map(c => c.matched) });
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|
|
129
138
|
const rootCauseAnalysis = this._analyzeForRootCause(answer);
|
|
130
139
|
|
|
131
140
|
if (rootCauseAnalysis.isRootCause && this.depth >= 3) {
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* challenge-evaluator.js — Decides HOW to challenge a claim based on ELO context
|
|
3
|
+
*
|
|
4
|
+
* Does NOT decide if a claim is true or false (that's the LLM's job).
|
|
5
|
+
* Returns challenge configuration: scaffold level, style, flags, LLM instructions.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
const config = require('./domain-config');
|
|
9
|
+
|
|
10
|
+
class ChallengeEvaluator {
|
|
11
|
+
/**
|
|
12
|
+
* Build the challenge context for a given domain/rating.
|
|
13
|
+
* Called BEFORE the LLM evaluates the claim.
|
|
14
|
+
*
|
|
15
|
+
* @param {string} domain
|
|
16
|
+
* @param {object} domainProfile - from EloStore.getDomain()
|
|
17
|
+
* @returns {ChallengeContext}
|
|
18
|
+
*/
|
|
19
|
+
evaluateContext(domain, domainProfile) {
|
|
20
|
+
const rating = domainProfile.rating ?? config.INITIAL_RATING;
|
|
21
|
+
const scaffold = config.getScaffoldLevel(rating);
|
|
22
|
+
const style = config.getChallengeStyle(rating);
|
|
23
|
+
const inDeadZone = config.isInDeadZone(rating);
|
|
24
|
+
const firstBlood = !domainProfile.first_claim_made;
|
|
25
|
+
const tiltDetected = (domainProfile.blocked_streak ?? 0) >= config.TILT_THRESHOLD;
|
|
26
|
+
const shouldSoftChallenge = rating < 500 || inDeadZone;
|
|
27
|
+
|
|
28
|
+
return {
|
|
29
|
+
domain,
|
|
30
|
+
rating,
|
|
31
|
+
rd: domainProfile.rd ?? config.INITIAL_RD,
|
|
32
|
+
scaffoldLevel: scaffold.level,
|
|
33
|
+
scaffoldIncludes: scaffold.includes,
|
|
34
|
+
challengeStyle: style,
|
|
35
|
+
shouldSoftChallenge,
|
|
36
|
+
firstBlood,
|
|
37
|
+
inDeadZone,
|
|
38
|
+
tiltDetected,
|
|
39
|
+
promptInstructions: this._buildPromptInstructions({
|
|
40
|
+
scaffoldLevel: scaffold.level,
|
|
41
|
+
scaffoldIncludes: scaffold.includes,
|
|
42
|
+
style,
|
|
43
|
+
shouldSoftChallenge,
|
|
44
|
+
firstBlood,
|
|
45
|
+
inDeadZone,
|
|
46
|
+
tiltDetected,
|
|
47
|
+
domain,
|
|
48
|
+
rating
|
|
49
|
+
})
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Map a raw LLM evaluation outcome to a canonical result.
|
|
55
|
+
* @param {'validated'|'blocked'|'partial'|'soft'} outcome
|
|
56
|
+
* @returns {'VALIDATED'|'BLOCKED'|'PARTIALLY_VALID'|'SOFT_CHALLENGED'}
|
|
57
|
+
*/
|
|
58
|
+
normalizeResult(outcome) {
|
|
59
|
+
const map = {
|
|
60
|
+
validated: 'VALIDATED',
|
|
61
|
+
blocked: 'BLOCKED',
|
|
62
|
+
partial: 'PARTIALLY_VALID',
|
|
63
|
+
soft: 'SOFT_CHALLENGED'
|
|
64
|
+
};
|
|
65
|
+
return map[outcome?.toLowerCase()] ?? 'SOFT_CHALLENGED';
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Convert a canonical result to a numeric Glicko score.
|
|
70
|
+
* @param {'VALIDATED'|'BLOCKED'|'PARTIALLY_VALID'|'SOFT_CHALLENGED'} result
|
|
71
|
+
* @returns {number} 0 | 0.5 | 1
|
|
72
|
+
*/
|
|
73
|
+
resultToScore(result) {
|
|
74
|
+
switch (result) {
|
|
75
|
+
case 'VALIDATED': return 1;
|
|
76
|
+
case 'PARTIALLY_VALID': return 0.5;
|
|
77
|
+
case 'BLOCKED': return 0;
|
|
78
|
+
default: return 0.5; // neutral for soft challenges
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
// --- Private ---
|
|
83
|
+
|
|
84
|
+
_buildPromptInstructions({ scaffoldLevel, scaffoldIncludes, style, shouldSoftChallenge,
|
|
85
|
+
firstBlood, inDeadZone, tiltDetected, domain, rating }) {
|
|
86
|
+
const lines = [];
|
|
87
|
+
|
|
88
|
+
if (tiltDetected) {
|
|
89
|
+
lines.push(`TILT_DETECTED: User has ${3}+ consecutive BLOCKED in ${domain}. Use maximum empathy. Suggest a pause before continuing.`);
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
if (firstBlood) {
|
|
93
|
+
lines.push(`FIRST_BLOOD: This is the user's first claim in ${domain}. Always challenge regardless of global ELO. Zero Trust per domain.`);
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
if (shouldSoftChallenge) {
|
|
97
|
+
lines.push(`SOFT_CHALLENGE: Do not BLOCK immediately. Ask "what led you to this conclusion?" first. Only BLOCK if the explanation reveals a real gap.`);
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
if (inDeadZone) {
|
|
101
|
+
lines.push(`DEAD_ZONE: User is at ${rating} (450–550 Dunning-Kruger peak). Maximum nuance required. Challenge precisely.`);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
lines.push(`CHALLENGE_STYLE: ${style.toUpperCase()}`);
|
|
105
|
+
lines.push(`SCAFFOLD_LEVEL: ${scaffoldLevel} — include: [${scaffoldIncludes.join(', ')}]`);
|
|
106
|
+
|
|
107
|
+
if (style === 'guide') {
|
|
108
|
+
lines.push('TONE: Encourage and simplify. User is significantly below benchmark.');
|
|
109
|
+
} else if (style === 'learner') {
|
|
110
|
+
lines.push('TONE: Adopt learner mode. User may exceed your benchmark — ask questions.');
|
|
111
|
+
} else if (style === 'peer') {
|
|
112
|
+
lines.push('TONE: Peer-to-peer. Be technically precise and nuanced.');
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
lines.push('INVARIANT: Tone is always curious, never accusatory. No "That\'s wrong." Use "What led you to think...?" or "You\'re on the right track — the nuance is..."');
|
|
116
|
+
|
|
117
|
+
return lines.join('\n');
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
module.exports = ChallengeEvaluator;
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* domain-config.js — Static configuration for the ELO Trust System
|
|
3
|
+
*
|
|
4
|
+
* All thresholds, multipliers and routing rules live here.
|
|
5
|
+
* Pure data — no side effects.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
// K-factor multipliers per domain.
|
|
9
|
+
// Base K = 32. Final K = BASE_K × multiplier.
|
|
10
|
+
const BASE_K = 32;
|
|
11
|
+
|
|
12
|
+
const K_FACTOR_MULTIPLIERS = {
|
|
13
|
+
security: 1.5, // high stakes — errors are costly
|
|
14
|
+
compliance: 1.5,
|
|
15
|
+
performance: 1.2,
|
|
16
|
+
general: 1.0, // neutral reference
|
|
17
|
+
javascript: 1.0,
|
|
18
|
+
typescript: 1.0,
|
|
19
|
+
nodejs: 1.0,
|
|
20
|
+
algorithms: 0.8, // fundamentals are stable — mistakes are normal
|
|
21
|
+
cryptography:1.2,
|
|
22
|
+
devops: 1.0,
|
|
23
|
+
react: 1.0
|
|
24
|
+
};
|
|
25
|
+
|
|
26
|
+
// Dunning-Kruger dead zone: maximum challenge intensity here
|
|
27
|
+
const DEAD_ZONE = { min: 450, max: 550 };
|
|
28
|
+
|
|
29
|
+
// LLM routing thresholds (max rating across active domains)
|
|
30
|
+
const LLM_ROUTING = [
|
|
31
|
+
{ maxRating: 200, model: 'claude-opus-4.5', label: 'Apprenti' },
|
|
32
|
+
{ maxRating: 600, model: 'claude-sonnet-4.5', label: 'Praticien' },
|
|
33
|
+
{ maxRating: Infinity, model: 'claude-haiku-4.5', label: 'Expert' }
|
|
34
|
+
];
|
|
35
|
+
|
|
36
|
+
// Scaffold levels: how much help is provided with a challenge
|
|
37
|
+
const SCAFFOLD_LEVELS = [
|
|
38
|
+
{ maxRating: 200, level: 3, label: 'full', includes: ['challenge', 'hint', 'analogy', 'concept_link'] },
|
|
39
|
+
{ maxRating: 500, level: 2, label: 'guided', includes: ['challenge', 'hint'] },
|
|
40
|
+
{ maxRating: 700, level: 1, label: 'standard',includes: ['challenge'] },
|
|
41
|
+
{ maxRating: Infinity, level: 0, label: 'adversarial', includes: ['challenge'] }
|
|
42
|
+
];
|
|
43
|
+
|
|
44
|
+
// Challenge style based on ELO gap (user rating - domain baseline 500)
|
|
45
|
+
const CHALLENGE_STYLES = [
|
|
46
|
+
{ minGap: -Infinity, maxGap: -400, style: 'guide' }, // beginner vs expert
|
|
47
|
+
{ minGap: -400, maxGap: -100, style: 'standard' },
|
|
48
|
+
{ minGap: -100, maxGap: 100, style: 'peer' },
|
|
49
|
+
{ minGap: 100, maxGap: Infinity, style: 'learner' } // user exceeds BYAN benchmark
|
|
50
|
+
];
|
|
51
|
+
|
|
52
|
+
// The 6 root causes of a BLOCKED claim
|
|
53
|
+
const BLOCKED_REASONS = {
|
|
54
|
+
TERMINOLOGY_GAP: 'terminology_gap',
|
|
55
|
+
PREREQUISITE_GAP: 'prerequisite_gap',
|
|
56
|
+
CONTEXT_MISMATCH: 'context_mismatch',
|
|
57
|
+
OUTDATED_KNOWLEDGE: 'outdated_knowledge',
|
|
58
|
+
LAZY_CLAIM: 'lazy_claim',
|
|
59
|
+
OVERCONFIDENCE: 'overconfidence'
|
|
60
|
+
};
|
|
61
|
+
|
|
62
|
+
// Adapted label for BLOCKED result based on user rating
|
|
63
|
+
const BLOCKED_LABELS = [
|
|
64
|
+
{ maxRating: 300, label: "Moment d'apprentissage" },
|
|
65
|
+
{ maxRating: 600, label: 'Point de precision' },
|
|
66
|
+
{ maxRating: Infinity, label: 'Claim non valide' }
|
|
67
|
+
];
|
|
68
|
+
|
|
69
|
+
// Tilt threshold: consecutive BLOCKs in same domain triggering soft intervention
|
|
70
|
+
const TILT_THRESHOLD = 3;
|
|
71
|
+
|
|
72
|
+
// Intervention mode threshold: total rating at 0 across N sessions
|
|
73
|
+
const INTERVENTION_RATING = 0;
|
|
74
|
+
|
|
75
|
+
// Initial rating for a new domain
|
|
76
|
+
const INITIAL_RATING = 0;
|
|
77
|
+
const INITIAL_RD = 200; // high uncertainty at start
|
|
78
|
+
|
|
79
|
+
// Provisional ELO when user self-declares expertise level
|
|
80
|
+
const DECLARED_EXPERTISE_RATINGS = {
|
|
81
|
+
beginner: 100,
|
|
82
|
+
intermediate: 400,
|
|
83
|
+
advanced: 650,
|
|
84
|
+
expert: 800,
|
|
85
|
+
principal: 900
|
|
86
|
+
};
|
|
87
|
+
|
|
88
|
+
function getKFactor(domain) {
|
|
89
|
+
const multiplier = K_FACTOR_MULTIPLIERS[domain] ?? K_FACTOR_MULTIPLIERS.general;
|
|
90
|
+
return Math.round(BASE_K * multiplier);
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
function getScaffoldLevel(rating) {
|
|
94
|
+
return SCAFFOLD_LEVELS.find(s => rating <= s.maxRating) ?? SCAFFOLD_LEVELS[SCAFFOLD_LEVELS.length - 1];
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
function getChallengeStyle(rating, byanBaseline = 500) {
|
|
98
|
+
const gap = rating - byanBaseline;
|
|
99
|
+
return CHALLENGE_STYLES.find(s => gap >= s.minGap && gap < s.maxGap)?.style ?? 'standard';
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
function getBlockedLabel(rating) {
|
|
103
|
+
return BLOCKED_LABELS.find(b => rating <= b.maxRating)?.label ?? 'Claim non valide';
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
function isInDeadZone(rating) {
|
|
107
|
+
return rating >= DEAD_ZONE.min && rating <= DEAD_ZONE.max;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
module.exports = {
|
|
111
|
+
BASE_K,
|
|
112
|
+
K_FACTOR_MULTIPLIERS,
|
|
113
|
+
DEAD_ZONE,
|
|
114
|
+
LLM_ROUTING,
|
|
115
|
+
SCAFFOLD_LEVELS,
|
|
116
|
+
CHALLENGE_STYLES,
|
|
117
|
+
BLOCKED_REASONS,
|
|
118
|
+
BLOCKED_LABELS,
|
|
119
|
+
TILT_THRESHOLD,
|
|
120
|
+
INTERVENTION_RATING,
|
|
121
|
+
INITIAL_RATING,
|
|
122
|
+
INITIAL_RD,
|
|
123
|
+
DECLARED_EXPERTISE_RATINGS,
|
|
124
|
+
getKFactor,
|
|
125
|
+
getScaffoldLevel,
|
|
126
|
+
getChallengeStyle,
|
|
127
|
+
getBlockedLabel,
|
|
128
|
+
isInDeadZone
|
|
129
|
+
};
|