akm-cli 0.9.0-beta.54 → 0.9.0-beta.55
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +5 -3
- package/dist/commands/agent/contribute-cli.js +2 -3
- package/dist/commands/env/env-cli.js +187 -202
- package/dist/commands/env/secret-cli.js +109 -121
- package/dist/commands/feedback-cli.js +152 -155
- package/dist/commands/health/advisories.js +151 -0
- package/dist/commands/health/improve-metrics.js +754 -0
- package/dist/commands/health/llm-usage.js +65 -0
- package/dist/commands/health/md-report.js +103 -0
- package/dist/commands/health/metrics.js +278 -0
- package/dist/commands/health/task-runs.js +135 -0
- package/dist/commands/health/types.js +18 -0
- package/dist/commands/health/windows.js +196 -0
- package/dist/commands/health.js +14 -1624
- package/dist/commands/improve/anti-collapse.js +170 -0
- package/dist/commands/improve/collapse-detector.js +3 -2
- package/dist/commands/improve/consolidate.js +636 -633
- package/dist/commands/improve/dedup.js +1 -1
- package/dist/commands/improve/distill/content-repair.js +202 -0
- package/dist/commands/improve/distill/promote-memory.js +228 -0
- package/dist/commands/improve/distill/quality-gate.js +233 -0
- package/dist/commands/improve/distill-guards.js +127 -0
- package/dist/commands/improve/distill.js +49 -575
- package/dist/commands/improve/extract-cli.js +74 -76
- package/dist/commands/improve/extract.js +6 -4
- package/dist/commands/improve/hot-probation.js +45 -0
- package/dist/commands/improve/improve-auto-accept.js +3 -2
- package/dist/commands/improve/improve-cli.js +14 -13
- package/dist/commands/improve/improve-result-file.js +2 -1
- package/dist/commands/improve/improve.js +6 -5
- package/dist/commands/improve/loop-stages.js +19 -21
- package/dist/commands/improve/preparation.js +4 -2
- package/dist/commands/improve/procedural.js +10 -31
- package/dist/commands/improve/recombine.js +19 -43
- package/dist/commands/improve/reflect.js +1 -1
- package/dist/commands/improve/schema-similarity-gate.js +168 -0
- package/dist/commands/improve/shared.js +48 -0
- package/dist/commands/observability-cli.js +4 -4
- package/dist/commands/proposal/drain-policies.js +2 -2
- package/dist/commands/proposal/drain.js +1 -1
- package/dist/commands/proposal/legacy-import.js +115 -0
- package/dist/commands/proposal/proposal-cli.js +3 -3
- package/dist/commands/proposal/proposal.js +2 -1
- package/dist/commands/proposal/propose.js +1 -1
- package/dist/commands/proposal/repository.js +829 -0
- package/dist/commands/proposal/validators/proposals.js +5 -920
- package/dist/commands/read/remember-cli.js +132 -137
- package/dist/commands/read/search-cli.js +1 -1
- package/dist/commands/registry-cli.js +76 -87
- package/dist/commands/sources/add-cli.js +90 -94
- package/dist/commands/sources/history.js +1 -1
- package/dist/commands/sources/schema-repair.js +1 -1
- package/dist/commands/sources/sources-cli.js +3 -3
- package/dist/commands/sources/stash-cli.js +1 -1
- package/dist/commands/tasks/tasks-cli.js +1 -2
- package/dist/commands/wiki-cli.js +2 -3
- package/dist/core/common.js +3 -3
- package/dist/core/config/config-schema.js +6 -0
- package/dist/core/deep-merge.js +38 -0
- package/dist/core/events.js +2 -1
- package/dist/core/logs-db.js +8 -13
- package/dist/core/paths.js +14 -14
- package/dist/core/state-db.js +13 -1140
- package/dist/indexer/db/db.js +66 -709
- package/dist/indexer/db/entry-mapper.js +41 -0
- package/dist/indexer/db/schema.js +516 -0
- package/dist/indexer/feedback/utility-policy.js +85 -0
- package/dist/indexer/graph/graph-extraction.js +2 -1
- package/dist/indexer/index-writer-lock.js +9 -0
- package/dist/indexer/indexer.js +78 -23
- package/dist/indexer/search/fts-query.js +51 -0
- package/dist/integrations/agent/spawn.js +15 -66
- package/dist/output/text/helpers.js +13 -0
- package/dist/scripts/migrate-storage.js +6891 -7436
- package/dist/scripts/migrations/import-fs-improve-runs-to-db.js +44 -43
- package/dist/setup/legacy-config.js +106 -0
- package/dist/setup/prompt.js +57 -0
- package/dist/setup/providers.js +14 -0
- package/dist/setup/semantic-assets.js +124 -0
- package/dist/setup/setup.js +24 -1607
- package/dist/setup/steps/connection.js +734 -0
- package/dist/setup/steps/output.js +31 -0
- package/dist/setup/steps/platforms.js +124 -0
- package/dist/setup/steps/semantic.js +27 -0
- package/dist/setup/steps/sources.js +222 -0
- package/dist/setup/steps/stashdir.js +42 -0
- package/dist/setup/steps/tasks.js +152 -0
- package/dist/storage/repositories/canaries-repository.js +107 -0
- package/dist/storage/repositories/consolidation-repository.js +38 -0
- package/dist/storage/repositories/embeddings-repository.js +72 -0
- package/dist/storage/repositories/events-repository.js +187 -0
- package/dist/storage/repositories/extract-sessions-repository.js +96 -0
- package/dist/storage/repositories/improve-runs-repository.js +130 -0
- package/dist/storage/repositories/index-db.js +4 -7
- package/dist/storage/repositories/proposals-repository.js +220 -0
- package/dist/storage/repositories/recombine-repository.js +213 -0
- package/dist/storage/repositories/task-history-repository.js +93 -0
- package/dist/storage/sqlite-pragmas.js +3 -3
- package/dist/tasks/runner.js +2 -1
- package/package.json +1 -1
- package/dist/commands/improve/homeostatic.js +0 -497
|
@@ -1,497 +0,0 @@
|
|
|
1
|
-
// This Source Code Form is subject to the terms of the Mozilla Public
|
|
2
|
-
// License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
3
|
-
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
|
|
4
|
-
/**
|
|
5
|
-
* WS-3b Step 0 — Intake + homeostatic tier.
|
|
6
|
-
*
|
|
7
|
-
* Sub-features (0b is default-ON for extract since R3; the rest default-OFF):
|
|
8
|
-
*
|
|
9
|
-
* (The former **0a homeostatic demotion** pass was removed (R4,
|
|
10
|
-
* docs/design/improve-self-learning-analysis.md G3): it was default-off and
|
|
11
|
-
* self-undoing — the next `upsertAssetSalience` recompute unconditionally
|
|
12
|
-
* overwrote the demoted values. SHY-style continuous downscaling now lives in
|
|
13
|
-
* `computeSalience`'s always-applied recency decay, whose 0.1 floor itself
|
|
14
|
-
* decays on a long half-life so unreviewed-forever assets keep drifting down.)
|
|
15
|
-
*
|
|
16
|
-
* **0b Schema-similarity gate**
|
|
17
|
-
* At intake, if a new candidate's body embedding is within ε of an existing
|
|
18
|
-
* derived-layer lesson/knowledge node, mark `schema-consistent` and lower
|
|
19
|
-
* its priority; only schema-inconsistent/contradicting candidates get full
|
|
20
|
-
* `encodingSalience`. One embedding lookup via body_embeddings cache; relieves
|
|
21
|
-
* dedup pressure before it accumulates.
|
|
22
|
-
*
|
|
23
|
-
* **0c Hot-probation intake buffer (#604)**
|
|
24
|
-
* New system-generated extractions enter `captureMode: hot-probation` and
|
|
25
|
-
* spend ONE consolidation cycle in probation before promotion to the main
|
|
26
|
-
* stash; dedup + quality second-pass runs against them. Stops noisy
|
|
27
|
-
* extractions from polluting the stash at the source. Reuses shared
|
|
28
|
-
* dedupHash + body_embeddings. Default OFF.
|
|
29
|
-
*
|
|
30
|
-
* **Anti-collapse guards (step 8)**
|
|
31
|
-
* (a) Generation counter: merged.generation = max(sources)+1; refuse merge
|
|
32
|
-
* of two assets both above generation N (default 2); merges cite sources.
|
|
33
|
-
* (b) Lexical-diversity check: low n-gram diversity ⇒ raise merge threshold.
|
|
34
|
-
* (c) Occasional random non-similar cluster in the pool.
|
|
35
|
-
*
|
|
36
|
-
* **CLS interleaving (step 9)**
|
|
37
|
-
* distill/memoryInference prompts include embedding-retrieved adjacent
|
|
38
|
-
* lessons/knowledge so the pipeline doesn't overwrite prior generalizations.
|
|
39
|
-
*
|
|
40
|
-
* **Distill→source fidelity (step 10)**
|
|
41
|
-
* After a distill proposal, check it against cited source memories; a
|
|
42
|
-
* contradiction flag routes to human review.
|
|
43
|
-
*
|
|
44
|
-
* @module homeostatic
|
|
45
|
-
*/
|
|
46
|
-
import { warn } from "../../core/warn.js";
|
|
47
|
-
import { closeDatabase, openExistingDatabase } from "../../indexer/db/db.js";
|
|
48
|
-
// ── Constants ─────────────────────────────────────────────────────────────────
|
|
49
|
-
/** Default epsilon for schema-similarity gate (looser than dedup's 0.97). */
|
|
50
|
-
export const DEFAULT_SCHEMA_SIMILARITY_EPSILON = 0.85;
|
|
51
|
-
/** Default multiplicative confidence penalty applied to schema-consistent candidates. */
|
|
52
|
-
export const DEFAULT_SCHEMA_CONFIDENCE_PENALTY = 0.5;
|
|
53
|
-
/** Default max generation depth before merge is refused. */
|
|
54
|
-
export const DEFAULT_MAX_GENERATION = 2;
|
|
55
|
-
/** Default fraction of pool to fill with random (non-similar) clusters. */
|
|
56
|
-
export const DEFAULT_RANDOM_CLUSTER_FRACTION = 0.05;
|
|
57
|
-
/** Default number of adjacent lessons/knowledge for CLS interleaving. */
|
|
58
|
-
export const DEFAULT_CLS_ADJACENT_COUNT = 3;
|
|
59
|
-
/**
|
|
60
|
-
* Check whether a candidate body embedding is schema-consistent with an existing
|
|
61
|
-
* derived-layer lesson/knowledge node. Returns `true` when the candidate is
|
|
62
|
-
* within ε of ANY existing derived node (i.e. it's likely covering ground the
|
|
63
|
-
* derived layer already knows about, so give it lower priority).
|
|
64
|
-
*
|
|
65
|
-
* One embedding lookup via the body_embeddings cache; no LLM call.
|
|
66
|
-
* Fails open: returns `false` (not schema-consistent) on any error so the
|
|
67
|
-
* candidate is not silently dropped.
|
|
68
|
-
*
|
|
69
|
-
* @param candidateEmbedding - Float32 embedding vector for the candidate body.
|
|
70
|
-
* @param existingDerivedEmbeddings - Pre-loaded embeddings for existing derived assets.
|
|
71
|
-
* @param config - Schema-similarity gate config.
|
|
72
|
-
*/
|
|
73
|
-
export function isSchemaConsistent(candidateEmbedding, existingDerivedEmbeddings, config) {
|
|
74
|
-
if (!config.enabled || existingDerivedEmbeddings.length === 0) {
|
|
75
|
-
return { consistent: false };
|
|
76
|
-
}
|
|
77
|
-
const epsilon = config.epsilon ?? DEFAULT_SCHEMA_SIMILARITY_EPSILON;
|
|
78
|
-
let bestSim = -Infinity;
|
|
79
|
-
let bestRef;
|
|
80
|
-
for (const { ref, embedding } of existingDerivedEmbeddings) {
|
|
81
|
-
// cosine similarity: dot(a,b) / (|a| * |b|)
|
|
82
|
-
let dot = 0;
|
|
83
|
-
let magA = 0;
|
|
84
|
-
let magB = 0;
|
|
85
|
-
for (let i = 0; i < candidateEmbedding.length; i++) {
|
|
86
|
-
const a = candidateEmbedding[i] ?? 0;
|
|
87
|
-
const b = embedding[i] ?? 0;
|
|
88
|
-
dot += a * b;
|
|
89
|
-
magA += a * a;
|
|
90
|
-
magB += b * b;
|
|
91
|
-
}
|
|
92
|
-
const sim = magA === 0 || magB === 0 ? 0 : dot / (Math.sqrt(magA) * Math.sqrt(magB));
|
|
93
|
-
if (sim > bestSim) {
|
|
94
|
-
bestSim = sim;
|
|
95
|
-
bestRef = ref;
|
|
96
|
-
}
|
|
97
|
-
}
|
|
98
|
-
if (bestSim >= epsilon) {
|
|
99
|
-
return { consistent: true, matchedRef: bestRef, similarity: bestSim };
|
|
100
|
-
}
|
|
101
|
-
return { consistent: false };
|
|
102
|
-
}
|
|
103
|
-
/**
|
|
104
|
-
* WS-3b Step-0b: apply the schema-similarity intake gate to one extract
|
|
105
|
-
* candidate. Pure/deterministic given `embedText`, so it is directly unit
|
|
106
|
-
* testable without the full extract→LLM harness.
|
|
107
|
-
*
|
|
108
|
-
* Returns the (possibly penalised) effective confidence plus a `penalised` flag
|
|
109
|
-
* and an optional human-readable `warning`. Parity guarantees:
|
|
110
|
-
* - `ctx === null` (gate disabled / default-off) → no change, never embeds.
|
|
111
|
-
* - empty `derivedEmbeddings` → no change, never embeds.
|
|
112
|
-
* - candidate type not lesson/knowledge → no change, never embeds.
|
|
113
|
-
* - embed throws → fail open (no change), warns.
|
|
114
|
-
*/
|
|
115
|
-
export async function applySchemaSimilarityPenalty(candidate, ctx, embedText) {
|
|
116
|
-
const baseConfidence = typeof candidate.confidence === "number" ? candidate.confidence : undefined;
|
|
117
|
-
if (ctx === null || ctx.derivedEmbeddings.length === 0) {
|
|
118
|
-
return { effectiveConfidence: baseConfidence, penalised: false };
|
|
119
|
-
}
|
|
120
|
-
if (candidate.type !== "lesson" && candidate.type !== "knowledge") {
|
|
121
|
-
return { effectiveConfidence: baseConfidence, penalised: false };
|
|
122
|
-
}
|
|
123
|
-
try {
|
|
124
|
-
const candidateVec = await embedText(candidate.body);
|
|
125
|
-
const check = isSchemaConsistent(candidateVec, ctx.derivedEmbeddings, ctx.config);
|
|
126
|
-
if (check.consistent) {
|
|
127
|
-
const penalty = ctx.config.confidencePenalty ?? DEFAULT_SCHEMA_CONFIDENCE_PENALTY;
|
|
128
|
-
return {
|
|
129
|
-
effectiveConfidence: (baseConfidence ?? 1.0) * penalty,
|
|
130
|
-
penalised: true,
|
|
131
|
-
warning: `[extract] schema-consistent candidate ${candidate.type}:${candidate.name} ` +
|
|
132
|
-
`(sim=${check.similarity?.toFixed(3)} vs ${check.matchedRef}) — confidence penalised ×${penalty}`,
|
|
133
|
-
};
|
|
134
|
-
}
|
|
135
|
-
return { effectiveConfidence: baseConfidence, penalised: false };
|
|
136
|
-
}
|
|
137
|
-
catch (embedErr) {
|
|
138
|
-
// Fail open: embed errors must never abort extraction.
|
|
139
|
-
return {
|
|
140
|
-
effectiveConfidence: baseConfidence,
|
|
141
|
-
penalised: false,
|
|
142
|
-
warning: `[extract] schema-similarity embed failed for ${candidate.type}:${candidate.name} — skipping gate: ` +
|
|
143
|
-
(embedErr instanceof Error ? embedErr.message : String(embedErr)),
|
|
144
|
-
};
|
|
145
|
-
}
|
|
146
|
-
}
|
|
147
|
-
/**
|
|
148
|
-
* Load persisted body embeddings for all indexed **derived-layer**
|
|
149
|
-
* (lesson + knowledge) entries from index.db. Returns an empty array when
|
|
150
|
-
* the DB is unavailable, empty, or the embeddings table has no entries for
|
|
151
|
-
* those types — the caller treats an empty array as "gate inactive".
|
|
152
|
-
*
|
|
153
|
-
* FAIL-OPEN: any error emits a debug warning and returns an empty array.
|
|
154
|
-
* This ensures the extract pass never fails because of a missing index.
|
|
155
|
-
*
|
|
156
|
-
* The returned entries are keyed by `entry_key` (e.g. "lesson:foo",
|
|
157
|
-
* "knowledge:bar"). Only entries whose embedding dimension matches the first
|
|
158
|
-
* observed dimension are included (mixed-dim BLOBs are silently skipped).
|
|
159
|
-
*
|
|
160
|
-
* @param dbPath - Optional path override for index.db (for testing).
|
|
161
|
-
*/
|
|
162
|
-
export function loadDerivedLayerEmbeddings(dbPath) {
|
|
163
|
-
let db;
|
|
164
|
-
try {
|
|
165
|
-
db = openExistingDatabase(dbPath);
|
|
166
|
-
const rows = db
|
|
167
|
-
.prepare(`SELECT e.entry_key, emb.embedding
|
|
168
|
-
FROM entries e
|
|
169
|
-
JOIN embeddings emb ON emb.id = e.id
|
|
170
|
-
WHERE e.entry_type IN ('lesson', 'knowledge')`)
|
|
171
|
-
.all();
|
|
172
|
-
if (rows.length === 0)
|
|
173
|
-
return [];
|
|
174
|
-
let expectedDim;
|
|
175
|
-
const result = [];
|
|
176
|
-
for (const row of rows) {
|
|
177
|
-
const buf = row.embedding;
|
|
178
|
-
if (!buf || buf.byteLength === 0 || buf.byteLength % 4 !== 0)
|
|
179
|
-
continue;
|
|
180
|
-
const dim = buf.byteLength / 4;
|
|
181
|
-
if (expectedDim === undefined)
|
|
182
|
-
expectedDim = dim;
|
|
183
|
-
if (dim !== expectedDim)
|
|
184
|
-
continue;
|
|
185
|
-
const aligned = new ArrayBuffer(buf.byteLength);
|
|
186
|
-
new Uint8Array(aligned).set(buf);
|
|
187
|
-
const f32 = new Float32Array(aligned);
|
|
188
|
-
result.push({ ref: row.entry_key, embedding: Array.from(f32) });
|
|
189
|
-
}
|
|
190
|
-
return result;
|
|
191
|
-
}
|
|
192
|
-
catch (err) {
|
|
193
|
-
warn("[homeostatic] loadDerivedLayerEmbeddings: failed to load from index.db — gate inactive:", err instanceof Error ? err.message : String(err));
|
|
194
|
-
return [];
|
|
195
|
-
}
|
|
196
|
-
finally {
|
|
197
|
-
if (db) {
|
|
198
|
-
try {
|
|
199
|
-
closeDatabase(db);
|
|
200
|
-
}
|
|
201
|
-
catch {
|
|
202
|
-
// ignore close errors
|
|
203
|
-
}
|
|
204
|
-
}
|
|
205
|
-
}
|
|
206
|
-
}
|
|
207
|
-
/**
|
|
208
|
-
* Read the `generation` field from an asset's frontmatter.
|
|
209
|
-
* Returns 0 when absent (no generation metadata = original asset).
|
|
210
|
-
*/
|
|
211
|
-
export function readAssetGeneration(frontmatterData) {
|
|
212
|
-
const gen = frontmatterData.generation;
|
|
213
|
-
if (typeof gen === "number" && Number.isFinite(gen) && gen >= 0) {
|
|
214
|
-
return Math.floor(gen);
|
|
215
|
-
}
|
|
216
|
-
return 0;
|
|
217
|
-
}
|
|
218
|
-
/**
|
|
219
|
-
* Compute the new generation for a merged asset.
|
|
220
|
-
* Rule: `merged.generation = max(source generations) + 1`.
|
|
221
|
-
*/
|
|
222
|
-
export function computeMergedGeneration(sourceGenerations) {
|
|
223
|
-
if (sourceGenerations.length === 0)
|
|
224
|
-
return 1;
|
|
225
|
-
return Math.max(...sourceGenerations) + 1;
|
|
226
|
-
}
|
|
227
|
-
/**
|
|
228
|
-
* Check whether a merge of the given assets should be refused due to the
|
|
229
|
-
* anti-collapse generation guard.
|
|
230
|
-
*
|
|
231
|
-
* Returns `{ refused: true, reason }` when BOTH assets have generation > maxGeneration.
|
|
232
|
-
* Returns `{ refused: false }` when the merge is allowed.
|
|
233
|
-
*
|
|
234
|
-
* @param sourceGenerations - Generation values for all merge participants.
|
|
235
|
-
* @param config - Anti-collapse config.
|
|
236
|
-
*/
|
|
237
|
-
export function checkGenerationGuard(sourceGenerations, config) {
|
|
238
|
-
// R5: default ON — only an explicit opt-out disables the guard.
|
|
239
|
-
if (config.enabled === false)
|
|
240
|
-
return { refused: false };
|
|
241
|
-
const maxGen = config.maxGeneration ?? DEFAULT_MAX_GENERATION;
|
|
242
|
-
const highGenCount = sourceGenerations.filter((g) => g > maxGen).length;
|
|
243
|
-
if (highGenCount >= 2) {
|
|
244
|
-
return {
|
|
245
|
-
refused: true,
|
|
246
|
-
reason: `Anti-collapse: ${highGenCount} merge participants have generation > ${maxGen} (${sourceGenerations.join(", ")}); refusing to merge over-consolidated assets.`,
|
|
247
|
-
};
|
|
248
|
-
}
|
|
249
|
-
return { refused: false };
|
|
250
|
-
}
|
|
251
|
-
/** Distinct-token retention floor default (R5 §4.2). */
|
|
252
|
-
export const DEFAULT_MIN_SPECIFICITY_RETENTION = 0.6;
|
|
253
|
-
function distinctTokens(text) {
|
|
254
|
-
// Same lowercase whitespace tokenization computeBigramDiversity uses.
|
|
255
|
-
return new Set(text
|
|
256
|
-
.toLowerCase()
|
|
257
|
-
.split(/\s+/)
|
|
258
|
-
.filter((w) => w.length > 0));
|
|
259
|
-
}
|
|
260
|
-
/**
|
|
261
|
-
* A merge must strictly increase information (R5 §4.2):
|
|
262
|
-
* 1. Provenance: the merged asset's `source_refs` must be a superset of the
|
|
263
|
-
* union of all participants' `source_refs` plus the participant refs
|
|
264
|
-
* themselves — provenance never shrinks through a merge.
|
|
265
|
-
* 2. Specificity: distinctTokens(mergedBody) ≥ minSpecificityRetention ×
|
|
266
|
-
* |union(distinctTokens(participant bodies))| — a merge that only
|
|
267
|
-
* shortens/genericizes fails.
|
|
268
|
-
*
|
|
269
|
-
* Pure and deterministic; ADVISORY in v1 (the caller counts violations, it
|
|
270
|
-
* does not refuse the merge). Returns `passed: true` immediately when the
|
|
271
|
-
* anti-collapse suite or the floor itself is opted out.
|
|
272
|
-
*/
|
|
273
|
-
export function checkMergeInformationFloor(mergedBody, mergedSourceRefs, participants, config) {
|
|
274
|
-
if (config.enabled === false || config.mergeInformationFloor === false || participants.length === 0) {
|
|
275
|
-
return { passed: true, provenanceBefore: 0, provenanceAfter: 0, specificityRetention: 1 };
|
|
276
|
-
}
|
|
277
|
-
// 1. Provenance union: participants + everything they already cited.
|
|
278
|
-
const required = new Set();
|
|
279
|
-
for (const p of participants) {
|
|
280
|
-
required.add(p.ref);
|
|
281
|
-
for (const sr of p.sourceRefs)
|
|
282
|
-
required.add(sr);
|
|
283
|
-
}
|
|
284
|
-
const after = new Set(mergedSourceRefs);
|
|
285
|
-
const missing = [...required].filter((r) => !after.has(r));
|
|
286
|
-
// 2. Specificity retention over the union of source tokens.
|
|
287
|
-
const sourceTokens = new Set();
|
|
288
|
-
for (const p of participants) {
|
|
289
|
-
for (const t of distinctTokens(p.body))
|
|
290
|
-
sourceTokens.add(t);
|
|
291
|
-
}
|
|
292
|
-
const mergedTokens = distinctTokens(mergedBody);
|
|
293
|
-
// Clamped at computation so the pass/fail decision, the reason string, and
|
|
294
|
-
// the reported field all describe the same value.
|
|
295
|
-
const specificityRetention = Math.min(1, sourceTokens.size === 0 ? 1 : mergedTokens.size / sourceTokens.size);
|
|
296
|
-
const minRetention = config.minSpecificityRetention ?? DEFAULT_MIN_SPECIFICITY_RETENTION;
|
|
297
|
-
const provenanceOk = missing.length === 0;
|
|
298
|
-
const specificityOk = specificityRetention >= minRetention;
|
|
299
|
-
const reasons = [];
|
|
300
|
-
if (!provenanceOk) {
|
|
301
|
-
reasons.push(`provenance shrank: merged source_refs missing ${missing.length} ref(s) (e.g. ${missing[0]})`);
|
|
302
|
-
}
|
|
303
|
-
if (!specificityOk) {
|
|
304
|
-
reasons.push(`specificity retention ${specificityRetention.toFixed(2)} < ${minRetention} (merge genericized/shortened)`);
|
|
305
|
-
}
|
|
306
|
-
return {
|
|
307
|
-
passed: provenanceOk && specificityOk,
|
|
308
|
-
provenanceBefore: required.size,
|
|
309
|
-
provenanceAfter: after.size,
|
|
310
|
-
specificityRetention,
|
|
311
|
-
...(reasons.length > 0 ? { reason: reasons.join("; ") } : {}),
|
|
312
|
-
};
|
|
313
|
-
}
|
|
314
|
-
/**
|
|
315
|
-
* Compute the bigram n-gram diversity of a text string.
|
|
316
|
-
* Returns a value in [0, 1] where 0 = all identical bigrams, 1 = all unique.
|
|
317
|
-
* Used by the lexical-diversity check to detect correlated-extraction artifacts.
|
|
318
|
-
*/
|
|
319
|
-
export function computeBigramDiversity(text) {
|
|
320
|
-
const words = text
|
|
321
|
-
.toLowerCase()
|
|
322
|
-
.split(/\s+/)
|
|
323
|
-
.filter((w) => w.length > 0);
|
|
324
|
-
if (words.length < 2)
|
|
325
|
-
return 1; // too short to have bigrams; treat as diverse
|
|
326
|
-
const total = words.length - 1;
|
|
327
|
-
const unique = new Set();
|
|
328
|
-
for (let i = 0; i < total; i++) {
|
|
329
|
-
unique.add(`${words[i]}\t${words[i + 1]}`);
|
|
330
|
-
}
|
|
331
|
-
return unique.size / total;
|
|
332
|
-
}
|
|
333
|
-
/**
|
|
334
|
-
* Check whether a cluster of memories exhibits suspiciously low lexical diversity.
|
|
335
|
-
* When true, the cluster is likely a correlated-extraction artifact; the merge
|
|
336
|
-
* threshold should be raised.
|
|
337
|
-
*
|
|
338
|
-
* @param bodies - The stripped body texts of the cluster members.
|
|
339
|
-
* @param config - Anti-collapse config.
|
|
340
|
-
* @returns `{ lowDiversity: true, diversity }` when the cluster diversity is
|
|
341
|
-
* below the 0.3 threshold; `{ lowDiversity: false }` otherwise.
|
|
342
|
-
*/
|
|
343
|
-
export function checkLexicalDiversity(bodies, config) {
|
|
344
|
-
// R5: default ON — only an explicit opt-out disables the check.
|
|
345
|
-
if (config.enabled === false || config.lexicalDiversityCheck === false) {
|
|
346
|
-
return { lowDiversity: false };
|
|
347
|
-
}
|
|
348
|
-
if (bodies.length === 0)
|
|
349
|
-
return { lowDiversity: false };
|
|
350
|
-
// Average bigram diversity across all bodies in the cluster.
|
|
351
|
-
const avg = bodies.reduce((sum, b) => sum + computeBigramDiversity(b), 0) / bodies.length;
|
|
352
|
-
const DIVERSITY_FLOOR = 0.3;
|
|
353
|
-
if (avg < DIVERSITY_FLOOR) {
|
|
354
|
-
return { lowDiversity: true, diversity: avg };
|
|
355
|
-
}
|
|
356
|
-
return { lowDiversity: false };
|
|
357
|
-
}
|
|
358
|
-
/**
|
|
359
|
-
* Build a CLS (Complementary Learning System) context snippet for injection
|
|
360
|
-
* into distill/memoryInference prompts.
|
|
361
|
-
*
|
|
362
|
-
* Given a list of embedding-retrieved adjacent lessons/knowledge, formats them
|
|
363
|
-
* as a markdown section to append to the prompt so the LLM avoids overwriting
|
|
364
|
-
* prior generalizations.
|
|
365
|
-
*
|
|
366
|
-
* Returns an empty string when CLS is disabled or no adjacent items are found.
|
|
367
|
-
*
|
|
368
|
-
* @param adjacentItems - Top-N adjacent lessons/knowledge retrieved by embedding.
|
|
369
|
-
* @param config - CLS config.
|
|
370
|
-
*/
|
|
371
|
-
export function buildClsContext(adjacentItems, config) {
|
|
372
|
-
if (!config.enabled || adjacentItems.length === 0)
|
|
373
|
-
return "";
|
|
374
|
-
const lines = [
|
|
375
|
-
"",
|
|
376
|
-
"## Existing adjacent lessons / knowledge (CLS context)",
|
|
377
|
-
"The following are semantically related entries already in the stash.",
|
|
378
|
-
"Your proposal MUST NOT contradict or silently overwrite these — if you",
|
|
379
|
-
"disagree with one, flag it as contradicted (do not ignore it).",
|
|
380
|
-
"",
|
|
381
|
-
];
|
|
382
|
-
for (const item of adjacentItems) {
|
|
383
|
-
lines.push(`### ${item.ref}`);
|
|
384
|
-
// Truncate to 400 chars to keep the prompt size reasonable.
|
|
385
|
-
lines.push(item.content.trim().slice(0, 400));
|
|
386
|
-
lines.push("");
|
|
387
|
-
}
|
|
388
|
-
return lines.join("\n");
|
|
389
|
-
}
|
|
390
|
-
/**
|
|
391
|
-
* Check a distill proposal against its cited source memories for contradictions.
|
|
392
|
-
*
|
|
393
|
-
* Uses a simple heuristic: looks for explicit negation of key claims in the
|
|
394
|
-
* proposal body that appear in the source bodies. A full LLM-based
|
|
395
|
-
* contradiction check is expensive (one LLM call per proposal); this cheap
|
|
396
|
-
* heuristic catches the most obvious cases and flags them for human review.
|
|
397
|
-
*
|
|
398
|
-
* When `fidelityCheck.enabled` is false, returns `{ contradictionDetected: false }`
|
|
399
|
-
* immediately (no work done).
|
|
400
|
-
*
|
|
401
|
-
* @param proposalBody - The stripped body of the distill proposal.
|
|
402
|
-
* @param sourceBodies - The stripped bodies of the cited source memories.
|
|
403
|
-
* @param config - Fidelity check config.
|
|
404
|
-
*/
|
|
405
|
-
export function checkDistillFidelity(proposalBody, sourceBodies, config) {
|
|
406
|
-
if (!config.enabled || sourceBodies.length === 0) {
|
|
407
|
-
return { contradictionDetected: false };
|
|
408
|
-
}
|
|
409
|
-
// Heuristic: detect explicit negation of "never" / "always" / "must" claims.
|
|
410
|
-
// A proposal that says "always X" while the source says "never X" (or vice
|
|
411
|
-
// versa) is a clear contradiction worth flagging.
|
|
412
|
-
//
|
|
413
|
-
// This is intentionally conservative: it only flags when both the proposal
|
|
414
|
-
// AND the source contain the opposing polarity of the same key term. False
|
|
415
|
-
// negatives (missed contradictions) are preferred over false positives
|
|
416
|
-
// (blocking valid proposals) since the consequence of a false positive is
|
|
417
|
-
// a human review request, while the cost of a false negative is a slightly
|
|
418
|
-
// degraded stash.
|
|
419
|
-
const proposalLow = proposalBody.toLowerCase();
|
|
420
|
-
// Extract "always/never/must/must not" claims from the proposal.
|
|
421
|
-
const strongClaims = extractStrongClaims(proposalLow);
|
|
422
|
-
if (strongClaims.length === 0)
|
|
423
|
-
return { contradictionDetected: false };
|
|
424
|
-
for (const sourceBody of sourceBodies) {
|
|
425
|
-
const sourceLow = sourceBody.toLowerCase();
|
|
426
|
-
for (const { polarity, term } of strongClaims) {
|
|
427
|
-
const oppositePolarity = polarity === "positive" ? "negative" : "positive";
|
|
428
|
-
const sourceHasOpposite = hasStrongClaim(sourceLow, term, oppositePolarity);
|
|
429
|
-
if (sourceHasOpposite) {
|
|
430
|
-
return {
|
|
431
|
-
contradictionDetected: true,
|
|
432
|
-
reason: `Proposal makes a ${polarity} strong claim about "${term}" that conflicts with an opposing claim in a cited source. Route to human review.`,
|
|
433
|
-
};
|
|
434
|
-
}
|
|
435
|
-
}
|
|
436
|
-
}
|
|
437
|
-
// Also flag proposals whose source_refs are empty (broken provenance).
|
|
438
|
-
// This is a degradation signal, not a contradiction, but worth surfacing.
|
|
439
|
-
return { contradictionDetected: false };
|
|
440
|
-
}
|
|
441
|
-
function extractStrongClaims(text) {
|
|
442
|
-
const claims = [];
|
|
443
|
-
// Match "always <term>", "never <term>", "must <term>", "must not <term>".
|
|
444
|
-
const patterns = [
|
|
445
|
-
{ polarity: "positive", re: /\b(?:always|must)\s+(\w+)/g },
|
|
446
|
-
{ polarity: "negative", re: /\b(?:never|must\s+not|should\s+not)\s+(\w+)/g },
|
|
447
|
-
];
|
|
448
|
-
for (const { polarity, re } of patterns) {
|
|
449
|
-
re.lastIndex = 0;
|
|
450
|
-
let m = re.exec(text);
|
|
451
|
-
while (m !== null) {
|
|
452
|
-
const term = m[1];
|
|
453
|
-
if (term && term.length > 2)
|
|
454
|
-
claims.push({ polarity, term });
|
|
455
|
-
m = re.exec(text);
|
|
456
|
-
}
|
|
457
|
-
}
|
|
458
|
-
return claims;
|
|
459
|
-
}
|
|
460
|
-
function hasStrongClaim(text, term, polarity) {
|
|
461
|
-
if (polarity === "positive") {
|
|
462
|
-
return /\b(?:always|must)\s/.test(text) && text.includes(term);
|
|
463
|
-
}
|
|
464
|
-
return /\b(?:never|must\s+not|should\s+not)\s/.test(text) && text.includes(term);
|
|
465
|
-
}
|
|
466
|
-
// ── captureMode: hot-probation helpers ───────────────────────────────────────
|
|
467
|
-
/**
|
|
468
|
-
* captureMode value for system-generated extractions in probation.
|
|
469
|
-
* Automatic counterpart to the user-explicit `captureMode: hot`.
|
|
470
|
-
*/
|
|
471
|
-
export const CAPTURE_MODE_HOT_PROBATION = "hot-probation";
|
|
472
|
-
/**
|
|
473
|
-
* Returns true when an asset is in hot-probation (system-generated, not yet
|
|
474
|
-
* graduated from the intake dedup+quality pass).
|
|
475
|
-
*/
|
|
476
|
-
export function isHotProbation(captureModeValue) {
|
|
477
|
-
return captureModeValue === CAPTURE_MODE_HOT_PROBATION;
|
|
478
|
-
}
|
|
479
|
-
/**
|
|
480
|
-
* Returns true when an asset should be skipped by the consolidation LLM
|
|
481
|
-
* because it's still in hot-probation (hasn't completed the intake pass yet).
|
|
482
|
-
*
|
|
483
|
-
* Hot-probation assets are processed by the consolidation dedup pre-pass
|
|
484
|
-
* (runDeterministicDedup) but excluded from the LLM merge clustering, so
|
|
485
|
-
* noisy extractions can't pollute the LLM context.
|
|
486
|
-
*/
|
|
487
|
-
export function shouldSkipHotProbationInLlm(frontmatterData) {
|
|
488
|
-
return isHotProbation(frontmatterData.captureMode);
|
|
489
|
-
}
|
|
490
|
-
/**
|
|
491
|
-
* Build frontmatter fields to inject when creating a hot-probation proposal.
|
|
492
|
-
* The proposal will carry `captureMode: hot-probation` so downstream logic
|
|
493
|
-
* knows to run the intake dedup pass before graduating it.
|
|
494
|
-
*/
|
|
495
|
-
export function buildHotProbationFrontmatter() {
|
|
496
|
-
return { captureMode: CAPTURE_MODE_HOT_PROBATION };
|
|
497
|
-
}
|