@totalreclaw/totalreclaw 3.3.1-rc.2 → 3.3.1-rc.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +330 -0
- package/SKILL.md +50 -83
- package/api-client.ts +18 -11
- package/config.ts +117 -3
- package/crypto.ts +10 -2
- package/dist/api-client.js +226 -0
- package/dist/billing-cache.js +100 -0
- package/dist/claims-helper.js +606 -0
- package/dist/config.js +280 -0
- package/dist/consolidation.js +258 -0
- package/dist/contradiction-sync.js +1034 -0
- package/dist/crypto.js +138 -0
- package/dist/digest-sync.js +361 -0
- package/dist/download-ux.js +63 -0
- package/dist/embedding.js +86 -0
- package/dist/extractor.js +1225 -0
- package/dist/first-run.js +103 -0
- package/dist/fs-helpers.js +563 -0
- package/dist/gateway-url.js +197 -0
- package/dist/generate-mnemonic.js +13 -0
- package/dist/hot-cache-wrapper.js +101 -0
- package/dist/import-adapters/base-adapter.js +64 -0
- package/dist/import-adapters/chatgpt-adapter.js +238 -0
- package/dist/import-adapters/claude-adapter.js +114 -0
- package/dist/import-adapters/gemini-adapter.js +201 -0
- package/dist/import-adapters/index.js +26 -0
- package/dist/import-adapters/mcp-memory-adapter.js +219 -0
- package/dist/import-adapters/mem0-adapter.js +158 -0
- package/dist/import-adapters/types.js +1 -0
- package/dist/index.js +5348 -0
- package/dist/llm-client.js +686 -0
- package/dist/llm-profile-reader.js +346 -0
- package/dist/lsh.js +62 -0
- package/dist/onboarding-cli.js +750 -0
- package/dist/pair-cli.js +344 -0
- package/dist/pair-crypto.js +359 -0
- package/dist/pair-http.js +404 -0
- package/dist/pair-page.js +826 -0
- package/dist/pair-qr.js +107 -0
- package/dist/pair-remote-client.js +410 -0
- package/dist/pair-session-store.js +566 -0
- package/dist/pin.js +542 -0
- package/dist/qa-bug-report.js +301 -0
- package/dist/relay-headers.js +44 -0
- package/dist/reranker.js +442 -0
- package/dist/retype-setscope.js +348 -0
- package/dist/semantic-dedup.js +75 -0
- package/dist/subgraph-search.js +289 -0
- package/dist/subgraph-store.js +694 -0
- package/dist/tool-gating.js +58 -0
- package/download-ux.ts +91 -0
- package/embedding.ts +32 -9
- package/fs-helpers.ts +124 -0
- package/gateway-url.ts +57 -9
- package/index.ts +586 -357
- package/llm-client.ts +211 -23
- package/lsh.ts +7 -2
- package/onboarding-cli.ts +114 -1
- package/package.json +19 -5
- package/pair-cli.ts +76 -8
- package/pair-crypto.ts +34 -24
- package/pair-page.ts +28 -17
- package/pair-qr.ts +152 -0
- package/pair-remote-client.ts +540 -0
- package/qa-bug-report.ts +381 -0
- package/relay-headers.ts +50 -0
- package/reranker.ts +73 -0
- package/retype-setscope.ts +12 -0
- package/subgraph-search.ts +4 -3
- package/subgraph-store.ts +109 -16
|
@@ -0,0 +1,348 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* retype / set_scope pure operations for OpenClaw plugin — v1.1 taxonomy.
|
|
3
|
+
*
|
|
4
|
+
* Agents need to be able to reclassify an existing memory's `type`
|
|
5
|
+
* (claim ↔ preference, etc.) or its `scope` (work ↔ personal ↔ health, ...)
|
|
6
|
+
* without destroying the underlying text. The subgraph is append-only,
|
|
7
|
+
* so like pin/unpin both operations tombstone the existing fact and
|
|
8
|
+
* write a fresh v1.1 blob with the changed field. The new fact's
|
|
9
|
+
* `superseded_by` points to the old fact id so cross-device readers see
|
|
10
|
+
* the correct resolution.
|
|
11
|
+
*
|
|
12
|
+
* Why this module is separate from pin.ts
|
|
13
|
+
* ---------------------------------------
|
|
14
|
+
* `executePinOperation` is tightly coupled to `pin_status` handling
|
|
15
|
+
* (idempotent short-circuit on matching status, decision-log recovery
|
|
16
|
+
* for auto-supersede victims, feedback wiring into the tuning loop).
|
|
17
|
+
* retype and set_scope are simpler — they don't short-circuit when the
|
|
18
|
+
* new value equals the old (the user might be confirming a prior
|
|
19
|
+
* auto-extraction's label) and they don't write feedback rows. Sharing
|
|
20
|
+
* the transport / crypto deps with pin is still useful; callers pass
|
|
21
|
+
* the same `RetypeSetScopeDeps` object.
|
|
22
|
+
*
|
|
23
|
+
* Scope and scanner surface
|
|
24
|
+
* -------------------------
|
|
25
|
+
* - No env-var reads — config is centralized in config.ts.
|
|
26
|
+
* - No outbound HTTP — all network work happens inside the injected
|
|
27
|
+
* `submitBatch` dep (callers wire it to subgraph-store).
|
|
28
|
+
* - No disk reads — callers supply an in-memory pre-loaded fact.
|
|
29
|
+
*/
|
|
30
|
+
import crypto from 'node:crypto';
|
|
31
|
+
import { createRequire } from 'node:module';
|
|
32
|
+
import { buildV1ClaimBlob, mapTypeToCategory, readV1Blob, } from './claims-helper.js';
|
|
33
|
+
import { isValidMemoryType, VALID_MEMORY_SCOPES, V0_TO_V1_TYPE, } from './extractor.js';
|
|
34
|
+
import { PROTOBUF_VERSION_V4 } from './subgraph-store.js';
|
|
35
|
+
// Lazy-load WASM core — mirrors pin.ts pattern.
|
|
36
|
+
const requireWasm = createRequire(import.meta.url);
|
|
37
|
+
let _wasm = null;
|
|
38
|
+
function getWasm() {
|
|
39
|
+
if (!_wasm)
|
|
40
|
+
_wasm = requireWasm('@totalreclaw/core');
|
|
41
|
+
return _wasm;
|
|
42
|
+
}
|
|
43
|
+
function encodeFactProtobufLocal(fact, version) {
|
|
44
|
+
const json = JSON.stringify({
|
|
45
|
+
id: fact.id,
|
|
46
|
+
timestamp: fact.timestamp,
|
|
47
|
+
owner: fact.owner,
|
|
48
|
+
encrypted_blob_hex: fact.encryptedBlob,
|
|
49
|
+
blind_indices: fact.blindIndices,
|
|
50
|
+
decay_score: fact.decayScore,
|
|
51
|
+
source: fact.source,
|
|
52
|
+
content_fp: fact.contentFp,
|
|
53
|
+
agent_id: fact.agentId,
|
|
54
|
+
encrypted_embedding: fact.encryptedEmbedding || null,
|
|
55
|
+
version,
|
|
56
|
+
});
|
|
57
|
+
return Buffer.from(getWasm().encodeFactProtobuf(json));
|
|
58
|
+
}
|
|
59
|
+
function projectFromDecrypted(decrypted) {
|
|
60
|
+
let obj;
|
|
61
|
+
try {
|
|
62
|
+
obj = JSON.parse(decrypted);
|
|
63
|
+
}
|
|
64
|
+
catch {
|
|
65
|
+
return null;
|
|
66
|
+
}
|
|
67
|
+
// v1 blob (schema_version "1.x")
|
|
68
|
+
if (typeof obj.text === 'string' &&
|
|
69
|
+
typeof obj.type === 'string' &&
|
|
70
|
+
typeof obj.schema_version === 'string' &&
|
|
71
|
+
obj.schema_version.startsWith('1.')) {
|
|
72
|
+
const v1 = readV1Blob(decrypted);
|
|
73
|
+
if (v1) {
|
|
74
|
+
return {
|
|
75
|
+
text: v1.text,
|
|
76
|
+
type: v1.type,
|
|
77
|
+
source: v1.source,
|
|
78
|
+
scope: v1.scope,
|
|
79
|
+
volatility: v1.volatility,
|
|
80
|
+
reasoning: v1.reasoning,
|
|
81
|
+
entities: v1.entities,
|
|
82
|
+
importance: v1.importance,
|
|
83
|
+
confidence: v1.confidence,
|
|
84
|
+
createdAt: v1.createdAt,
|
|
85
|
+
expiresAt: v1.expiresAt,
|
|
86
|
+
pinStatus: v1.pinStatus,
|
|
87
|
+
};
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
// v0 short-key blob — upgrade to v1 shape.
|
|
91
|
+
if (typeof obj.t === 'string' && typeof obj.c === 'string') {
|
|
92
|
+
const v0Type = typeof obj.c === 'string' ? obj.c : 'fact';
|
|
93
|
+
const v1Type = V0_TO_V1_TYPE[v0Type] ?? 'claim';
|
|
94
|
+
const imp = typeof obj.i === 'number' ? obj.i : 5;
|
|
95
|
+
const conf = typeof obj.cf === 'number' ? obj.cf : 0.85;
|
|
96
|
+
const sa = typeof obj.sa === 'string' ? obj.sa : 'user';
|
|
97
|
+
const validSource = ['user', 'user-inferred', 'assistant', 'external', 'derived'].includes(sa)
|
|
98
|
+
? sa
|
|
99
|
+
: 'user';
|
|
100
|
+
const ea = typeof obj.ea === 'string' ? obj.ea : new Date().toISOString();
|
|
101
|
+
const entities = Array.isArray(obj.e)
|
|
102
|
+
? obj.e
|
|
103
|
+
.map((e) => {
|
|
104
|
+
if (!e || typeof e !== 'object')
|
|
105
|
+
return null;
|
|
106
|
+
const entity = e;
|
|
107
|
+
const name = typeof entity.n === 'string' ? entity.n : '';
|
|
108
|
+
const entType = typeof entity.tp === 'string' ? entity.tp : 'concept';
|
|
109
|
+
if (!name)
|
|
110
|
+
return null;
|
|
111
|
+
const role = typeof entity.r === 'string' ? entity.r : undefined;
|
|
112
|
+
return { name, type: entType, role };
|
|
113
|
+
})
|
|
114
|
+
.filter((e) => e !== null)
|
|
115
|
+
: undefined;
|
|
116
|
+
return {
|
|
117
|
+
text: typeof obj.t === 'string' ? obj.t : '',
|
|
118
|
+
type: v1Type,
|
|
119
|
+
source: validSource,
|
|
120
|
+
scope: undefined,
|
|
121
|
+
volatility: undefined,
|
|
122
|
+
reasoning: undefined,
|
|
123
|
+
entities,
|
|
124
|
+
importance: Math.max(1, Math.min(10, Math.round(imp))),
|
|
125
|
+
confidence: Math.max(0, Math.min(1, conf)),
|
|
126
|
+
createdAt: ea,
|
|
127
|
+
};
|
|
128
|
+
}
|
|
129
|
+
return null;
|
|
130
|
+
}
|
|
131
|
+
// ---------------------------------------------------------------------------
|
|
132
|
+
// Core: retrieve existing fact, decrypt, rewrite with mutated field
|
|
133
|
+
// ---------------------------------------------------------------------------
|
|
134
|
+
async function rewriteWithMutation(factId, deps, mutate) {
|
|
135
|
+
const existing = await deps.fetchFactById(factId);
|
|
136
|
+
if (!existing) {
|
|
137
|
+
return { success: false, fact_id: factId, error: `Fact not found: ${factId}` };
|
|
138
|
+
}
|
|
139
|
+
const blobHex = existing.encryptedBlob.startsWith('0x')
|
|
140
|
+
? existing.encryptedBlob.slice(2)
|
|
141
|
+
: existing.encryptedBlob;
|
|
142
|
+
let plaintext;
|
|
143
|
+
try {
|
|
144
|
+
plaintext = deps.decryptBlob(blobHex);
|
|
145
|
+
}
|
|
146
|
+
catch (err) {
|
|
147
|
+
return {
|
|
148
|
+
success: false,
|
|
149
|
+
fact_id: factId,
|
|
150
|
+
error: `Failed to decrypt fact: ${err instanceof Error ? err.message : String(err)}`,
|
|
151
|
+
};
|
|
152
|
+
}
|
|
153
|
+
const current = projectFromDecrypted(plaintext);
|
|
154
|
+
if (!current) {
|
|
155
|
+
return {
|
|
156
|
+
success: false,
|
|
157
|
+
fact_id: factId,
|
|
158
|
+
error: `Unrecognized blob shape for fact ${factId} — cannot retype/rescope`,
|
|
159
|
+
};
|
|
160
|
+
}
|
|
161
|
+
const next = mutate(current);
|
|
162
|
+
const newFactId = crypto.randomUUID();
|
|
163
|
+
let canonicalJson;
|
|
164
|
+
try {
|
|
165
|
+
canonicalJson = buildV1ClaimBlob({
|
|
166
|
+
id: newFactId,
|
|
167
|
+
text: next.text,
|
|
168
|
+
type: next.type,
|
|
169
|
+
source: next.source,
|
|
170
|
+
scope: next.scope,
|
|
171
|
+
volatility: next.volatility,
|
|
172
|
+
reasoning: next.reasoning,
|
|
173
|
+
entities: next.entities,
|
|
174
|
+
importance: next.importance,
|
|
175
|
+
confidence: next.confidence,
|
|
176
|
+
createdAt: new Date().toISOString(),
|
|
177
|
+
supersededBy: factId,
|
|
178
|
+
// Issue #117 follow-up: preserve pin_status so that retype / set_scope
|
|
179
|
+
// on a pinned fact does NOT silently un-pin it. Without this, a pinned
|
|
180
|
+
// fact loses its immunity to auto-supersede after any metadata edit.
|
|
181
|
+
pinStatus: next.pinStatus,
|
|
182
|
+
});
|
|
183
|
+
}
|
|
184
|
+
catch (err) {
|
|
185
|
+
return {
|
|
186
|
+
success: false,
|
|
187
|
+
fact_id: factId,
|
|
188
|
+
error: `Failed to build v1 claim blob: ${err instanceof Error ? err.message : String(err)}`,
|
|
189
|
+
};
|
|
190
|
+
}
|
|
191
|
+
let newBlobHex;
|
|
192
|
+
try {
|
|
193
|
+
newBlobHex = deps.encryptBlob(canonicalJson);
|
|
194
|
+
}
|
|
195
|
+
catch (err) {
|
|
196
|
+
return {
|
|
197
|
+
success: false,
|
|
198
|
+
fact_id: factId,
|
|
199
|
+
error: `Failed to encrypt updated claim: ${err instanceof Error ? err.message : String(err)}`,
|
|
200
|
+
};
|
|
201
|
+
}
|
|
202
|
+
const entityNames = next.entities
|
|
203
|
+
? next.entities
|
|
204
|
+
.map((e) => e.name)
|
|
205
|
+
.filter((n) => typeof n === 'string' && n.length > 0)
|
|
206
|
+
: [];
|
|
207
|
+
let regenerated;
|
|
208
|
+
try {
|
|
209
|
+
regenerated = await deps.generateIndices(next.text, entityNames);
|
|
210
|
+
}
|
|
211
|
+
catch {
|
|
212
|
+
regenerated = { blindIndices: [] };
|
|
213
|
+
}
|
|
214
|
+
const tombstonePayload = {
|
|
215
|
+
id: factId,
|
|
216
|
+
timestamp: new Date().toISOString(),
|
|
217
|
+
owner: deps.owner,
|
|
218
|
+
encryptedBlob: '00',
|
|
219
|
+
blindIndices: [],
|
|
220
|
+
decayScore: 0,
|
|
221
|
+
source: 'tombstone',
|
|
222
|
+
contentFp: '',
|
|
223
|
+
agentId: deps.sourceAgent,
|
|
224
|
+
};
|
|
225
|
+
const newPayload = {
|
|
226
|
+
id: newFactId,
|
|
227
|
+
timestamp: new Date().toISOString(),
|
|
228
|
+
owner: deps.owner,
|
|
229
|
+
encryptedBlob: newBlobHex,
|
|
230
|
+
blindIndices: regenerated.blindIndices,
|
|
231
|
+
decayScore: 1.0,
|
|
232
|
+
source: 'openclaw-plugin-retype',
|
|
233
|
+
contentFp: '',
|
|
234
|
+
agentId: deps.sourceAgent,
|
|
235
|
+
encryptedEmbedding: regenerated.encryptedEmbedding,
|
|
236
|
+
};
|
|
237
|
+
const payloads = [
|
|
238
|
+
encodeFactProtobufLocal(tombstonePayload, /* legacy v3 */ 3),
|
|
239
|
+
encodeFactProtobufLocal(newPayload, PROTOBUF_VERSION_V4),
|
|
240
|
+
];
|
|
241
|
+
try {
|
|
242
|
+
const { txHash, success } = await deps.submitBatch(payloads);
|
|
243
|
+
if (!success) {
|
|
244
|
+
return {
|
|
245
|
+
success: false,
|
|
246
|
+
fact_id: factId,
|
|
247
|
+
error: 'On-chain batch submission failed',
|
|
248
|
+
tx_hash: txHash,
|
|
249
|
+
};
|
|
250
|
+
}
|
|
251
|
+
return {
|
|
252
|
+
success: true,
|
|
253
|
+
fact_id: factId,
|
|
254
|
+
new_fact_id: newFactId,
|
|
255
|
+
previous_type: current.type,
|
|
256
|
+
new_type: next.type,
|
|
257
|
+
previous_scope: current.scope,
|
|
258
|
+
new_scope: next.scope,
|
|
259
|
+
tx_hash: txHash,
|
|
260
|
+
};
|
|
261
|
+
}
|
|
262
|
+
catch (err) {
|
|
263
|
+
return {
|
|
264
|
+
success: false,
|
|
265
|
+
fact_id: factId,
|
|
266
|
+
error: `Failed to submit retype/rescope batch: ${err instanceof Error ? err.message : String(err)}`,
|
|
267
|
+
};
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
// ---------------------------------------------------------------------------
|
|
271
|
+
// Public entry points
|
|
272
|
+
// ---------------------------------------------------------------------------
|
|
273
|
+
/**
|
|
274
|
+
* Re-type an existing memory. Writes a new v1.1 claim with `type` changed;
|
|
275
|
+
* tombstones the old fact. `superseded_by` on the new fact points to the
|
|
276
|
+
* old id so cross-device readers see the correct resolution.
|
|
277
|
+
*/
|
|
278
|
+
export async function executeRetype(factId, newType, deps) {
|
|
279
|
+
if (!isValidMemoryType(newType)) {
|
|
280
|
+
return {
|
|
281
|
+
success: false,
|
|
282
|
+
fact_id: factId,
|
|
283
|
+
error: `Invalid new type "${newType}". Must be one of: claim, preference, directive, commitment, episode, summary.`,
|
|
284
|
+
};
|
|
285
|
+
}
|
|
286
|
+
return rewriteWithMutation(factId, deps, (current) => ({
|
|
287
|
+
...current,
|
|
288
|
+
type: newType,
|
|
289
|
+
}));
|
|
290
|
+
}
|
|
291
|
+
/**
|
|
292
|
+
* Re-scope an existing memory. Writes a new v1.1 claim with `scope` changed;
|
|
293
|
+
* tombstones the old fact.
|
|
294
|
+
*/
|
|
295
|
+
export async function executeSetScope(factId, newScope, deps) {
|
|
296
|
+
if (!VALID_MEMORY_SCOPES.includes(newScope)) {
|
|
297
|
+
return {
|
|
298
|
+
success: false,
|
|
299
|
+
fact_id: factId,
|
|
300
|
+
error: `Invalid new scope "${newScope}". Must be one of: ${VALID_MEMORY_SCOPES.join(', ')}.`,
|
|
301
|
+
};
|
|
302
|
+
}
|
|
303
|
+
return rewriteWithMutation(factId, deps, (current) => ({
|
|
304
|
+
...current,
|
|
305
|
+
scope: newScope,
|
|
306
|
+
}));
|
|
307
|
+
}
|
|
308
|
+
export function validateRetypeArgs(args) {
|
|
309
|
+
if (typeof args !== 'object' || args === null) {
|
|
310
|
+
return { ok: false, error: 'totalreclaw_retype requires an object argument.' };
|
|
311
|
+
}
|
|
312
|
+
const rec = args;
|
|
313
|
+
const factId = rec.fact_id ?? rec.factId;
|
|
314
|
+
if (typeof factId !== 'string' || factId.trim().length === 0) {
|
|
315
|
+
return { ok: false, error: 'fact_id is required and must be a non-empty string.' };
|
|
316
|
+
}
|
|
317
|
+
const newType = rec.new_type ?? rec.newType ?? rec.type;
|
|
318
|
+
if (typeof newType !== 'string' || !isValidMemoryType(newType)) {
|
|
319
|
+
return {
|
|
320
|
+
ok: false,
|
|
321
|
+
error: `new_type must be one of: ${[...['claim', 'preference', 'directive', 'commitment', 'episode', 'summary']].join(', ')}`,
|
|
322
|
+
};
|
|
323
|
+
}
|
|
324
|
+
return { ok: true, factId: factId.trim(), newType: newType };
|
|
325
|
+
}
|
|
326
|
+
export function validateSetScopeArgs(args) {
|
|
327
|
+
if (typeof args !== 'object' || args === null) {
|
|
328
|
+
return { ok: false, error: 'totalreclaw_set_scope requires an object argument.' };
|
|
329
|
+
}
|
|
330
|
+
const rec = args;
|
|
331
|
+
const factId = rec.fact_id ?? rec.factId;
|
|
332
|
+
if (typeof factId !== 'string' || factId.trim().length === 0) {
|
|
333
|
+
return { ok: false, error: 'fact_id is required and must be a non-empty string.' };
|
|
334
|
+
}
|
|
335
|
+
const newScope = rec.new_scope ?? rec.newScope ?? rec.scope;
|
|
336
|
+
if (typeof newScope !== 'string' || !VALID_MEMORY_SCOPES.includes(newScope)) {
|
|
337
|
+
return {
|
|
338
|
+
ok: false,
|
|
339
|
+
error: `new_scope must be one of: ${VALID_MEMORY_SCOPES.join(', ')}`,
|
|
340
|
+
};
|
|
341
|
+
}
|
|
342
|
+
return { ok: true, factId: factId.trim(), newScope: newScope };
|
|
343
|
+
}
|
|
344
|
+
// ---------------------------------------------------------------------------
|
|
345
|
+
// Export mapTypeToCategory re-export so callers (index.ts) don't need
|
|
346
|
+
// a separate import path.
|
|
347
|
+
// ---------------------------------------------------------------------------
|
|
348
|
+
export { mapTypeToCategory };
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* TotalReclaw Plugin - Semantic Near-Duplicate Detection (T330)
|
|
3
|
+
*
|
|
4
|
+
* Provides batch-level deduplication of extracted facts using cosine
|
|
5
|
+
* similarity on their embeddings. Facts within the same extraction batch
|
|
6
|
+
* that are semantically near-duplicates (cosine >= threshold) are reduced
|
|
7
|
+
* to keep only the first occurrence.
|
|
8
|
+
*
|
|
9
|
+
* This module intentionally has minimal dependencies (only reranker for
|
|
10
|
+
* cosineSimilarity and extractor for the ExtractedFact type) so it can
|
|
11
|
+
* be tested without pulling in the full plugin dependency graph.
|
|
12
|
+
*/
|
|
13
|
+
import { cosineSimilarity } from './reranker.js';
|
|
14
|
+
// ---------------------------------------------------------------------------
|
|
15
|
+
// Configuration
|
|
16
|
+
// ---------------------------------------------------------------------------
|
|
17
|
+
/**
|
|
18
|
+
* Get the cosine similarity threshold for semantic dedup.
|
|
19
|
+
*
|
|
20
|
+
* Configurable via TOTALRECLAW_SEMANTIC_DEDUP_THRESHOLD env var.
|
|
21
|
+
* Must be a number in [0, 1]. Falls back to 0.9 if invalid or unset.
|
|
22
|
+
*/
|
|
23
|
+
export function getSemanticDedupThreshold() {
|
|
24
|
+
const envVal = process.env.TOTALRECLAW_SEMANTIC_DEDUP_THRESHOLD;
|
|
25
|
+
if (envVal !== undefined) {
|
|
26
|
+
const parsed = parseFloat(envVal);
|
|
27
|
+
if (!isNaN(parsed) && parsed >= 0 && parsed <= 1)
|
|
28
|
+
return parsed;
|
|
29
|
+
}
|
|
30
|
+
return 0.9;
|
|
31
|
+
}
|
|
32
|
+
// ---------------------------------------------------------------------------
|
|
33
|
+
// Batch deduplication
|
|
34
|
+
// ---------------------------------------------------------------------------
|
|
35
|
+
/**
|
|
36
|
+
* Deduplicate a batch of extracted facts using cosine similarity on their
|
|
37
|
+
* embeddings. Facts without embeddings are always kept (fail-open).
|
|
38
|
+
*
|
|
39
|
+
* For each fact, compares its embedding against all previously kept facts.
|
|
40
|
+
* If any kept fact has cosine similarity >= threshold, the new fact is
|
|
41
|
+
* considered a near-duplicate and is skipped.
|
|
42
|
+
*
|
|
43
|
+
* @param facts - Array of extracted facts to deduplicate
|
|
44
|
+
* @param embeddings - Map from fact text to its embedding vector
|
|
45
|
+
* @param logger - Logger for reporting skipped duplicates
|
|
46
|
+
* @returns - Deduplicated array (subset of input, preserving order)
|
|
47
|
+
*/
|
|
48
|
+
export function deduplicateBatch(facts, embeddings, logger) {
|
|
49
|
+
const threshold = getSemanticDedupThreshold();
|
|
50
|
+
const kept = [];
|
|
51
|
+
for (const fact of facts) {
|
|
52
|
+
const factEmb = embeddings.get(fact.text);
|
|
53
|
+
if (!factEmb) {
|
|
54
|
+
// No embedding available -- keep the fact (fail-open)
|
|
55
|
+
kept.push(fact);
|
|
56
|
+
continue;
|
|
57
|
+
}
|
|
58
|
+
let isDuplicate = false;
|
|
59
|
+
for (const keptFact of kept) {
|
|
60
|
+
const keptEmb = embeddings.get(keptFact.text);
|
|
61
|
+
if (!keptEmb)
|
|
62
|
+
continue;
|
|
63
|
+
const similarity = cosineSimilarity(factEmb, keptEmb);
|
|
64
|
+
if (similarity >= threshold) {
|
|
65
|
+
isDuplicate = true;
|
|
66
|
+
logger.info(`Semantic dedup: skipping "${fact.text}" (cosine=${similarity.toFixed(3)} >= ${threshold} with "${keptFact.text}")`);
|
|
67
|
+
break;
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
if (!isDuplicate) {
|
|
71
|
+
kept.push(fact);
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
return kept;
|
|
75
|
+
}
|