@totalreclaw/totalreclaw 1.6.0 → 3.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAWHUB.md +134 -0
- package/README.md +407 -64
- package/SKILL.md +1032 -0
- package/api-client.ts +5 -5
- package/claims-helper.ts +686 -0
- package/config.ts +211 -0
- package/consolidation.ts +141 -33
- package/contradiction-sync.ts +1389 -0
- package/crypto.ts +63 -261
- package/digest-sync.ts +516 -0
- package/embedding.ts +69 -46
- package/extractor.ts +1307 -84
- package/hot-cache-wrapper.ts +1 -1
- package/import-adapters/gemini-adapter.ts +243 -0
- package/import-adapters/index.ts +3 -0
- package/import-adapters/types.ts +1 -1
- package/index.ts +1887 -323
- package/llm-client.ts +106 -53
- package/lsh.ts +21 -210
- package/package.json +20 -7
- package/pin.ts +502 -0
- package/reranker.ts +96 -124
- package/skill.json +213 -0
- package/subgraph-search.ts +112 -5
- package/subgraph-store.ts +559 -275
- package/consolidation.test.ts +0 -356
- package/extractor-dedup.test.ts +0 -168
- package/import-adapters/import-adapters.test.ts +0 -1123
- package/lsh.test.ts +0 -463
- package/pocv2-e2e-test.ts +0 -917
- package/porter-stemmer.d.ts +0 -4
- package/reranker.test.ts +0 -594
- package/semantic-dedup.test.ts +0 -392
- package/setup.sh +0 -19
- package/store-dedup-wiring.test.ts +0 -186
package/config.ts
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Plugin configuration — centralized env var reads.
|
|
3
|
+
* This file ONLY reads process.env. No network calls, no I/O.
|
|
4
|
+
* Other modules import config values from here.
|
|
5
|
+
*
|
|
6
|
+
* OpenClaw's security scanner flags files that contain BOTH process.env reads
|
|
7
|
+
* AND network calls. By centralizing all env reads here, no other file needs
|
|
8
|
+
* to touch process.env directly.
|
|
9
|
+
*
|
|
10
|
+
* v1 env var cleanup — see `docs/guides/env-vars-reference.md`.
|
|
11
|
+
* Removed user-facing vars: TOTALRECLAW_CHAIN_ID, TOTALRECLAW_EMBEDDING_MODEL,
|
|
12
|
+
* TOTALRECLAW_STORE_DEDUP, TOTALRECLAW_LLM_MODEL, TOTALRECLAW_SESSION_ID,
|
|
13
|
+
* TOTALRECLAW_TAXONOMY_VERSION.
|
|
14
|
+
* Removed legacy gates: TOTALRECLAW_CLAIM_FORMAT, TOTALRECLAW_DIGEST_MODE,
|
|
15
|
+
* TOTALRECLAW_AUTO_RESOLVE_MODE (the last one moved to an internal debug
|
|
16
|
+
* module; see `contradiction-sync.ts`).
|
|
17
|
+
*
|
|
18
|
+
* Tuning knobs (cosine threshold, min importance, cache TTL, etc.) are now
|
|
19
|
+
* delivered via the relay billing response. Env-var fallbacks are kept only
|
|
20
|
+
* for self-hosted deployments where the server may not surface those values.
|
|
21
|
+
*/
|
|
22
|
+
|
|
23
|
+
import path from 'node:path';
|
|
24
|
+
|
|
25
|
+
const home = process.env.HOME ?? '/home/node';
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* Removed env vars — warn once per process if still set so operators know
|
|
29
|
+
* their config is a no-op. The removal list matches `docs/guides/env-vars-reference.md`.
|
|
30
|
+
*/
|
|
31
|
+
const REMOVED_ENV_VARS = [
|
|
32
|
+
'TOTALRECLAW_CHAIN_ID',
|
|
33
|
+
'TOTALRECLAW_EMBEDDING_MODEL',
|
|
34
|
+
'TOTALRECLAW_STORE_DEDUP',
|
|
35
|
+
'TOTALRECLAW_LLM_MODEL',
|
|
36
|
+
'TOTALRECLAW_SESSION_ID',
|
|
37
|
+
'TOTALRECLAW_TAXONOMY_VERSION',
|
|
38
|
+
'TOTALRECLAW_CLAIM_FORMAT',
|
|
39
|
+
'TOTALRECLAW_DIGEST_MODE',
|
|
40
|
+
] as const;
|
|
41
|
+
|
|
42
|
+
function warnRemovedEnvVars(warn: (msg: string) => void = console.warn): void {
|
|
43
|
+
const set = REMOVED_ENV_VARS.filter((name) => process.env[name] !== undefined);
|
|
44
|
+
if (set.length === 0) return;
|
|
45
|
+
warn(
|
|
46
|
+
`TotalReclaw: ignoring removed env var(s): ${set.join(', ')}. ` +
|
|
47
|
+
`See docs/guides/env-vars-reference.md for the v1 env var surface.`,
|
|
48
|
+
);
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
// Emit the warning once at import time. Safe because this module is loaded
|
|
52
|
+
// exactly once per process.
|
|
53
|
+
warnRemovedEnvVars();
|
|
54
|
+
|
|
55
|
+
/** Runtime override for recovery phrase (set by hot-reload after setup). */
|
|
56
|
+
let _recoveryPhraseOverride: string | null = null;
|
|
57
|
+
|
|
58
|
+
export function setRecoveryPhraseOverride(phrase: string): void {
|
|
59
|
+
_recoveryPhraseOverride = phrase;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
export function getRecoveryPhrase(): string {
|
|
63
|
+
return _recoveryPhraseOverride ?? process.env.TOTALRECLAW_RECOVERY_PHRASE ?? '';
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Runtime override for chain ID, set after the relay billing response is
|
|
68
|
+
* read. Free tier stays on 84532 (Base Sepolia); Pro tier flips to 100
|
|
69
|
+
* (Gnosis mainnet). The relay routes Pro writes to Gnosis, so Pro-tier
|
|
70
|
+
* UserOps MUST be signed against chain 100 — otherwise the bundler rejects
|
|
71
|
+
* the signature with AA23.
|
|
72
|
+
*
|
|
73
|
+
* See index.ts: after the billing lookup completes, call
|
|
74
|
+
* `setChainIdOverride(100)` for Pro users. Free users can leave the
|
|
75
|
+
* override unset.
|
|
76
|
+
*/
|
|
77
|
+
let _chainIdOverride: number | null = null;
|
|
78
|
+
|
|
79
|
+
export function setChainIdOverride(chainId: number): void {
|
|
80
|
+
_chainIdOverride = chainId;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/** Reset the chain override — used by tests. */
|
|
84
|
+
export function __resetChainIdOverrideForTests(): void {
|
|
85
|
+
_chainIdOverride = null;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
export const CONFIG = {
|
|
89
|
+
// Core — recoveryPhrase reads from override first, then env var.
|
|
90
|
+
// Use getRecoveryPhrase() for dynamic access; this property is for
|
|
91
|
+
// backward-compat with code that reads CONFIG.recoveryPhrase at init time.
|
|
92
|
+
get recoveryPhrase(): string {
|
|
93
|
+
return getRecoveryPhrase();
|
|
94
|
+
},
|
|
95
|
+
serverUrl: (process.env.TOTALRECLAW_SERVER_URL || 'https://api.totalreclaw.xyz').replace(/\/+$/, ''),
|
|
96
|
+
selfHosted: process.env.TOTALRECLAW_SELF_HOSTED === 'true',
|
|
97
|
+
credentialsPath: process.env.TOTALRECLAW_CREDENTIALS_PATH || path.join(home, '.totalreclaw', 'credentials.json'),
|
|
98
|
+
|
|
99
|
+
// Chain — chainId is no longer user-configurable. It is auto-detected from
|
|
100
|
+
// the relay billing response (free = Base Sepolia / 84532, Pro = Gnosis /
|
|
101
|
+
// 100). The default here is used only before the first billing lookup
|
|
102
|
+
// completes. Self-hosted users can still point at a custom DataEdge via
|
|
103
|
+
// TOTALRECLAW_DATA_EDGE_ADDRESS / TOTALRECLAW_ENTRYPOINT_ADDRESS /
|
|
104
|
+
// TOTALRECLAW_RPC_URL (undocumented; internal knobs).
|
|
105
|
+
//
|
|
106
|
+
// Reads the runtime override set by the billing auto-detect in index.ts.
|
|
107
|
+
// Falls back to 84532 (free tier / pre-billing-lookup). Must be a getter,
|
|
108
|
+
// not a literal — a literal would freeze all Pro-tier UserOps to the
|
|
109
|
+
// wrong chainId and AA23 at the bundler.
|
|
110
|
+
get chainId(): number {
|
|
111
|
+
return _chainIdOverride ?? 84532;
|
|
112
|
+
},
|
|
113
|
+
dataEdgeAddress: process.env.TOTALRECLAW_DATA_EDGE_ADDRESS || '',
|
|
114
|
+
entryPointAddress: process.env.TOTALRECLAW_ENTRYPOINT_ADDRESS || '',
|
|
115
|
+
rpcUrl: process.env.TOTALRECLAW_RPC_URL || '',
|
|
116
|
+
|
|
117
|
+
// Tuning knobs — default values used only as local fallback for
|
|
118
|
+
// self-hosted mode. Managed-service clients override these from the relay
|
|
119
|
+
// billing response via `resolveTuning(...)`.
|
|
120
|
+
// See: docs/specs/totalreclaw/client-consistency.md
|
|
121
|
+
cosineThreshold: parseFloat(process.env.TOTALRECLAW_COSINE_THRESHOLD ?? '0.15'),
|
|
122
|
+
extractInterval: parseInt(process.env.TOTALRECLAW_EXTRACT_INTERVAL ?? process.env.TOTALRECLAW_EXTRACT_EVERY_TURNS ?? '3', 10),
|
|
123
|
+
relevanceThreshold: parseFloat(process.env.TOTALRECLAW_RELEVANCE_THRESHOLD ?? '0.3'),
|
|
124
|
+
semanticSkipThreshold: parseFloat(process.env.TOTALRECLAW_SEMANTIC_SKIP_THRESHOLD ?? '0.85'),
|
|
125
|
+
cacheTtlMs: parseInt(process.env.TOTALRECLAW_CACHE_TTL_MS ?? String(5 * 60 * 1000), 10),
|
|
126
|
+
minImportance: Math.max(1, Math.min(10, Number(process.env.TOTALRECLAW_MIN_IMPORTANCE) || 6)),
|
|
127
|
+
trapdoorBatchSize: parseInt(process.env.TOTALRECLAW_TRAPDOOR_BATCH_SIZE ?? '5', 10),
|
|
128
|
+
pageSize: parseInt(process.env.TOTALRECLAW_SUBGRAPH_PAGE_SIZE ?? '1000', 10),
|
|
129
|
+
|
|
130
|
+
// Store-time dedup is always ON. TOTALRECLAW_STORE_DEDUP was removed in v1.
|
|
131
|
+
storeDedupEnabled: true,
|
|
132
|
+
|
|
133
|
+
// LLM provider API keys (read once, passed to llm-client). Model selection
|
|
134
|
+
// is entirely automatic via `deriveCheapModel(provider)` — the
|
|
135
|
+
// TOTALRECLAW_LLM_MODEL override was removed in v1.
|
|
136
|
+
llmApiKeys: {
|
|
137
|
+
zai: process.env.ZAI_API_KEY || '',
|
|
138
|
+
anthropic: process.env.ANTHROPIC_API_KEY || '',
|
|
139
|
+
openai: process.env.OPENAI_API_KEY || '',
|
|
140
|
+
gemini: process.env.GEMINI_API_KEY || '',
|
|
141
|
+
google: process.env.GOOGLE_API_KEY || '',
|
|
142
|
+
mistral: process.env.MISTRAL_API_KEY || '',
|
|
143
|
+
groq: process.env.GROQ_API_KEY || '',
|
|
144
|
+
deepseek: process.env.DEEPSEEK_API_KEY || '',
|
|
145
|
+
openrouter: process.env.OPENROUTER_API_KEY || '',
|
|
146
|
+
xai: process.env.XAI_API_KEY || '',
|
|
147
|
+
together: process.env.TOGETHER_API_KEY || '',
|
|
148
|
+
cerebras: process.env.CEREBRAS_API_KEY || '',
|
|
149
|
+
} as Record<string, string>,
|
|
150
|
+
|
|
151
|
+
// Paths
|
|
152
|
+
home,
|
|
153
|
+
billingCachePath: path.join(home, '.totalreclaw', 'billing-cache.json'),
|
|
154
|
+
cachePath: process.env.TOTALRECLAW_CACHE_PATH || path.join(home, '.totalreclaw', 'cache.enc'),
|
|
155
|
+
openclawWorkspace: path.join(home, '.openclaw', 'workspace'),
|
|
156
|
+
} as const;
|
|
157
|
+
|
|
158
|
+
// ---------------------------------------------------------------------------
|
|
159
|
+
// Server-side tuning resolution
|
|
160
|
+
// ---------------------------------------------------------------------------
|
|
161
|
+
|
|
162
|
+
/**
|
|
163
|
+
* Optional tuning fields delivered via the relay billing response.
|
|
164
|
+
*
|
|
165
|
+
* Relay may populate these in `features` (same cache consumed by
|
|
166
|
+
* `isLlmDedupEnabled`, `getExtractInterval`, etc.). When present, they
|
|
167
|
+
* override the env/defaults resolved above. When absent (self-hosted or
|
|
168
|
+
* pre-rollout relay), clients fall back to `CONFIG` values.
|
|
169
|
+
*/
|
|
170
|
+
export interface BillingTuning {
|
|
171
|
+
cosine_threshold?: number;
|
|
172
|
+
relevance_threshold?: number;
|
|
173
|
+
semantic_skip_threshold?: number;
|
|
174
|
+
min_importance?: number;
|
|
175
|
+
cache_ttl_ms?: number;
|
|
176
|
+
trapdoor_batch_size?: number;
|
|
177
|
+
subgraph_page_size?: number;
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
/**
|
|
181
|
+
* Merge a billing-response tuning block with the local fallback values.
|
|
182
|
+
*
|
|
183
|
+
* Use this at the call-site that needs a threshold, passing the features
|
|
184
|
+
* blob from the billing cache. No I/O here — callers read the cache once
|
|
185
|
+
* and hand the features in.
|
|
186
|
+
*/
|
|
187
|
+
export function resolveTuning(features?: BillingTuning | null): {
|
|
188
|
+
cosineThreshold: number;
|
|
189
|
+
relevanceThreshold: number;
|
|
190
|
+
semanticSkipThreshold: number;
|
|
191
|
+
minImportance: number;
|
|
192
|
+
cacheTtlMs: number;
|
|
193
|
+
trapdoorBatchSize: number;
|
|
194
|
+
pageSize: number;
|
|
195
|
+
} {
|
|
196
|
+
return {
|
|
197
|
+
cosineThreshold: features?.cosine_threshold ?? CONFIG.cosineThreshold,
|
|
198
|
+
relevanceThreshold: features?.relevance_threshold ?? CONFIG.relevanceThreshold,
|
|
199
|
+
semanticSkipThreshold: features?.semantic_skip_threshold ?? CONFIG.semanticSkipThreshold,
|
|
200
|
+
minImportance: features?.min_importance ?? CONFIG.minImportance,
|
|
201
|
+
cacheTtlMs: features?.cache_ttl_ms ?? CONFIG.cacheTtlMs,
|
|
202
|
+
trapdoorBatchSize: features?.trapdoor_batch_size ?? CONFIG.trapdoorBatchSize,
|
|
203
|
+
pageSize: features?.subgraph_page_size ?? CONFIG.pageSize,
|
|
204
|
+
};
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
// Exposed for tests that want to assert the removed-var warning behaviour.
|
|
208
|
+
export const __internal = {
|
|
209
|
+
REMOVED_ENV_VARS,
|
|
210
|
+
warnRemovedEnvVars,
|
|
211
|
+
};
|
package/consolidation.ts
CHANGED
|
@@ -12,13 +12,30 @@
|
|
|
12
12
|
* 3. Bulk consolidation — cluster all facts in the vault and identify
|
|
13
13
|
* groups of near-duplicates for cleanup (clusterFacts).
|
|
14
14
|
*
|
|
15
|
-
*
|
|
16
|
-
*
|
|
17
|
-
*
|
|
15
|
+
* Delegates core computation to `@totalreclaw/core` Rust WASM module where
|
|
16
|
+
* bindings are available. `shouldSupersede` uses the core directly.
|
|
17
|
+
* `findNearDuplicate` and `clusterFacts` use the core's `findBestNearDuplicate`
|
|
18
|
+
* and `clusterFacts` WASM functions when available, falling back to local
|
|
19
|
+
* implementations that use WASM-backed `cosineSimilarity`.
|
|
20
|
+
*
|
|
21
|
+
* Threshold helpers remain local (they read process.env).
|
|
18
22
|
*/
|
|
19
23
|
|
|
24
|
+
import { createRequire } from 'node:module';
|
|
20
25
|
import { cosineSimilarity } from './reranker.js';
|
|
21
26
|
|
|
27
|
+
// ---------------------------------------------------------------------------
|
|
28
|
+
// Lazy-load WASM core (mirrors claims-helper.ts / contradiction-sync.ts
|
|
29
|
+
// pattern — plays nicely under both the OpenClaw runtime (CJS-ish tsx) and
|
|
30
|
+
// bare Node ESM used by tests).
|
|
31
|
+
// ---------------------------------------------------------------------------
|
|
32
|
+
const requireWasm = createRequire(import.meta.url);
|
|
33
|
+
let _wasm: typeof import('@totalreclaw/core') | null = null;
|
|
34
|
+
function getWasm(): typeof import('@totalreclaw/core') {
|
|
35
|
+
if (!_wasm) _wasm = requireWasm('@totalreclaw/core');
|
|
36
|
+
return _wasm!;
|
|
37
|
+
}
|
|
38
|
+
|
|
22
39
|
// ---------------------------------------------------------------------------
|
|
23
40
|
// Configuration
|
|
24
41
|
// ---------------------------------------------------------------------------
|
|
@@ -106,6 +123,36 @@ export function findNearDuplicate(
|
|
|
106
123
|
candidates: DecryptedCandidate[],
|
|
107
124
|
threshold: number,
|
|
108
125
|
): NearDuplicateMatch | null {
|
|
126
|
+
const wasm = getWasm();
|
|
127
|
+
|
|
128
|
+
// Use core's findBestNearDuplicate if available (added in core >=1.5.0;
|
|
129
|
+
// guaranteed present in core >=2.0.0 which this plugin depends on).
|
|
130
|
+
if (typeof (wasm as any).findBestNearDuplicate === 'function') {
|
|
131
|
+
const existing = candidates
|
|
132
|
+
.filter((c) => c.embedding && c.embedding.length > 0)
|
|
133
|
+
.map((c) => ({ id: c.id, embedding: c.embedding! }));
|
|
134
|
+
|
|
135
|
+
if (existing.length === 0) return null;
|
|
136
|
+
|
|
137
|
+
const resultJs = (wasm as any).findBestNearDuplicate(
|
|
138
|
+
JSON.stringify(newFactEmbedding),
|
|
139
|
+
JSON.stringify(existing),
|
|
140
|
+
threshold,
|
|
141
|
+
);
|
|
142
|
+
|
|
143
|
+
if (resultJs == null) return null;
|
|
144
|
+
|
|
145
|
+
const result: { fact_id: string; similarity: number } =
|
|
146
|
+
typeof resultJs === 'string' ? JSON.parse(resultJs) : resultJs;
|
|
147
|
+
|
|
148
|
+
const matched = candidates.find((c) => c.id === result.fact_id);
|
|
149
|
+
if (!matched) return null;
|
|
150
|
+
|
|
151
|
+
return { existingFact: matched, similarity: result.similarity };
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
// Fallback: local loop using WASM-backed cosineSimilarity. Defensive only
|
|
155
|
+
// — core >=2.0.0 always exposes findBestNearDuplicate.
|
|
109
156
|
let bestMatch: NearDuplicateMatch | null = null;
|
|
110
157
|
|
|
111
158
|
for (const candidate of candidates) {
|
|
@@ -132,6 +179,8 @@ export function findNearDuplicate(
|
|
|
132
179
|
* - Higher importance wins.
|
|
133
180
|
* - Equal importance: new fact supersedes (newer is preferred).
|
|
134
181
|
*
|
|
182
|
+
* Delegates to `@totalreclaw/core` WASM `shouldSupersede`.
|
|
183
|
+
*
|
|
135
184
|
* @param newImportance - Importance score of the new fact
|
|
136
185
|
* @param existingFact - The existing near-duplicate candidate
|
|
137
186
|
* @returns - 'supersede' if new fact should replace, 'skip' otherwise
|
|
@@ -140,43 +189,21 @@ export function shouldSupersede(
|
|
|
140
189
|
newImportance: number,
|
|
141
190
|
existingFact: DecryptedCandidate,
|
|
142
191
|
): 'supersede' | 'skip' {
|
|
143
|
-
|
|
144
|
-
return 'skip';
|
|
192
|
+
const wasm = getWasm();
|
|
193
|
+
return wasm.shouldSupersede(newImportance, existingFact.importance) ? 'supersede' : 'skip';
|
|
145
194
|
}
|
|
146
195
|
|
|
147
196
|
// ---------------------------------------------------------------------------
|
|
148
197
|
// Bulk consolidation
|
|
149
198
|
// ---------------------------------------------------------------------------
|
|
150
199
|
|
|
151
|
-
/**
|
|
152
|
-
* Pick the best representative from a group of near-duplicate facts.
|
|
153
|
-
*
|
|
154
|
-
* Tiebreak order:
|
|
155
|
-
* 1. Highest decayScore
|
|
156
|
-
* 2. Most recent (highest createdAt)
|
|
157
|
-
* 3. Longest text
|
|
158
|
-
*/
|
|
159
|
-
function pickRepresentative(facts: DecryptedCandidate[]): DecryptedCandidate {
|
|
160
|
-
let best = facts[0];
|
|
161
|
-
for (let i = 1; i < facts.length; i++) {
|
|
162
|
-
const f = facts[i];
|
|
163
|
-
if (
|
|
164
|
-
f.decayScore > best.decayScore ||
|
|
165
|
-
(f.decayScore === best.decayScore && f.createdAt > best.createdAt) ||
|
|
166
|
-
(f.decayScore === best.decayScore && f.createdAt === best.createdAt && f.text.length > best.text.length)
|
|
167
|
-
) {
|
|
168
|
-
best = f;
|
|
169
|
-
}
|
|
170
|
-
}
|
|
171
|
-
return best;
|
|
172
|
-
}
|
|
173
|
-
|
|
174
200
|
/**
|
|
175
201
|
* Cluster facts by semantic similarity using greedy single-pass clustering.
|
|
176
202
|
*
|
|
177
|
-
*
|
|
178
|
-
*
|
|
179
|
-
*
|
|
203
|
+
* Delegates to `@totalreclaw/core` WASM `clusterFacts` which performs the
|
|
204
|
+
* same greedy single-pass algorithm and representative selection. The WASM
|
|
205
|
+
* function returns ID-only clusters; this wrapper maps IDs back to full
|
|
206
|
+
* `DecryptedCandidate` objects for callers.
|
|
180
207
|
*
|
|
181
208
|
* Only returns clusters that have duplicates (i.e. more than one member).
|
|
182
209
|
* Facts without embeddings are not clustered.
|
|
@@ -189,6 +216,62 @@ export function clusterFacts(
|
|
|
189
216
|
facts: DecryptedCandidate[],
|
|
190
217
|
threshold: number,
|
|
191
218
|
): ConsolidationCluster[] {
|
|
219
|
+
const wasm = getWasm();
|
|
220
|
+
|
|
221
|
+
// Use core's clusterFacts if available (added in core >=1.5.0;
|
|
222
|
+
// guaranteed present in core >=2.0.0 which this plugin depends on).
|
|
223
|
+
if (typeof (wasm as any).clusterFacts === 'function') {
|
|
224
|
+
// Build ConsolidationCandidate JSON for WASM (snake_case fields).
|
|
225
|
+
const wasmCandidates = facts
|
|
226
|
+
.filter((f) => f.embedding && f.embedding.length > 0)
|
|
227
|
+
.map((f) => ({
|
|
228
|
+
id: f.id,
|
|
229
|
+
text: f.text,
|
|
230
|
+
embedding: f.embedding!,
|
|
231
|
+
importance: f.importance,
|
|
232
|
+
decay_score: f.decayScore,
|
|
233
|
+
created_at: f.createdAt,
|
|
234
|
+
version: f.version,
|
|
235
|
+
}));
|
|
236
|
+
|
|
237
|
+
if (wasmCandidates.length === 0) return [];
|
|
238
|
+
|
|
239
|
+
const resultJs = (wasm as any).clusterFacts(
|
|
240
|
+
JSON.stringify(wasmCandidates),
|
|
241
|
+
threshold,
|
|
242
|
+
);
|
|
243
|
+
|
|
244
|
+
// WASM returns a JSON string: [{ representative: string, duplicates: string[] }]
|
|
245
|
+
const wasmClusters: { representative: string; duplicates: string[] }[] =
|
|
246
|
+
typeof resultJs === 'string' ? JSON.parse(resultJs) : resultJs;
|
|
247
|
+
|
|
248
|
+
// Build a lookup map for fast ID -> DecryptedCandidate resolution.
|
|
249
|
+
const byId = new Map<string, DecryptedCandidate>();
|
|
250
|
+
for (const f of facts) byId.set(f.id, f);
|
|
251
|
+
|
|
252
|
+
// Map ID-only clusters back to full DecryptedCandidate objects.
|
|
253
|
+
// Filter out singleton clusters (no duplicates) to match the pre-WASM
|
|
254
|
+
// plugin contract — callers rely on `clusters.length === 0` when nothing
|
|
255
|
+
// duplicates anything.
|
|
256
|
+
const result: ConsolidationCluster[] = [];
|
|
257
|
+
for (const wc of wasmClusters) {
|
|
258
|
+
const rep = byId.get(wc.representative);
|
|
259
|
+
if (!rep) continue;
|
|
260
|
+
|
|
261
|
+
const dups = wc.duplicates
|
|
262
|
+
.map((id) => byId.get(id))
|
|
263
|
+
.filter((d): d is DecryptedCandidate => d !== undefined);
|
|
264
|
+
|
|
265
|
+
if (dups.length > 0) {
|
|
266
|
+
result.push({ representative: rep, duplicates: dups });
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
return result;
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
// Fallback: local greedy single-pass clustering using WASM-backed
|
|
274
|
+
// cosineSimilarity. Defensive only — core >=2.0.0 always exposes clusterFacts.
|
|
192
275
|
const clusters: { members: DecryptedCandidate[] }[] = [];
|
|
193
276
|
|
|
194
277
|
for (const fact of facts) {
|
|
@@ -196,7 +279,6 @@ export function clusterFacts(
|
|
|
196
279
|
|
|
197
280
|
let assigned = false;
|
|
198
281
|
for (const cluster of clusters) {
|
|
199
|
-
// Compare against the first member's embedding (cluster seed)
|
|
200
282
|
const seed = cluster.members[0];
|
|
201
283
|
if (!seed.embedding) continue;
|
|
202
284
|
|
|
@@ -213,7 +295,6 @@ export function clusterFacts(
|
|
|
213
295
|
}
|
|
214
296
|
}
|
|
215
297
|
|
|
216
|
-
// Only return clusters with duplicates, pick representative for each
|
|
217
298
|
const result: ConsolidationCluster[] = [];
|
|
218
299
|
for (const cluster of clusters) {
|
|
219
300
|
if (cluster.members.length < 2) continue;
|
|
@@ -225,3 +306,30 @@ export function clusterFacts(
|
|
|
225
306
|
|
|
226
307
|
return result;
|
|
227
308
|
}
|
|
309
|
+
|
|
310
|
+
// ---------------------------------------------------------------------------
|
|
311
|
+
// Local helpers (used only in fallback paths)
|
|
312
|
+
// ---------------------------------------------------------------------------
|
|
313
|
+
|
|
314
|
+
/**
|
|
315
|
+
* Pick the best representative from a group of near-duplicate facts.
|
|
316
|
+
*
|
|
317
|
+
* Tiebreak order:
|
|
318
|
+
* 1. Highest decayScore
|
|
319
|
+
* 2. Most recent (highest createdAt)
|
|
320
|
+
* 3. Longest text
|
|
321
|
+
*/
|
|
322
|
+
function pickRepresentative(facts: DecryptedCandidate[]): DecryptedCandidate {
|
|
323
|
+
let best = facts[0];
|
|
324
|
+
for (let i = 1; i < facts.length; i++) {
|
|
325
|
+
const f = facts[i];
|
|
326
|
+
if (
|
|
327
|
+
f.decayScore > best.decayScore ||
|
|
328
|
+
(f.decayScore === best.decayScore && f.createdAt > best.createdAt) ||
|
|
329
|
+
(f.decayScore === best.decayScore && f.createdAt === best.createdAt && f.text.length > best.text.length)
|
|
330
|
+
) {
|
|
331
|
+
best = f;
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
return best;
|
|
335
|
+
}
|