@feelingmindful/thinking-graph 1.15.1 → 1.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/config.d.ts +16 -0
- package/dist/config.js +39 -0
- package/dist/engine/dedup.d.ts +10 -0
- package/dist/engine/dedup.js +19 -0
- package/dist/engine/fusion.d.ts +75 -0
- package/dist/engine/fusion.js +144 -0
- package/dist/engine/graph.d.ts +35 -1
- package/dist/engine/graph.js +147 -1
- package/dist/engine/intent.d.ts +14 -0
- package/dist/engine/intent.js +19 -0
- package/dist/engine/types.d.ts +4 -0
- package/dist/storage/adapter.d.ts +30 -1
- package/dist/storage/jsonl.d.ts +30 -1
- package/dist/storage/jsonl.js +196 -5
- package/dist/storage/memory.d.ts +15 -2
- package/dist/storage/memory.js +54 -1
- package/dist/storage/sqlite.d.ts +1 -0
- package/dist/storage/sqlite.js +26 -0
- package/dist/storage/vector-index.d.ts +8 -1
- package/dist/storage/vector-index.js +10 -1
- package/dist/tools/execute-skills.d.ts +6 -6
- package/dist/tools/learn.d.ts +2 -2
- package/dist/tools/learn.js +25 -8
- package/dist/tools/recall.js +67 -14
- package/dist/tools/research.js +23 -3
- package/dist/tools/think.d.ts +1 -1
- package/dist/vault/bridge.d.ts +5 -4
- package/dist/vault/bridge.js +7 -5
- package/package.json +1 -1
package/dist/config.d.ts
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Centralized, env-overridable tuning for recall scoring. Flat schema so every
|
|
3
|
+
* weight/half-life lives in one place instead of scattered module constants.
|
|
4
|
+
* Values are read once at process start; override via environment variables.
|
|
5
|
+
*/
|
|
6
|
+
export declare const config: {
|
|
7
|
+
readonly denseWeight: number;
|
|
8
|
+
readonly lexicalWeight: number;
|
|
9
|
+
readonly recencyHalfLifeDays: number;
|
|
10
|
+
readonly recencyReward: number;
|
|
11
|
+
readonly importanceWeight: number;
|
|
12
|
+
readonly importanceHalfMerges: number;
|
|
13
|
+
readonly graphSeedCount: number;
|
|
14
|
+
readonly graphExpandLimit: number;
|
|
15
|
+
};
|
|
16
|
+
export type ScoringConfig = typeof config;
|
package/dist/config.js
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Centralized, env-overridable tuning for recall scoring. Flat schema so every
|
|
3
|
+
* weight/half-life lives in one place instead of scattered module constants.
|
|
4
|
+
* Values are read once at process start; override via environment variables.
|
|
5
|
+
*/
|
|
6
|
+
import { z } from 'zod';
|
|
7
|
+
// Coercing parsers that fall back to the default on any invalid input (missing,
|
|
8
|
+
// non-numeric, NaN/Infinity, or out of range) rather than throwing at startup.
|
|
9
|
+
// Weights are finite and >= 0; half-life / half-merges must be > 0 (they are
|
|
10
|
+
// denominators / decay constants).
|
|
11
|
+
const nonneg = (def) => z.coerce.number().finite().nonnegative().catch(def);
|
|
12
|
+
const positive = (def) => z.coerce.number().finite().positive().catch(def);
|
|
13
|
+
const schema = z.object({
|
|
14
|
+
// Hybrid recall channel weights.
|
|
15
|
+
THINKING_GRAPH_DENSE_WEIGHT: nonneg(0.6),
|
|
16
|
+
THINKING_GRAPH_LEXICAL_WEIGHT: nonneg(0.4),
|
|
17
|
+
// Recency reward: score *= 1 + reward * exp(-age*ln2/halfLife).
|
|
18
|
+
THINKING_GRAPH_RECENCY_HALF_LIFE_DAYS: positive(14),
|
|
19
|
+
THINKING_GRAPH_RECENCY_REWARD: nonneg(0.2),
|
|
20
|
+
// Importance reward from reinforcement: score *= 1 + weight * m/(m+halfMerges),
|
|
21
|
+
// where m is the node's mergeCount (how often a duplicate reinforced it).
|
|
22
|
+
THINKING_GRAPH_IMPORTANCE_WEIGHT: nonneg(0.3),
|
|
23
|
+
THINKING_GRAPH_IMPORTANCE_HALF_MERGES: positive(3),
|
|
24
|
+
// Graph expansion (multi-hop recall): how many top content hits to seed the
|
|
25
|
+
// 1-hop typed-edge walk from, and how many neighbours to pull in.
|
|
26
|
+
THINKING_GRAPH_GRAPH_SEED_COUNT: positive(3),
|
|
27
|
+
THINKING_GRAPH_GRAPH_EXPAND_LIMIT: positive(10),
|
|
28
|
+
});
|
|
29
|
+
const env = schema.parse(process.env);
|
|
30
|
+
export const config = {
|
|
31
|
+
denseWeight: env.THINKING_GRAPH_DENSE_WEIGHT,
|
|
32
|
+
lexicalWeight: env.THINKING_GRAPH_LEXICAL_WEIGHT,
|
|
33
|
+
recencyHalfLifeDays: env.THINKING_GRAPH_RECENCY_HALF_LIFE_DAYS,
|
|
34
|
+
recencyReward: env.THINKING_GRAPH_RECENCY_REWARD,
|
|
35
|
+
importanceWeight: env.THINKING_GRAPH_IMPORTANCE_WEIGHT,
|
|
36
|
+
importanceHalfMerges: env.THINKING_GRAPH_IMPORTANCE_HALF_MERGES,
|
|
37
|
+
graphSeedCount: env.THINKING_GRAPH_GRAPH_SEED_COUNT,
|
|
38
|
+
graphExpandLimit: env.THINKING_GRAPH_GRAPH_EXPAND_LIMIT,
|
|
39
|
+
};
|
package/dist/engine/dedup.d.ts
CHANGED
|
@@ -2,4 +2,14 @@
|
|
|
2
2
|
* Content deduplication using Jaccard similarity on word tokens.
|
|
3
3
|
*/
|
|
4
4
|
export declare function tokenize(content: string): string[];
|
|
5
|
+
export declare const DUP_COSINE_HIGH = 0.92;
|
|
6
|
+
export declare const DUP_COSINE_BORDERLINE = 0.8;
|
|
7
|
+
export declare const DUP_JACCARD_HIGH = 0.9;
|
|
8
|
+
export declare const DUP_JACCARD_BORDERLINE = 0.75;
|
|
9
|
+
/**
|
|
10
|
+
* Canonicalize content for exact-match dedup: casefold, unify unicode
|
|
11
|
+
* dashes/hyphens to '-', and collapse all whitespace runs. So trivially
|
|
12
|
+
* different renderings of the same fact compare equal.
|
|
13
|
+
*/
|
|
14
|
+
export declare function normalizeForExactMatch(content: string): string;
|
|
5
15
|
export declare function similarity(a: string, b: string): number;
|
package/dist/engine/dedup.js
CHANGED
|
@@ -4,6 +4,25 @@
|
|
|
4
4
|
export function tokenize(content) {
|
|
5
5
|
return content.toLowerCase().split(/\W+/).filter(Boolean);
|
|
6
6
|
}
|
|
7
|
+
// Duplicate-detection bands. Cosine bands apply when an embedding index is
|
|
8
|
+
// available; Jaccard bands are the offline fallback (token-overlap runs hotter
|
|
9
|
+
// than cosine, so its thresholds differ).
|
|
10
|
+
export const DUP_COSINE_HIGH = 0.92;
|
|
11
|
+
export const DUP_COSINE_BORDERLINE = 0.80;
|
|
12
|
+
export const DUP_JACCARD_HIGH = 0.9;
|
|
13
|
+
export const DUP_JACCARD_BORDERLINE = 0.75;
|
|
14
|
+
/**
|
|
15
|
+
* Canonicalize content for exact-match dedup: casefold, unify unicode
|
|
16
|
+
* dashes/hyphens to '-', and collapse all whitespace runs. So trivially
|
|
17
|
+
* different renderings of the same fact compare equal.
|
|
18
|
+
*/
|
|
19
|
+
export function normalizeForExactMatch(content) {
|
|
20
|
+
return content
|
|
21
|
+
.toLowerCase()
|
|
22
|
+
.replace(/[‐-―−]/g, '-') // hyphens, en/em dashes, minus → '-'
|
|
23
|
+
.replace(/\s+/g, ' ')
|
|
24
|
+
.trim();
|
|
25
|
+
}
|
|
7
26
|
export function similarity(a, b) {
|
|
8
27
|
const tokensA = new Set(tokenize(a));
|
|
9
28
|
const tokensB = new Set(tokenize(b));
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Score fusion primitives for hybrid recall.
|
|
3
|
+
*
|
|
4
|
+
* Recall blends several ranking channels — dense vector similarity, lexical
|
|
5
|
+
* token coverage, and (later) graph signals. Each channel produces raw scores
|
|
6
|
+
* on its own scale, so we min-max normalize per channel, combine with weights,
|
|
7
|
+
* and apply a mild recency reward. These are pure functions with no I/O so the
|
|
8
|
+
* ranking math is unit-testable in isolation from storage and embeddings.
|
|
9
|
+
*/
|
|
10
|
+
import type { Node, ScoredNode } from './types.js';
|
|
11
|
+
export interface FusionChannel {
|
|
12
|
+
scores: Map<string, number>;
|
|
13
|
+
weight: number;
|
|
14
|
+
}
|
|
15
|
+
export interface FuseHybridArgs {
|
|
16
|
+
/** Candidate nodes by id. Only ids present here are emitted. */
|
|
17
|
+
nodes: Map<string, Node>;
|
|
18
|
+
/** Raw (un-normalized) channels; each is min-max normalized internally. */
|
|
19
|
+
channels: FusionChannel[];
|
|
20
|
+
/** Recency reward coefficient applied as base * (1 + reward * recency). */
|
|
21
|
+
recencyReward?: number;
|
|
22
|
+
/** Recency half-life in days. */
|
|
23
|
+
halfLifeDays?: number;
|
|
24
|
+
/** Importance reward coefficient (reinforcement boost from mergeCount). */
|
|
25
|
+
importanceWeight?: number;
|
|
26
|
+
/** mergeCount at which the importance reward reaches half its maximum. */
|
|
27
|
+
importanceHalfMerges?: number;
|
|
28
|
+
/** Injectable clock for deterministic tests. */
|
|
29
|
+
now?: Date;
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Reinforcement importance multiplier in [1, 1+weight). A node merged into more
|
|
33
|
+
* often (higher mergeCount) ranks higher, with saturating returns:
|
|
34
|
+
* m=0 → 1, m=halfMerges → 1 + weight/2, m→∞ → 1 + weight.
|
|
35
|
+
*/
|
|
36
|
+
export declare function importanceFactor(mergeCount: number, weight: number, halfMerges: number): number;
|
|
37
|
+
/**
|
|
38
|
+
* Scale a map of raw scores to [0,1] via min-max. All-equal (or single) inputs
|
|
39
|
+
* map to 1 — they are equally relevant, normalized to the top of the channel.
|
|
40
|
+
*/
|
|
41
|
+
export declare function minMaxNormalize(scores: Map<string, number>): Map<string, number>;
|
|
42
|
+
/**
|
|
43
|
+
* Weighted sum across channels over the union of ids. An id absent from a
|
|
44
|
+
* channel contributes 0 for that channel. Weights need not sum to 1.
|
|
45
|
+
*/
|
|
46
|
+
export declare function weightedFuse(channels: FusionChannel[]): Map<string, number>;
|
|
47
|
+
/**
|
|
48
|
+
* Reciprocal Rank Fusion over ranked id lists. Score for an id is the sum of
|
|
49
|
+
* 1/(k + rank) across the lists it appears in, with 1-based rank. Rank-based,
|
|
50
|
+
* so it fuses lists whose raw scores are not comparable (e.g. cosine vs Jaccard).
|
|
51
|
+
*/
|
|
52
|
+
export declare function rrfFuse(rankedLists: string[][], k?: number): Map<string, number>;
|
|
53
|
+
/**
|
|
54
|
+
* Exponential time-decay weight in (0,1]. 1 at age 0, 0.5 at one half-life.
|
|
55
|
+
* Future timestamps (negative age) are capped at 1.
|
|
56
|
+
*/
|
|
57
|
+
export declare function recencyWeight(createdAt: string, halfLifeDays?: number, now?: Date): number;
|
|
58
|
+
/**
|
|
59
|
+
* Lexical relevance from a precomputed query-token set: the fraction of those
|
|
60
|
+
* tokens present in the content. Lets callers tokenize the query once and reuse
|
|
61
|
+
* the set across many candidates. 0 when the set is empty.
|
|
62
|
+
*/
|
|
63
|
+
export declare function lexicalScoreTokens(queryTokens: Set<string>, content: string): number;
|
|
64
|
+
/**
|
|
65
|
+
* Lexical relevance as the fraction of distinct query tokens present in the
|
|
66
|
+
* content. 0 when nothing overlaps or the query is empty.
|
|
67
|
+
*/
|
|
68
|
+
export declare function lexicalScore(query: string, content: string): number;
|
|
69
|
+
/**
|
|
70
|
+
* Fuse weighted channels into a single ranked list of scored nodes. Channels
|
|
71
|
+
* are min-max normalized, weighted-summed, then multiplied by a mild recency
|
|
72
|
+
* reward. Only ids present in `nodes` are emitted. Returns the full ranked list
|
|
73
|
+
* (caller paginates).
|
|
74
|
+
*/
|
|
75
|
+
export declare function fuseHybrid(args: FuseHybridArgs): ScoredNode[];
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Score fusion primitives for hybrid recall.
|
|
3
|
+
*
|
|
4
|
+
* Recall blends several ranking channels — dense vector similarity, lexical
|
|
5
|
+
* token coverage, and (later) graph signals. Each channel produces raw scores
|
|
6
|
+
* on its own scale, so we min-max normalize per channel, combine with weights,
|
|
7
|
+
* and apply a mild recency reward. These are pure functions with no I/O so the
|
|
8
|
+
* ranking math is unit-testable in isolation from storage and embeddings.
|
|
9
|
+
*/
|
|
10
|
+
import { tokenize } from './dedup.js';
|
|
11
|
+
import { config } from '../config.js';
|
|
12
|
+
const MS_PER_DAY = 86_400_000;
|
|
13
|
+
const DEFAULT_HALF_LIFE_DAYS = 14;
|
|
14
|
+
/**
|
|
15
|
+
* Reinforcement importance multiplier in [1, 1+weight). A node merged into more
|
|
16
|
+
* often (higher mergeCount) ranks higher, with saturating returns:
|
|
17
|
+
* m=0 → 1, m=halfMerges → 1 + weight/2, m→∞ → 1 + weight.
|
|
18
|
+
*/
|
|
19
|
+
export function importanceFactor(mergeCount, weight, halfMerges) {
|
|
20
|
+
// Guard against non-finite / non-positive inputs (bad config or corrupted
|
|
21
|
+
// metadata): no boost rather than NaN or a divide-by-zero.
|
|
22
|
+
if (!Number.isFinite(mergeCount) || mergeCount <= 0)
|
|
23
|
+
return 1;
|
|
24
|
+
if (!Number.isFinite(weight) || weight <= 0)
|
|
25
|
+
return 1;
|
|
26
|
+
if (!Number.isFinite(halfMerges) || halfMerges <= 0)
|
|
27
|
+
return 1;
|
|
28
|
+
return 1 + weight * (mergeCount / (mergeCount + halfMerges));
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* Scale a map of raw scores to [0,1] via min-max. All-equal (or single) inputs
|
|
32
|
+
* map to 1 — they are equally relevant, normalized to the top of the channel.
|
|
33
|
+
*/
|
|
34
|
+
export function minMaxNormalize(scores) {
|
|
35
|
+
if (scores.size === 0)
|
|
36
|
+
return new Map();
|
|
37
|
+
let min = Infinity;
|
|
38
|
+
let max = -Infinity;
|
|
39
|
+
for (const v of scores.values()) {
|
|
40
|
+
if (v < min)
|
|
41
|
+
min = v;
|
|
42
|
+
if (v > max)
|
|
43
|
+
max = v;
|
|
44
|
+
}
|
|
45
|
+
const range = max - min;
|
|
46
|
+
const out = new Map();
|
|
47
|
+
for (const [id, v] of scores) {
|
|
48
|
+
out.set(id, range === 0 ? 1 : (v - min) / range);
|
|
49
|
+
}
|
|
50
|
+
return out;
|
|
51
|
+
}
|
|
52
|
+
/**
|
|
53
|
+
* Weighted sum across channels over the union of ids. An id absent from a
|
|
54
|
+
* channel contributes 0 for that channel. Weights need not sum to 1.
|
|
55
|
+
*/
|
|
56
|
+
export function weightedFuse(channels) {
|
|
57
|
+
const out = new Map();
|
|
58
|
+
for (const { scores, weight } of channels) {
|
|
59
|
+
for (const [id, score] of scores) {
|
|
60
|
+
out.set(id, (out.get(id) ?? 0) + weight * score);
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
return out;
|
|
64
|
+
}
|
|
65
|
+
/**
|
|
66
|
+
* Reciprocal Rank Fusion over ranked id lists. Score for an id is the sum of
|
|
67
|
+
* 1/(k + rank) across the lists it appears in, with 1-based rank. Rank-based,
|
|
68
|
+
* so it fuses lists whose raw scores are not comparable (e.g. cosine vs Jaccard).
|
|
69
|
+
*/
|
|
70
|
+
export function rrfFuse(rankedLists, k = 60) {
|
|
71
|
+
const out = new Map();
|
|
72
|
+
for (const list of rankedLists) {
|
|
73
|
+
list.forEach((id, idx) => {
|
|
74
|
+
out.set(id, (out.get(id) ?? 0) + 1 / (k + idx + 1));
|
|
75
|
+
});
|
|
76
|
+
}
|
|
77
|
+
return out;
|
|
78
|
+
}
|
|
79
|
+
/**
|
|
80
|
+
* Exponential time-decay weight in (0,1]. 1 at age 0, 0.5 at one half-life.
|
|
81
|
+
* Future timestamps (negative age) are capped at 1.
|
|
82
|
+
*/
|
|
83
|
+
export function recencyWeight(createdAt, halfLifeDays = DEFAULT_HALF_LIFE_DAYS, now = new Date()) {
|
|
84
|
+
// A non-positive / non-finite half-life would invert or blow up the decay;
|
|
85
|
+
// treat it as "no decay" so the weight stays in (0,1].
|
|
86
|
+
if (!Number.isFinite(halfLifeDays) || halfLifeDays <= 0)
|
|
87
|
+
return 1;
|
|
88
|
+
const ageDays = (now.getTime() - new Date(createdAt).getTime()) / MS_PER_DAY;
|
|
89
|
+
if (!Number.isFinite(ageDays) || ageDays <= 0)
|
|
90
|
+
return 1;
|
|
91
|
+
return Math.exp((-ageDays * Math.LN2) / halfLifeDays);
|
|
92
|
+
}
|
|
93
|
+
/**
|
|
94
|
+
* Lexical relevance from a precomputed query-token set: the fraction of those
|
|
95
|
+
* tokens present in the content. Lets callers tokenize the query once and reuse
|
|
96
|
+
* the set across many candidates. 0 when the set is empty.
|
|
97
|
+
*/
|
|
98
|
+
export function lexicalScoreTokens(queryTokens, content) {
|
|
99
|
+
if (queryTokens.size === 0)
|
|
100
|
+
return 0;
|
|
101
|
+
const contentTokens = new Set(tokenize(content));
|
|
102
|
+
let hits = 0;
|
|
103
|
+
for (const t of queryTokens) {
|
|
104
|
+
if (contentTokens.has(t))
|
|
105
|
+
hits++;
|
|
106
|
+
}
|
|
107
|
+
return hits / queryTokens.size;
|
|
108
|
+
}
|
|
109
|
+
/**
|
|
110
|
+
* Lexical relevance as the fraction of distinct query tokens present in the
|
|
111
|
+
* content. 0 when nothing overlaps or the query is empty.
|
|
112
|
+
*/
|
|
113
|
+
export function lexicalScore(query, content) {
|
|
114
|
+
return lexicalScoreTokens(new Set(tokenize(query)), content);
|
|
115
|
+
}
|
|
116
|
+
/**
|
|
117
|
+
* Fuse weighted channels into a single ranked list of scored nodes. Channels
|
|
118
|
+
* are min-max normalized, weighted-summed, then multiplied by a mild recency
|
|
119
|
+
* reward. Only ids present in `nodes` are emitted. Returns the full ranked list
|
|
120
|
+
* (caller paginates).
|
|
121
|
+
*/
|
|
122
|
+
export function fuseHybrid(args) {
|
|
123
|
+
const normalized = args.channels.map(c => ({
|
|
124
|
+
scores: minMaxNormalize(c.scores),
|
|
125
|
+
weight: c.weight,
|
|
126
|
+
}));
|
|
127
|
+
const fused = weightedFuse(normalized);
|
|
128
|
+
const reward = args.recencyReward ?? config.recencyReward;
|
|
129
|
+
const halfLife = args.halfLifeDays ?? config.recencyHalfLifeDays;
|
|
130
|
+
const importanceW = args.importanceWeight ?? config.importanceWeight;
|
|
131
|
+
const halfMerges = args.importanceHalfMerges ?? config.importanceHalfMerges;
|
|
132
|
+
const out = [];
|
|
133
|
+
for (const [id, base] of fused) {
|
|
134
|
+
const node = args.nodes.get(id);
|
|
135
|
+
if (!node)
|
|
136
|
+
continue;
|
|
137
|
+
const recency = recencyWeight(node.createdAt, halfLife, args.now);
|
|
138
|
+
const mergeCount = typeof node.metadata?.mergeCount === 'number' ? node.metadata.mergeCount : 0;
|
|
139
|
+
const importance = importanceFactor(mergeCount, importanceW, halfMerges);
|
|
140
|
+
out.push({ ...node, score: base * (1 + reward * recency) * importance });
|
|
141
|
+
}
|
|
142
|
+
out.sort((a, b) => b.score - a.score);
|
|
143
|
+
return out;
|
|
144
|
+
}
|
package/dist/engine/graph.d.ts
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
|
-
import type { StorageAdapter } from '../storage/adapter.js';
|
|
1
|
+
import type { StorageAdapter, HybridSearchOpts } from '../storage/adapter.js';
|
|
2
2
|
import type { Node, Edge, Session, NodeType, EdgeType, NodeQuery, PaginatedResult, NodeWithEdges, ExportOpts, GraphExport, HealthReport, SkillRegistryEntry } from './types.js';
|
|
3
|
+
export type DuplicateBand = 'exact' | 'high' | 'borderline';
|
|
3
4
|
export declare class ThinkingGraph {
|
|
4
5
|
storage: StorageAdapter;
|
|
5
6
|
private currentSessionId;
|
|
@@ -10,7 +11,34 @@ export declare class ThinkingGraph {
|
|
|
10
11
|
getNode(id: string): Promise<Node | null>;
|
|
11
12
|
findNodes(query: NodeQuery): Promise<PaginatedResult<Node>>;
|
|
12
13
|
searchNodes(text: string, limit?: number): Promise<Node[]>;
|
|
14
|
+
/**
|
|
15
|
+
* Ranked hybrid recall: fuses semantic (dense vector) + lexical channels with
|
|
16
|
+
* a recency reward. Falls back to substring queryNodes when the storage
|
|
17
|
+
* adapter has no hybrid index. Items are ScoredNode (carry a `score`).
|
|
18
|
+
*/
|
|
19
|
+
recallHybrid(opts: HybridSearchOpts & {
|
|
20
|
+
limit?: number;
|
|
21
|
+
offset?: number;
|
|
22
|
+
}): Promise<PaginatedResult<Node>>;
|
|
13
23
|
findSimilar(content: string, type: NodeType, projectId?: string, threshold?: number): Promise<Node | null>;
|
|
24
|
+
/**
|
|
25
|
+
* Banded duplicate detection for learn-time dedup:
|
|
26
|
+
* L1 — normalized exact match (casefold/dash/whitespace) → band 'exact'
|
|
27
|
+
* L2 — semantic cosine via the embedding index (Jaccard fallback offline):
|
|
28
|
+
* >= HIGH → 'high' (auto-merge), >= BORDERLINE → 'borderline' (near-dup)
|
|
29
|
+
* Returns the best match and its band, or null when nothing is similar enough.
|
|
30
|
+
*/
|
|
31
|
+
findDuplicate(content: string, type: NodeType, projectId?: string): Promise<{
|
|
32
|
+
node: Node;
|
|
33
|
+
band: DuplicateBand;
|
|
34
|
+
score: number;
|
|
35
|
+
} | null>;
|
|
36
|
+
/**
|
|
37
|
+
* Reinforce a node after a duplicate observation (non-silent merge): bump
|
|
38
|
+
* mergeCount/lastMergedAt and preserve a distinct merged variant (capped), so
|
|
39
|
+
* the merge is recorded and reversible rather than dropping the new content.
|
|
40
|
+
*/
|
|
41
|
+
reinforceNode(id: string, variant?: string): Promise<void>;
|
|
14
42
|
addEdge(input: Omit<Edge, 'id' | 'createdAt' | 'weight'> & {
|
|
15
43
|
id?: string;
|
|
16
44
|
weight?: number;
|
|
@@ -19,6 +47,12 @@ export declare class ThinkingGraph {
|
|
|
19
47
|
}>;
|
|
20
48
|
getEdges(nodeId: string, direction?: 'outgoing' | 'incoming' | 'both', type?: EdgeType): Promise<Edge[]>;
|
|
21
49
|
traverse(startId: string, edgeType: EdgeType, depth?: number): Promise<Node[]>;
|
|
50
|
+
/**
|
|
51
|
+
* 1-hop typed-edge neighbours of the seed nodes, across all edge types and
|
|
52
|
+
* both directions, excluding the seeds themselves and deduped. Used to pull
|
|
53
|
+
* structurally-related context into multi-hop recall.
|
|
54
|
+
*/
|
|
55
|
+
expandNeighbors(seedIds: string[], limit?: number): Promise<Node[]>;
|
|
22
56
|
findContradictions(sessionId?: string): Promise<{
|
|
23
57
|
a: Node;
|
|
24
58
|
b: Node;
|
package/dist/engine/graph.js
CHANGED
|
@@ -1,6 +1,20 @@
|
|
|
1
1
|
import { v4 as uuid } from 'uuid';
|
|
2
|
-
import { similarity } from './dedup.js';
|
|
2
|
+
import { similarity, normalizeForExactMatch, tokenize, DUP_COSINE_HIGH, DUP_COSINE_BORDERLINE, DUP_JACCARD_HIGH, DUP_JACCARD_BORDERLINE, } from './dedup.js';
|
|
3
3
|
import { seedAll } from './seed.js';
|
|
4
|
+
const MAX_MERGED_VARIANTS = 5;
|
|
5
|
+
// Candidate cap for the L1 normalized-exact scan and the Jaccard fallback. The
|
|
6
|
+
// cosine path (findNearest) scans the full in-filter set; only these two scans
|
|
7
|
+
// are capped, so an exact/lexical duplicate of an older node can be missed once
|
|
8
|
+
// a single type accumulates more than this many nodes.
|
|
9
|
+
const DUP_CANDIDATE_LIMIT = 1000;
|
|
10
|
+
/** Map a similarity score to a duplicate band, or null if below borderline. */
|
|
11
|
+
function bandFor(score, high, borderline) {
|
|
12
|
+
if (score >= high)
|
|
13
|
+
return 'high';
|
|
14
|
+
if (score >= borderline)
|
|
15
|
+
return 'borderline';
|
|
16
|
+
return null;
|
|
17
|
+
}
|
|
4
18
|
export class ThinkingGraph {
|
|
5
19
|
storage;
|
|
6
20
|
currentSessionId = null;
|
|
@@ -41,6 +55,33 @@ export class ThinkingGraph {
|
|
|
41
55
|
async searchNodes(text, limit = 20) {
|
|
42
56
|
return this.storage.searchContent(text, limit);
|
|
43
57
|
}
|
|
58
|
+
/**
|
|
59
|
+
* Ranked hybrid recall: fuses semantic (dense vector) + lexical channels with
|
|
60
|
+
* a recency reward. Falls back to substring queryNodes when the storage
|
|
61
|
+
* adapter has no hybrid index. Items are ScoredNode (carry a `score`).
|
|
62
|
+
*/
|
|
63
|
+
async recallHybrid(opts) {
|
|
64
|
+
if (!this.storage.searchHybrid) {
|
|
65
|
+
return this.findNodes({
|
|
66
|
+
query: opts.query,
|
|
67
|
+
type: opts.type,
|
|
68
|
+
sessionId: opts.sessionId,
|
|
69
|
+
projectId: opts.projectId,
|
|
70
|
+
crossProject: opts.crossProject,
|
|
71
|
+
since: opts.since,
|
|
72
|
+
limit: opts.limit,
|
|
73
|
+
offset: opts.offset,
|
|
74
|
+
});
|
|
75
|
+
}
|
|
76
|
+
const scored = await this.storage.searchHybrid(opts);
|
|
77
|
+
const offset = opts.offset ?? 0;
|
|
78
|
+
const limit = opts.limit ?? 20;
|
|
79
|
+
return {
|
|
80
|
+
items: scored.slice(offset, offset + limit),
|
|
81
|
+
totalCount: scored.length,
|
|
82
|
+
hasMore: offset + limit < scored.length,
|
|
83
|
+
};
|
|
84
|
+
}
|
|
44
85
|
async findSimilar(content, type, projectId, threshold = 0.9) {
|
|
45
86
|
// Get candidates of the same type
|
|
46
87
|
const candidates = await this.storage.queryNodes({
|
|
@@ -55,6 +96,78 @@ export class ThinkingGraph {
|
|
|
55
96
|
}
|
|
56
97
|
return null;
|
|
57
98
|
}
|
|
99
|
+
/**
|
|
100
|
+
* Banded duplicate detection for learn-time dedup:
|
|
101
|
+
* L1 — normalized exact match (casefold/dash/whitespace) → band 'exact'
|
|
102
|
+
* L2 — semantic cosine via the embedding index (Jaccard fallback offline):
|
|
103
|
+
* >= HIGH → 'high' (auto-merge), >= BORDERLINE → 'borderline' (near-dup)
|
|
104
|
+
* Returns the best match and its band, or null when nothing is similar enough.
|
|
105
|
+
*/
|
|
106
|
+
async findDuplicate(content, type, projectId) {
|
|
107
|
+
// Degenerate content (empty / whitespace / punctuation-only) has no signal
|
|
108
|
+
// to dedup on — never match it. (Without this, two token-empty strings score
|
|
109
|
+
// similarity 1.0 and would auto-merge into an unrelated node.)
|
|
110
|
+
const norm = normalizeForExactMatch(content);
|
|
111
|
+
if (norm === '' || tokenize(content).length === 0)
|
|
112
|
+
return null;
|
|
113
|
+
const candidates = (await this.storage.queryNodes({ type, projectId, limit: DUP_CANDIDATE_LIMIT })).items;
|
|
114
|
+
if (candidates.length === 0)
|
|
115
|
+
return null;
|
|
116
|
+
// L1: normalized exact match
|
|
117
|
+
for (const c of candidates) {
|
|
118
|
+
if (normalizeForExactMatch(c.content) === norm) {
|
|
119
|
+
return { node: c, band: 'exact', score: 1 };
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
// L2a: semantic cosine when the embedding index can place this content.
|
|
123
|
+
if (this.storage.findNearest) {
|
|
124
|
+
const near = await this.storage.findNearest(content, { type, projectId });
|
|
125
|
+
if (near) {
|
|
126
|
+
const band = bandFor(near.score, DUP_COSINE_HIGH, DUP_COSINE_BORDERLINE);
|
|
127
|
+
if (band) {
|
|
128
|
+
const node = await this.storage.getNode(near.id);
|
|
129
|
+
if (node)
|
|
130
|
+
return { node, band, score: near.score };
|
|
131
|
+
}
|
|
132
|
+
// cosine below borderline → fall through; Jaccard may still catch a
|
|
133
|
+
// lexical duplicate the embedder under-scored.
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
// L2b: Jaccard fallback — runs whenever cosine did not yield a confident
|
|
137
|
+
// band (embeddings disabled, candidate not embedded, or cosine too low).
|
|
138
|
+
let best = null;
|
|
139
|
+
for (const c of candidates) {
|
|
140
|
+
const s = similarity(content, c.content);
|
|
141
|
+
if (!best || s > best.score)
|
|
142
|
+
best = { node: c, score: s };
|
|
143
|
+
}
|
|
144
|
+
if (!best)
|
|
145
|
+
return null;
|
|
146
|
+
const band = bandFor(best.score, DUP_JACCARD_HIGH, DUP_JACCARD_BORDERLINE);
|
|
147
|
+
return band ? { node: best.node, band, score: best.score } : null;
|
|
148
|
+
}
|
|
149
|
+
/**
|
|
150
|
+
* Reinforce a node after a duplicate observation (non-silent merge): bump
|
|
151
|
+
* mergeCount/lastMergedAt and preserve a distinct merged variant (capped), so
|
|
152
|
+
* the merge is recorded and reversible rather than dropping the new content.
|
|
153
|
+
*/
|
|
154
|
+
async reinforceNode(id, variant) {
|
|
155
|
+
const node = await this.storage.getNode(id);
|
|
156
|
+
if (!node)
|
|
157
|
+
return;
|
|
158
|
+
const meta = { ...node.metadata };
|
|
159
|
+
meta.mergeCount = (typeof meta.mergeCount === 'number' ? meta.mergeCount : 0) + 1;
|
|
160
|
+
meta.lastMergedAt = new Date().toISOString();
|
|
161
|
+
if (variant && normalizeForExactMatch(variant) !== normalizeForExactMatch(node.content)) {
|
|
162
|
+
const variants = Array.isArray(meta.mergedVariants) ? meta.mergedVariants : [];
|
|
163
|
+
const seen = normalizeForExactMatch(variant);
|
|
164
|
+
const known = variants.some(v => normalizeForExactMatch(v) === seen);
|
|
165
|
+
if (!known && variants.length < MAX_MERGED_VARIANTS) {
|
|
166
|
+
meta.mergedVariants = [...variants, variant];
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
await this.storage.updateNode(id, { metadata: meta, updatedAt: new Date().toISOString() });
|
|
170
|
+
}
|
|
58
171
|
// ─── Edge operations ─────────────────────────────────
|
|
59
172
|
async addEdge(input) {
|
|
60
173
|
const edge = {
|
|
@@ -82,6 +195,39 @@ export class ThinkingGraph {
|
|
|
82
195
|
async traverse(startId, edgeType, depth = 1) {
|
|
83
196
|
return this.storage.traverseEdges(startId, edgeType, depth);
|
|
84
197
|
}
|
|
198
|
+
/**
|
|
199
|
+
* 1-hop typed-edge neighbours of the seed nodes, across all edge types and
|
|
200
|
+
* both directions, excluding the seeds themselves and deduped. Used to pull
|
|
201
|
+
* structurally-related context into multi-hop recall.
|
|
202
|
+
*/
|
|
203
|
+
async expandNeighbors(seedIds, limit = 10) {
|
|
204
|
+
const seen = new Set(seedIds);
|
|
205
|
+
const out = [];
|
|
206
|
+
const collect = async (edges, id) => {
|
|
207
|
+
for (const e of edges) {
|
|
208
|
+
const neighborId = e.sourceId === id ? e.targetId : e.sourceId;
|
|
209
|
+
if (seen.has(neighborId))
|
|
210
|
+
continue;
|
|
211
|
+
seen.add(neighborId);
|
|
212
|
+
const node = await this.storage.getNode(neighborId);
|
|
213
|
+
if (node)
|
|
214
|
+
out.push(node);
|
|
215
|
+
if (out.length >= limit)
|
|
216
|
+
return;
|
|
217
|
+
}
|
|
218
|
+
};
|
|
219
|
+
// Fetch outgoing first, then incoming, checking the cap between each — so a
|
|
220
|
+
// high-degree node's incoming edges aren't fetched once the limit is met.
|
|
221
|
+
for (const id of seedIds) {
|
|
222
|
+
if (out.length >= limit)
|
|
223
|
+
break;
|
|
224
|
+
await collect(await this.storage.getEdgesFrom(id), id);
|
|
225
|
+
if (out.length >= limit)
|
|
226
|
+
break;
|
|
227
|
+
await collect(await this.storage.getEdgesTo(id), id);
|
|
228
|
+
}
|
|
229
|
+
return out;
|
|
230
|
+
}
|
|
85
231
|
async findContradictions(sessionId) {
|
|
86
232
|
const query = sessionId ? { sessionId } : {};
|
|
87
233
|
const allNodes = await this.storage.queryNodes({ ...query, limit: 1000 });
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Lightweight, pure-regex query intent classification. Used to gate expensive
|
|
3
|
+
* recall stages (e.g. the graph-walk expansion) so single-entity lookups stay
|
|
4
|
+
* cheap and clean, while relationship/traversal queries get structural context.
|
|
5
|
+
*/
|
|
6
|
+
export interface QueryIntent {
|
|
7
|
+
/** Asks about relationships, causation, or traversal across nodes. */
|
|
8
|
+
multiHop: boolean;
|
|
9
|
+
/** Asks about history / sequence / evolution over time. */
|
|
10
|
+
narrative: boolean;
|
|
11
|
+
/** First-person framing ("my", "our", "we"). */
|
|
12
|
+
personal: boolean;
|
|
13
|
+
}
|
|
14
|
+
export declare function classifyIntent(query: string): QueryIntent;
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Lightweight, pure-regex query intent classification. Used to gate expensive
|
|
3
|
+
* recall stages (e.g. the graph-walk expansion) so single-entity lookups stay
|
|
4
|
+
* cheap and clean, while relationship/traversal queries get structural context.
|
|
5
|
+
*/
|
|
6
|
+
// Relationship / causation / traversal cues.
|
|
7
|
+
const MULTIHOP_RE = /\b(relate[ds]?|related to|relationship|depend(s|ed|ency|encies)?|because|cause[ds]?|causes|lead(s|ing)? to|result(s|ed)? in|connect(s|ed|ion|ions)?|chain|affect(s|ed)?|impact(s|ed)?|influence[ds]?|between|downstream|upstream|trace|tied to|linked)\b/i;
|
|
8
|
+
// History / sequence / evolution cues.
|
|
9
|
+
const NARRATIVE_RE = /\b(history|timeline|evolution|evolved?|over time|how did|journey|progress(ion)?|sequence|chronolog\w*|previously|originally)\b/i;
|
|
10
|
+
// First-person cues (token-boundary to avoid matching inside words).
|
|
11
|
+
const PERSONAL_RE = /\b(i|i'm|i've|me|my|mine|we|we're|we've|us|our|ours)\b/i;
|
|
12
|
+
export function classifyIntent(query) {
|
|
13
|
+
const q = query ?? '';
|
|
14
|
+
return {
|
|
15
|
+
multiHop: MULTIHOP_RE.test(q),
|
|
16
|
+
narrative: NARRATIVE_RE.test(q),
|
|
17
|
+
personal: PERSONAL_RE.test(q),
|
|
18
|
+
};
|
|
19
|
+
}
|
package/dist/engine/types.d.ts
CHANGED
|
@@ -69,6 +69,10 @@ export interface PaginatedResult<T> {
|
|
|
69
69
|
totalCount: number;
|
|
70
70
|
hasMore: boolean;
|
|
71
71
|
}
|
|
72
|
+
export interface ScoredNode extends Node {
|
|
73
|
+
/** Fused relevance score from hybrid recall (higher is more relevant). */
|
|
74
|
+
score: number;
|
|
75
|
+
}
|
|
72
76
|
export interface NodeWithEdges extends Node {
|
|
73
77
|
edges: {
|
|
74
78
|
type: EdgeType;
|
|
@@ -1,11 +1,40 @@
|
|
|
1
|
-
import type { Node, Edge, Session, SkillRegistryEntry, NodeQuery, PaginatedResult, EdgeType, ExportOpts, GraphExport, GraphStats } from '../engine/types.js';
|
|
1
|
+
import type { Node, Edge, Session, SkillRegistryEntry, NodeQuery, PaginatedResult, EdgeType, NodeType, ScoredNode, ExportOpts, GraphExport, GraphStats } from '../engine/types.js';
|
|
2
|
+
/** Attribute filters for a hybrid (semantic + lexical) recall. */
|
|
3
|
+
export interface HybridSearchOpts {
|
|
4
|
+
query: string;
|
|
5
|
+
type?: NodeType | NodeType[];
|
|
6
|
+
sessionId?: string;
|
|
7
|
+
projectId?: string;
|
|
8
|
+
crossProject?: boolean;
|
|
9
|
+
since?: string;
|
|
10
|
+
}
|
|
2
11
|
export interface StorageAdapter {
|
|
3
12
|
initialize(): Promise<void>;
|
|
4
13
|
close(): Promise<void>;
|
|
5
14
|
insertNode(node: Node): Promise<void>;
|
|
15
|
+
updateNode(id: string, fields: Partial<Node>): Promise<void>;
|
|
6
16
|
getNode(id: string): Promise<Node | null>;
|
|
7
17
|
queryNodes(query: NodeQuery): Promise<PaginatedResult<Node>>;
|
|
8
18
|
searchContent(text: string, limit?: number): Promise<Node[]>;
|
|
19
|
+
/**
|
|
20
|
+
* Highest-cosine same-type (and same-project, when given) node for `content`.
|
|
21
|
+
* Optional: adapters without an embedding index omit it (callers fall back to
|
|
22
|
+
* Jaccard similarity). Returns null when no embedding is available or no node
|
|
23
|
+
* matches the filter.
|
|
24
|
+
*/
|
|
25
|
+
findNearest?(content: string, opts: {
|
|
26
|
+
type: NodeType;
|
|
27
|
+
projectId?: string;
|
|
28
|
+
}): Promise<{
|
|
29
|
+
id: string;
|
|
30
|
+
score: number;
|
|
31
|
+
} | null>;
|
|
32
|
+
/**
|
|
33
|
+
* Ranked hybrid recall fusing semantic (dense vector) and lexical channels
|
|
34
|
+
* with a recency reward. Optional: adapters without an embedding index may
|
|
35
|
+
* omit it (callers fall back to queryNodes substring search).
|
|
36
|
+
*/
|
|
37
|
+
searchHybrid?(opts: HybridSearchOpts): Promise<ScoredNode[]>;
|
|
9
38
|
insertEdge(edge: Edge): Promise<boolean>;
|
|
10
39
|
getEdgesFrom(nodeId: string, type?: EdgeType): Promise<Edge[]>;
|
|
11
40
|
getEdgesTo(nodeId: string, type?: EdgeType): Promise<Edge[]>;
|