opencode-diane 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +180 -0
- package/LICENSE +21 -0
- package/README.md +206 -0
- package/WIKI.md +1430 -0
- package/dist/index.d.ts +28 -0
- package/dist/index.js +1632 -0
- package/dist/ingest/adaptive.d.ts +47 -0
- package/dist/ingest/adaptive.js +182 -0
- package/dist/ingest/code-health.d.ts +58 -0
- package/dist/ingest/code-health.js +202 -0
- package/dist/ingest/code-map.d.ts +71 -0
- package/dist/ingest/code-map.js +670 -0
- package/dist/ingest/cross-refs.d.ts +59 -0
- package/dist/ingest/cross-refs.js +1207 -0
- package/dist/ingest/docs.d.ts +49 -0
- package/dist/ingest/docs.js +325 -0
- package/dist/ingest/git.d.ts +77 -0
- package/dist/ingest/git.js +390 -0
- package/dist/ingest/live-session.d.ts +101 -0
- package/dist/ingest/live-session.js +173 -0
- package/dist/ingest/project-notes.d.ts +28 -0
- package/dist/ingest/project-notes.js +102 -0
- package/dist/ingest/project.d.ts +35 -0
- package/dist/ingest/project.js +430 -0
- package/dist/ingest/session-snapshot.d.ts +63 -0
- package/dist/ingest/session-snapshot.js +94 -0
- package/dist/ingest/sessions.d.ts +29 -0
- package/dist/ingest/sessions.js +164 -0
- package/dist/ingest/tables.d.ts +52 -0
- package/dist/ingest/tables.js +360 -0
- package/dist/mining/skill-miner.d.ts +53 -0
- package/dist/mining/skill-miner.js +234 -0
- package/dist/search/bm25.d.ts +81 -0
- package/dist/search/bm25.js +334 -0
- package/dist/search/e5-embedder.d.ts +30 -0
- package/dist/search/e5-embedder.js +91 -0
- package/dist/search/embed-pass.d.ts +26 -0
- package/dist/search/embed-pass.js +43 -0
- package/dist/search/embedder.d.ts +58 -0
- package/dist/search/embedder.js +85 -0
- package/dist/search/inverted-index.d.ts +51 -0
- package/dist/search/inverted-index.js +139 -0
- package/dist/search/ppr.d.ts +44 -0
- package/dist/search/ppr.js +118 -0
- package/dist/search/tokenize.d.ts +26 -0
- package/dist/search/tokenize.js +98 -0
- package/dist/store/eviction.d.ts +16 -0
- package/dist/store/eviction.js +37 -0
- package/dist/store/repository.d.ts +222 -0
- package/dist/store/repository.js +420 -0
- package/dist/store/sqlite-store.d.ts +89 -0
- package/dist/store/sqlite-store.js +252 -0
- package/dist/store/vector-store.d.ts +66 -0
- package/dist/store/vector-store.js +160 -0
- package/dist/types.d.ts +385 -0
- package/dist/types.js +9 -0
- package/dist/utils/file-log.d.ts +87 -0
- package/dist/utils/file-log.js +215 -0
- package/dist/utils/peer-detection.d.ts +45 -0
- package/dist/utils/peer-detection.js +90 -0
- package/dist/utils/shell.d.ts +43 -0
- package/dist/utils/shell.js +110 -0
- package/dist/utils/usage-skill.d.ts +42 -0
- package/dist/utils/usage-skill.js +129 -0
- package/dist/utils/xlsx.d.ts +36 -0
- package/dist/utils/xlsx.js +270 -0
- package/grammars/tree-sitter-c.wasm +0 -0
- package/grammars/tree-sitter-c_sharp.wasm +0 -0
- package/grammars/tree-sitter-cpp.wasm +0 -0
- package/grammars/tree-sitter-css.wasm +0 -0
- package/grammars/tree-sitter-go.wasm +0 -0
- package/grammars/tree-sitter-html.wasm +0 -0
- package/grammars/tree-sitter-java.wasm +0 -0
- package/grammars/tree-sitter-javascript.wasm +0 -0
- package/grammars/tree-sitter-json.wasm +0 -0
- package/grammars/tree-sitter-php.wasm +0 -0
- package/grammars/tree-sitter-python.wasm +0 -0
- package/grammars/tree-sitter-rust.wasm +0 -0
- package/grammars/tree-sitter-typescript.wasm +0 -0
- package/package.json +80 -0
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* In-memory inverted index plus auxiliary indexes used for hierarchical
|
|
3
|
+
* filtering. Rebuilt from the store on plugin startup; kept consistent
|
|
4
|
+
* by the repository on every CRUD.
|
|
5
|
+
*/
|
|
6
|
+
import type { Category, Memory } from "../types.js";
|
|
7
|
+
/** Per-memory bookkeeping used by BM25 scoring. */
|
|
8
|
+
interface DocInfo {
|
|
9
|
+
/** Token → term frequency. */
|
|
10
|
+
tf: Map<string, number>;
|
|
11
|
+
/** Total token count (used as |d| in BM25). */
|
|
12
|
+
length: number;
|
|
13
|
+
}
|
|
14
|
+
export declare class InvertedIndex {
|
|
15
|
+
/** token → set of memory ids that contain it */
|
|
16
|
+
readonly postings: Map<string, Set<string>>;
|
|
17
|
+
/** memory id → per-doc bookkeeping */
|
|
18
|
+
readonly docs: Map<string, DocInfo>;
|
|
19
|
+
/** category → set of memory ids */
|
|
20
|
+
readonly byCategory: Map<Category, Set<string>>;
|
|
21
|
+
/** subject → set of memory ids */
|
|
22
|
+
readonly bySubject: Map<string, Set<string>>;
|
|
23
|
+
/**
|
|
24
|
+
* Undirected file co-change graph: file path → set of file paths it
|
|
25
|
+
* was modified together with. Built from the `co-change` memories
|
|
26
|
+
* the git ingester produces (tags = ["co-change", fileA, fileB]).
|
|
27
|
+
* Used by BM25 search to propagate score to structurally-related
|
|
28
|
+
* files — the Aider PageRank idea, one hop, over edges we already
|
|
29
|
+
* compute. Survives restarts because co-change memories persist.
|
|
30
|
+
*/
|
|
31
|
+
readonly coChange: Map<string, Set<string>>;
|
|
32
|
+
/** Running sum of doc lengths — for avgdl. */
|
|
33
|
+
totalLength: number;
|
|
34
|
+
/** Number of indexed docs. */
|
|
35
|
+
docCount: number;
|
|
36
|
+
rebuildFromAll(memories: Memory[]): void;
|
|
37
|
+
add(memory: Memory): void;
|
|
38
|
+
remove(memory: Memory): void;
|
|
39
|
+
avgDocLength(): number;
|
|
40
|
+
/** Files structurally coupled to `file` (one hop). Empty if none. */
|
|
41
|
+
coChangeNeighbors(file: string): ReadonlySet<string> | undefined;
|
|
42
|
+
/**
|
|
43
|
+
* A co-change memory carries tags `["co-change", fileA, fileB]`.
|
|
44
|
+
* If this memory is one, record the undirected edge. Anything else
|
|
45
|
+
* is ignored. Edges are de-duplicated by Set semantics, so the same
|
|
46
|
+
* pair appearing in multiple memories is harmless.
|
|
47
|
+
*/
|
|
48
|
+
private indexCoChangeEdge;
|
|
49
|
+
private removeCoChangeEdge;
|
|
50
|
+
}
|
|
51
|
+
export {};
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* In-memory inverted index plus auxiliary indexes used for hierarchical
|
|
3
|
+
* filtering. Rebuilt from the store on plugin startup; kept consistent
|
|
4
|
+
* by the repository on every CRUD.
|
|
5
|
+
*/
|
|
6
|
+
import { termFreq, tokenize } from "./tokenize.js";
|
|
7
|
+
export class InvertedIndex {
|
|
8
|
+
/** token → set of memory ids that contain it */
|
|
9
|
+
postings = new Map();
|
|
10
|
+
/** memory id → per-doc bookkeeping */
|
|
11
|
+
docs = new Map();
|
|
12
|
+
/** category → set of memory ids */
|
|
13
|
+
byCategory = new Map();
|
|
14
|
+
/** subject → set of memory ids */
|
|
15
|
+
bySubject = new Map();
|
|
16
|
+
/**
|
|
17
|
+
* Undirected file co-change graph: file path → set of file paths it
|
|
18
|
+
* was modified together with. Built from the `co-change` memories
|
|
19
|
+
* the git ingester produces (tags = ["co-change", fileA, fileB]).
|
|
20
|
+
* Used by BM25 search to propagate score to structurally-related
|
|
21
|
+
* files — the Aider PageRank idea, one hop, over edges we already
|
|
22
|
+
* compute. Survives restarts because co-change memories persist.
|
|
23
|
+
*/
|
|
24
|
+
coChange = new Map();
|
|
25
|
+
/** Running sum of doc lengths — for avgdl. */
|
|
26
|
+
totalLength = 0;
|
|
27
|
+
/** Number of indexed docs. */
|
|
28
|
+
docCount = 0;
|
|
29
|
+
rebuildFromAll(memories) {
|
|
30
|
+
this.postings.clear();
|
|
31
|
+
this.docs.clear();
|
|
32
|
+
this.byCategory.clear();
|
|
33
|
+
this.bySubject.clear();
|
|
34
|
+
this.coChange.clear();
|
|
35
|
+
this.totalLength = 0;
|
|
36
|
+
this.docCount = 0;
|
|
37
|
+
for (const m of memories)
|
|
38
|
+
this.add(m);
|
|
39
|
+
}
|
|
40
|
+
add(memory) {
|
|
41
|
+
// Build searchable text from subject + content + tags
|
|
42
|
+
const text = `${memory.subject} ${memory.content} ${memory.tags.join(" ")}`;
|
|
43
|
+
const tokens = tokenize(text);
|
|
44
|
+
const tf = termFreq(tokens);
|
|
45
|
+
this.docs.set(memory.id, { tf, length: tokens.length });
|
|
46
|
+
this.totalLength += tokens.length;
|
|
47
|
+
this.docCount += 1;
|
|
48
|
+
for (const token of tf.keys()) {
|
|
49
|
+
let set = this.postings.get(token);
|
|
50
|
+
if (!set) {
|
|
51
|
+
set = new Set();
|
|
52
|
+
this.postings.set(token, set);
|
|
53
|
+
}
|
|
54
|
+
set.add(memory.id);
|
|
55
|
+
}
|
|
56
|
+
let catSet = this.byCategory.get(memory.category);
|
|
57
|
+
if (!catSet) {
|
|
58
|
+
catSet = new Set();
|
|
59
|
+
this.byCategory.set(memory.category, catSet);
|
|
60
|
+
}
|
|
61
|
+
catSet.add(memory.id);
|
|
62
|
+
let subSet = this.bySubject.get(memory.subject);
|
|
63
|
+
if (!subSet) {
|
|
64
|
+
subSet = new Set();
|
|
65
|
+
this.bySubject.set(memory.subject, subSet);
|
|
66
|
+
}
|
|
67
|
+
subSet.add(memory.id);
|
|
68
|
+
this.indexCoChangeEdge(memory);
|
|
69
|
+
}
|
|
70
|
+
remove(memory) {
|
|
71
|
+
const info = this.docs.get(memory.id);
|
|
72
|
+
if (info) {
|
|
73
|
+
this.totalLength -= info.length;
|
|
74
|
+
this.docCount -= 1;
|
|
75
|
+
for (const token of info.tf.keys()) {
|
|
76
|
+
const set = this.postings.get(token);
|
|
77
|
+
if (set) {
|
|
78
|
+
set.delete(memory.id);
|
|
79
|
+
if (set.size === 0)
|
|
80
|
+
this.postings.delete(token);
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
this.docs.delete(memory.id);
|
|
84
|
+
}
|
|
85
|
+
this.byCategory.get(memory.category)?.delete(memory.id);
|
|
86
|
+
this.bySubject.get(memory.subject)?.delete(memory.id);
|
|
87
|
+
this.removeCoChangeEdge(memory);
|
|
88
|
+
}
|
|
89
|
+
avgDocLength() {
|
|
90
|
+
return this.docCount > 0 ? this.totalLength / this.docCount : 0;
|
|
91
|
+
}
|
|
92
|
+
/** Files structurally coupled to `file` (one hop). Empty if none. */
|
|
93
|
+
coChangeNeighbors(file) {
|
|
94
|
+
return this.coChange.get(file);
|
|
95
|
+
}
|
|
96
|
+
/**
|
|
97
|
+
* A co-change memory carries tags `["co-change", fileA, fileB]`.
|
|
98
|
+
* If this memory is one, record the undirected edge. Anything else
|
|
99
|
+
* is ignored. Edges are de-duplicated by Set semantics, so the same
|
|
100
|
+
* pair appearing in multiple memories is harmless.
|
|
101
|
+
*/
|
|
102
|
+
indexCoChangeEdge(memory) {
|
|
103
|
+
const pair = coChangePair(memory);
|
|
104
|
+
if (!pair)
|
|
105
|
+
return;
|
|
106
|
+
const [a, b] = pair;
|
|
107
|
+
addEdge(this.coChange, a, b);
|
|
108
|
+
addEdge(this.coChange, b, a);
|
|
109
|
+
}
|
|
110
|
+
removeCoChangeEdge(memory) {
|
|
111
|
+
const pair = coChangePair(memory);
|
|
112
|
+
if (!pair)
|
|
113
|
+
return;
|
|
114
|
+
const [a, b] = pair;
|
|
115
|
+
this.coChange.get(a)?.delete(b);
|
|
116
|
+
this.coChange.get(b)?.delete(a);
|
|
117
|
+
if (this.coChange.get(a)?.size === 0)
|
|
118
|
+
this.coChange.delete(a);
|
|
119
|
+
if (this.coChange.get(b)?.size === 0)
|
|
120
|
+
this.coChange.delete(b);
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
function coChangePair(memory) {
|
|
124
|
+
if (!memory.tags.includes("co-change"))
|
|
125
|
+
return null;
|
|
126
|
+
// tags = ["co-change", fileA, fileB]
|
|
127
|
+
const files = memory.tags.filter((t) => t !== "co-change");
|
|
128
|
+
if (files.length < 2)
|
|
129
|
+
return null;
|
|
130
|
+
return [files[0], files[1]];
|
|
131
|
+
}
|
|
132
|
+
function addEdge(graph, from, to) {
|
|
133
|
+
let set = graph.get(from);
|
|
134
|
+
if (!set) {
|
|
135
|
+
set = new Set();
|
|
136
|
+
graph.set(from, set);
|
|
137
|
+
}
|
|
138
|
+
set.add(to);
|
|
139
|
+
}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ppr.ts — Personalized PageRank over the co-change graph.
|
|
3
|
+
*
|
|
4
|
+
* The default co-change boost (bm25.ts step 3b) is a single hop: a
|
|
5
|
+
* textual hit about file X lifts memories about X's *direct* co-change
|
|
6
|
+
* neighbours, and stops there. Personalized PageRank generalises that
|
|
7
|
+
* to the whole graph — a random walk with restart, where the restart
|
|
8
|
+
* (teleport) distribution is concentrated on the query's textual hits.
|
|
9
|
+
* Relevance then spreads across multiple hops and is graded by how
|
|
10
|
+
* reachable each file is from that seed set: a direct neighbour scores
|
|
11
|
+
* higher than a two-hop file, which still scores above an unrelated
|
|
12
|
+
* one.
|
|
13
|
+
*
|
|
14
|
+
* This is opt-in (`personalizedPageRank`, default off). When off,
|
|
15
|
+
* retrieval uses the cheaper one-hop boost and this file is never
|
|
16
|
+
* reached — so the default path keeps its O(seeds × neighbours) cost
|
|
17
|
+
* and its full inspectability.
|
|
18
|
+
*
|
|
19
|
+
* Deterministic: a fixed teleport probability, a fixed node-iteration
|
|
20
|
+
* order, and a fixed convergence tolerance with an iteration cap mean
|
|
21
|
+
* the same graph and personalization always yield the same scores.
|
|
22
|
+
*/
|
|
23
|
+
export interface PprOptions {
|
|
24
|
+
/** Teleport / restart probability. Default 0.15 (i.e. 0.85 damping). */
|
|
25
|
+
alpha?: number;
|
|
26
|
+
/** Hard cap on power-iteration steps. Default 60. */
|
|
27
|
+
maxIterations?: number;
|
|
28
|
+
/** L1 convergence threshold — stop once the score vector barely moves. Default 1e-7. */
|
|
29
|
+
tolerance?: number;
|
|
30
|
+
/** Safety valve: return empty (skip PPR) if the graph exceeds this node count. Default 100000. */
|
|
31
|
+
maxNodes?: number;
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Personalized PageRank over an undirected, unweighted graph.
|
|
35
|
+
*
|
|
36
|
+
* @param graph adjacency: node → set of neighbour nodes
|
|
37
|
+
* @param personalization restart distribution: node → weight. Weights
|
|
38
|
+
* need not be normalised; non-positive weights
|
|
39
|
+
* are ignored.
|
|
40
|
+
* @returns node → stationary score, summing to ~1. Empty when the
|
|
41
|
+
* personalization carries no mass, the graph is empty, or the
|
|
42
|
+
* graph exceeds `maxNodes`.
|
|
43
|
+
*/
|
|
44
|
+
export declare function personalizedPageRank(graph: ReadonlyMap<string, ReadonlySet<string>>, personalization: ReadonlyMap<string, number>, options?: PprOptions): Map<string, number>;
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ppr.ts — Personalized PageRank over the co-change graph.
|
|
3
|
+
*
|
|
4
|
+
* The default co-change boost (bm25.ts step 3b) is a single hop: a
|
|
5
|
+
* textual hit about file X lifts memories about X's *direct* co-change
|
|
6
|
+
* neighbours, and stops there. Personalized PageRank generalises that
|
|
7
|
+
* to the whole graph — a random walk with restart, where the restart
|
|
8
|
+
* (teleport) distribution is concentrated on the query's textual hits.
|
|
9
|
+
* Relevance then spreads across multiple hops and is graded by how
|
|
10
|
+
* reachable each file is from that seed set: a direct neighbour scores
|
|
11
|
+
* higher than a two-hop file, which still scores above an unrelated
|
|
12
|
+
* one.
|
|
13
|
+
*
|
|
14
|
+
* This is opt-in (`personalizedPageRank`, default off). When off,
|
|
15
|
+
* retrieval uses the cheaper one-hop boost and this file is never
|
|
16
|
+
* reached — so the default path keeps its O(seeds × neighbours) cost
|
|
17
|
+
* and its full inspectability.
|
|
18
|
+
*
|
|
19
|
+
* Deterministic: a fixed teleport probability, a fixed node-iteration
|
|
20
|
+
* order, and a fixed convergence tolerance with an iteration cap mean
|
|
21
|
+
* the same graph and personalization always yield the same scores.
|
|
22
|
+
*/
|
|
23
|
+
/**
|
|
24
|
+
* Personalized PageRank over an undirected, unweighted graph.
|
|
25
|
+
*
|
|
26
|
+
* @param graph adjacency: node → set of neighbour nodes
|
|
27
|
+
* @param personalization restart distribution: node → weight. Weights
|
|
28
|
+
* need not be normalised; non-positive weights
|
|
29
|
+
* are ignored.
|
|
30
|
+
* @returns node → stationary score, summing to ~1. Empty when the
|
|
31
|
+
* personalization carries no mass, the graph is empty, or the
|
|
32
|
+
* graph exceeds `maxNodes`.
|
|
33
|
+
*/
|
|
34
|
+
export function personalizedPageRank(graph, personalization, options = {}) {
|
|
35
|
+
const alpha = options.alpha ?? 0.15;
|
|
36
|
+
const maxIterations = options.maxIterations ?? 60;
|
|
37
|
+
const tolerance = options.tolerance ?? 1e-7;
|
|
38
|
+
const maxNodes = options.maxNodes ?? 100_000;
|
|
39
|
+
// ── node set: every graph node, plus every personalized node ─────
|
|
40
|
+
// A personalized file with no co-change history still belongs in the
|
|
41
|
+
// walk — it just becomes a dangling node holding its restart mass.
|
|
42
|
+
const idOf = new Map();
|
|
43
|
+
const nodes = [];
|
|
44
|
+
const intern = (name) => {
|
|
45
|
+
let i = idOf.get(name);
|
|
46
|
+
if (i === undefined) {
|
|
47
|
+
i = nodes.length;
|
|
48
|
+
idOf.set(name, i);
|
|
49
|
+
nodes.push(name);
|
|
50
|
+
}
|
|
51
|
+
return i;
|
|
52
|
+
};
|
|
53
|
+
for (const [node, neighbours] of graph) {
|
|
54
|
+
intern(node);
|
|
55
|
+
for (const nb of neighbours)
|
|
56
|
+
intern(nb);
|
|
57
|
+
}
|
|
58
|
+
for (const node of personalization.keys())
|
|
59
|
+
intern(node);
|
|
60
|
+
const n = nodes.length;
|
|
61
|
+
if (n === 0 || n > maxNodes)
|
|
62
|
+
return new Map();
|
|
63
|
+
// ── personalization vector p, normalised to sum 1 ────────────────
|
|
64
|
+
const p = new Float64Array(n);
|
|
65
|
+
let pTotal = 0;
|
|
66
|
+
for (const [node, weight] of personalization) {
|
|
67
|
+
if (weight > 0) {
|
|
68
|
+
p[idOf.get(node)] += weight;
|
|
69
|
+
pTotal += weight;
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
if (pTotal === 0)
|
|
73
|
+
return new Map(); // nothing to personalize toward
|
|
74
|
+
for (let i = 0; i < n; i++)
|
|
75
|
+
p[i] /= pTotal;
|
|
76
|
+
// ── adjacency as integer arrays (fixed, deterministic order) ─────
|
|
77
|
+
const adj = nodes.map(() => []);
|
|
78
|
+
for (const [node, neighbours] of graph) {
|
|
79
|
+
const j = idOf.get(node);
|
|
80
|
+
for (const nb of neighbours)
|
|
81
|
+
adj[j].push(idOf.get(nb));
|
|
82
|
+
}
|
|
83
|
+
// ── power iteration with dangling-mass redistribution ────────────
|
|
84
|
+
// r_new[i] = (α + (1-α)·danglingMass)·p[i] + (1-α)·Σ_{j→i} r[j]/deg(j)
|
|
85
|
+
// A dangling node (no out-edges) would otherwise leak probability;
|
|
86
|
+
// sending its mass back through p keeps Σr = 1 every iteration.
|
|
87
|
+
let r = Float64Array.from(p);
|
|
88
|
+
let next = new Float64Array(n);
|
|
89
|
+
for (let iter = 0; iter < maxIterations; iter++) {
|
|
90
|
+
let dangling = 0;
|
|
91
|
+
for (let i = 0; i < n; i++)
|
|
92
|
+
if (adj[i].length === 0)
|
|
93
|
+
dangling += r[i];
|
|
94
|
+
const base = alpha + (1 - alpha) * dangling;
|
|
95
|
+
for (let i = 0; i < n; i++)
|
|
96
|
+
next[i] = base * p[i];
|
|
97
|
+
for (let j = 0; j < n; j++) {
|
|
98
|
+
const out = adj[j];
|
|
99
|
+
if (out.length === 0)
|
|
100
|
+
continue;
|
|
101
|
+
const share = ((1 - alpha) * r[j]) / out.length;
|
|
102
|
+
for (const i of out)
|
|
103
|
+
next[i] += share;
|
|
104
|
+
}
|
|
105
|
+
let delta = 0;
|
|
106
|
+
for (let i = 0; i < n; i++)
|
|
107
|
+
delta += Math.abs(next[i] - r[i]);
|
|
108
|
+
const swap = r;
|
|
109
|
+
r = next;
|
|
110
|
+
next = swap;
|
|
111
|
+
if (delta < tolerance)
|
|
112
|
+
break;
|
|
113
|
+
}
|
|
114
|
+
const result = new Map();
|
|
115
|
+
for (let i = 0; i < n; i++)
|
|
116
|
+
result.set(nodes[i], r[i]);
|
|
117
|
+
return result;
|
|
118
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Deterministic tokenizer for memory search.
|
|
3
|
+
*
|
|
4
|
+
* Two scripts, two strategies, one pass:
|
|
5
|
+
*
|
|
6
|
+
* - Latin / digit runs are split identifier-aware, so an agent's
|
|
7
|
+
* camelCase / snake_case queries match well:
|
|
8
|
+
* "AuthService.login_user" → ["authservice","auth","service","login","user"]
|
|
9
|
+
*
|
|
10
|
+
* - CJK runs (Chinese, Japanese kana, Korean) are emitted as
|
|
11
|
+
* overlapping bigrams:
|
|
12
|
+
* "数据库连接" → ["数据","据库","库连","连接"]
|
|
13
|
+
* CJK text has no word delimiters, so a whitespace/punctuation
|
|
14
|
+
* splitter would drop it entirely. Bigrams give BM25 overlapping
|
|
15
|
+
* units to match on — a query "数据库" → ["数据","据库"] overlaps a
|
|
16
|
+
* stored "数据库连接池" — with no dictionary, model, or word
|
|
17
|
+
* segmenter. This is the same approach Lucene's CJK analyzer and
|
|
18
|
+
* SQLite FTS5 use; it is deterministic and dependency-free, which
|
|
19
|
+
* is why it's preferred here over a statistical segmenter (whose
|
|
20
|
+
* dictionary alone would blow the package size budget).
|
|
21
|
+
*
|
|
22
|
+
* Both indexing and querying call this function, so the two sides
|
|
23
|
+
* always agree regardless of script.
|
|
24
|
+
*/
|
|
25
|
+
export declare function tokenize(text: string): string[];
|
|
26
|
+
export declare function termFreq(tokens: string[]): Map<string, number>;
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Deterministic tokenizer for memory search.
|
|
3
|
+
*
|
|
4
|
+
* Two scripts, two strategies, one pass:
|
|
5
|
+
*
|
|
6
|
+
* - Latin / digit runs are split identifier-aware, so an agent's
|
|
7
|
+
* camelCase / snake_case queries match well:
|
|
8
|
+
* "AuthService.login_user" → ["authservice","auth","service","login","user"]
|
|
9
|
+
*
|
|
10
|
+
* - CJK runs (Chinese, Japanese kana, Korean) are emitted as
|
|
11
|
+
* overlapping bigrams:
|
|
12
|
+
* "数据库连接" → ["数据","据库","库连","连接"]
|
|
13
|
+
* CJK text has no word delimiters, so a whitespace/punctuation
|
|
14
|
+
* splitter would drop it entirely. Bigrams give BM25 overlapping
|
|
15
|
+
* units to match on — a query "数据库" → ["数据","据库"] overlaps a
|
|
16
|
+
* stored "数据库连接池" — with no dictionary, model, or word
|
|
17
|
+
* segmenter. This is the same approach Lucene's CJK analyzer and
|
|
18
|
+
* SQLite FTS5 use; it is deterministic and dependency-free, which
|
|
19
|
+
* is why it's preferred here over a statistical segmenter (whose
|
|
20
|
+
* dictionary alone would blow the package size budget).
|
|
21
|
+
*
|
|
22
|
+
* Both indexing and querying call this function, so the two sides
|
|
23
|
+
* always agree regardless of script.
|
|
24
|
+
*/
|
|
25
|
+
// English stopwords. Not applied to CJK bigrams — there's no reliable
|
|
26
|
+
// script-agnostic stopword notion for bigrams, and BM25's IDF already
|
|
27
|
+
// down-weights ubiquitous bigrams without a hand-maintained list.
|
|
28
|
+
const STOPWORDS = new Set([
|
|
29
|
+
"a", "an", "and", "are", "as", "at", "be", "but", "by", "for",
|
|
30
|
+
"from", "has", "have", "if", "in", "into", "is", "it", "its", "of",
|
|
31
|
+
"on", "or", "so", "such", "that", "the", "their", "then", "there",
|
|
32
|
+
"these", "they", "this", "to", "was", "were", "will", "with",
|
|
33
|
+
"we", "us", "you", "your", "i", "me", "my", "do", "does", "did",
|
|
34
|
+
]);
|
|
35
|
+
const MIN_LEN = 2;
|
|
36
|
+
const MAX_LEN = 32;
|
|
37
|
+
// CJK scripts handled as bigrams: Han (Chinese, Japanese kanji),
|
|
38
|
+
// Hiragana + Katakana (Japanese), Hangul (Korean).
|
|
39
|
+
const CJK_SCRIPTS = "\\p{Script=Han}\\p{Script=Hiragana}\\p{Script=Katakana}\\p{Script=Hangul}";
|
|
40
|
+
// One pass: match either a CJK run OR an ASCII word run. Everything
|
|
41
|
+
// else (spaces, punctuation, CJK punctuation) is a separator.
|
|
42
|
+
const RUN_RE = new RegExp(`[${CJK_SCRIPTS}]+|[A-Za-z0-9_]+`, "gu");
|
|
43
|
+
const CJK_RE = new RegExp(`[${CJK_SCRIPTS}]`, "u");
|
|
44
|
+
export function tokenize(text) {
|
|
45
|
+
if (!text)
|
|
46
|
+
return [];
|
|
47
|
+
const out = [];
|
|
48
|
+
for (const match of text.matchAll(RUN_RE)) {
|
|
49
|
+
const run = match[0];
|
|
50
|
+
// ── CJK run → overlapping bigrams ───────────────────────────────
|
|
51
|
+
if (CJK_RE.test(run)) {
|
|
52
|
+
// Iterate code points (not UTF-16 units) so astral-plane
|
|
53
|
+
// ideographs (CJK Extension B+) bigram correctly.
|
|
54
|
+
const chars = [...run];
|
|
55
|
+
if (chars.length === 1) {
|
|
56
|
+
// A lone ideograph — rare, but keep it so a single-character
|
|
57
|
+
// query still matches. CJK tokens bypass the Latin MIN_LEN.
|
|
58
|
+
out.push(chars[0]);
|
|
59
|
+
}
|
|
60
|
+
else {
|
|
61
|
+
for (let i = 0; i < chars.length - 1; i++) {
|
|
62
|
+
out.push(chars[i] + chars[i + 1]);
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
continue;
|
|
66
|
+
}
|
|
67
|
+
// ── Latin / digit run → identifier-aware splitting ──────────────
|
|
68
|
+
// camelCase split works because the run is not yet lowercased.
|
|
69
|
+
const camelParts = /[a-z][A-Z]/.test(run) ? run.split(/(?=[A-Z])/u) : [run];
|
|
70
|
+
for (const part of camelParts) {
|
|
71
|
+
const lower = part.toLowerCase();
|
|
72
|
+
const snakeParts = lower.includes("_") ? lower.split("_") : [lower];
|
|
73
|
+
for (const sp of snakeParts) {
|
|
74
|
+
if (sp.length < MIN_LEN || sp.length > MAX_LEN)
|
|
75
|
+
continue;
|
|
76
|
+
if (STOPWORDS.has(sp))
|
|
77
|
+
continue;
|
|
78
|
+
out.push(sp);
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
// Also keep the original (lowercased) full token, so exact-string
|
|
82
|
+
// queries like "authservice" still match documents with "AuthService".
|
|
83
|
+
const full = run.toLowerCase();
|
|
84
|
+
if (full.length >= MIN_LEN &&
|
|
85
|
+
full.length <= MAX_LEN &&
|
|
86
|
+
!STOPWORDS.has(full) &&
|
|
87
|
+
!out.includes(full)) {
|
|
88
|
+
out.push(full);
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
return out;
|
|
92
|
+
}
|
|
93
|
+
export function termFreq(tokens) {
|
|
94
|
+
const tf = new Map();
|
|
95
|
+
for (const t of tokens)
|
|
96
|
+
tf.set(t, (tf.get(t) ?? 0) + 1);
|
|
97
|
+
return tf;
|
|
98
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Eviction policy: least-frequently-used first (the user's
|
|
3
|
+
* explicit requirement), with least-recently-used as tiebreaker.
|
|
4
|
+
* Pinned entries are never evicted.
|
|
5
|
+
*
|
|
6
|
+
* Called after each ingest batch and after explicit agent writes.
|
|
7
|
+
*
|
|
8
|
+
* `currentTotalBytes` is supplied by the caller (the repository
|
|
9
|
+
* already tracks it incrementally) so this function does not
|
|
10
|
+
* recompute the sum, and — importantly — it evicts against the
|
|
11
|
+
* *same* number the repository reports from `totalBytes()`,
|
|
12
|
+
* including the fixed store overhead. Otherwise the effective
|
|
13
|
+
* budget would silently drift by that constant.
|
|
14
|
+
*/
|
|
15
|
+
import type { Memory } from "../types.js";
|
|
16
|
+
export declare function evictIfOverBudget(memories: readonly Memory[], maxBytes: number, currentTotalBytes: number): Memory[];
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Eviction policy: least-frequently-used first (the user's
|
|
3
|
+
* explicit requirement), with least-recently-used as tiebreaker.
|
|
4
|
+
* Pinned entries are never evicted.
|
|
5
|
+
*
|
|
6
|
+
* Called after each ingest batch and after explicit agent writes.
|
|
7
|
+
*
|
|
8
|
+
* `currentTotalBytes` is supplied by the caller (the repository
|
|
9
|
+
* already tracks it incrementally) so this function does not
|
|
10
|
+
* recompute the sum, and — importantly — it evicts against the
|
|
11
|
+
* *same* number the repository reports from `totalBytes()`,
|
|
12
|
+
* including the fixed store overhead. Otherwise the effective
|
|
13
|
+
* budget would silently drift by that constant.
|
|
14
|
+
*/
|
|
15
|
+
export function evictIfOverBudget(memories, maxBytes, currentTotalBytes) {
|
|
16
|
+
let total = currentTotalBytes;
|
|
17
|
+
if (total <= maxBytes)
|
|
18
|
+
return [];
|
|
19
|
+
// Eligible = not pinned. Sort ascending by (useCount, usedAt) so
|
|
20
|
+
// the first elements are the cheapest to lose.
|
|
21
|
+
const eligible = memories
|
|
22
|
+
.filter((m) => !m.pinned)
|
|
23
|
+
.slice()
|
|
24
|
+
.sort((a, b) => {
|
|
25
|
+
if (a.useCount !== b.useCount)
|
|
26
|
+
return a.useCount - b.useCount;
|
|
27
|
+
return a.usedAt - b.usedAt;
|
|
28
|
+
});
|
|
29
|
+
const removed = [];
|
|
30
|
+
for (const m of eligible) {
|
|
31
|
+
if (total <= maxBytes)
|
|
32
|
+
break;
|
|
33
|
+
removed.push(m);
|
|
34
|
+
total -= m.sizeBytes;
|
|
35
|
+
}
|
|
36
|
+
return removed;
|
|
37
|
+
}
|