opencode-diane 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/CHANGELOG.md +180 -0
  2. package/LICENSE +21 -0
  3. package/README.md +206 -0
  4. package/WIKI.md +1430 -0
  5. package/dist/index.d.ts +28 -0
  6. package/dist/index.js +1632 -0
  7. package/dist/ingest/adaptive.d.ts +47 -0
  8. package/dist/ingest/adaptive.js +182 -0
  9. package/dist/ingest/code-health.d.ts +58 -0
  10. package/dist/ingest/code-health.js +202 -0
  11. package/dist/ingest/code-map.d.ts +71 -0
  12. package/dist/ingest/code-map.js +670 -0
  13. package/dist/ingest/cross-refs.d.ts +59 -0
  14. package/dist/ingest/cross-refs.js +1207 -0
  15. package/dist/ingest/docs.d.ts +49 -0
  16. package/dist/ingest/docs.js +325 -0
  17. package/dist/ingest/git.d.ts +77 -0
  18. package/dist/ingest/git.js +390 -0
  19. package/dist/ingest/live-session.d.ts +101 -0
  20. package/dist/ingest/live-session.js +173 -0
  21. package/dist/ingest/project-notes.d.ts +28 -0
  22. package/dist/ingest/project-notes.js +102 -0
  23. package/dist/ingest/project.d.ts +35 -0
  24. package/dist/ingest/project.js +430 -0
  25. package/dist/ingest/session-snapshot.d.ts +63 -0
  26. package/dist/ingest/session-snapshot.js +94 -0
  27. package/dist/ingest/sessions.d.ts +29 -0
  28. package/dist/ingest/sessions.js +164 -0
  29. package/dist/ingest/tables.d.ts +52 -0
  30. package/dist/ingest/tables.js +360 -0
  31. package/dist/mining/skill-miner.d.ts +53 -0
  32. package/dist/mining/skill-miner.js +234 -0
  33. package/dist/search/bm25.d.ts +81 -0
  34. package/dist/search/bm25.js +334 -0
  35. package/dist/search/e5-embedder.d.ts +30 -0
  36. package/dist/search/e5-embedder.js +91 -0
  37. package/dist/search/embed-pass.d.ts +26 -0
  38. package/dist/search/embed-pass.js +43 -0
  39. package/dist/search/embedder.d.ts +58 -0
  40. package/dist/search/embedder.js +85 -0
  41. package/dist/search/inverted-index.d.ts +51 -0
  42. package/dist/search/inverted-index.js +139 -0
  43. package/dist/search/ppr.d.ts +44 -0
  44. package/dist/search/ppr.js +118 -0
  45. package/dist/search/tokenize.d.ts +26 -0
  46. package/dist/search/tokenize.js +98 -0
  47. package/dist/store/eviction.d.ts +16 -0
  48. package/dist/store/eviction.js +37 -0
  49. package/dist/store/repository.d.ts +222 -0
  50. package/dist/store/repository.js +420 -0
  51. package/dist/store/sqlite-store.d.ts +89 -0
  52. package/dist/store/sqlite-store.js +252 -0
  53. package/dist/store/vector-store.d.ts +66 -0
  54. package/dist/store/vector-store.js +160 -0
  55. package/dist/types.d.ts +385 -0
  56. package/dist/types.js +9 -0
  57. package/dist/utils/file-log.d.ts +87 -0
  58. package/dist/utils/file-log.js +215 -0
  59. package/dist/utils/peer-detection.d.ts +45 -0
  60. package/dist/utils/peer-detection.js +90 -0
  61. package/dist/utils/shell.d.ts +43 -0
  62. package/dist/utils/shell.js +110 -0
  63. package/dist/utils/usage-skill.d.ts +42 -0
  64. package/dist/utils/usage-skill.js +129 -0
  65. package/dist/utils/xlsx.d.ts +36 -0
  66. package/dist/utils/xlsx.js +270 -0
  67. package/grammars/tree-sitter-c.wasm +0 -0
  68. package/grammars/tree-sitter-c_sharp.wasm +0 -0
  69. package/grammars/tree-sitter-cpp.wasm +0 -0
  70. package/grammars/tree-sitter-css.wasm +0 -0
  71. package/grammars/tree-sitter-go.wasm +0 -0
  72. package/grammars/tree-sitter-html.wasm +0 -0
  73. package/grammars/tree-sitter-java.wasm +0 -0
  74. package/grammars/tree-sitter-javascript.wasm +0 -0
  75. package/grammars/tree-sitter-json.wasm +0 -0
  76. package/grammars/tree-sitter-php.wasm +0 -0
  77. package/grammars/tree-sitter-python.wasm +0 -0
  78. package/grammars/tree-sitter-rust.wasm +0 -0
  79. package/grammars/tree-sitter-typescript.wasm +0 -0
  80. package/package.json +80 -0
@@ -0,0 +1,51 @@
1
+ /**
2
+ * In-memory inverted index plus auxiliary indexes used for hierarchical
3
+ * filtering. Rebuilt from the store on plugin startup; kept consistent
4
+ * by the repository on every CRUD.
5
+ */
6
+ import type { Category, Memory } from "../types.js";
7
+ /** Per-memory bookkeeping used by BM25 scoring. */
8
+ interface DocInfo {
9
+ /** Token → term frequency. */
10
+ tf: Map<string, number>;
11
+ /** Total token count (used as |d| in BM25). */
12
+ length: number;
13
+ }
14
+ export declare class InvertedIndex {
15
+ /** token → set of memory ids that contain it */
16
+ readonly postings: Map<string, Set<string>>;
17
+ /** memory id → per-doc bookkeeping */
18
+ readonly docs: Map<string, DocInfo>;
19
+ /** category → set of memory ids */
20
+ readonly byCategory: Map<Category, Set<string>>;
21
+ /** subject → set of memory ids */
22
+ readonly bySubject: Map<string, Set<string>>;
23
+ /**
24
+ * Undirected file co-change graph: file path → set of file paths it
25
+ * was modified together with. Built from the `co-change` memories
26
+ * the git ingester produces (tags = ["co-change", fileA, fileB]).
27
+ * Used by BM25 search to propagate score to structurally-related
28
+ * files — the Aider PageRank idea, one hop, over edges we already
29
+ * compute. Survives restarts because co-change memories persist.
30
+ */
31
+ readonly coChange: Map<string, Set<string>>;
32
+ /** Running sum of doc lengths — for avgdl. */
33
+ totalLength: number;
34
+ /** Number of indexed docs. */
35
+ docCount: number;
36
+ rebuildFromAll(memories: Memory[]): void;
37
+ add(memory: Memory): void;
38
+ remove(memory: Memory): void;
39
+ avgDocLength(): number;
40
+ /** Files structurally coupled to `file` (one hop). Empty if none. */
41
+ coChangeNeighbors(file: string): ReadonlySet<string> | undefined;
42
+ /**
43
+ * A co-change memory carries tags `["co-change", fileA, fileB]`.
44
+ * If this memory is one, record the undirected edge. Anything else
45
+ * is ignored. Edges are de-duplicated by Set semantics, so the same
46
+ * pair appearing in multiple memories is harmless.
47
+ */
48
+ private indexCoChangeEdge;
49
+ private removeCoChangeEdge;
50
+ }
51
+ export {};
@@ -0,0 +1,139 @@
1
+ /**
2
+ * In-memory inverted index plus auxiliary indexes used for hierarchical
3
+ * filtering. Rebuilt from the store on plugin startup; kept consistent
4
+ * by the repository on every CRUD.
5
+ */
6
+ import { termFreq, tokenize } from "./tokenize.js";
7
+ export class InvertedIndex {
8
+ /** token → set of memory ids that contain it */
9
+ postings = new Map();
10
+ /** memory id → per-doc bookkeeping */
11
+ docs = new Map();
12
+ /** category → set of memory ids */
13
+ byCategory = new Map();
14
+ /** subject → set of memory ids */
15
+ bySubject = new Map();
16
+ /**
17
+ * Undirected file co-change graph: file path → set of file paths it
18
+ * was modified together with. Built from the `co-change` memories
19
+ * the git ingester produces (tags = ["co-change", fileA, fileB]).
20
+ * Used by BM25 search to propagate score to structurally-related
21
+ * files — the Aider PageRank idea, one hop, over edges we already
22
+ * compute. Survives restarts because co-change memories persist.
23
+ */
24
+ coChange = new Map();
25
+ /** Running sum of doc lengths — for avgdl. */
26
+ totalLength = 0;
27
+ /** Number of indexed docs. */
28
+ docCount = 0;
29
+ rebuildFromAll(memories) {
30
+ this.postings.clear();
31
+ this.docs.clear();
32
+ this.byCategory.clear();
33
+ this.bySubject.clear();
34
+ this.coChange.clear();
35
+ this.totalLength = 0;
36
+ this.docCount = 0;
37
+ for (const m of memories)
38
+ this.add(m);
39
+ }
40
+ add(memory) {
41
+ // Build searchable text from subject + content + tags
42
+ const text = `${memory.subject} ${memory.content} ${memory.tags.join(" ")}`;
43
+ const tokens = tokenize(text);
44
+ const tf = termFreq(tokens);
45
+ this.docs.set(memory.id, { tf, length: tokens.length });
46
+ this.totalLength += tokens.length;
47
+ this.docCount += 1;
48
+ for (const token of tf.keys()) {
49
+ let set = this.postings.get(token);
50
+ if (!set) {
51
+ set = new Set();
52
+ this.postings.set(token, set);
53
+ }
54
+ set.add(memory.id);
55
+ }
56
+ let catSet = this.byCategory.get(memory.category);
57
+ if (!catSet) {
58
+ catSet = new Set();
59
+ this.byCategory.set(memory.category, catSet);
60
+ }
61
+ catSet.add(memory.id);
62
+ let subSet = this.bySubject.get(memory.subject);
63
+ if (!subSet) {
64
+ subSet = new Set();
65
+ this.bySubject.set(memory.subject, subSet);
66
+ }
67
+ subSet.add(memory.id);
68
+ this.indexCoChangeEdge(memory);
69
+ }
70
+ remove(memory) {
71
+ const info = this.docs.get(memory.id);
72
+ if (info) {
73
+ this.totalLength -= info.length;
74
+ this.docCount -= 1;
75
+ for (const token of info.tf.keys()) {
76
+ const set = this.postings.get(token);
77
+ if (set) {
78
+ set.delete(memory.id);
79
+ if (set.size === 0)
80
+ this.postings.delete(token);
81
+ }
82
+ }
83
+ this.docs.delete(memory.id);
84
+ }
85
+ this.byCategory.get(memory.category)?.delete(memory.id);
86
+ this.bySubject.get(memory.subject)?.delete(memory.id);
87
+ this.removeCoChangeEdge(memory);
88
+ }
89
+ avgDocLength() {
90
+ return this.docCount > 0 ? this.totalLength / this.docCount : 0;
91
+ }
92
+ /** Files structurally coupled to `file` (one hop). Empty if none. */
93
+ coChangeNeighbors(file) {
94
+ return this.coChange.get(file);
95
+ }
96
+ /**
97
+ * A co-change memory carries tags `["co-change", fileA, fileB]`.
98
+ * If this memory is one, record the undirected edge. Anything else
99
+ * is ignored. Edges are de-duplicated by Set semantics, so the same
100
+ * pair appearing in multiple memories is harmless.
101
+ */
102
+ indexCoChangeEdge(memory) {
103
+ const pair = coChangePair(memory);
104
+ if (!pair)
105
+ return;
106
+ const [a, b] = pair;
107
+ addEdge(this.coChange, a, b);
108
+ addEdge(this.coChange, b, a);
109
+ }
110
+ removeCoChangeEdge(memory) {
111
+ const pair = coChangePair(memory);
112
+ if (!pair)
113
+ return;
114
+ const [a, b] = pair;
115
+ this.coChange.get(a)?.delete(b);
116
+ this.coChange.get(b)?.delete(a);
117
+ if (this.coChange.get(a)?.size === 0)
118
+ this.coChange.delete(a);
119
+ if (this.coChange.get(b)?.size === 0)
120
+ this.coChange.delete(b);
121
+ }
122
+ }
123
+ function coChangePair(memory) {
124
+ if (!memory.tags.includes("co-change"))
125
+ return null;
126
+ // tags = ["co-change", fileA, fileB]
127
+ const files = memory.tags.filter((t) => t !== "co-change");
128
+ if (files.length < 2)
129
+ return null;
130
+ return [files[0], files[1]];
131
+ }
132
+ function addEdge(graph, from, to) {
133
+ let set = graph.get(from);
134
+ if (!set) {
135
+ set = new Set();
136
+ graph.set(from, set);
137
+ }
138
+ set.add(to);
139
+ }
@@ -0,0 +1,44 @@
1
+ /**
2
+ * ppr.ts — Personalized PageRank over the co-change graph.
3
+ *
4
+ * The default co-change boost (bm25.ts step 3b) is a single hop: a
5
+ * textual hit about file X lifts memories about X's *direct* co-change
6
+ * neighbours, and stops there. Personalized PageRank generalises that
7
+ * to the whole graph — a random walk with restart, where the restart
8
+ * (teleport) distribution is concentrated on the query's textual hits.
9
+ * Relevance then spreads across multiple hops and is graded by how
10
+ * reachable each file is from that seed set: a direct neighbour scores
11
+ * higher than a two-hop file, which still scores above an unrelated
12
+ * one.
13
+ *
14
+ * This is opt-in (`personalizedPageRank`, default off). When off,
15
+ * retrieval uses the cheaper one-hop boost and this file is never
16
+ * reached — so the default path keeps its O(seeds × neighbours) cost
17
+ * and its full inspectability.
18
+ *
19
+ * Deterministic: a fixed teleport probability, a fixed node-iteration
20
+ * order, and a fixed convergence tolerance with an iteration cap mean
21
+ * the same graph and personalization always yield the same scores.
22
+ */
23
+ export interface PprOptions {
24
+ /** Teleport / restart probability. Default 0.15 (i.e. 0.85 damping). */
25
+ alpha?: number;
26
+ /** Hard cap on power-iteration steps. Default 60. */
27
+ maxIterations?: number;
28
+ /** L1 convergence threshold — stop once the score vector barely moves. Default 1e-7. */
29
+ tolerance?: number;
30
+ /** Safety valve: return empty (skip PPR) if the graph exceeds this node count. Default 100000. */
31
+ maxNodes?: number;
32
+ }
33
+ /**
34
+ * Personalized PageRank over an undirected, unweighted graph.
35
+ *
36
+ * @param graph adjacency: node → set of neighbour nodes
37
+ * @param personalization restart distribution: node → weight. Weights
38
+ * need not be normalised; non-positive weights
39
+ * are ignored.
40
+ * @returns node → stationary score, summing to ~1. Empty when the
41
+ * personalization carries no mass, the graph is empty, or the
42
+ * graph exceeds `maxNodes`.
43
+ */
44
+ export declare function personalizedPageRank(graph: ReadonlyMap<string, ReadonlySet<string>>, personalization: ReadonlyMap<string, number>, options?: PprOptions): Map<string, number>;
@@ -0,0 +1,118 @@
1
+ /**
2
+ * ppr.ts — Personalized PageRank over the co-change graph.
3
+ *
4
+ * The default co-change boost (bm25.ts step 3b) is a single hop: a
5
+ * textual hit about file X lifts memories about X's *direct* co-change
6
+ * neighbours, and stops there. Personalized PageRank generalises that
7
+ * to the whole graph — a random walk with restart, where the restart
8
+ * (teleport) distribution is concentrated on the query's textual hits.
9
+ * Relevance then spreads across multiple hops and is graded by how
10
+ * reachable each file is from that seed set: a direct neighbour scores
11
+ * higher than a two-hop file, which still scores above an unrelated
12
+ * one.
13
+ *
14
+ * This is opt-in (`personalizedPageRank`, default off). When off,
15
+ * retrieval uses the cheaper one-hop boost and this file is never
16
+ * reached — so the default path keeps its O(seeds × neighbours) cost
17
+ * and its full inspectability.
18
+ *
19
+ * Deterministic: a fixed teleport probability, a fixed node-iteration
20
+ * order, and a fixed convergence tolerance with an iteration cap mean
21
+ * the same graph and personalization always yield the same scores.
22
+ */
23
+ /**
24
+ * Personalized PageRank over an undirected, unweighted graph.
25
+ *
26
+ * @param graph adjacency: node → set of neighbour nodes
27
+ * @param personalization restart distribution: node → weight. Weights
28
+ * need not be normalised; non-positive weights
29
+ * are ignored.
30
+ * @returns node → stationary score, summing to ~1. Empty when the
31
+ * personalization carries no mass, the graph is empty, or the
32
+ * graph exceeds `maxNodes`.
33
+ */
34
+ export function personalizedPageRank(graph, personalization, options = {}) {
35
+ const alpha = options.alpha ?? 0.15;
36
+ const maxIterations = options.maxIterations ?? 60;
37
+ const tolerance = options.tolerance ?? 1e-7;
38
+ const maxNodes = options.maxNodes ?? 100_000;
39
+ // ── node set: every graph node, plus every personalized node ─────
40
+ // A personalized file with no co-change history still belongs in the
41
+ // walk — it just becomes a dangling node holding its restart mass.
42
+ const idOf = new Map();
43
+ const nodes = [];
44
+ const intern = (name) => {
45
+ let i = idOf.get(name);
46
+ if (i === undefined) {
47
+ i = nodes.length;
48
+ idOf.set(name, i);
49
+ nodes.push(name);
50
+ }
51
+ return i;
52
+ };
53
+ for (const [node, neighbours] of graph) {
54
+ intern(node);
55
+ for (const nb of neighbours)
56
+ intern(nb);
57
+ }
58
+ for (const node of personalization.keys())
59
+ intern(node);
60
+ const n = nodes.length;
61
+ if (n === 0 || n > maxNodes)
62
+ return new Map();
63
+ // ── personalization vector p, normalised to sum 1 ────────────────
64
+ const p = new Float64Array(n);
65
+ let pTotal = 0;
66
+ for (const [node, weight] of personalization) {
67
+ if (weight > 0) {
68
+ p[idOf.get(node)] += weight;
69
+ pTotal += weight;
70
+ }
71
+ }
72
+ if (pTotal === 0)
73
+ return new Map(); // nothing to personalize toward
74
+ for (let i = 0; i < n; i++)
75
+ p[i] /= pTotal;
76
+ // ── adjacency as integer arrays (fixed, deterministic order) ─────
77
+ const adj = nodes.map(() => []);
78
+ for (const [node, neighbours] of graph) {
79
+ const j = idOf.get(node);
80
+ for (const nb of neighbours)
81
+ adj[j].push(idOf.get(nb));
82
+ }
83
+ // ── power iteration with dangling-mass redistribution ────────────
84
+ // r_new[i] = (α + (1-α)·danglingMass)·p[i] + (1-α)·Σ_{j→i} r[j]/deg(j)
85
+ // A dangling node (no out-edges) would otherwise leak probability;
86
+ // sending its mass back through p keeps Σr = 1 every iteration.
87
+ let r = Float64Array.from(p);
88
+ let next = new Float64Array(n);
89
+ for (let iter = 0; iter < maxIterations; iter++) {
90
+ let dangling = 0;
91
+ for (let i = 0; i < n; i++)
92
+ if (adj[i].length === 0)
93
+ dangling += r[i];
94
+ const base = alpha + (1 - alpha) * dangling;
95
+ for (let i = 0; i < n; i++)
96
+ next[i] = base * p[i];
97
+ for (let j = 0; j < n; j++) {
98
+ const out = adj[j];
99
+ if (out.length === 0)
100
+ continue;
101
+ const share = ((1 - alpha) * r[j]) / out.length;
102
+ for (const i of out)
103
+ next[i] += share;
104
+ }
105
+ let delta = 0;
106
+ for (let i = 0; i < n; i++)
107
+ delta += Math.abs(next[i] - r[i]);
108
+ const swap = r;
109
+ r = next;
110
+ next = swap;
111
+ if (delta < tolerance)
112
+ break;
113
+ }
114
+ const result = new Map();
115
+ for (let i = 0; i < n; i++)
116
+ result.set(nodes[i], r[i]);
117
+ return result;
118
+ }
@@ -0,0 +1,26 @@
1
+ /**
2
+ * Deterministic tokenizer for memory search.
3
+ *
4
+ * Two scripts, two strategies, one pass:
5
+ *
6
+ * - Latin / digit runs are split identifier-aware, so an agent's
7
+ * camelCase / snake_case queries match well:
8
+ * "AuthService.login_user" → ["authservice","auth","service","login","user"]
9
+ *
10
+ * - CJK runs (Chinese, Japanese kana, Korean) are emitted as
11
+ * overlapping bigrams:
12
+ * "数据库连接" → ["数据","据库","库连","连接"]
13
+ * CJK text has no word delimiters, so a whitespace/punctuation
14
+ * splitter would drop it entirely. Bigrams give BM25 overlapping
15
+ * units to match on — a query "数据库" → ["数据","据库"] overlaps a
16
+ * stored "数据库连接池" — with no dictionary, model, or word
17
+ * segmenter. This is the same approach Lucene's CJK analyzer and
18
+ * SQLite FTS5 use; it is deterministic and dependency-free, which
19
+ * is why it's preferred here over a statistical segmenter (whose
20
+ * dictionary alone would blow the package size budget).
21
+ *
22
+ * Both indexing and querying call this function, so the two sides
23
+ * always agree regardless of script.
24
+ */
25
+ export declare function tokenize(text: string): string[];
26
+ export declare function termFreq(tokens: string[]): Map<string, number>;
@@ -0,0 +1,98 @@
1
+ /**
2
+ * Deterministic tokenizer for memory search.
3
+ *
4
+ * Two scripts, two strategies, one pass:
5
+ *
6
+ * - Latin / digit runs are split identifier-aware, so an agent's
7
+ * camelCase / snake_case queries match well:
8
+ * "AuthService.login_user" → ["authservice","auth","service","login","user"]
9
+ *
10
+ * - CJK runs (Chinese, Japanese kana, Korean) are emitted as
11
+ * overlapping bigrams:
12
+ * "数据库连接" → ["数据","据库","库连","连接"]
13
+ * CJK text has no word delimiters, so a whitespace/punctuation
14
+ * splitter would drop it entirely. Bigrams give BM25 overlapping
15
+ * units to match on — a query "数据库" → ["数据","据库"] overlaps a
16
+ * stored "数据库连接池" — with no dictionary, model, or word
17
+ * segmenter. This is the same approach Lucene's CJK analyzer and
18
+ * SQLite FTS5 use; it is deterministic and dependency-free, which
19
+ * is why it's preferred here over a statistical segmenter (whose
20
+ * dictionary alone would blow the package size budget).
21
+ *
22
+ * Both indexing and querying call this function, so the two sides
23
+ * always agree regardless of script.
24
+ */
25
+ // English stopwords. Not applied to CJK bigrams — there's no reliable
26
+ // script-agnostic stopword notion for bigrams, and BM25's IDF already
27
+ // down-weights ubiquitous bigrams without a hand-maintained list.
28
+ const STOPWORDS = new Set([
29
+ "a", "an", "and", "are", "as", "at", "be", "but", "by", "for",
30
+ "from", "has", "have", "if", "in", "into", "is", "it", "its", "of",
31
+ "on", "or", "so", "such", "that", "the", "their", "then", "there",
32
+ "these", "they", "this", "to", "was", "were", "will", "with",
33
+ "we", "us", "you", "your", "i", "me", "my", "do", "does", "did",
34
+ ]);
35
+ const MIN_LEN = 2;
36
+ const MAX_LEN = 32;
37
+ // CJK scripts handled as bigrams: Han (Chinese, Japanese kanji),
38
+ // Hiragana + Katakana (Japanese), Hangul (Korean).
39
+ const CJK_SCRIPTS = "\\p{Script=Han}\\p{Script=Hiragana}\\p{Script=Katakana}\\p{Script=Hangul}";
40
+ // One pass: match either a CJK run OR an ASCII word run. Everything
41
+ // else (spaces, punctuation, CJK punctuation) is a separator.
42
+ const RUN_RE = new RegExp(`[${CJK_SCRIPTS}]+|[A-Za-z0-9_]+`, "gu");
43
+ const CJK_RE = new RegExp(`[${CJK_SCRIPTS}]`, "u");
44
+ export function tokenize(text) {
45
+ if (!text)
46
+ return [];
47
+ const out = [];
48
+ for (const match of text.matchAll(RUN_RE)) {
49
+ const run = match[0];
50
+ // ── CJK run → overlapping bigrams ───────────────────────────────
51
+ if (CJK_RE.test(run)) {
52
+ // Iterate code points (not UTF-16 units) so astral-plane
53
+ // ideographs (CJK Extension B+) bigram correctly.
54
+ const chars = [...run];
55
+ if (chars.length === 1) {
56
+ // A lone ideograph — rare, but keep it so a single-character
57
+ // query still matches. CJK tokens bypass the Latin MIN_LEN.
58
+ out.push(chars[0]);
59
+ }
60
+ else {
61
+ for (let i = 0; i < chars.length - 1; i++) {
62
+ out.push(chars[i] + chars[i + 1]);
63
+ }
64
+ }
65
+ continue;
66
+ }
67
+ // ── Latin / digit run → identifier-aware splitting ──────────────
68
+ // camelCase split works because the run is not yet lowercased.
69
+ const camelParts = /[a-z][A-Z]/.test(run) ? run.split(/(?=[A-Z])/u) : [run];
70
+ for (const part of camelParts) {
71
+ const lower = part.toLowerCase();
72
+ const snakeParts = lower.includes("_") ? lower.split("_") : [lower];
73
+ for (const sp of snakeParts) {
74
+ if (sp.length < MIN_LEN || sp.length > MAX_LEN)
75
+ continue;
76
+ if (STOPWORDS.has(sp))
77
+ continue;
78
+ out.push(sp);
79
+ }
80
+ }
81
+ // Also keep the original (lowercased) full token, so exact-string
82
+ // queries like "authservice" still match documents with "AuthService".
83
+ const full = run.toLowerCase();
84
+ if (full.length >= MIN_LEN &&
85
+ full.length <= MAX_LEN &&
86
+ !STOPWORDS.has(full) &&
87
+ !out.includes(full)) {
88
+ out.push(full);
89
+ }
90
+ }
91
+ return out;
92
+ }
93
+ export function termFreq(tokens) {
94
+ const tf = new Map();
95
+ for (const t of tokens)
96
+ tf.set(t, (tf.get(t) ?? 0) + 1);
97
+ return tf;
98
+ }
@@ -0,0 +1,16 @@
1
+ /**
2
+ * Eviction policy: least-frequently-used first (the user's
3
+ * explicit requirement), with least-recently-used as tiebreaker.
4
+ * Pinned entries are never evicted.
5
+ *
6
+ * Called after each ingest batch and after explicit agent writes.
7
+ *
8
+ * `currentTotalBytes` is supplied by the caller (the repository
9
+ * already tracks it incrementally) so this function does not
10
+ * recompute the sum, and — importantly — it evicts against the
11
+ * *same* number the repository reports from `totalBytes()`,
12
+ * including the fixed store overhead. Otherwise the effective
13
+ * budget would silently drift by that constant.
14
+ */
15
+ import type { Memory } from "../types.js";
16
+ export declare function evictIfOverBudget(memories: readonly Memory[], maxBytes: number, currentTotalBytes: number): Memory[];
@@ -0,0 +1,37 @@
1
+ /**
2
+ * Eviction policy: least-frequently-used first (the user's
3
+ * explicit requirement), with least-recently-used as tiebreaker.
4
+ * Pinned entries are never evicted.
5
+ *
6
+ * Called after each ingest batch and after explicit agent writes.
7
+ *
8
+ * `currentTotalBytes` is supplied by the caller (the repository
9
+ * already tracks it incrementally) so this function does not
10
+ * recompute the sum, and — importantly — it evicts against the
11
+ * *same* number the repository reports from `totalBytes()`,
12
+ * including the fixed store overhead. Otherwise the effective
13
+ * budget would silently drift by that constant.
14
+ */
15
+ export function evictIfOverBudget(memories, maxBytes, currentTotalBytes) {
16
+ let total = currentTotalBytes;
17
+ if (total <= maxBytes)
18
+ return [];
19
+ // Eligible = not pinned. Sort ascending by (useCount, usedAt) so
20
+ // the first elements are the cheapest to lose.
21
+ const eligible = memories
22
+ .filter((m) => !m.pinned)
23
+ .slice()
24
+ .sort((a, b) => {
25
+ if (a.useCount !== b.useCount)
26
+ return a.useCount - b.useCount;
27
+ return a.usedAt - b.usedAt;
28
+ });
29
+ const removed = [];
30
+ for (const m of eligible) {
31
+ if (total <= maxBytes)
32
+ break;
33
+ removed.push(m);
34
+ total -= m.sizeBytes;
35
+ }
36
+ return removed;
37
+ }