@renseiai/agentfactory-code-intelligence 0.8.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/dist/src/__tests__/types.test.d.ts +2 -0
- package/dist/src/__tests__/types.test.d.ts.map +1 -0
- package/dist/src/__tests__/types.test.js +187 -0
- package/dist/src/index.d.ts +26 -0
- package/dist/src/index.d.ts.map +1 -0
- package/dist/src/index.js +29 -0
- package/dist/src/indexing/__tests__/indexing.test.d.ts +2 -0
- package/dist/src/indexing/__tests__/indexing.test.d.ts.map +1 -0
- package/dist/src/indexing/__tests__/indexing.test.js +193 -0
- package/dist/src/indexing/change-detector.d.ts +16 -0
- package/dist/src/indexing/change-detector.d.ts.map +1 -0
- package/dist/src/indexing/change-detector.js +38 -0
- package/dist/src/indexing/git-hash-provider.d.ts +11 -0
- package/dist/src/indexing/git-hash-provider.d.ts.map +1 -0
- package/dist/src/indexing/git-hash-provider.js +25 -0
- package/dist/src/indexing/incremental-indexer.d.ts +38 -0
- package/dist/src/indexing/incremental-indexer.d.ts.map +1 -0
- package/dist/src/indexing/incremental-indexer.js +122 -0
- package/dist/src/indexing/merkle-tree.d.ts +33 -0
- package/dist/src/indexing/merkle-tree.d.ts.map +1 -0
- package/dist/src/indexing/merkle-tree.js +107 -0
- package/dist/src/memory/__tests__/dedup.test.d.ts +2 -0
- package/dist/src/memory/__tests__/dedup.test.d.ts.map +1 -0
- package/dist/src/memory/__tests__/dedup.test.js +173 -0
- package/dist/src/memory/dedup-pipeline.d.ts +24 -0
- package/dist/src/memory/dedup-pipeline.d.ts.map +1 -0
- package/dist/src/memory/dedup-pipeline.js +73 -0
- package/dist/src/memory/memory-store.d.ts +22 -0
- package/dist/src/memory/memory-store.d.ts.map +1 -0
- package/dist/src/memory/memory-store.js +32 -0
- package/dist/src/memory/simhash.d.ts +16 -0
- package/dist/src/memory/simhash.d.ts.map +1 -0
- package/dist/src/memory/simhash.js +67 -0
- package/dist/src/memory/xxhash.d.ts +3 -0
- package/dist/src/memory/xxhash.d.ts.map +1 -0
- package/dist/src/memory/xxhash.js +13 -0
- package/dist/src/parser/__tests__/multi-language.test.d.ts +2 -0
- package/dist/src/parser/__tests__/multi-language.test.d.ts.map +1 -0
- package/dist/src/parser/__tests__/multi-language.test.js +350 -0
- package/dist/src/parser/__tests__/symbol-extractor.test.d.ts +2 -0
- package/dist/src/parser/__tests__/symbol-extractor.test.d.ts.map +1 -0
- package/dist/src/parser/__tests__/symbol-extractor.test.js +188 -0
- package/dist/src/parser/go-extractor.d.ts +8 -0
- package/dist/src/parser/go-extractor.d.ts.map +1 -0
- package/dist/src/parser/go-extractor.js +127 -0
- package/dist/src/parser/python-extractor.d.ts +8 -0
- package/dist/src/parser/python-extractor.d.ts.map +1 -0
- package/dist/src/parser/python-extractor.js +92 -0
- package/dist/src/parser/rust-extractor.d.ts +8 -0
- package/dist/src/parser/rust-extractor.d.ts.map +1 -0
- package/dist/src/parser/rust-extractor.js +168 -0
- package/dist/src/parser/symbol-extractor.d.ts +14 -0
- package/dist/src/parser/symbol-extractor.d.ts.map +1 -0
- package/dist/src/parser/symbol-extractor.js +47 -0
- package/dist/src/parser/typescript-extractor.d.ts +13 -0
- package/dist/src/parser/typescript-extractor.d.ts.map +1 -0
- package/dist/src/parser/typescript-extractor.js +229 -0
- package/dist/src/plugin/__tests__/plugin.test.d.ts +2 -0
- package/dist/src/plugin/__tests__/plugin.test.d.ts.map +1 -0
- package/dist/src/plugin/__tests__/plugin.test.js +48 -0
- package/dist/src/plugin/code-intelligence-plugin.d.ts +15 -0
- package/dist/src/plugin/code-intelligence-plugin.d.ts.map +1 -0
- package/dist/src/plugin/code-intelligence-plugin.js +102 -0
- package/dist/src/repo-map/__tests__/repo-map.test.d.ts +2 -0
- package/dist/src/repo-map/__tests__/repo-map.test.d.ts.map +1 -0
- package/dist/src/repo-map/__tests__/repo-map.test.js +186 -0
- package/dist/src/repo-map/dependency-graph.d.ts +30 -0
- package/dist/src/repo-map/dependency-graph.d.ts.map +1 -0
- package/dist/src/repo-map/dependency-graph.js +105 -0
- package/dist/src/repo-map/pagerank.d.ts +20 -0
- package/dist/src/repo-map/pagerank.d.ts.map +1 -0
- package/dist/src/repo-map/pagerank.js +68 -0
- package/dist/src/repo-map/repo-map-generator.d.ts +20 -0
- package/dist/src/repo-map/repo-map-generator.d.ts.map +1 -0
- package/dist/src/repo-map/repo-map-generator.js +66 -0
- package/dist/src/search/__tests__/search.test.d.ts +2 -0
- package/dist/src/search/__tests__/search.test.d.ts.map +1 -0
- package/dist/src/search/__tests__/search.test.js +191 -0
- package/dist/src/search/bm25.d.ts +24 -0
- package/dist/src/search/bm25.d.ts.map +1 -0
- package/dist/src/search/bm25.js +44 -0
- package/dist/src/search/inverted-index.d.ts +31 -0
- package/dist/src/search/inverted-index.d.ts.map +1 -0
- package/dist/src/search/inverted-index.js +72 -0
- package/dist/src/search/search-engine.d.ts +22 -0
- package/dist/src/search/search-engine.d.ts.map +1 -0
- package/dist/src/search/search-engine.js +76 -0
- package/dist/src/search/tokenizer.d.ts +11 -0
- package/dist/src/search/tokenizer.d.ts.map +1 -0
- package/dist/src/search/tokenizer.js +48 -0
- package/dist/src/types.d.ts +242 -0
- package/dist/src/types.d.ts.map +1 -0
- package/dist/src/types.js +96 -0
- package/package.json +74 -0
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
import { readFile, writeFile, mkdir } from 'node:fs/promises';
|
|
2
|
+
import { join } from 'node:path';
|
|
3
|
+
import { MerkleTree } from './merkle-tree.js';
|
|
4
|
+
import { ChangeDetector } from './change-detector.js';
|
|
5
|
+
import { GitHashProvider } from './git-hash-provider.js';
|
|
6
|
+
/**
|
|
7
|
+
* Orchestrates incremental re-indexing of only changed files.
|
|
8
|
+
* Persists index to `.agentfactory/code-index/`.
|
|
9
|
+
*/
|
|
10
|
+
export class IncrementalIndexer {
|
|
11
|
+
hashProvider = new GitHashProvider();
|
|
12
|
+
changeDetector = new ChangeDetector();
|
|
13
|
+
extractor;
|
|
14
|
+
indexDir;
|
|
15
|
+
previousTree;
|
|
16
|
+
fileIndex = new Map();
|
|
17
|
+
constructor(extractor, options = {}) {
|
|
18
|
+
this.extractor = extractor;
|
|
19
|
+
this.indexDir = options.indexDir ?? '.agentfactory/code-index';
|
|
20
|
+
}
|
|
21
|
+
/**
|
|
22
|
+
* Index files, returning only the changed ones.
|
|
23
|
+
* @param files Map of filePath -> content
|
|
24
|
+
*/
|
|
25
|
+
async index(files) {
|
|
26
|
+
const newTree = MerkleTree.fromFiles(files);
|
|
27
|
+
let changes;
|
|
28
|
+
if (this.previousTree) {
|
|
29
|
+
changes = this.changeDetector.detect(this.previousTree, newTree);
|
|
30
|
+
}
|
|
31
|
+
else {
|
|
32
|
+
changes = {
|
|
33
|
+
added: [...files.keys()].sort(),
|
|
34
|
+
modified: [],
|
|
35
|
+
deleted: [],
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
// Remove deleted files from index
|
|
39
|
+
for (const path of changes.deleted) {
|
|
40
|
+
this.fileIndex.delete(path);
|
|
41
|
+
}
|
|
42
|
+
// Index added and modified files
|
|
43
|
+
const indexed = [];
|
|
44
|
+
const toIndex = [...changes.added, ...changes.modified];
|
|
45
|
+
for (const path of toIndex) {
|
|
46
|
+
const content = files.get(path);
|
|
47
|
+
if (!content)
|
|
48
|
+
continue;
|
|
49
|
+
const ast = this.extractor.extractFromSource(content, path);
|
|
50
|
+
const gitHash = this.hashProvider.hashContent(content);
|
|
51
|
+
const fileIdx = {
|
|
52
|
+
filePath: path,
|
|
53
|
+
gitHash,
|
|
54
|
+
symbols: ast.symbols,
|
|
55
|
+
lastIndexed: Date.now(),
|
|
56
|
+
};
|
|
57
|
+
this.fileIndex.set(path, fileIdx);
|
|
58
|
+
indexed.push(fileIdx);
|
|
59
|
+
}
|
|
60
|
+
this.previousTree = newTree;
|
|
61
|
+
// Collect all symbols for metadata
|
|
62
|
+
const allSymbols = [];
|
|
63
|
+
const languages = new Set();
|
|
64
|
+
for (const fi of this.fileIndex.values()) {
|
|
65
|
+
allSymbols.push(...fi.symbols);
|
|
66
|
+
for (const s of fi.symbols) {
|
|
67
|
+
if (s.language)
|
|
68
|
+
languages.add(s.language);
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
const metadata = {
|
|
72
|
+
version: 1,
|
|
73
|
+
rootHash: newTree.getRootHash(),
|
|
74
|
+
totalFiles: this.fileIndex.size,
|
|
75
|
+
totalSymbols: allSymbols.length,
|
|
76
|
+
lastUpdated: Date.now(),
|
|
77
|
+
languages: [...languages].sort(),
|
|
78
|
+
};
|
|
79
|
+
return { changes, indexed, metadata };
|
|
80
|
+
}
|
|
81
|
+
/** Save index to disk. */
|
|
82
|
+
async save(basePath) {
|
|
83
|
+
const dir = join(basePath, this.indexDir);
|
|
84
|
+
await mkdir(dir, { recursive: true });
|
|
85
|
+
const data = {
|
|
86
|
+
files: Object.fromEntries(this.fileIndex),
|
|
87
|
+
rootHash: this.previousTree?.getRootHash() ?? '',
|
|
88
|
+
};
|
|
89
|
+
await writeFile(join(dir, 'index.json'), JSON.stringify(data, null, 2));
|
|
90
|
+
}
|
|
91
|
+
/** Load index from disk. */
|
|
92
|
+
async load(basePath) {
|
|
93
|
+
const indexPath = join(basePath, this.indexDir, 'index.json');
|
|
94
|
+
try {
|
|
95
|
+
const raw = await readFile(indexPath, 'utf-8');
|
|
96
|
+
const data = JSON.parse(raw);
|
|
97
|
+
this.fileIndex = new Map(Object.entries(data.files));
|
|
98
|
+
// Rebuild Merkle tree from stored hashes
|
|
99
|
+
const fileHashes = new Map();
|
|
100
|
+
for (const [path, fi] of this.fileIndex) {
|
|
101
|
+
fileHashes.set(path, fi.gitHash);
|
|
102
|
+
}
|
|
103
|
+
this.previousTree = MerkleTree.fromHashes(fileHashes);
|
|
104
|
+
return true;
|
|
105
|
+
}
|
|
106
|
+
catch {
|
|
107
|
+
return false;
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
/** Get all indexed symbols. */
|
|
111
|
+
getAllSymbols() {
|
|
112
|
+
const symbols = [];
|
|
113
|
+
for (const fi of this.fileIndex.values()) {
|
|
114
|
+
symbols.push(...fi.symbols);
|
|
115
|
+
}
|
|
116
|
+
return symbols;
|
|
117
|
+
}
|
|
118
|
+
/** Get the current file index. */
|
|
119
|
+
getFileIndex() {
|
|
120
|
+
return new Map(this.fileIndex);
|
|
121
|
+
}
|
|
122
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
export interface MerkleNode {
|
|
2
|
+
path: string;
|
|
3
|
+
hash: string;
|
|
4
|
+
isDirectory: boolean;
|
|
5
|
+
children: Map<string, MerkleNode>;
|
|
6
|
+
}
|
|
7
|
+
/**
|
|
8
|
+
* Merkle tree for incremental code indexing.
|
|
9
|
+
* Builds a tree from file paths + content hashes, enabling efficient diff.
|
|
10
|
+
*/
|
|
11
|
+
export declare class MerkleTree {
|
|
12
|
+
private root;
|
|
13
|
+
private hashProvider;
|
|
14
|
+
constructor();
|
|
15
|
+
/** Build tree from file path -> content map. */
|
|
16
|
+
static fromFiles(files: Map<string, string>): MerkleTree;
|
|
17
|
+
/** Build tree from file path -> pre-computed hash map. */
|
|
18
|
+
static fromHashes(fileHashes: Map<string, string>): MerkleTree;
|
|
19
|
+
addFile(path: string, content: string): void;
|
|
20
|
+
addFileWithHash(path: string, hash: string): void;
|
|
21
|
+
/** Recompute all directory hashes bottom-up. */
|
|
22
|
+
computeHashes(): void;
|
|
23
|
+
/** Get the root hash of the tree. */
|
|
24
|
+
getRootHash(): string;
|
|
25
|
+
/** Get all file nodes in the tree. */
|
|
26
|
+
getFiles(): Map<string, string>;
|
|
27
|
+
/** Get a specific node by path. */
|
|
28
|
+
getNode(path: string): MerkleNode | undefined;
|
|
29
|
+
private computeNodeHash;
|
|
30
|
+
private collectFiles;
|
|
31
|
+
private createDirNode;
|
|
32
|
+
}
|
|
33
|
+
//# sourceMappingURL=merkle-tree.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"merkle-tree.d.ts","sourceRoot":"","sources":["../../../src/indexing/merkle-tree.ts"],"names":[],"mappings":"AAEA,MAAM,WAAW,UAAU;IACzB,IAAI,EAAE,MAAM,CAAA;IACZ,IAAI,EAAE,MAAM,CAAA;IACZ,WAAW,EAAE,OAAO,CAAA;IACpB,QAAQ,EAAE,GAAG,CAAC,MAAM,EAAE,UAAU,CAAC,CAAA;CAClC;AAED;;;GAGG;AACH,qBAAa,UAAU;IACrB,OAAO,CAAC,IAAI,CAAY;IACxB,OAAO,CAAC,YAAY,CAAiB;;IAOrC,gDAAgD;IAChD,MAAM,CAAC,SAAS,CAAC,KAAK,EAAE,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,GAAG,UAAU;IASxD,0DAA0D;IAC1D,MAAM,CAAC,UAAU,CAAC,UAAU,EAAE,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,GAAG,UAAU;IAS9D,OAAO,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,GAAG,IAAI;IAK5C,eAAe,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,GAAG,IAAI;IAwBjD,gDAAgD;IAChD,aAAa,IAAI,IAAI;IAIrB,qCAAqC;IACrC,WAAW,IAAI,MAAM;IAIrB,sCAAsC;IACtC,QAAQ,IAAI,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC;IAM/B,mCAAmC;IACnC,OAAO,CAAC,IAAI,EAAE,MAAM,GAAG,UAAU,GAAG,SAAS;IAY7C,OAAO,CAAC,eAAe;IAYvB,OAAO,CAAC,YAAY;IAUpB,OAAO,CAAC,aAAa;CAGtB"}
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
import { GitHashProvider } from './git-hash-provider.js';
|
|
2
|
+
/**
|
|
3
|
+
* Merkle tree for incremental code indexing.
|
|
4
|
+
* Builds a tree from file paths + content hashes, enabling efficient diff.
|
|
5
|
+
*/
|
|
6
|
+
export class MerkleTree {
|
|
7
|
+
root;
|
|
8
|
+
hashProvider;
|
|
9
|
+
constructor() {
|
|
10
|
+
this.hashProvider = new GitHashProvider();
|
|
11
|
+
this.root = this.createDirNode('');
|
|
12
|
+
}
|
|
13
|
+
/** Build tree from file path -> content map. */
|
|
14
|
+
static fromFiles(files) {
|
|
15
|
+
const tree = new MerkleTree();
|
|
16
|
+
for (const [path, content] of files) {
|
|
17
|
+
tree.addFile(path, content);
|
|
18
|
+
}
|
|
19
|
+
tree.computeHashes();
|
|
20
|
+
return tree;
|
|
21
|
+
}
|
|
22
|
+
/** Build tree from file path -> pre-computed hash map. */
|
|
23
|
+
static fromHashes(fileHashes) {
|
|
24
|
+
const tree = new MerkleTree();
|
|
25
|
+
for (const [path, hash] of fileHashes) {
|
|
26
|
+
tree.addFileWithHash(path, hash);
|
|
27
|
+
}
|
|
28
|
+
tree.computeHashes();
|
|
29
|
+
return tree;
|
|
30
|
+
}
|
|
31
|
+
addFile(path, content) {
|
|
32
|
+
const hash = this.hashProvider.hashContent(content);
|
|
33
|
+
this.addFileWithHash(path, hash);
|
|
34
|
+
}
|
|
35
|
+
addFileWithHash(path, hash) {
|
|
36
|
+
const parts = path.split('/').filter(Boolean);
|
|
37
|
+
let current = this.root;
|
|
38
|
+
// Create directory nodes
|
|
39
|
+
for (let i = 0; i < parts.length - 1; i++) {
|
|
40
|
+
const part = parts[i];
|
|
41
|
+
if (!current.children.has(part)) {
|
|
42
|
+
const dirPath = parts.slice(0, i + 1).join('/');
|
|
43
|
+
current.children.set(part, this.createDirNode(dirPath));
|
|
44
|
+
}
|
|
45
|
+
current = current.children.get(part);
|
|
46
|
+
}
|
|
47
|
+
// Create file node
|
|
48
|
+
const fileName = parts[parts.length - 1];
|
|
49
|
+
current.children.set(fileName, {
|
|
50
|
+
path,
|
|
51
|
+
hash,
|
|
52
|
+
isDirectory: false,
|
|
53
|
+
children: new Map(),
|
|
54
|
+
});
|
|
55
|
+
}
|
|
56
|
+
/** Recompute all directory hashes bottom-up. */
|
|
57
|
+
computeHashes() {
|
|
58
|
+
this.computeNodeHash(this.root);
|
|
59
|
+
}
|
|
60
|
+
/** Get the root hash of the tree. */
|
|
61
|
+
getRootHash() {
|
|
62
|
+
return this.root.hash;
|
|
63
|
+
}
|
|
64
|
+
/** Get all file nodes in the tree. */
|
|
65
|
+
getFiles() {
|
|
66
|
+
const files = new Map();
|
|
67
|
+
this.collectFiles(this.root, files);
|
|
68
|
+
return files;
|
|
69
|
+
}
|
|
70
|
+
/** Get a specific node by path. */
|
|
71
|
+
getNode(path) {
|
|
72
|
+
if (path === '' || path === '/')
|
|
73
|
+
return this.root;
|
|
74
|
+
const parts = path.split('/').filter(Boolean);
|
|
75
|
+
let current = this.root;
|
|
76
|
+
for (const part of parts) {
|
|
77
|
+
const child = current.children.get(part);
|
|
78
|
+
if (!child)
|
|
79
|
+
return undefined;
|
|
80
|
+
current = child;
|
|
81
|
+
}
|
|
82
|
+
return current;
|
|
83
|
+
}
|
|
84
|
+
computeNodeHash(node) {
|
|
85
|
+
if (!node.isDirectory)
|
|
86
|
+
return node.hash;
|
|
87
|
+
const childHashes = [];
|
|
88
|
+
for (const [name, child] of [...node.children.entries()].sort((a, b) => a[0].localeCompare(b[0]))) {
|
|
89
|
+
const childHash = this.computeNodeHash(child);
|
|
90
|
+
childHashes.push(`${name}:${childHash}`);
|
|
91
|
+
}
|
|
92
|
+
node.hash = this.hashProvider.hashDirectory(childHashes);
|
|
93
|
+
return node.hash;
|
|
94
|
+
}
|
|
95
|
+
collectFiles(node, files) {
|
|
96
|
+
if (!node.isDirectory) {
|
|
97
|
+
files.set(node.path, node.hash);
|
|
98
|
+
return;
|
|
99
|
+
}
|
|
100
|
+
for (const child of node.children.values()) {
|
|
101
|
+
this.collectFiles(child, files);
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
createDirNode(path) {
|
|
105
|
+
return { path, hash: '', isDirectory: true, children: new Map() };
|
|
106
|
+
}
|
|
107
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"dedup.test.d.ts","sourceRoot":"","sources":["../../../../src/memory/__tests__/dedup.test.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
import { describe, it, expect, beforeEach } from 'vitest';
|
|
2
|
+
import { xxhash64 } from '../xxhash.js';
|
|
3
|
+
import { SimHash } from '../simhash.js';
|
|
4
|
+
import { InMemoryStore } from '../memory-store.js';
|
|
5
|
+
import { DedupPipeline } from '../dedup-pipeline.js';
|
|
6
|
+
describe('xxhash64', () => {
|
|
7
|
+
it('produces consistent hashes for identical content', async () => {
|
|
8
|
+
const hash1 = await xxhash64('hello world');
|
|
9
|
+
const hash2 = await xxhash64('hello world');
|
|
10
|
+
expect(hash1).toBe(hash2);
|
|
11
|
+
});
|
|
12
|
+
it('produces different hashes for different content', async () => {
|
|
13
|
+
const hash1 = await xxhash64('hello world');
|
|
14
|
+
const hash2 = await xxhash64('hello world!');
|
|
15
|
+
expect(hash1).not.toBe(hash2);
|
|
16
|
+
});
|
|
17
|
+
it('returns a hex string', async () => {
|
|
18
|
+
const hash = await xxhash64('test');
|
|
19
|
+
expect(hash).toMatch(/^[0-9a-f]+$/i);
|
|
20
|
+
});
|
|
21
|
+
it('handles empty string', async () => {
|
|
22
|
+
const hash = await xxhash64('');
|
|
23
|
+
expect(typeof hash).toBe('string');
|
|
24
|
+
expect(hash.length).toBeGreaterThan(0);
|
|
25
|
+
});
|
|
26
|
+
});
|
|
27
|
+
describe('SimHash', () => {
|
|
28
|
+
const simhash = new SimHash();
|
|
29
|
+
it('produces identical fingerprint for identical text', () => {
|
|
30
|
+
const fp1 = simhash.compute('the quick brown fox jumps over the lazy dog');
|
|
31
|
+
const fp2 = simhash.compute('the quick brown fox jumps over the lazy dog');
|
|
32
|
+
expect(fp1).toBe(fp2);
|
|
33
|
+
});
|
|
34
|
+
it('produces similar fingerprint for similar text', () => {
|
|
35
|
+
const fp1 = simhash.compute('the quick brown fox jumps over the lazy dog');
|
|
36
|
+
const fp2 = simhash.compute('the quick brown fox leaps over the lazy dog');
|
|
37
|
+
const distance = simhash.hammingDistance(fp1, fp2);
|
|
38
|
+
expect(distance).toBeLessThan(10);
|
|
39
|
+
});
|
|
40
|
+
it('produces different fingerprint for different text', () => {
|
|
41
|
+
const fp1 = simhash.compute('the quick brown fox jumps over the lazy dog');
|
|
42
|
+
const fp2 = simhash.compute('completely unrelated text about quantum physics and mathematics');
|
|
43
|
+
const distance = simhash.hammingDistance(fp1, fp2);
|
|
44
|
+
expect(distance).toBeGreaterThan(5);
|
|
45
|
+
});
|
|
46
|
+
it('computes hamming distance correctly', () => {
|
|
47
|
+
expect(simhash.hammingDistance(10n, 9n)).toBe(2);
|
|
48
|
+
expect(simhash.hammingDistance(0n, 0n)).toBe(0);
|
|
49
|
+
expect(simhash.hammingDistance(15n, 0n)).toBe(4);
|
|
50
|
+
});
|
|
51
|
+
it('detects near-duplicates within threshold', () => {
|
|
52
|
+
const fp1 = simhash.compute('hello world this is a test document for simhash');
|
|
53
|
+
const fp2 = simhash.compute('hello world this is a test document for simhash!');
|
|
54
|
+
expect(simhash.isNearDuplicate(fp1, fp2, 5)).toBe(true);
|
|
55
|
+
});
|
|
56
|
+
it('handles empty text', () => {
|
|
57
|
+
const fp = simhash.compute('');
|
|
58
|
+
expect(fp).toBe(0n);
|
|
59
|
+
});
|
|
60
|
+
});
|
|
61
|
+
describe('InMemoryStore', () => {
|
|
62
|
+
let store;
|
|
63
|
+
beforeEach(() => {
|
|
64
|
+
store = new InMemoryStore();
|
|
65
|
+
});
|
|
66
|
+
it('stores and retrieves entries', async () => {
|
|
67
|
+
const entry = {
|
|
68
|
+
id: 'test-1',
|
|
69
|
+
content: 'hello',
|
|
70
|
+
xxhash: 'abc123',
|
|
71
|
+
simhash: 42n,
|
|
72
|
+
createdAt: Date.now(),
|
|
73
|
+
};
|
|
74
|
+
await store.put(entry);
|
|
75
|
+
const retrieved = await store.get('test-1');
|
|
76
|
+
expect(retrieved).toEqual(entry);
|
|
77
|
+
});
|
|
78
|
+
it('finds by xxhash', async () => {
|
|
79
|
+
const entry = {
|
|
80
|
+
id: 'test-1',
|
|
81
|
+
content: 'hello',
|
|
82
|
+
xxhash: 'abc123',
|
|
83
|
+
simhash: 42n,
|
|
84
|
+
createdAt: Date.now(),
|
|
85
|
+
};
|
|
86
|
+
await store.put(entry);
|
|
87
|
+
const found = await store.findByXxhash('abc123');
|
|
88
|
+
expect(found?.id).toBe('test-1');
|
|
89
|
+
});
|
|
90
|
+
it('returns undefined for missing entries', async () => {
|
|
91
|
+
expect(await store.get('nonexistent')).toBeUndefined();
|
|
92
|
+
expect(await store.findByXxhash('nonexistent')).toBeUndefined();
|
|
93
|
+
});
|
|
94
|
+
it('deletes entries', async () => {
|
|
95
|
+
await store.put({
|
|
96
|
+
id: 'test-1', content: 'hello', xxhash: 'abc', simhash: 0n, createdAt: Date.now(),
|
|
97
|
+
});
|
|
98
|
+
expect(await store.delete('test-1')).toBe(true);
|
|
99
|
+
expect(await store.get('test-1')).toBeUndefined();
|
|
100
|
+
expect(await store.delete('test-1')).toBe(false);
|
|
101
|
+
});
|
|
102
|
+
it('clears all entries', async () => {
|
|
103
|
+
await store.put({ id: '1', content: 'a', xxhash: 'a', simhash: 0n, createdAt: Date.now() });
|
|
104
|
+
await store.put({ id: '2', content: 'b', xxhash: 'b', simhash: 0n, createdAt: Date.now() });
|
|
105
|
+
await store.clear();
|
|
106
|
+
const all = await store.getAll();
|
|
107
|
+
expect(all).toHaveLength(0);
|
|
108
|
+
});
|
|
109
|
+
it('lists all entries', async () => {
|
|
110
|
+
await store.put({ id: '1', content: 'a', xxhash: 'a', simhash: 0n, createdAt: Date.now() });
|
|
111
|
+
await store.put({ id: '2', content: 'b', xxhash: 'b', simhash: 0n, createdAt: Date.now() });
|
|
112
|
+
const all = await store.getAll();
|
|
113
|
+
expect(all).toHaveLength(2);
|
|
114
|
+
});
|
|
115
|
+
});
|
|
116
|
+
describe('DedupPipeline', () => {
|
|
117
|
+
let store;
|
|
118
|
+
let pipeline;
|
|
119
|
+
beforeEach(() => {
|
|
120
|
+
store = new InMemoryStore();
|
|
121
|
+
pipeline = new DedupPipeline(store);
|
|
122
|
+
});
|
|
123
|
+
it('normalizes content', () => {
|
|
124
|
+
const normalized = pipeline.normalize(' hello\r\n world\t\t ');
|
|
125
|
+
expect(normalized).toBe('hello\n world');
|
|
126
|
+
});
|
|
127
|
+
it('detects no duplicate in empty store', async () => {
|
|
128
|
+
const result = await pipeline.check('test content');
|
|
129
|
+
expect(result.isDuplicate).toBe(false);
|
|
130
|
+
expect(result.matchType).toBe('none');
|
|
131
|
+
});
|
|
132
|
+
it('detects exact duplicate', async () => {
|
|
133
|
+
await pipeline.storeContent('entry-1', 'hello world test content');
|
|
134
|
+
const result = await pipeline.check('hello world test content');
|
|
135
|
+
expect(result.isDuplicate).toBe(true);
|
|
136
|
+
expect(result.matchType).toBe('exact');
|
|
137
|
+
expect(result.existingId).toBe('entry-1');
|
|
138
|
+
});
|
|
139
|
+
it('detects near-duplicate', async () => {
|
|
140
|
+
// Use a wider threshold for near-duplicate detection in this test
|
|
141
|
+
const widePipeline = new DedupPipeline(store, { simhashThreshold: 10 });
|
|
142
|
+
await widePipeline.storeContent('entry-1', 'the quick brown fox jumps over the lazy dog and runs through the forest');
|
|
143
|
+
const result = await widePipeline.check('the quick brown fox leaps over the lazy dog and runs through the forest');
|
|
144
|
+
expect(result.isDuplicate).toBe(true);
|
|
145
|
+
expect(['exact', 'near']).toContain(result.matchType);
|
|
146
|
+
});
|
|
147
|
+
it('does not match completely different content', async () => {
|
|
148
|
+
await pipeline.storeContent('entry-1', 'the quick brown fox jumps over the lazy dog in the morning');
|
|
149
|
+
const result = await pipeline.check('quantum physics explains particle behavior in accelerators');
|
|
150
|
+
expect(result.isDuplicate).toBe(false);
|
|
151
|
+
expect(result.matchType).toBe('none');
|
|
152
|
+
});
|
|
153
|
+
it('stores content with metadata', async () => {
|
|
154
|
+
const entry = await pipeline.storeContent('test-1', 'sample content here', { source: 'test' });
|
|
155
|
+
expect(entry.id).toBe('test-1');
|
|
156
|
+
expect(entry.xxhash).toBeTruthy();
|
|
157
|
+
expect(typeof entry.simhash).toBe('bigint');
|
|
158
|
+
expect(entry.metadata).toEqual({ source: 'test' });
|
|
159
|
+
});
|
|
160
|
+
it('handles whitespace normalization for dedup', async () => {
|
|
161
|
+
await pipeline.storeContent('entry-1', 'hello world');
|
|
162
|
+
// Same content after normalization
|
|
163
|
+
const result = await pipeline.check('hello world');
|
|
164
|
+
expect(result.isDuplicate).toBe(true);
|
|
165
|
+
expect(result.matchType).toBe('exact');
|
|
166
|
+
});
|
|
167
|
+
it('treats line ending variations as same content', async () => {
|
|
168
|
+
await pipeline.storeContent('entry-1', 'hello\nworld');
|
|
169
|
+
const result = await pipeline.check('hello\r\nworld');
|
|
170
|
+
expect(result.isDuplicate).toBe(true);
|
|
171
|
+
expect(result.matchType).toBe('exact');
|
|
172
|
+
});
|
|
173
|
+
});
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import type { DedupResult, MemoryEntry } from '../types.js';
|
|
2
|
+
import type { MemoryStore } from './memory-store.js';
|
|
3
|
+
export interface DedupPipelineOptions {
|
|
4
|
+
/** Hamming distance threshold for near-duplicate detection (default: 3). */
|
|
5
|
+
simhashThreshold?: number;
|
|
6
|
+
}
|
|
7
|
+
/**
|
|
8
|
+
* Two-tier deduplication pipeline.
|
|
9
|
+
* Tier 1: xxHash64 exact content matching.
|
|
10
|
+
* Tier 2: SimHash fuzzy matching with configurable Hamming threshold.
|
|
11
|
+
*/
|
|
12
|
+
export declare class DedupPipeline {
|
|
13
|
+
private memoryStore;
|
|
14
|
+
private simhash;
|
|
15
|
+
private threshold;
|
|
16
|
+
constructor(store: MemoryStore, options?: DedupPipelineOptions);
|
|
17
|
+
/** Normalize content for consistent hashing. */
|
|
18
|
+
normalize(content: string): string;
|
|
19
|
+
/** Check content against existing memory for duplicates. */
|
|
20
|
+
check(content: string): Promise<DedupResult>;
|
|
21
|
+
/** Store new content, returning the created entry. */
|
|
22
|
+
storeContent(id: string, content: string, metadata?: Record<string, unknown>): Promise<MemoryEntry>;
|
|
23
|
+
}
|
|
24
|
+
//# sourceMappingURL=dedup-pipeline.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"dedup-pipeline.d.ts","sourceRoot":"","sources":["../../../src/memory/dedup-pipeline.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAE,WAAW,EAAE,MAAM,aAAa,CAAA;AAC3D,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAA;AAIpD,MAAM,WAAW,oBAAoB;IACnC,4EAA4E;IAC5E,gBAAgB,CAAC,EAAE,MAAM,CAAA;CAC1B;AAED;;;;GAIG;AACH,qBAAa,aAAa;IACxB,OAAO,CAAC,WAAW,CAAa;IAChC,OAAO,CAAC,OAAO,CAAS;IACxB,OAAO,CAAC,SAAS,CAAQ;gBAEb,KAAK,EAAE,WAAW,EAAE,OAAO,GAAE,oBAAyB;IAMlE,gDAAgD;IAChD,SAAS,CAAC,OAAO,EAAE,MAAM,GAAG,MAAM;IAQlC,4DAA4D;IACtD,KAAK,CAAC,OAAO,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC;IAmClD,sDAAsD;IAChD,YAAY,CAAC,EAAE,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,OAAO,CAAC,WAAW,CAAC;CAgB1G"}
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import { xxhash64 } from './xxhash.js';
|
|
2
|
+
import { SimHash } from './simhash.js';
|
|
3
|
+
/**
|
|
4
|
+
* Two-tier deduplication pipeline.
|
|
5
|
+
* Tier 1: xxHash64 exact content matching.
|
|
6
|
+
* Tier 2: SimHash fuzzy matching with configurable Hamming threshold.
|
|
7
|
+
*/
|
|
8
|
+
export class DedupPipeline {
|
|
9
|
+
memoryStore;
|
|
10
|
+
simhash;
|
|
11
|
+
threshold;
|
|
12
|
+
constructor(store, options = {}) {
|
|
13
|
+
this.memoryStore = store;
|
|
14
|
+
this.simhash = new SimHash();
|
|
15
|
+
this.threshold = options.simhashThreshold ?? 3;
|
|
16
|
+
}
|
|
17
|
+
/** Normalize content for consistent hashing. */
|
|
18
|
+
normalize(content) {
|
|
19
|
+
return content
|
|
20
|
+
.replace(/\r\n/g, '\n') // Normalize line endings
|
|
21
|
+
.replace(/\t/g, ' ') // Normalize tabs
|
|
22
|
+
.replace(/ +$/gm, '') // Strip trailing whitespace
|
|
23
|
+
.trim();
|
|
24
|
+
}
|
|
25
|
+
/** Check content against existing memory for duplicates. */
|
|
26
|
+
async check(content) {
|
|
27
|
+
const normalized = this.normalize(content);
|
|
28
|
+
const hash = await xxhash64(normalized);
|
|
29
|
+
// Tier 1: exact match via xxHash64
|
|
30
|
+
const exactMatch = await this.memoryStore.findByXxhash(hash);
|
|
31
|
+
if (exactMatch) {
|
|
32
|
+
return {
|
|
33
|
+
isDuplicate: true,
|
|
34
|
+
matchType: 'exact',
|
|
35
|
+
existingId: exactMatch.id,
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
// Tier 2: fuzzy match via SimHash
|
|
39
|
+
const fingerprint = this.simhash.compute(normalized);
|
|
40
|
+
const allEntries = await this.memoryStore.getAll();
|
|
41
|
+
for (const entry of allEntries) {
|
|
42
|
+
const distance = this.simhash.hammingDistance(fingerprint, entry.simhash);
|
|
43
|
+
if (distance <= this.threshold) {
|
|
44
|
+
return {
|
|
45
|
+
isDuplicate: true,
|
|
46
|
+
matchType: 'near',
|
|
47
|
+
existingId: entry.id,
|
|
48
|
+
hammingDistance: distance,
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
return {
|
|
53
|
+
isDuplicate: false,
|
|
54
|
+
matchType: 'none',
|
|
55
|
+
};
|
|
56
|
+
}
|
|
57
|
+
/** Store new content, returning the created entry. */
|
|
58
|
+
async storeContent(id, content, metadata) {
|
|
59
|
+
const normalized = this.normalize(content);
|
|
60
|
+
const hash = await xxhash64(normalized);
|
|
61
|
+
const fingerprint = this.simhash.compute(normalized);
|
|
62
|
+
const entry = {
|
|
63
|
+
id,
|
|
64
|
+
content: normalized,
|
|
65
|
+
xxhash: hash,
|
|
66
|
+
simhash: fingerprint,
|
|
67
|
+
createdAt: Date.now(),
|
|
68
|
+
...(metadata ? { metadata } : {}),
|
|
69
|
+
};
|
|
70
|
+
await this.memoryStore.put(entry);
|
|
71
|
+
return entry;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import type { MemoryEntry } from '../types.js';
|
|
2
|
+
/** Interface for persistent memory storage. */
|
|
3
|
+
export interface MemoryStore {
|
|
4
|
+
get(id: string): Promise<MemoryEntry | undefined>;
|
|
5
|
+
put(entry: MemoryEntry): Promise<void>;
|
|
6
|
+
findByXxhash(hash: string): Promise<MemoryEntry | undefined>;
|
|
7
|
+
getAll(): Promise<MemoryEntry[]>;
|
|
8
|
+
delete(id: string): Promise<boolean>;
|
|
9
|
+
clear(): Promise<void>;
|
|
10
|
+
}
|
|
11
|
+
/** In-memory implementation of MemoryStore for testing and lightweight use. */
|
|
12
|
+
export declare class InMemoryStore implements MemoryStore {
|
|
13
|
+
private entries;
|
|
14
|
+
private xxhashIndex;
|
|
15
|
+
get(id: string): Promise<MemoryEntry | undefined>;
|
|
16
|
+
put(entry: MemoryEntry): Promise<void>;
|
|
17
|
+
findByXxhash(hash: string): Promise<MemoryEntry | undefined>;
|
|
18
|
+
getAll(): Promise<MemoryEntry[]>;
|
|
19
|
+
delete(id: string): Promise<boolean>;
|
|
20
|
+
clear(): Promise<void>;
|
|
21
|
+
}
|
|
22
|
+
//# sourceMappingURL=memory-store.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"memory-store.d.ts","sourceRoot":"","sources":["../../../src/memory/memory-store.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAe,MAAM,aAAa,CAAA;AAE3D,+CAA+C;AAC/C,MAAM,WAAW,WAAW;IAC1B,GAAG,CAAC,EAAE,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,GAAG,SAAS,CAAC,CAAA;IACjD,GAAG,CAAC,KAAK,EAAE,WAAW,GAAG,OAAO,CAAC,IAAI,CAAC,CAAA;IACtC,YAAY,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,GAAG,SAAS,CAAC,CAAA;IAC5D,MAAM,IAAI,OAAO,CAAC,WAAW,EAAE,CAAC,CAAA;IAChC,MAAM,CAAC,EAAE,EAAE,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC,CAAA;IACpC,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC,CAAA;CACvB;AAED,+EAA+E;AAC/E,qBAAa,aAAc,YAAW,WAAW;IAC/C,OAAO,CAAC,OAAO,CAAsC;IACrD,OAAO,CAAC,WAAW,CAAiC;IAE9C,GAAG,CAAC,EAAE,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,GAAG,SAAS,CAAC;IAIjD,GAAG,CAAC,KAAK,EAAE,WAAW,GAAG,OAAO,CAAC,IAAI,CAAC;IAKtC,YAAY,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,GAAG,SAAS,CAAC;IAM5D,MAAM,IAAI,OAAO,CAAC,WAAW,EAAE,CAAC;IAIhC,MAAM,CAAC,EAAE,EAAE,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC;IAOpC,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;CAI7B"}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
/** In-memory implementation of MemoryStore for testing and lightweight use. */
|
|
2
|
+
export class InMemoryStore {
|
|
3
|
+
entries = new Map();
|
|
4
|
+
xxhashIndex = new Map(); // xxhash -> id
|
|
5
|
+
async get(id) {
|
|
6
|
+
return this.entries.get(id);
|
|
7
|
+
}
|
|
8
|
+
async put(entry) {
|
|
9
|
+
this.entries.set(entry.id, entry);
|
|
10
|
+
this.xxhashIndex.set(entry.xxhash, entry.id);
|
|
11
|
+
}
|
|
12
|
+
async findByXxhash(hash) {
|
|
13
|
+
const id = this.xxhashIndex.get(hash);
|
|
14
|
+
if (!id)
|
|
15
|
+
return undefined;
|
|
16
|
+
return this.entries.get(id);
|
|
17
|
+
}
|
|
18
|
+
async getAll() {
|
|
19
|
+
return Array.from(this.entries.values());
|
|
20
|
+
}
|
|
21
|
+
async delete(id) {
|
|
22
|
+
const entry = this.entries.get(id);
|
|
23
|
+
if (!entry)
|
|
24
|
+
return false;
|
|
25
|
+
this.xxhashIndex.delete(entry.xxhash);
|
|
26
|
+
return this.entries.delete(id);
|
|
27
|
+
}
|
|
28
|
+
async clear() {
|
|
29
|
+
this.entries.clear();
|
|
30
|
+
this.xxhashIndex.clear();
|
|
31
|
+
}
|
|
32
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* SimHash implementation for near-duplicate detection.
|
|
3
|
+
* Produces a 64-bit fingerprint where similar content produces similar hashes.
|
|
4
|
+
*/
|
|
5
|
+
export declare class SimHash {
|
|
6
|
+
private bits;
|
|
7
|
+
constructor(bits?: number);
|
|
8
|
+
/** Compute a SimHash fingerprint for the given text. */
|
|
9
|
+
compute(text: string): bigint;
|
|
10
|
+
/** Compute Hamming distance between two fingerprints. */
|
|
11
|
+
hammingDistance(a: bigint, b: bigint): number;
|
|
12
|
+
/** Check if two fingerprints are near-duplicates within threshold. */
|
|
13
|
+
isNearDuplicate(a: bigint, b: bigint, threshold?: number): boolean;
|
|
14
|
+
private tokenize;
|
|
15
|
+
}
|
|
16
|
+
//# sourceMappingURL=simhash.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"simhash.d.ts","sourceRoot":"","sources":["../../../src/memory/simhash.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAYH,qBAAa,OAAO;IAClB,OAAO,CAAC,IAAI,CAAQ;gBAER,IAAI,SAAK;IAIrB,wDAAwD;IACxD,OAAO,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM;IA4B7B,yDAAyD;IACzD,eAAe,CAAC,CAAC,EAAE,MAAM,EAAE,CAAC,EAAE,MAAM,GAAG,MAAM;IAU7C,sEAAsE;IACtE,eAAe,CAAC,CAAC,EAAE,MAAM,EAAE,CAAC,EAAE,MAAM,EAAE,SAAS,SAAI,GAAG,OAAO;IAI7D,OAAO,CAAC,QAAQ;CAOjB"}
|