underrow 2026.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +9 -0
- package/LICENSE +21 -0
- package/README.md +11 -0
- package/index.js +61 -0
- package/package.json +22 -0
- package/public/index.html +271 -0
- package/src/chunker.js +26 -0
- package/src/density.js +8 -0
- package/src/embedder.js +84 -0
- package/src/server.js +57 -0
- package/src/store.js +131 -0
- package/src/watcher.js +108 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Matt Currier
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# KB
|
|
2
|
+
KnowledgeBase driver
|
|
3
|
+
|
|
4
|
+
* When running it watches for file changes in it's directory
|
|
5
|
+
* When a file changes it chunks amd vector embeds it in FAISS and stores metadata
|
|
6
|
+
* Information Density = gzip-size / orig_size
|
|
7
|
+
* Provides a sumple server that
|
|
8
|
+
* gives a vector search API
|
|
9
|
+
* gives fuzzy text search API
|
|
10
|
+
* host a web dashboard
|
|
11
|
+
in
|
package/index.js
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import { resolve } from 'path'
|
|
4
|
+
import { startWatcher } from './src/watcher.js'
|
|
5
|
+
import { createServer } from './src/server.js'
|
|
6
|
+
import { Store } from './src/store.js'
|
|
7
|
+
import { initEmbedder } from './src/embedder.js'
|
|
8
|
+
|
|
9
|
+
const args = process.argv.slice(2)
|
|
10
|
+
|
|
11
|
+
function flag(name, fallback) {
|
|
12
|
+
const i = args.indexOf(name)
|
|
13
|
+
if (i === -1) return fallback
|
|
14
|
+
return args.splice(i, 2)[1] || fallback
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
if (args.includes('--help') || args.includes('-h')) {
|
|
18
|
+
console.log(`underrow - watch a directory, embed content, search via UI & API
|
|
19
|
+
|
|
20
|
+
Usage: underrow [dir] [options]
|
|
21
|
+
|
|
22
|
+
Arguments:
|
|
23
|
+
dir Directory to watch (default: current directory)
|
|
24
|
+
|
|
25
|
+
Options:
|
|
26
|
+
--port, -p Server port (default: 3737, env: KB_PORT)
|
|
27
|
+
--data, -d Data storage directory (default: ./data, env: KB_DATA_DIR)
|
|
28
|
+
-h, --help Show this help
|
|
29
|
+
`)
|
|
30
|
+
process.exit(0)
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
const PORT = parseInt(flag('--port', flag('-p', process.env.KB_PORT || '3737')), 10)
|
|
34
|
+
const DATA_DIR = resolve(flag('--data', flag('-d', process.env.KB_DATA_DIR || './data')))
|
|
35
|
+
const WATCH_DIR = resolve(args[0] || process.env.KB_WATCH_DIR || process.cwd())
|
|
36
|
+
|
|
37
|
+
async function main() {
|
|
38
|
+
console.log(`KB starting...`)
|
|
39
|
+
console.log(` Watch dir : ${WATCH_DIR}`)
|
|
40
|
+
console.log(` Data dir : ${DATA_DIR}`)
|
|
41
|
+
console.log(` Port : ${PORT}`)
|
|
42
|
+
|
|
43
|
+
console.log('Loading embedding model...')
|
|
44
|
+
const embedder = await initEmbedder()
|
|
45
|
+
console.log('Embedding model ready.')
|
|
46
|
+
|
|
47
|
+
const store = new Store(DATA_DIR, embedder.dimensions)
|
|
48
|
+
store._embedder = embedder
|
|
49
|
+
|
|
50
|
+
startWatcher(WATCH_DIR, store, embedder)
|
|
51
|
+
|
|
52
|
+
const app = createServer(store, embedder)
|
|
53
|
+
app.listen(PORT, () => {
|
|
54
|
+
console.log(`KB server listening on http://localhost:${PORT}`)
|
|
55
|
+
})
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
main().catch(err => {
|
|
59
|
+
console.error('Fatal error:', err)
|
|
60
|
+
process.exit(1)
|
|
61
|
+
})
|
package/package.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "underrow",
|
|
3
|
+
"version": "2026.4.1",
|
|
4
|
+
"description": "KnowledgeBase driver - file watcher with vector and fuzzy search",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "index.js",
|
|
7
|
+
"bin": {
|
|
8
|
+
"underrow": "index.js"
|
|
9
|
+
},
|
|
10
|
+
"scripts": {
|
|
11
|
+
"start": "node index.js",
|
|
12
|
+
"dev": "node --watch index.js"
|
|
13
|
+
},
|
|
14
|
+
"keywords": ["knowledgebase", "vector-search", "faiss", "fuzzy-search"],
|
|
15
|
+
"license": "MIT",
|
|
16
|
+
"dependencies": {
|
|
17
|
+
"chokidar": "^3.6.0",
|
|
18
|
+
"express": "^4.21.0",
|
|
19
|
+
"faiss-node": "^0.5.1",
|
|
20
|
+
"fuse.js": "^7.0.0"
|
|
21
|
+
}
|
|
22
|
+
}
|
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
<!DOCTYPE html>
|
|
2
|
+
<html lang="en">
|
|
3
|
+
<head>
|
|
4
|
+
<meta charset="UTF-8">
|
|
5
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
6
|
+
<title>KB - Knowledge Base</title>
|
|
7
|
+
<style>
|
|
8
|
+
* { margin: 0; padding: 0; box-sizing: border-box; }
|
|
9
|
+
|
|
10
|
+
body {
|
|
11
|
+
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
|
12
|
+
background: #0f1117;
|
|
13
|
+
color: #e1e4e8;
|
|
14
|
+
min-height: 100vh;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
header {
|
|
18
|
+
background: #161b22;
|
|
19
|
+
border-bottom: 1px solid #30363d;
|
|
20
|
+
padding: 1rem 2rem;
|
|
21
|
+
display: flex;
|
|
22
|
+
align-items: center;
|
|
23
|
+
gap: 1rem;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
header h1 {
|
|
27
|
+
font-size: 1.4rem;
|
|
28
|
+
font-weight: 600;
|
|
29
|
+
color: #58a6ff;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
.stats {
|
|
33
|
+
margin-left: auto;
|
|
34
|
+
font-size: 0.85rem;
|
|
35
|
+
color: #8b949e;
|
|
36
|
+
display: flex;
|
|
37
|
+
gap: 1.5rem;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
.stats span { color: #58a6ff; font-weight: 600; }
|
|
41
|
+
|
|
42
|
+
.container {
|
|
43
|
+
max-width: 960px;
|
|
44
|
+
margin: 2rem auto;
|
|
45
|
+
padding: 0 1rem;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
.search-box {
|
|
49
|
+
display: flex;
|
|
50
|
+
gap: 0.5rem;
|
|
51
|
+
margin-bottom: 1.5rem;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
.search-box input {
|
|
55
|
+
flex: 1;
|
|
56
|
+
padding: 0.75rem 1rem;
|
|
57
|
+
background: #161b22;
|
|
58
|
+
border: 1px solid #30363d;
|
|
59
|
+
border-radius: 6px;
|
|
60
|
+
color: #e1e4e8;
|
|
61
|
+
font-size: 1rem;
|
|
62
|
+
outline: none;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
.search-box input:focus { border-color: #58a6ff; }
|
|
66
|
+
|
|
67
|
+
.search-box select {
|
|
68
|
+
padding: 0.75rem;
|
|
69
|
+
background: #161b22;
|
|
70
|
+
border: 1px solid #30363d;
|
|
71
|
+
border-radius: 6px;
|
|
72
|
+
color: #e1e4e8;
|
|
73
|
+
font-size: 0.9rem;
|
|
74
|
+
cursor: pointer;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
.search-box button {
|
|
78
|
+
padding: 0.75rem 1.5rem;
|
|
79
|
+
background: #238636;
|
|
80
|
+
border: none;
|
|
81
|
+
border-radius: 6px;
|
|
82
|
+
color: #fff;
|
|
83
|
+
font-size: 1rem;
|
|
84
|
+
cursor: pointer;
|
|
85
|
+
font-weight: 500;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
.search-box button:hover { background: #2ea043; }
|
|
89
|
+
|
|
90
|
+
.results { display: flex; flex-direction: column; gap: 0.75rem; }
|
|
91
|
+
|
|
92
|
+
.result-card {
|
|
93
|
+
background: #161b22;
|
|
94
|
+
border: 1px solid #30363d;
|
|
95
|
+
border-radius: 8px;
|
|
96
|
+
padding: 1rem 1.25rem;
|
|
97
|
+
transition: border-color 0.15s;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
.result-card:hover { border-color: #58a6ff; }
|
|
101
|
+
|
|
102
|
+
.result-header {
|
|
103
|
+
display: flex;
|
|
104
|
+
justify-content: space-between;
|
|
105
|
+
align-items: center;
|
|
106
|
+
margin-bottom: 0.5rem;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
.result-path {
|
|
110
|
+
font-size: 0.85rem;
|
|
111
|
+
color: #58a6ff;
|
|
112
|
+
font-family: monospace;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
.result-meta {
|
|
116
|
+
font-size: 0.75rem;
|
|
117
|
+
color: #8b949e;
|
|
118
|
+
display: flex;
|
|
119
|
+
gap: 1rem;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
.result-text {
|
|
123
|
+
font-size: 0.9rem;
|
|
124
|
+
color: #c9d1d9;
|
|
125
|
+
line-height: 1.5;
|
|
126
|
+
white-space: pre-wrap;
|
|
127
|
+
max-height: 150px;
|
|
128
|
+
overflow-y: auto;
|
|
129
|
+
font-family: monospace;
|
|
130
|
+
background: #0d1117;
|
|
131
|
+
padding: 0.75rem;
|
|
132
|
+
border-radius: 4px;
|
|
133
|
+
margin-top: 0.5rem;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
.empty-state {
|
|
137
|
+
text-align: center;
|
|
138
|
+
padding: 3rem;
|
|
139
|
+
color: #8b949e;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
.density-bar {
|
|
143
|
+
display: inline-block;
|
|
144
|
+
width: 50px;
|
|
145
|
+
height: 8px;
|
|
146
|
+
background: #21262d;
|
|
147
|
+
border-radius: 4px;
|
|
148
|
+
overflow: hidden;
|
|
149
|
+
vertical-align: middle;
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
.density-bar .fill {
|
|
153
|
+
height: 100%;
|
|
154
|
+
border-radius: 4px;
|
|
155
|
+
background: #3fb950;
|
|
156
|
+
}
|
|
157
|
+
</style>
|
|
158
|
+
</head>
|
|
159
|
+
<body>
|
|
160
|
+
<header>
|
|
161
|
+
<h1>KB</h1>
|
|
162
|
+
<div class="stats">
|
|
163
|
+
<div>Files: <span id="stat-files">-</span></div>
|
|
164
|
+
<div>Chunks: <span id="stat-chunks">-</span></div>
|
|
165
|
+
<div>Avg Density: <span id="stat-density">-</span></div>
|
|
166
|
+
</div>
|
|
167
|
+
</header>
|
|
168
|
+
|
|
169
|
+
<div class="container">
|
|
170
|
+
<div class="search-box">
|
|
171
|
+
<input type="text" id="query" placeholder="Search your knowledge base..." autofocus />
|
|
172
|
+
<select id="mode">
|
|
173
|
+
<option value="vector">Vector</option>
|
|
174
|
+
<option value="fuzzy">Fuzzy</option>
|
|
175
|
+
</select>
|
|
176
|
+
<button onclick="doSearch()">Search</button>
|
|
177
|
+
</div>
|
|
178
|
+
|
|
179
|
+
<div class="results" id="results">
|
|
180
|
+
<div class="empty-state">
|
|
181
|
+
<p>Enter a query to search your indexed files.</p>
|
|
182
|
+
</div>
|
|
183
|
+
</div>
|
|
184
|
+
</div>
|
|
185
|
+
|
|
186
|
+
<script>
|
|
187
|
+
const queryInput = document.getElementById('query')
|
|
188
|
+
const modeSelect = document.getElementById('mode')
|
|
189
|
+
const resultsDiv = document.getElementById('results')
|
|
190
|
+
|
|
191
|
+
queryInput.addEventListener('keydown', e => {
|
|
192
|
+
if (e.key === 'Enter') doSearch()
|
|
193
|
+
})
|
|
194
|
+
|
|
195
|
+
async function loadStats() {
|
|
196
|
+
try {
|
|
197
|
+
const res = await fetch('/api/stats')
|
|
198
|
+
const data = await res.json()
|
|
199
|
+
document.getElementById('stat-files').textContent = data.totalFiles
|
|
200
|
+
document.getElementById('stat-chunks').textContent = data.totalChunks
|
|
201
|
+
document.getElementById('stat-density').textContent = data.avgDensity.toFixed(3)
|
|
202
|
+
} catch {}
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
async function doSearch() {
|
|
206
|
+
const query = queryInput.value.trim()
|
|
207
|
+
if (!query) return
|
|
208
|
+
|
|
209
|
+
const mode = modeSelect.value
|
|
210
|
+
let results
|
|
211
|
+
|
|
212
|
+
try {
|
|
213
|
+
if (mode === 'vector') {
|
|
214
|
+
const res = await fetch('/api/search/vector', {
|
|
215
|
+
method: 'POST',
|
|
216
|
+
headers: { 'Content-Type': 'application/json' },
|
|
217
|
+
body: JSON.stringify({ query, k: 20 }),
|
|
218
|
+
})
|
|
219
|
+
const data = await res.json()
|
|
220
|
+
results = data.results
|
|
221
|
+
} else {
|
|
222
|
+
const res = await fetch(`/api/search/fuzzy?q=${encodeURIComponent(query)}`)
|
|
223
|
+
const data = await res.json()
|
|
224
|
+
results = data.results
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
renderResults(results)
|
|
228
|
+
} catch (err) {
|
|
229
|
+
resultsDiv.innerHTML = `<div class="empty-state">Error: ${err.message}</div>`
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
function renderResults(results) {
|
|
234
|
+
if (!results || results.length === 0) {
|
|
235
|
+
resultsDiv.innerHTML = '<div class="empty-state">No results found.</div>'
|
|
236
|
+
return
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
resultsDiv.innerHTML = results.map(r => `
|
|
240
|
+
<div class="result-card">
|
|
241
|
+
<div class="result-header">
|
|
242
|
+
<span class="result-path">${escapeHtml(r.filePath)}</span>
|
|
243
|
+
<div class="result-meta">
|
|
244
|
+
<span>chunk ${r.chunkIndex}</span>
|
|
245
|
+
<span>score: ${r.score?.toFixed(3) ?? '-'}</span>
|
|
246
|
+
<span>
|
|
247
|
+
density: ${r.density?.toFixed(3) ?? '-'}
|
|
248
|
+
<span class="density-bar">
|
|
249
|
+
<span class="fill" style="width: ${(r.density ?? 0) * 100}%"></span>
|
|
250
|
+
</span>
|
|
251
|
+
</span>
|
|
252
|
+
</div>
|
|
253
|
+
</div>
|
|
254
|
+
<div class="result-text">${escapeHtml(r.text)}</div>
|
|
255
|
+
</div>
|
|
256
|
+
`).join('')
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
function escapeHtml(str) {
|
|
260
|
+
return str
|
|
261
|
+
.replace(/&/g, '&')
|
|
262
|
+
.replace(/</g, '<')
|
|
263
|
+
.replace(/>/g, '>')
|
|
264
|
+
.replace(/"/g, '"')
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
loadStats()
|
|
268
|
+
setInterval(loadStats, 5000)
|
|
269
|
+
</script>
|
|
270
|
+
</body>
|
|
271
|
+
</html>
|
package/src/chunker.js
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
const DEFAULT_CHUNK_SIZE = 512
|
|
2
|
+
const DEFAULT_OVERLAP = 64
|
|
3
|
+
|
|
4
|
+
export function chunkText(text, { chunkSize = DEFAULT_CHUNK_SIZE, overlap = DEFAULT_OVERLAP } = {}) {
|
|
5
|
+
const chunks = []
|
|
6
|
+
const lines = text.split('\n')
|
|
7
|
+
let current = ''
|
|
8
|
+
|
|
9
|
+
for (const line of lines) {
|
|
10
|
+
if (current.length + line.length + 1 > chunkSize && current.length > 0) {
|
|
11
|
+
chunks.push(current.trim())
|
|
12
|
+
// keep overlap from end of previous chunk
|
|
13
|
+
const words = current.split(/\s+/)
|
|
14
|
+
const overlapWords = words.slice(-Math.ceil(overlap / 5))
|
|
15
|
+
current = overlapWords.join(' ') + '\n' + line
|
|
16
|
+
} else {
|
|
17
|
+
current += (current ? '\n' : '') + line
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
if (current.trim()) {
|
|
22
|
+
chunks.push(current.trim())
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
return chunks
|
|
26
|
+
}
|
package/src/density.js
ADDED
package/src/embedder.js
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Lightweight local text embedder using character n-gram hashing.
|
|
3
|
+
* No external model download required - creates fixed-size vectors
|
|
4
|
+
* using hash projections of character and word n-grams.
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
const DIMENSIONS = 384
|
|
8
|
+
const CHAR_NGRAM_SIZES = [3, 4, 5]
|
|
9
|
+
const WORD_NGRAM_SIZES = [1, 2]
|
|
10
|
+
|
|
11
|
+
function hashCode(str) {
|
|
12
|
+
let h = 0
|
|
13
|
+
for (let i = 0; i < str.length; i++) {
|
|
14
|
+
h = ((h << 5) - h + str.charCodeAt(i)) | 0
|
|
15
|
+
}
|
|
16
|
+
return h
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
function hashToIndex(str, dim) {
|
|
20
|
+
const h = hashCode(str)
|
|
21
|
+
return ((h % dim) + dim) % dim
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
function hashToSign(str) {
|
|
25
|
+
return (hashCode(str + '_sign') & 1) === 0 ? 1 : -1
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
function extractCharNgrams(text) {
|
|
29
|
+
const ngrams = []
|
|
30
|
+
const lower = text.toLowerCase()
|
|
31
|
+
for (const n of CHAR_NGRAM_SIZES) {
|
|
32
|
+
for (let i = 0; i <= lower.length - n; i++) {
|
|
33
|
+
ngrams.push(lower.slice(i, i + n))
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
return ngrams
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
function extractWordNgrams(text) {
|
|
40
|
+
const words = text.toLowerCase().split(/\s+/).filter(w => w.length > 0)
|
|
41
|
+
const ngrams = []
|
|
42
|
+
for (const n of WORD_NGRAM_SIZES) {
|
|
43
|
+
for (let i = 0; i <= words.length - n; i++) {
|
|
44
|
+
ngrams.push(words.slice(i, i + n).join(' '))
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
return ngrams
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
function normalize(vec) {
|
|
51
|
+
let norm = 0
|
|
52
|
+
for (let i = 0; i < vec.length; i++) norm += vec[i] * vec[i]
|
|
53
|
+
norm = Math.sqrt(norm)
|
|
54
|
+
if (norm === 0) return vec
|
|
55
|
+
for (let i = 0; i < vec.length; i++) vec[i] /= norm
|
|
56
|
+
return vec
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
function embedText(text) {
|
|
60
|
+
const vec = new Float64Array(DIMENSIONS)
|
|
61
|
+
|
|
62
|
+
// Character n-gram features (weighted higher)
|
|
63
|
+
const charNgrams = extractCharNgrams(text)
|
|
64
|
+
for (const ng of charNgrams) {
|
|
65
|
+
const idx = hashToIndex(ng, DIMENSIONS)
|
|
66
|
+
vec[idx] += hashToSign(ng) * 1.0
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
// Word n-gram features (weighted higher for semantic meaning)
|
|
70
|
+
const wordNgrams = extractWordNgrams(text)
|
|
71
|
+
for (const ng of wordNgrams) {
|
|
72
|
+
const idx = hashToIndex('w_' + ng, DIMENSIONS)
|
|
73
|
+
vec[idx] += hashToSign('w_' + ng) * 2.0
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
return Array.from(normalize(vec))
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
export async function initEmbedder() {
|
|
80
|
+
return {
|
|
81
|
+
dimensions: DIMENSIONS,
|
|
82
|
+
embed: async (text) => embedText(text),
|
|
83
|
+
}
|
|
84
|
+
}
|
package/src/server.js
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import express from 'express'
|
|
2
|
+
import { resolve } from 'path'
|
|
3
|
+
import { fileURLToPath } from 'url'
|
|
4
|
+
import { dirname } from 'path'
|
|
5
|
+
|
|
6
|
+
const __dirname = dirname(fileURLToPath(import.meta.url))
|
|
7
|
+
|
|
8
|
+
export function createServer(store, embedder) {
|
|
9
|
+
const app = express()
|
|
10
|
+
|
|
11
|
+
app.use(express.json())
|
|
12
|
+
app.use(express.static(resolve(__dirname, '..', 'public')))
|
|
13
|
+
|
|
14
|
+
// Vector search
|
|
15
|
+
app.post('/api/search/vector', async (req, res) => {
|
|
16
|
+
const { query, k = 10 } = req.body
|
|
17
|
+
|
|
18
|
+
if (!query) {
|
|
19
|
+
return res.status(400).json({ error: 'query is required' })
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
try {
|
|
23
|
+
const vec = await embedder.embed(query)
|
|
24
|
+
const results = store.vectorSearch(vec, k)
|
|
25
|
+
res.json({ results })
|
|
26
|
+
} catch (err) {
|
|
27
|
+
res.status(500).json({ error: err.message })
|
|
28
|
+
}
|
|
29
|
+
})
|
|
30
|
+
|
|
31
|
+
// Fuzzy text search
|
|
32
|
+
app.get('/api/search/fuzzy', (req, res) => {
|
|
33
|
+
const { q, limit = 20, threshold = 0.4 } = req.query
|
|
34
|
+
|
|
35
|
+
if (!q) {
|
|
36
|
+
return res.status(400).json({ error: 'q query parameter is required' })
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
const results = store.fuzzySearch(q, {
|
|
40
|
+
limit: parseInt(limit, 10),
|
|
41
|
+
threshold: parseFloat(threshold),
|
|
42
|
+
})
|
|
43
|
+
res.json({ results })
|
|
44
|
+
})
|
|
45
|
+
|
|
46
|
+
// Stats
|
|
47
|
+
app.get('/api/stats', (req, res) => {
|
|
48
|
+
res.json(store.getStats())
|
|
49
|
+
})
|
|
50
|
+
|
|
51
|
+
// All metadata
|
|
52
|
+
app.get('/api/documents', (req, res) => {
|
|
53
|
+
res.json(store.getAllMetadata())
|
|
54
|
+
})
|
|
55
|
+
|
|
56
|
+
return app
|
|
57
|
+
}
|
package/src/store.js
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
import faiss from 'faiss-node'
|
|
2
|
+
const { IndexFlatIP } = faiss
|
|
3
|
+
import { writeFileSync, readFileSync, existsSync, mkdirSync } from 'fs'
|
|
4
|
+
import { join } from 'path'
|
|
5
|
+
import Fuse from 'fuse.js'
|
|
6
|
+
|
|
7
|
+
export class Store {
|
|
8
|
+
constructor(dataDir, dimensions) {
|
|
9
|
+
this.dataDir = dataDir
|
|
10
|
+
this.dimensions = dimensions
|
|
11
|
+
this.indexPath = join(dataDir, 'faiss.index')
|
|
12
|
+
this.metaPath = join(dataDir, 'metadata.json')
|
|
13
|
+
|
|
14
|
+
mkdirSync(dataDir, { recursive: true })
|
|
15
|
+
|
|
16
|
+
this.metadata = [] // array of { id, filePath, chunkIndex, text, density, mtime }
|
|
17
|
+
this.index = new IndexFlatIP(dimensions) // inner product (cosine sim on normalized vecs)
|
|
18
|
+
|
|
19
|
+
this._load()
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
_load() {
|
|
23
|
+
if (existsSync(this.metaPath)) {
|
|
24
|
+
try {
|
|
25
|
+
this.metadata = JSON.parse(readFileSync(this.metaPath, 'utf8'))
|
|
26
|
+
} catch {
|
|
27
|
+
this.metadata = []
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
if (existsSync(this.indexPath) && this.metadata.length > 0) {
|
|
31
|
+
try {
|
|
32
|
+
this.index = IndexFlatIP.read(this.indexPath)
|
|
33
|
+
} catch {
|
|
34
|
+
this.index = new IndexFlatIP(this.dimensions)
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
save() {
|
|
40
|
+
writeFileSync(this.metaPath, JSON.stringify(this.metadata, null, 2))
|
|
41
|
+
if (this.metadata.length > 0) {
|
|
42
|
+
this.index.write(this.indexPath)
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
removeFile(filePath) {
|
|
47
|
+
const remaining = []
|
|
48
|
+
const removeIds = new Set()
|
|
49
|
+
|
|
50
|
+
for (let i = 0; i < this.metadata.length; i++) {
|
|
51
|
+
if (this.metadata[i].filePath === filePath) {
|
|
52
|
+
removeIds.add(i)
|
|
53
|
+
} else {
|
|
54
|
+
remaining.push(this.metadata[i])
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
if (removeIds.size === 0) return
|
|
59
|
+
|
|
60
|
+
// Rebuild index without removed entries
|
|
61
|
+
const newIndex = new IndexFlatIP(this.dimensions)
|
|
62
|
+
// We need to re-add all vectors except removed ones
|
|
63
|
+
// Unfortunately faiss-node doesn't support removal, so we reconstruct
|
|
64
|
+
for (let i = 0; i < this.metadata.length; i++) {
|
|
65
|
+
if (!removeIds.has(i)) {
|
|
66
|
+
const vec = this.index.reconstruct(i)
|
|
67
|
+
newIndex.add(vec)
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
this.metadata = remaining
|
|
72
|
+
this.index = newIndex
|
|
73
|
+
this.save()
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
addChunks(filePath, chunks, vectors, densities, mtime) {
|
|
77
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
78
|
+
this.metadata.push({
|
|
79
|
+
id: this.metadata.length,
|
|
80
|
+
filePath,
|
|
81
|
+
chunkIndex: i,
|
|
82
|
+
text: chunks[i],
|
|
83
|
+
density: densities[i],
|
|
84
|
+
mtime: mtime.toISOString(),
|
|
85
|
+
})
|
|
86
|
+
this.index.add(vectors[i])
|
|
87
|
+
}
|
|
88
|
+
this.save()
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
vectorSearch(queryVector, k = 10) {
|
|
92
|
+
if (this.metadata.length === 0) return []
|
|
93
|
+
const clampedK = Math.min(k, this.metadata.length)
|
|
94
|
+
const result = this.index.search(queryVector, clampedK)
|
|
95
|
+
|
|
96
|
+
return result.labels.map((idx, i) => ({
|
|
97
|
+
...this.metadata[idx],
|
|
98
|
+
score: result.distances[i],
|
|
99
|
+
}))
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
fuzzySearch(query, options = {}) {
|
|
103
|
+
const fuse = new Fuse(this.metadata, {
|
|
104
|
+
keys: ['text', 'filePath'],
|
|
105
|
+
includeScore: true,
|
|
106
|
+
threshold: options.threshold ?? 0.4,
|
|
107
|
+
})
|
|
108
|
+
|
|
109
|
+
const results = fuse.search(query, { limit: options.limit ?? 20 })
|
|
110
|
+
return results.map(r => ({
|
|
111
|
+
...r.item,
|
|
112
|
+
score: 1 - r.score, // invert so higher = better
|
|
113
|
+
}))
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
getStats() {
|
|
117
|
+
const files = new Set(this.metadata.map(m => m.filePath))
|
|
118
|
+
return {
|
|
119
|
+
totalChunks: this.metadata.length,
|
|
120
|
+
totalFiles: files.size,
|
|
121
|
+
files: [...files],
|
|
122
|
+
avgDensity: this.metadata.length > 0
|
|
123
|
+
? this.metadata.reduce((sum, m) => sum + m.density, 0) / this.metadata.length
|
|
124
|
+
: 0,
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
getAllMetadata() {
|
|
129
|
+
return this.metadata
|
|
130
|
+
}
|
|
131
|
+
}
|
package/src/watcher.js
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
import chokidar from 'chokidar'
|
|
2
|
+
import { readFileSync, statSync } from 'fs'
|
|
3
|
+
import { relative, resolve } from 'path'
|
|
4
|
+
import { chunkText } from './chunker.js'
|
|
5
|
+
import { informationDensity } from './density.js'
|
|
6
|
+
|
|
7
|
+
const TEXT_EXTENSIONS = new Set([
|
|
8
|
+
'.txt', '.md', '.js', '.ts', '.jsx', '.tsx', '.py', '.rb', '.go',
|
|
9
|
+
'.rs', '.java', '.c', '.cpp', '.h', '.hpp', '.css', '.html', '.xml',
|
|
10
|
+
'.json', '.yaml', '.yml', '.toml', '.ini', '.cfg', '.conf', '.sh',
|
|
11
|
+
'.bash', '.zsh', '.fish', '.sql', '.csv', '.log', '.env', '.gitignore',
|
|
12
|
+
'.dockerfile', '.makefile', '.cmake', '.gradle', '.properties',
|
|
13
|
+
])
|
|
14
|
+
|
|
15
|
+
const IGNORE_PATTERNS = [
|
|
16
|
+
'**/node_modules/**',
|
|
17
|
+
'**/data/**',
|
|
18
|
+
'**/.git/**',
|
|
19
|
+
'**/dist/**',
|
|
20
|
+
'**/build/**',
|
|
21
|
+
'**/*.lock',
|
|
22
|
+
'**/package-lock.json',
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
function isTextFile(filePath) {
|
|
26
|
+
const ext = filePath.slice(filePath.lastIndexOf('.')).toLowerCase()
|
|
27
|
+
return TEXT_EXTENSIONS.has(ext) || ext === ''
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
async function processFile(filePath, store, embedder) {
|
|
31
|
+
if (!isTextFile(filePath)) return
|
|
32
|
+
|
|
33
|
+
let text
|
|
34
|
+
try {
|
|
35
|
+
text = readFileSync(filePath, 'utf8')
|
|
36
|
+
} catch {
|
|
37
|
+
return
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
if (!text.trim()) return
|
|
41
|
+
|
|
42
|
+
const stat = statSync(filePath)
|
|
43
|
+
const relPath = relative(process.cwd(), filePath)
|
|
44
|
+
|
|
45
|
+
console.log(`Processing: ${relPath}`)
|
|
46
|
+
|
|
47
|
+
// Remove old entries for this file
|
|
48
|
+
store.removeFile(relPath)
|
|
49
|
+
|
|
50
|
+
const chunks = chunkText(text)
|
|
51
|
+
const densities = chunks.map(c => informationDensity(c))
|
|
52
|
+
const vectors = []
|
|
53
|
+
|
|
54
|
+
for (const chunk of chunks) {
|
|
55
|
+
const vec = await embedder.embed(chunk)
|
|
56
|
+
vectors.push(vec)
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
store.addChunks(relPath, chunks, vectors, densities, stat.mtime)
|
|
60
|
+
console.log(` Indexed ${chunks.length} chunks (avg density: ${(densities.reduce((a, b) => a + b, 0) / densities.length).toFixed(3)})`)
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
export function startWatcher(dir, store, embedder) {
|
|
64
|
+
const watcher = chokidar.watch(dir, {
|
|
65
|
+
ignored: IGNORE_PATTERNS,
|
|
66
|
+
persistent: true,
|
|
67
|
+
ignoreInitial: false,
|
|
68
|
+
})
|
|
69
|
+
|
|
70
|
+
const queue = []
|
|
71
|
+
let processing = false
|
|
72
|
+
|
|
73
|
+
async function processQueue() {
|
|
74
|
+
if (processing) return
|
|
75
|
+
processing = true
|
|
76
|
+
|
|
77
|
+
while (queue.length > 0) {
|
|
78
|
+
const { filePath } = queue.shift()
|
|
79
|
+
try {
|
|
80
|
+
await processFile(filePath, store, embedder)
|
|
81
|
+
} catch (err) {
|
|
82
|
+
console.error(`Error processing ${filePath}:`, err.message)
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
processing = false
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
function enqueue(filePath) {
|
|
90
|
+
// Deduplicate
|
|
91
|
+
if (!queue.some(q => q.filePath === filePath)) {
|
|
92
|
+
queue.push({ filePath: resolve(filePath) })
|
|
93
|
+
}
|
|
94
|
+
processQueue()
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
watcher
|
|
98
|
+
.on('add', enqueue)
|
|
99
|
+
.on('change', enqueue)
|
|
100
|
+
.on('unlink', filePath => {
|
|
101
|
+
const relPath = relative(process.cwd(), resolve(filePath))
|
|
102
|
+
console.log(`Removed: ${relPath}`)
|
|
103
|
+
store.removeFile(relPath)
|
|
104
|
+
})
|
|
105
|
+
|
|
106
|
+
console.log(`Watching ${dir} for changes...`)
|
|
107
|
+
return watcher
|
|
108
|
+
}
|