@rekal/mem 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/db-BMh1OP4b.mjs +294 -0
- package/dist/doc-DnYN4jAU.mjs +116 -0
- package/dist/embed-rUMZxqed.mjs +100 -0
- package/dist/fs-DMp26Byo.mjs +32 -0
- package/dist/glob.d.mts +27 -0
- package/dist/glob.mjs +132 -0
- package/dist/index.d.mts +1465 -0
- package/dist/index.mjs +351 -0
- package/dist/llama-CT3dc9Cn.mjs +75 -0
- package/dist/models-DFQSgBNr.mjs +77 -0
- package/dist/openai-j2_2GM4J.mjs +76 -0
- package/dist/progress-B1JdNapX.mjs +263 -0
- package/dist/query-VFSpErTB.mjs +125 -0
- package/dist/runtime.node-DlQPaGrV.mjs +35 -0
- package/dist/search-BllHWtZF.mjs +166 -0
- package/dist/store-DE7S35SS.mjs +137 -0
- package/dist/transformers-CJ3QA2PK.mjs +55 -0
- package/dist/uri-CehXVDGB.mjs +28 -0
- package/dist/util-DNyrmcA3.mjs +11 -0
- package/dist/vfs-CNQbkhsf.mjs +222 -0
- package/foo.ts +3 -0
- package/foo2.ts +20 -0
- package/package.json +61 -0
- package/src/context.ts +77 -0
- package/src/db.ts +464 -0
- package/src/doc.ts +163 -0
- package/src/embed/base.ts +122 -0
- package/src/embed/index.ts +67 -0
- package/src/embed/llama.ts +111 -0
- package/src/embed/models.ts +104 -0
- package/src/embed/openai.ts +95 -0
- package/src/embed/transformers.ts +81 -0
- package/src/frecency.ts +58 -0
- package/src/fs.ts +36 -0
- package/src/glob.ts +163 -0
- package/src/index.ts +15 -0
- package/src/log.ts +60 -0
- package/src/md.ts +204 -0
- package/src/progress.ts +121 -0
- package/src/query.ts +131 -0
- package/src/runtime.bun.ts +33 -0
- package/src/runtime.node.ts +47 -0
- package/src/search.ts +230 -0
- package/src/snippet.ts +248 -0
- package/src/sqlite.ts +1 -0
- package/src/store.ts +180 -0
- package/src/uri.ts +28 -0
- package/src/util.ts +21 -0
- package/src/vfs.ts +257 -0
- package/test/doc.test.ts +61 -0
- package/test/fixtures/ignore-test/keep.md +0 -0
- package/test/fixtures/ignore-test/skip.log +0 -0
- package/test/fixtures/ignore-test/sub/keep.md +0 -0
- package/test/fixtures/store/agent/index.md +9 -0
- package/test/fixtures/store/agent/lessons.md +21 -0
- package/test/fixtures/store/agent/soul.md +28 -0
- package/test/fixtures/store/agent/tools.md +25 -0
- package/test/fixtures/store/concepts/frecency.md +30 -0
- package/test/fixtures/store/concepts/index.md +9 -0
- package/test/fixtures/store/concepts/memory-coherence.md +33 -0
- package/test/fixtures/store/concepts/rag.md +27 -0
- package/test/fixtures/store/index.md +9 -0
- package/test/fixtures/store/projects/index.md +9 -0
- package/test/fixtures/store/projects/rekall-inc/architecture.md +41 -0
- package/test/fixtures/store/projects/rekall-inc/decisions/index.md +9 -0
- package/test/fixtures/store/projects/rekall-inc/decisions/no-military.md +20 -0
- package/test/fixtures/store/projects/rekall-inc/index.md +28 -0
- package/test/fixtures/store/user/family.md +13 -0
- package/test/fixtures/store/user/index.md +9 -0
- package/test/fixtures/store/user/preferences.md +29 -0
- package/test/fixtures/store/user/profile.md +29 -0
- package/test/fs.test.ts +15 -0
- package/test/glob.test.ts +190 -0
- package/test/md.test.ts +177 -0
- package/test/query.test.ts +105 -0
- package/test/uri.test.ts +46 -0
- package/test/util.test.ts +62 -0
- package/test/vfs.test.ts +164 -0
- package/tsconfig.json +3 -0
- package/tsdown.config.ts +8 -0
package/src/db.ts
ADDED
|
@@ -0,0 +1,464 @@
|
|
|
1
|
+
import type { EmbedderChunk } from "./embed/index.ts"
|
|
2
|
+
import type { Database } from "./sqlite.ts"
|
|
3
|
+
import type { StoreChunk } from "./store.ts"
|
|
4
|
+
|
|
5
|
+
import { openDatabase } from "./sqlite.ts"
|
|
6
|
+
|
|
7
|
+
export type { Database }
|
|
8
|
+
|
|
9
|
+
export type DocRow = {
|
|
10
|
+
id: number
|
|
11
|
+
path: string
|
|
12
|
+
hash: string
|
|
13
|
+
body: string
|
|
14
|
+
vec_hash?: string
|
|
15
|
+
description: string
|
|
16
|
+
title: string
|
|
17
|
+
tags: string
|
|
18
|
+
entities: string
|
|
19
|
+
updated_at: string
|
|
20
|
+
synced_at?: string
|
|
21
|
+
deadline?: number
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
export type VecResult = {
|
|
25
|
+
doc_id: number
|
|
26
|
+
path: string
|
|
27
|
+
seq: number
|
|
28
|
+
distance: number
|
|
29
|
+
score: number
|
|
30
|
+
rank?: number
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export type FTSResult = {
|
|
34
|
+
rowid: number
|
|
35
|
+
score: number
|
|
36
|
+
rank?: number
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
export type DbSearchOptions = {
|
|
40
|
+
limit?: number
|
|
41
|
+
scope?: string[] // path prefixes to limit search to (e.g. ["folder1/", "folder2/sub"])
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
const SEARCH_LIMIT = 20
|
|
45
|
+
const STOPWORD_THRESHOLD = 0.3 // terms in >50% of docs are candidates
|
|
46
|
+
const STOPWORD_MIN_DOCS = 10 // terms must be in at least 5 docs to be considered stop words
|
|
47
|
+
const STOPWORD_LIMIT = 1000 // max number of stop words to return
|
|
48
|
+
|
|
49
|
+
export function hasEmbedding<T extends EmbedderChunk>(c: T): c is T & { embedding: number[] } {
|
|
50
|
+
return Array.isArray(c.embedding)
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
export function assertEmbeddings<T extends EmbedderChunk>(
|
|
54
|
+
chunks: T[]
|
|
55
|
+
): asserts chunks is (T & { embedding: number[] })[] {
|
|
56
|
+
for (const c of chunks) {
|
|
57
|
+
if (!hasEmbedding(c)) throw new Error(`Chunk is missing embedding: ${JSON.stringify(c)}`)
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
export class Db {
|
|
62
|
+
#db: Database
|
|
63
|
+
#vec?: { exists: boolean; dims?: number; init?: boolean }
|
|
64
|
+
|
|
65
|
+
private constructor(db: Database) {
|
|
66
|
+
this.#db = db
|
|
67
|
+
this.init()
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
static async load(dbPath: string) {
|
|
71
|
+
return new Db(await openDatabase(dbPath))
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
private init() {
|
|
75
|
+
this.#db.run("PRAGMA journal_mode = WAL")
|
|
76
|
+
this.#db.run("PRAGMA foreign_keys = ON")
|
|
77
|
+
this.#db.run("PRAGMA busy_timeout = 5000")
|
|
78
|
+
|
|
79
|
+
this.#db.run(`
|
|
80
|
+
CREATE TABLE IF NOT EXISTS docs (
|
|
81
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
82
|
+
path TEXT NOT NULL UNIQUE,
|
|
83
|
+
hash TEXT NOT NULL,
|
|
84
|
+
vec_hash TEXT,
|
|
85
|
+
body TEXT NOT NULL DEFAULT '',
|
|
86
|
+
description TEXT NOT NULL DEFAULT '',
|
|
87
|
+
title TEXT NOT NULL DEFAULT '',
|
|
88
|
+
tags TEXT NOT NULL DEFAULT '',
|
|
89
|
+
entities TEXT NOT NULL DEFAULT '',
|
|
90
|
+
updated_at TEXT NOT NULL,
|
|
91
|
+
synced_at TEXT,
|
|
92
|
+
deadline REAL
|
|
93
|
+
)
|
|
94
|
+
`)
|
|
95
|
+
|
|
96
|
+
this.#db.run(`CREATE INDEX IF NOT EXISTS idx_docs_path ON docs(path)`)
|
|
97
|
+
this.#db.run(`CREATE INDEX IF NOT EXISTS idx_docs_hash ON docs(hash)`)
|
|
98
|
+
|
|
99
|
+
// Content-synced FTS5: reads content from docs table, no duplication.
|
|
100
|
+
// Fields ordered by BM25 weight: entities(10), tags(8), description(5), title(3), body(1)
|
|
101
|
+
this.#db.run(`
|
|
102
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS docs_fts USING fts5(
|
|
103
|
+
entities, tags, description, title, body,
|
|
104
|
+
content='docs',
|
|
105
|
+
content_rowid='id',
|
|
106
|
+
tokenize='porter unicode61'
|
|
107
|
+
)
|
|
108
|
+
`)
|
|
109
|
+
|
|
110
|
+
// Triggers to keep FTS in sync with docs table
|
|
111
|
+
this.#db.run(`
|
|
112
|
+
CREATE TRIGGER IF NOT EXISTS docs_fts_insert AFTER INSERT ON docs BEGIN
|
|
113
|
+
INSERT INTO docs_fts(rowid, entities, tags, description, title, body)
|
|
114
|
+
VALUES (new.id, new.entities, new.tags, new.description, new.title, new.body);
|
|
115
|
+
END
|
|
116
|
+
`)
|
|
117
|
+
|
|
118
|
+
this.#db.run(`
|
|
119
|
+
CREATE TRIGGER IF NOT EXISTS docs_fts_delete AFTER DELETE ON docs BEGIN
|
|
120
|
+
INSERT INTO docs_fts(docs_fts, rowid, entities, tags, description, title, body)
|
|
121
|
+
VALUES ('delete', old.id, old.entities, old.tags, old.description, old.title, old.body);
|
|
122
|
+
END
|
|
123
|
+
`)
|
|
124
|
+
|
|
125
|
+
this.#db.run(`
|
|
126
|
+
CREATE TRIGGER IF NOT EXISTS docs_fts_update AFTER UPDATE ON docs
|
|
127
|
+
WHEN old.body != new.body
|
|
128
|
+
OR old.title != new.title
|
|
129
|
+
OR old.description != new.description
|
|
130
|
+
OR old.tags != new.tags
|
|
131
|
+
OR old.entities != new.entities
|
|
132
|
+
BEGIN
|
|
133
|
+
INSERT INTO docs_fts(docs_fts, rowid, entities, tags, description, title, body)
|
|
134
|
+
VALUES ('delete', old.id, old.entities, old.tags, old.description, old.title, old.body);
|
|
135
|
+
INSERT INTO docs_fts(rowid, entities, tags, description, title, body)
|
|
136
|
+
VALUES (new.id, new.entities, new.tags, new.description, new.title, new.body);
|
|
137
|
+
END
|
|
138
|
+
`)
|
|
139
|
+
|
|
140
|
+
// FTS5 vocabulary table for IDF-based term weighting
|
|
141
|
+
this.#db.run(`CREATE VIRTUAL TABLE IF NOT EXISTS docs_vocab USING fts5vocab('docs_fts', 'row')`)
|
|
142
|
+
|
|
143
|
+
this.#db.run(`
|
|
144
|
+
CREATE TABLE IF NOT EXISTS meta (
|
|
145
|
+
key TEXT PRIMARY KEY,
|
|
146
|
+
value TEXT
|
|
147
|
+
)
|
|
148
|
+
`)
|
|
149
|
+
|
|
150
|
+
this.#db.run(`
|
|
151
|
+
CREATE TABLE IF NOT EXISTS cache (
|
|
152
|
+
key TEXT PRIMARY KEY,
|
|
153
|
+
value TEXT NOT NULL,
|
|
154
|
+
accessed_at TEXT NOT NULL
|
|
155
|
+
)
|
|
156
|
+
`)
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
reset() {
|
|
160
|
+
// Drop triggers first, then FTS (which references docs), then docs
|
|
161
|
+
this.#db.run(`DROP TRIGGER IF EXISTS docs_fts_insert`)
|
|
162
|
+
this.#db.run(`DROP TRIGGER IF EXISTS docs_fts_delete`)
|
|
163
|
+
this.#db.run(`DROP TRIGGER IF EXISTS docs_fts_update`)
|
|
164
|
+
this.#db.run(`DROP TABLE IF EXISTS docs_fts`)
|
|
165
|
+
this.#db.run(`DROP TABLE IF EXISTS vec`)
|
|
166
|
+
this.#db.run(`DROP TABLE IF EXISTS cache`)
|
|
167
|
+
this.#db.run(`DROP TABLE IF EXISTS docs`)
|
|
168
|
+
this.#db.run(`DROP TABLE IF EXISTS meta`)
|
|
169
|
+
this.#db.run(`VACUUM`)
|
|
170
|
+
this.#vec = { exists: false }
|
|
171
|
+
this.init()
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
private initVec(dims: number) {
|
|
175
|
+
if (this.vec.init) return
|
|
176
|
+
const existingDims = this.vec.dims
|
|
177
|
+
if (existingDims && existingDims !== dims)
|
|
178
|
+
throw new Error(
|
|
179
|
+
`Vector dimension mismatch: existing **vec** has \`${existingDims}\` dims, but got \`${dims}\`.\n` +
|
|
180
|
+
`Run \`rekal reset\` and \`rekal sync\` to recreate with the correct dimensions.`
|
|
181
|
+
)
|
|
182
|
+
this.#db.run(
|
|
183
|
+
`CREATE VIRTUAL TABLE IF NOT EXISTS vec USING vec0(
|
|
184
|
+
doc_id INTEGER NOT NULL,
|
|
185
|
+
seq INTEGER NOT NULL,
|
|
186
|
+
+path TEXT NOT NULL,
|
|
187
|
+
embedding float[${dims}] distance_metric=cosine
|
|
188
|
+
)`
|
|
189
|
+
)
|
|
190
|
+
this.#vec = { dims, exists: true, init: true }
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
// --- Docs ---
|
|
194
|
+
|
|
195
|
+
getDoc(from: string | number) {
|
|
196
|
+
const field = typeof from === "number" ? "id" : "path"
|
|
197
|
+
return this.#db.query(`SELECT * FROM docs WHERE ${field} = ?`).get(from) as DocRow | undefined
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
getDocs(from?: (string | number)[]) {
|
|
201
|
+
let ret: DocRow[]
|
|
202
|
+
|
|
203
|
+
if (!from) ret = this.#db.query(`SELECT * FROM docs`).all() as DocRow[]
|
|
204
|
+
else {
|
|
205
|
+
const field = typeof from[0] === "number" ? "id" : "path"
|
|
206
|
+
const placeholders = from.map(() => "?").join(",")
|
|
207
|
+
ret = this.#db
|
|
208
|
+
.query(`SELECT * FROM docs WHERE ${field} IN (${placeholders})`)
|
|
209
|
+
.all(...from) as DocRow[]
|
|
210
|
+
}
|
|
211
|
+
return new Map(ret.map((row) => [row.id, row]))
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
addDoc(row: Omit<DocRow, "id">) {
|
|
215
|
+
const result = this.#db
|
|
216
|
+
.query(
|
|
217
|
+
`INSERT INTO docs (path, hash, body, description, title, tags, entities, updated_at, synced_at)
|
|
218
|
+
VALUES($path, $hash, $body, $description, $title, $tags, $entities, $updated_at, $synced_at)
|
|
219
|
+
ON CONFLICT(path) DO UPDATE SET
|
|
220
|
+
hash = excluded.hash,
|
|
221
|
+
body = excluded.body,
|
|
222
|
+
description = excluded.description,
|
|
223
|
+
title = excluded.title,
|
|
224
|
+
tags = excluded.tags,
|
|
225
|
+
entities = excluded.entities,
|
|
226
|
+
updated_at = excluded.updated_at,
|
|
227
|
+
synced_at = excluded.synced_at
|
|
228
|
+
RETURNING id`
|
|
229
|
+
)
|
|
230
|
+
.get(row) as { id: number }
|
|
231
|
+
return result.id
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
deleteDoc(id: number, tables: { docs?: boolean; vec?: boolean } = {}) {
|
|
235
|
+
// FTS is auto-synced via triggers when docs are deleted/updated
|
|
236
|
+
if (tables.vec) this.deleteEmbeddings(id)
|
|
237
|
+
if (tables.docs) this.#db.query(`DELETE FROM docs WHERE id = ?`).run(id)
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
get vec() {
|
|
241
|
+
if (this.#vec) return this.#vec
|
|
242
|
+
const row = this.#db
|
|
243
|
+
.query(`SELECT sql FROM sqlite_master WHERE type = 'table' AND name = 'vec'`)
|
|
244
|
+
.get() as { sql: string } | undefined
|
|
245
|
+
const match = row?.sql.match(/embedding float\[(\d+)\]/)
|
|
246
|
+
this.#vec = { dims: match ? parseInt(match[1]) : undefined, exists: !!row?.sql }
|
|
247
|
+
return this.#vec
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
getStatus() {
|
|
251
|
+
const count = (sql: string) => (this.#db.query(sql).get() as { n: number }).n
|
|
252
|
+
return {
|
|
253
|
+
cache: count(`SELECT count(*) as n FROM cache`),
|
|
254
|
+
dbSize: (
|
|
255
|
+
this.#db
|
|
256
|
+
.query(`SELECT page_count * page_size as n FROM pragma_page_count, pragma_page_size`)
|
|
257
|
+
.get() as { n: number }
|
|
258
|
+
).n,
|
|
259
|
+
docs: count(`SELECT count(*) as n FROM docs`),
|
|
260
|
+
docsWithDescription: count(`SELECT count(*) as n FROM docs WHERE description != ''`),
|
|
261
|
+
lastSync: (
|
|
262
|
+
this.#db.query(`SELECT max(synced_at) as t FROM docs`).get() as { t: string | null }
|
|
263
|
+
).t,
|
|
264
|
+
unembedded: count(
|
|
265
|
+
`SELECT count(*) as n FROM docs WHERE vec_hash IS NULL OR vec_hash != hash`
|
|
266
|
+
),
|
|
267
|
+
vecDims: this.vec.dims,
|
|
268
|
+
vecs: this.vec.exists ? count(`SELECT count(*) as n FROM vec`) : 0,
|
|
269
|
+
vocabTerms: count(`SELECT count(DISTINCT term) as n FROM docs_vocab`),
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
transaction<A extends any[], T>(fn: (...args: A) => T) {
|
|
274
|
+
return this.#db.transaction(fn)
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
getUnembeddedDocs(): DocRow[] {
|
|
278
|
+
return this.#db
|
|
279
|
+
.query(`SELECT * FROM docs
|
|
280
|
+
WHERE vec_hash IS NULL OR vec_hash != hash
|
|
281
|
+
ORDER BY path`)
|
|
282
|
+
.all() as DocRow[]
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
touchDoc(id: number) {
|
|
286
|
+
this.#db.query(`UPDATE docs SET synced_at = ? WHERE id = ?`).run(new Date().toISOString(), id)
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
markEmbedded(id: number, docHash: string) {
|
|
290
|
+
this.#db.query(`UPDATE docs SET vec_hash = ? WHERE id = ?`).run(docHash, id)
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
/** Delete docs not seen since the given sync timestamp, optionally scoped to a path prefix. */
|
|
294
|
+
deleteStaleDocs(syncedBefore: string, prefix?: string): number {
|
|
295
|
+
let query = `SELECT id FROM docs WHERE synced_at IS NULL OR synced_at < ?`
|
|
296
|
+
const params = [syncedBefore]
|
|
297
|
+
if (prefix) {
|
|
298
|
+
query += ` AND path LIKE ? || '%'`
|
|
299
|
+
params.push(prefix)
|
|
300
|
+
}
|
|
301
|
+
const stale = this.#db.query(query).all(...params) as { id: number }[]
|
|
302
|
+
for (const { id } of stale) {
|
|
303
|
+
this.deleteDoc(id, { docs: true, vec: true })
|
|
304
|
+
}
|
|
305
|
+
return stale.length
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
// --- FTS ---
|
|
309
|
+
// FTS is auto-synced via triggers on the docs table.
|
|
310
|
+
|
|
311
|
+
/** Scoped FTS search: only match docs whose path starts with one of the given prefixes */
|
|
312
|
+
searchFts(query: string, opts?: DbSearchOptions): FTSResult[] {
|
|
313
|
+
if (opts?.scope?.length === 0) return [] // empty scope means no results
|
|
314
|
+
const scope = opts?.scope ?? []
|
|
315
|
+
const scopeQuery =
|
|
316
|
+
scope.length === 0 ? "" : `AND (${scope.map(() => `d.path LIKE ? || '%'`).join(" OR ")})`
|
|
317
|
+
return this.#db
|
|
318
|
+
.query(
|
|
319
|
+
`SELECT f.rowid, bm25(docs_fts, 10, 8, 5, 3, 1) as score
|
|
320
|
+
FROM docs_fts f
|
|
321
|
+
${scope.length > 0 ? "JOIN docs d ON d.id = f.rowid" : ""}
|
|
322
|
+
WHERE docs_fts MATCH ?
|
|
323
|
+
${scopeQuery}
|
|
324
|
+
ORDER BY score
|
|
325
|
+
LIMIT ?`
|
|
326
|
+
)
|
|
327
|
+
.all(query, ...scope, opts?.limit ?? SEARCH_LIMIT) as FTSResult[]
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
/** * Gets weights for high-frequency terms.
|
|
331
|
+
* Note: Truly common words will result in an IDF of 0 or less.
|
|
332
|
+
*/
|
|
333
|
+
getStopWords(): Map<string, number> {
|
|
334
|
+
// 1. Get total doc count (N) first
|
|
335
|
+
const totalDocs =
|
|
336
|
+
(this.#db.query("SELECT count(*) as n FROM docs").get() as { n: number } | undefined)?.n ?? 0
|
|
337
|
+
|
|
338
|
+
if (totalDocs === 0) return new Map()
|
|
339
|
+
|
|
340
|
+
// 2. Fetch the high-frequency terms
|
|
341
|
+
const rows = this.#db
|
|
342
|
+
.query(
|
|
343
|
+
`SELECT v.term, v.doc
|
|
344
|
+
FROM docs_vocab v
|
|
345
|
+
WHERE v.doc > ? AND v.doc > ?
|
|
346
|
+
ORDER BY v.doc DESC
|
|
347
|
+
LIMIT ?`
|
|
348
|
+
)
|
|
349
|
+
.all(totalDocs * STOPWORD_THRESHOLD, STOPWORD_MIN_DOCS, STOPWORD_LIMIT) as {
|
|
350
|
+
term: string
|
|
351
|
+
doc: number
|
|
352
|
+
}[]
|
|
353
|
+
|
|
354
|
+
return new Map(
|
|
355
|
+
rows.map((r) => {
|
|
356
|
+
// Calculate IDF
|
|
357
|
+
const idf = Math.log((totalDocs - r.doc + 0.5) / (r.doc + 0.5))
|
|
358
|
+
|
|
359
|
+
// For stop words, we usually want to clamp at 0.
|
|
360
|
+
// If a word is in >50% of docs, the formula goes negative.
|
|
361
|
+
return [r.term, Math.max(0, idf)]
|
|
362
|
+
})
|
|
363
|
+
)
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
getWeights(terms: string[]): number[] {
|
|
367
|
+
if (terms.length === 0) return []
|
|
368
|
+
const total = (this.#db.query(`SELECT count(*) as n FROM docs`).get() as { n: number }).n
|
|
369
|
+
const placeholders = terms.map(() => "?").join(",")
|
|
370
|
+
const rows = this.#db
|
|
371
|
+
.query(`SELECT term, doc FROM docs_vocab WHERE term IN (${placeholders})`)
|
|
372
|
+
.all(...terms) as { term: string; doc: number }[]
|
|
373
|
+
const df = new Map(rows.map((r) => [r.term, r.doc]))
|
|
374
|
+
return terms.map((t) => Math.log((total - (df.get(t) ?? 0) + 0.5) / ((df.get(t) ?? 0) + 0.5)))
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
// --- Vector ---
|
|
378
|
+
|
|
379
|
+
/** Insert embeddings into the vec table */
|
|
380
|
+
insertEmbeddings(chunks: StoreChunk[]) {
|
|
381
|
+
assertEmbeddings(chunks)
|
|
382
|
+
if (chunks.length === 0) return
|
|
383
|
+
this.initVec(chunks[0].embedding.length)
|
|
384
|
+
const stmt = this.#db.query(`INSERT INTO vec(doc_id, seq, path, embedding) VALUES (?, ?, ?, ?)`)
|
|
385
|
+
for (const chunk of chunks) {
|
|
386
|
+
stmt.run(chunk.doc_id, chunk.seq, chunk.doc.path, JSON.stringify(chunk.embedding))
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
/** Delete all vec entries for a doc */
|
|
391
|
+
deleteEmbeddings(docId: number) {
|
|
392
|
+
if (this.vec.exists) this.#db.query(`DELETE FROM vec WHERE doc_id = ?`).run(docId)
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
/** Global KNN search, returns top results across all docs */
|
|
396
|
+
searchVec(embedding: number[], opts?: DbSearchOptions): VecResult[] {
|
|
397
|
+
if (!this.vec.exists) return []
|
|
398
|
+
const limit = opts?.limit ?? SEARCH_LIMIT
|
|
399
|
+
return this.#db
|
|
400
|
+
.query(
|
|
401
|
+
`SELECT doc_id, seq, path, distance, (1 - distance/2) as score
|
|
402
|
+
FROM vec
|
|
403
|
+
WHERE embedding MATCH ?
|
|
404
|
+
AND k = ?
|
|
405
|
+
ORDER BY distance`
|
|
406
|
+
)
|
|
407
|
+
.all(JSON.stringify(embedding), limit) as VecResult[]
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
// --- Frecency ---
|
|
411
|
+
|
|
412
|
+
setDeadline(docId: number, deadline: number) {
|
|
413
|
+
this.#db.query(`UPDATE docs SET deadline = ? WHERE id = ?`).run(deadline, docId)
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
// --- Meta ---
|
|
417
|
+
|
|
418
|
+
getMeta(key: string) {
|
|
419
|
+
return (
|
|
420
|
+
this.#db.query(`SELECT value FROM meta WHERE key = ?`).get(key) as
|
|
421
|
+
| { value: string }
|
|
422
|
+
| undefined
|
|
423
|
+
)?.value
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
setMeta(key: string, value: string) {
|
|
427
|
+
this.#db
|
|
428
|
+
.query(`INSERT INTO meta (key, value) VALUES (?, ?) ON CONFLICT(key) DO UPDATE SET value = ?`)
|
|
429
|
+
.run(key, value, value)
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
// --- Cache ---
|
|
433
|
+
|
|
434
|
+
cacheGet<T>(key: string): T | undefined {
|
|
435
|
+
const row = this.#db.query(`SELECT value FROM cache WHERE key = ?`).get(key) as
|
|
436
|
+
| { value: string }
|
|
437
|
+
| undefined
|
|
438
|
+
if (!row) return
|
|
439
|
+
this.#db
|
|
440
|
+
.query(`UPDATE cache SET accessed_at = ? WHERE key = ?`)
|
|
441
|
+
.run(new Date().toISOString(), key)
|
|
442
|
+
return JSON.parse(row.value) as T
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
cacheSet<T>(key: string, value: T): T {
|
|
446
|
+
this.#db
|
|
447
|
+
.query(
|
|
448
|
+
`INSERT INTO cache (key, value, accessed_at) VALUES (?, ?, ?)
|
|
449
|
+
ON CONFLICT(key) DO UPDATE SET value = excluded.value, accessed_at = excluded.accessed_at`
|
|
450
|
+
)
|
|
451
|
+
.run(key, JSON.stringify(value), new Date().toISOString())
|
|
452
|
+
return value
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
cachePrune(maxEntries = 10_000) {
|
|
456
|
+
this.#db
|
|
457
|
+
.query(
|
|
458
|
+
`DELETE FROM cache WHERE key NOT IN (
|
|
459
|
+
SELECT key FROM cache ORDER BY accessed_at DESC LIMIT ?
|
|
460
|
+
)`
|
|
461
|
+
)
|
|
462
|
+
.run(maxEntries)
|
|
463
|
+
}
|
|
464
|
+
}
|
package/src/doc.ts
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
import type { Frontmatter, MarkdownDoc } from "./md.ts"
|
|
2
|
+
import type { VfsEntry } from "./vfs.ts"
|
|
3
|
+
|
|
4
|
+
import { readFile } from "node:fs/promises"
|
|
5
|
+
import { basename, join, resolve } from "pathe"
|
|
6
|
+
import { astat } from "./fs.ts"
|
|
7
|
+
import { parseMarkdown } from "./md.ts"
|
|
8
|
+
import { normUri } from "./uri.ts"
|
|
9
|
+
import { hash } from "./util.ts"
|
|
10
|
+
|
|
11
|
+
const INDEX = "index.md"
|
|
12
|
+
const MAX_DESC_LENGTH = 30 * 4 // roughly 30 tokens
|
|
13
|
+
|
|
14
|
+
export type DocFrontmatter = {
|
|
15
|
+
description?: string
|
|
16
|
+
tags?: string[]
|
|
17
|
+
entities?: string[]
|
|
18
|
+
} & Frontmatter
|
|
19
|
+
|
|
20
|
+
type DocHeading = {
|
|
21
|
+
level: number
|
|
22
|
+
text: string
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export class Doc {
|
|
26
|
+
#isDir = false
|
|
27
|
+
#name = ""
|
|
28
|
+
hash = ""
|
|
29
|
+
updated = new Date(0)
|
|
30
|
+
headings: DocHeading[] = []
|
|
31
|
+
parsed: MarkdownDoc = { body: "", bodyOffset: 0, frontmatter: {}, sections: [], text: "" }
|
|
32
|
+
|
|
33
|
+
protected constructor(
|
|
34
|
+
public uri: string,
|
|
35
|
+
public path: string
|
|
36
|
+
) {
|
|
37
|
+
this.path = resolve(path)
|
|
38
|
+
this.#name = basename(this.path, ".md")
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/** Full original markdown text, including frontmatter. */
|
|
42
|
+
get text() {
|
|
43
|
+
return this.parsed.text
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/** Markdown body without frontmatter. */
|
|
47
|
+
get body() {
|
|
48
|
+
return this.parsed.body
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/** Parsed frontmatter as an object. */
|
|
52
|
+
get fm() {
|
|
53
|
+
return this.parsed.frontmatter as DocFrontmatter
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/** Name of the doc, derived from the file or folder name, without extension */
|
|
57
|
+
get name(): string {
|
|
58
|
+
return this.#name
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// Actual (non-empty) description from frontmatter
|
|
62
|
+
get $description() {
|
|
63
|
+
const ret = this.fm.description?.trim()
|
|
64
|
+
return ret?.length ? ret : undefined
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
/** Desc from frontmatter or packed from headings. */
|
|
68
|
+
get description() {
|
|
69
|
+
const desc = this.$description
|
|
70
|
+
if (desc || this.headings.length === 0) return desc
|
|
71
|
+
|
|
72
|
+
const headings: (DocHeading & { used?: boolean })[] = this.headings.map((h) => ({ ...h }))
|
|
73
|
+
const minLevel = Math.min(...headings.map((h) => h.level))
|
|
74
|
+
const maxLevel = Math.max(...headings.map((h) => h.level))
|
|
75
|
+
|
|
76
|
+
// Pack by level until we reach maximum description length
|
|
77
|
+
let chars = 0
|
|
78
|
+
for (let level = minLevel; level <= maxLevel; level++) {
|
|
79
|
+
for (const h of headings) {
|
|
80
|
+
if (h.level !== level) continue
|
|
81
|
+
if (chars !== 0 && chars + h.text.length > MAX_DESC_LENGTH) continue
|
|
82
|
+
h.used = true
|
|
83
|
+
chars += h.text.length
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
return headings
|
|
88
|
+
.filter((h) => h.used)
|
|
89
|
+
.map((h) => h.text)
|
|
90
|
+
.join(", ")
|
|
91
|
+
.trim()
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/** Title from fontmatter or first heading */
|
|
95
|
+
get $title(): string | undefined {
|
|
96
|
+
const title = this.fm.title
|
|
97
|
+
if (typeof title === "string" && title.trim().length > 0) return title.trim()
|
|
98
|
+
return this.headings[0]?.text
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
/** `$title` if it doesn't contain the name, otherwise `name - $title` */
|
|
102
|
+
get title() {
|
|
103
|
+
const title = this.$title
|
|
104
|
+
if (!(title ?? "").length) return this.name
|
|
105
|
+
return title?.toLowerCase().includes(this.name.toLowerCase())
|
|
106
|
+
? title
|
|
107
|
+
: `${this.name} - ${title}`
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
get tags(): string[] {
|
|
111
|
+
return this.fm.tags ?? []
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
get entities(): string[] {
|
|
115
|
+
return this.fm.entities ?? []
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
get isDir(): boolean {
|
|
119
|
+
return this.#isDir
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
protected async load(): Promise<Doc | undefined> {
|
|
123
|
+
const name = basename(this.path)
|
|
124
|
+
|
|
125
|
+
// Quick validation. It's up to the caller to ensure the path
|
|
126
|
+
// is either a markdown file or a directory.
|
|
127
|
+
if (name === INDEX) throw new Error(`Doc path cannot end with \`${INDEX}\`:\n\`${this.path}\``)
|
|
128
|
+
|
|
129
|
+
let s = await astat(this.path)
|
|
130
|
+
let mdPath = this.path
|
|
131
|
+
|
|
132
|
+
if (s?.isDirectory()) {
|
|
133
|
+
this.#isDir = true
|
|
134
|
+
mdPath = join(this.path, INDEX)
|
|
135
|
+
s = await astat(mdPath)
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
if (!s && !this.#isDir) return
|
|
139
|
+
|
|
140
|
+
this.uri = normUri(this.uri, this.isDir)
|
|
141
|
+
|
|
142
|
+
// read file and normalize line endings to LF
|
|
143
|
+
const text = (s ? await readFile(mdPath, "utf8") : "").replace(/\r\n/g, "\n")
|
|
144
|
+
|
|
145
|
+
this.updated = s?.mtime ?? new Date(0)
|
|
146
|
+
this.hash = hash(text)
|
|
147
|
+
this.parsed = parseMarkdown(text)
|
|
148
|
+
this.headings = this.parsed.sections
|
|
149
|
+
.filter((section) => section.level > 0 && section.heading.trim().length > 0)
|
|
150
|
+
.map((section) => ({
|
|
151
|
+
level: section.level,
|
|
152
|
+
text: section.heading,
|
|
153
|
+
}))
|
|
154
|
+
return this
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
static async load(entry: string | VfsEntry): Promise<Doc | undefined>
|
|
158
|
+
static async load(uri: string, path?: string): Promise<Doc | undefined>
|
|
159
|
+
static async load(uri: string | VfsEntry, path?: string): Promise<Doc | undefined> {
|
|
160
|
+
const e = typeof uri === "string" ? { path: path, uri } : uri
|
|
161
|
+
return e.path ? await new Doc(e.uri, e.path).load() : undefined
|
|
162
|
+
}
|
|
163
|
+
}
|