@rekal/mem 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/db-BMh1OP4b.mjs +294 -0
- package/dist/doc-DnYN4jAU.mjs +116 -0
- package/dist/embed-rUMZxqed.mjs +100 -0
- package/dist/fs-DMp26Byo.mjs +32 -0
- package/dist/glob.d.mts +27 -0
- package/dist/glob.mjs +132 -0
- package/dist/index.d.mts +1465 -0
- package/dist/index.mjs +351 -0
- package/dist/llama-CT3dc9Cn.mjs +75 -0
- package/dist/models-DFQSgBNr.mjs +77 -0
- package/dist/openai-j2_2GM4J.mjs +76 -0
- package/dist/progress-B1JdNapX.mjs +263 -0
- package/dist/query-VFSpErTB.mjs +125 -0
- package/dist/runtime.node-DlQPaGrV.mjs +35 -0
- package/dist/search-BllHWtZF.mjs +166 -0
- package/dist/store-DE7S35SS.mjs +137 -0
- package/dist/transformers-CJ3QA2PK.mjs +55 -0
- package/dist/uri-CehXVDGB.mjs +28 -0
- package/dist/util-DNyrmcA3.mjs +11 -0
- package/dist/vfs-CNQbkhsf.mjs +222 -0
- package/foo.ts +3 -0
- package/foo2.ts +20 -0
- package/package.json +61 -0
- package/src/context.ts +77 -0
- package/src/db.ts +464 -0
- package/src/doc.ts +163 -0
- package/src/embed/base.ts +122 -0
- package/src/embed/index.ts +67 -0
- package/src/embed/llama.ts +111 -0
- package/src/embed/models.ts +104 -0
- package/src/embed/openai.ts +95 -0
- package/src/embed/transformers.ts +81 -0
- package/src/frecency.ts +58 -0
- package/src/fs.ts +36 -0
- package/src/glob.ts +163 -0
- package/src/index.ts +15 -0
- package/src/log.ts +60 -0
- package/src/md.ts +204 -0
- package/src/progress.ts +121 -0
- package/src/query.ts +131 -0
- package/src/runtime.bun.ts +33 -0
- package/src/runtime.node.ts +47 -0
- package/src/search.ts +230 -0
- package/src/snippet.ts +248 -0
- package/src/sqlite.ts +1 -0
- package/src/store.ts +180 -0
- package/src/uri.ts +28 -0
- package/src/util.ts +21 -0
- package/src/vfs.ts +257 -0
- package/test/doc.test.ts +61 -0
- package/test/fixtures/ignore-test/keep.md +0 -0
- package/test/fixtures/ignore-test/skip.log +0 -0
- package/test/fixtures/ignore-test/sub/keep.md +0 -0
- package/test/fixtures/store/agent/index.md +9 -0
- package/test/fixtures/store/agent/lessons.md +21 -0
- package/test/fixtures/store/agent/soul.md +28 -0
- package/test/fixtures/store/agent/tools.md +25 -0
- package/test/fixtures/store/concepts/frecency.md +30 -0
- package/test/fixtures/store/concepts/index.md +9 -0
- package/test/fixtures/store/concepts/memory-coherence.md +33 -0
- package/test/fixtures/store/concepts/rag.md +27 -0
- package/test/fixtures/store/index.md +9 -0
- package/test/fixtures/store/projects/index.md +9 -0
- package/test/fixtures/store/projects/rekall-inc/architecture.md +41 -0
- package/test/fixtures/store/projects/rekall-inc/decisions/index.md +9 -0
- package/test/fixtures/store/projects/rekall-inc/decisions/no-military.md +20 -0
- package/test/fixtures/store/projects/rekall-inc/index.md +28 -0
- package/test/fixtures/store/user/family.md +13 -0
- package/test/fixtures/store/user/index.md +9 -0
- package/test/fixtures/store/user/preferences.md +29 -0
- package/test/fixtures/store/user/profile.md +29 -0
- package/test/fs.test.ts +15 -0
- package/test/glob.test.ts +190 -0
- package/test/md.test.ts +177 -0
- package/test/query.test.ts +105 -0
- package/test/uri.test.ts +46 -0
- package/test/util.test.ts +62 -0
- package/test/vfs.test.ts +164 -0
- package/tsconfig.json +3 -0
- package/tsdown.config.ts +8 -0
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import type { Database } from "bun:sqlite"
|
|
2
|
+
|
|
3
|
+
let didInit = false
|
|
4
|
+
|
|
5
|
+
export async function openDatabase(path: string) {
|
|
6
|
+
const [{ load: sqliteVec }, { Database }] = await Promise.all([
|
|
7
|
+
import("sqlite-vec"),
|
|
8
|
+
import("bun:sqlite"),
|
|
9
|
+
])
|
|
10
|
+
if (!didInit) {
|
|
11
|
+
didInit = true
|
|
12
|
+
// See: https://bun.com/docs/runtime/sqlite#setcustomsqlite
|
|
13
|
+
if (process.platform === "darwin")
|
|
14
|
+
for (const p of [
|
|
15
|
+
"/opt/homebrew/opt/sqlite/lib/libsqlite3.dylib", // Apple Silicon
|
|
16
|
+
"/usr/local/opt/sqlite/lib/libsqlite3.dylib", // Intel
|
|
17
|
+
]) {
|
|
18
|
+
try {
|
|
19
|
+
Database.setCustomSQLite(p)
|
|
20
|
+
break
|
|
21
|
+
} catch {}
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
const db = new Database(path, { strict: true })
|
|
25
|
+
sqliteVec(db)
|
|
26
|
+
return db
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export type { Database }
|
|
30
|
+
|
|
31
|
+
export function parseYaml(content: string): unknown {
|
|
32
|
+
return Bun.YAML.parse(content)
|
|
33
|
+
}
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import type { Database as BetterDB } from "better-sqlite3"
|
|
2
|
+
import type { Database } from "bun:sqlite"
|
|
3
|
+
|
|
4
|
+
let DB: undefined | typeof Database
|
|
5
|
+
|
|
6
|
+
async function dbInit() {
|
|
7
|
+
const { default: BetterDatabase } = await import("better-sqlite3")
|
|
8
|
+
|
|
9
|
+
// Extend better-sqlite3 to mimic Bun's Database API
|
|
10
|
+
return class extends BetterDatabase {
|
|
11
|
+
private prepareCache = new Map<string, ReturnType<BetterDB["prepare"]>>()
|
|
12
|
+
|
|
13
|
+
// oxlint-disable-next-line no-useless-constructor
|
|
14
|
+
constructor(filename?: string) {
|
|
15
|
+
super(filename)
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
run(...args: Parameters<BetterDB["exec"]>) {
|
|
19
|
+
return this.exec(...args)
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
query(source: string) {
|
|
23
|
+
let ret = this.prepareCache.get(source)
|
|
24
|
+
if (!ret) {
|
|
25
|
+
ret = this.prepare(source)
|
|
26
|
+
this.prepareCache.set(source, ret)
|
|
27
|
+
}
|
|
28
|
+
return ret
|
|
29
|
+
}
|
|
30
|
+
} as unknown as typeof Database
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export async function openDatabase(path: string) {
|
|
34
|
+
DB ??= await dbInit()
|
|
35
|
+
const { load: sqliteVec } = await import("sqlite-vec")
|
|
36
|
+
const db = new DB(path, { strict: true })
|
|
37
|
+
sqliteVec(db)
|
|
38
|
+
return db
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
export type { Database }
|
|
42
|
+
|
|
43
|
+
const { load: loadYaml } = await import("js-yaml")
|
|
44
|
+
|
|
45
|
+
export function parseYaml(content: string): unknown {
|
|
46
|
+
return loadYaml(content)
|
|
47
|
+
}
|
package/src/search.ts
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
import type { Context } from "./context.ts"
|
|
2
|
+
import type { Db, DocRow, FTSResult, VecResult } from "./db.ts"
|
|
3
|
+
import type { VfsEntry } from "./vfs.ts"
|
|
4
|
+
|
|
5
|
+
import { toFts } from "./query.ts"
|
|
6
|
+
import { parentUri } from "./uri.ts"
|
|
7
|
+
import { hash } from "./util.ts"
|
|
8
|
+
|
|
9
|
+
export type SearchMode = "hybrid" | "vec" | "fts"
|
|
10
|
+
|
|
11
|
+
export type SearchScore = {
|
|
12
|
+
score: number
|
|
13
|
+
display_score?: number
|
|
14
|
+
rank: number
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export type SearchResult = {
|
|
18
|
+
uri: string
|
|
19
|
+
path: string
|
|
20
|
+
doc: DocRow
|
|
21
|
+
scores: Partial<Record<SearchMode, SearchScore>>
|
|
22
|
+
match: { fts?: FTSResult; vec?: VecResult }
|
|
23
|
+
} & VfsEntry
|
|
24
|
+
|
|
25
|
+
type SearchResultMap = {
|
|
26
|
+
hybrid: HybridSR
|
|
27
|
+
vec: VecSR
|
|
28
|
+
fts: FtsSR
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export type HybridSR = SearchResult & { scores: { hybrid: SearchScore } }
|
|
32
|
+
export type VecSR = SearchResult & { scores: { vec: SearchScore }; match: { vec: VecResult } }
|
|
33
|
+
export type FtsSR = SearchResult & { scores: { fts: SearchScore }; match: { fts: FTSResult } }
|
|
34
|
+
|
|
35
|
+
export type SearchOptions = {
|
|
36
|
+
limit?: number
|
|
37
|
+
uri?: string
|
|
38
|
+
mode?: SearchMode
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
export type FtsSearchOptions = Omit<SearchOptions, "mode"> & {
|
|
42
|
+
op?: "AND" | "OR"
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
// Description chunks (seq=0) get boosted in vector scoring
|
|
46
|
+
const DESC_BOOST = 0.2
|
|
47
|
+
const PARENT_BOOST = 0.3 // how much to boost a chunk based on its parent's score, vs its own score, in [0, 1]
|
|
48
|
+
const RRF_K = 60
|
|
49
|
+
const RRF_LIMIT = 50
|
|
50
|
+
const VEC_OVERSAMPLE = 4
|
|
51
|
+
|
|
52
|
+
export class Search {
|
|
53
|
+
private constructor(
|
|
54
|
+
public db: Db,
|
|
55
|
+
public ctx: Context
|
|
56
|
+
) {}
|
|
57
|
+
|
|
58
|
+
static async load(ctx: Context) {
|
|
59
|
+
return new Search(await ctx.db(), ctx)
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
async search(query: string, opts: SearchOptions = {}): Promise<SearchResult[]> {
|
|
63
|
+
const mode = opts.mode ?? "hybrid"
|
|
64
|
+
if (mode === "fts") return this.searchFts(query, opts)
|
|
65
|
+
if (mode === "vec") return this.searchVec(query, opts)
|
|
66
|
+
|
|
67
|
+
const limit = opts.limit ?? 20
|
|
68
|
+
|
|
69
|
+
// Hybrid: run both, fuse with RRF — need enough candidates for good fusion
|
|
70
|
+
const subLimit = Math.max(RRF_LIMIT, limit * 2)
|
|
71
|
+
const [fts, vec] = await Promise.all([
|
|
72
|
+
this.searchFts(query, { ...opts, limit: subLimit }),
|
|
73
|
+
this.searchVec(query, { ...opts, limit: subLimit, slice: false }),
|
|
74
|
+
])
|
|
75
|
+
|
|
76
|
+
return this.fuse(fts, vec, limit)
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
async searchVec(
|
|
80
|
+
query: string,
|
|
81
|
+
opts: Omit<SearchOptions, "mode"> & { slice?: boolean } = {}
|
|
82
|
+
): Promise<VecSR[]> {
|
|
83
|
+
const cacheKey = hash(`embed:${query}`)
|
|
84
|
+
const embedder = await this.ctx.embedder()
|
|
85
|
+
const vfs = await this.ctx.vfs()
|
|
86
|
+
|
|
87
|
+
const embedding =
|
|
88
|
+
this.db.cacheGet<number[]>(cacheKey) ??
|
|
89
|
+
this.db.cacheSet(cacheKey, await embedder.embed(query))
|
|
90
|
+
|
|
91
|
+
const scope = vfs.getScope(opts.uri)
|
|
92
|
+
const limit = opts.limit ?? 20
|
|
93
|
+
|
|
94
|
+
// Oversample for post-filtering when scoped
|
|
95
|
+
const results = this.db.searchVec(embedding, {
|
|
96
|
+
limit: Math.max(limit, RRF_LIMIT) * VEC_OVERSAMPLE,
|
|
97
|
+
})
|
|
98
|
+
|
|
99
|
+
// Group by doc, take best chunk per doc
|
|
100
|
+
const best = new Map<number, VecResult & { uri: string; hiscore: number }>()
|
|
101
|
+
for (const vec of results) {
|
|
102
|
+
const uri = scope.map(vec.path)
|
|
103
|
+
if (!uri) continue
|
|
104
|
+
vec.score = vec.seq === 0 ? vec.score + DESC_BOOST * (1 - vec.score) : vec.score
|
|
105
|
+
const existing = best.get(vec.doc_id)?.score ?? -Infinity
|
|
106
|
+
if (vec.score > existing) best.set(vec.doc_id, Object.assign(vec, { hiscore: 0, uri }))
|
|
107
|
+
}
|
|
108
|
+
const scores = new Map<string, number>(best.values().map((vec) => [vec.uri, vec.score]))
|
|
109
|
+
const parentScores = new Map<string, number>()
|
|
110
|
+
const getParentScore = (uri: string): number => {
|
|
111
|
+
const parent = parentUri(uri)
|
|
112
|
+
if (!parent) return 0
|
|
113
|
+
let score = parentScores.get(parent)
|
|
114
|
+
if (score !== undefined) return score
|
|
115
|
+
score = (scores.get(parent) ?? 0) * 0.5 + getParentScore(parent) * 0.5
|
|
116
|
+
parentScores.set(parent, score)
|
|
117
|
+
return score
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
for (const vec of best.values()) {
|
|
121
|
+
const parentScore = getParentScore(vec.uri)
|
|
122
|
+
vec.score += PARENT_BOOST * parentScore * (1 - vec.score)
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
let bestResults = [...best.values()].toSorted((a, b) => b.score - a.score)
|
|
126
|
+
bestResults = opts.slice === false ? bestResults : bestResults.slice(0, limit)
|
|
127
|
+
|
|
128
|
+
const docs = this.db.getDocs(bestResults.map((r) => r.doc_id))
|
|
129
|
+
const ret: VecSR[] = []
|
|
130
|
+
for (const vec of bestResults) {
|
|
131
|
+
const doc = docs.get(vec.doc_id)
|
|
132
|
+
if (doc)
|
|
133
|
+
ret.push({
|
|
134
|
+
doc,
|
|
135
|
+
match: { vec },
|
|
136
|
+
path: vec.path,
|
|
137
|
+
scores: { vec: { rank: 0, score: vec.score } },
|
|
138
|
+
uri: vec.uri,
|
|
139
|
+
})
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
return this.rank("vec", ret)
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
async searchFts(query: string, opts: FtsSearchOptions = {}): Promise<FtsSR[]> {
|
|
146
|
+
const vfs = await this.ctx.vfs()
|
|
147
|
+
const scope = vfs.getScope(opts.uri)
|
|
148
|
+
const results = this.db.searchFts(toFts(query, opts.op ?? "OR"), {
|
|
149
|
+
limit: opts.limit ?? 20,
|
|
150
|
+
scope: scope.paths.map((p) => p.path),
|
|
151
|
+
})
|
|
152
|
+
const docs = this.db.getDocs(results.map((r) => r.rowid))
|
|
153
|
+
const ret: FtsSR[] = []
|
|
154
|
+
for (const fts of results) {
|
|
155
|
+
fts.score = Math.abs(fts.score) / (1 + Math.abs(fts.score))
|
|
156
|
+
const doc = docs.get(fts.rowid)
|
|
157
|
+
const uri = scope.map(doc?.path ?? "")
|
|
158
|
+
if (doc && uri)
|
|
159
|
+
ret.push({
|
|
160
|
+
doc,
|
|
161
|
+
match: { fts },
|
|
162
|
+
path: doc.path,
|
|
163
|
+
scores: { fts: { rank: 0, score: fts.score } },
|
|
164
|
+
uri,
|
|
165
|
+
})
|
|
166
|
+
}
|
|
167
|
+
return this.rank("fts", ret)
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
rank<M extends SearchMode>(mode: M, results: SearchResultMap[M][]): SearchResultMap[M][] {
|
|
171
|
+
const score = (r: SearchResult) => (r.scores as Record<string, SearchScore>)[mode]
|
|
172
|
+
return results
|
|
173
|
+
.toSorted(
|
|
174
|
+
(a, b) =>
|
|
175
|
+
score(b).score - score(a).score ||
|
|
176
|
+
(score(b).display_score ?? 0) - (score(a).display_score ?? 0)
|
|
177
|
+
)
|
|
178
|
+
.map((r, i) => {
|
|
179
|
+
score(r).rank = i + 1
|
|
180
|
+
return r
|
|
181
|
+
})
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
/** Reciprocal Rank Fusion: merge FTS and vector results */
|
|
185
|
+
private fuse(ftsResults: FtsSR[], vecResults: VecSR[], limit: number): HybridSR[] {
|
|
186
|
+
const merged = new Map<number, { uri: string; vec?: VecSR; fts?: FtsSR }>()
|
|
187
|
+
|
|
188
|
+
const minVecScore = vecResults.length
|
|
189
|
+
? vecResults[vecResults.length - 1]?.scores.vec.score
|
|
190
|
+
: undefined
|
|
191
|
+
const minFtsScore = ftsResults.length
|
|
192
|
+
? ftsResults[ftsResults.length - 1]?.scores.fts.score
|
|
193
|
+
: undefined
|
|
194
|
+
const minScore = Math.min(minVecScore ?? 1, minFtsScore ?? 1)
|
|
195
|
+
|
|
196
|
+
for (const fts of ftsResults) merged.set(fts.doc.id, { fts, uri: fts.uri })
|
|
197
|
+
for (const vec of vecResults)
|
|
198
|
+
merged.set(vec.doc.id, { ...merged.get(vec.doc.id), uri: vec.uri, vec })
|
|
199
|
+
|
|
200
|
+
let ret: HybridSR[] = [...merged.values()].map(({ uri, fts, vec }) => {
|
|
201
|
+
const ftsScore = fts?.scores.fts
|
|
202
|
+
const vecScore = vec?.scores.vec
|
|
203
|
+
const score =
|
|
204
|
+
(ftsScore?.rank !== undefined ? 1 / (RRF_K + ftsScore.rank) : 0) +
|
|
205
|
+
(vecScore?.rank !== undefined ? 1 / (RRF_K + vecScore.rank) : 0)
|
|
206
|
+
const display_score =
|
|
207
|
+
0.6 * (vecScore?.score ?? minScore) + 0.4 * (ftsScore?.score ?? minScore)
|
|
208
|
+
const doc = (fts?.doc ?? vec?.doc)!
|
|
209
|
+
return {
|
|
210
|
+
doc,
|
|
211
|
+
match: { ...fts?.match, ...vec?.match },
|
|
212
|
+
path: doc.path,
|
|
213
|
+
scores: {
|
|
214
|
+
...fts?.scores,
|
|
215
|
+
...vec?.scores,
|
|
216
|
+
hybrid: { display_score, rank: 0, score },
|
|
217
|
+
},
|
|
218
|
+
uri,
|
|
219
|
+
}
|
|
220
|
+
})
|
|
221
|
+
|
|
222
|
+
ret = this.rank("hybrid", ret).slice(0, limit)
|
|
223
|
+
|
|
224
|
+
// Normalize scores to [0, 1]
|
|
225
|
+
const bestScore = ret[0]?.scores.hybrid.score ?? 1
|
|
226
|
+
for (const r of ret) r.scores.hybrid.score /= bestScore
|
|
227
|
+
|
|
228
|
+
return ret
|
|
229
|
+
}
|
|
230
|
+
}
|
package/src/snippet.ts
ADDED
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
// oxfmt-ignore
|
|
2
|
+
export type Token = {
|
|
3
|
+
text: string
|
|
4
|
+
lower: string
|
|
5
|
+
}
|
|
6
|
+
|
|
7
|
+
export type TokenWithScore = Token & { score: number }
|
|
8
|
+
|
|
9
|
+
export type SnippetOptions = {
|
|
10
|
+
query: string
|
|
11
|
+
lines?: number
|
|
12
|
+
stopWords?: Map<string, number>
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export type SnippetWindow = {
|
|
16
|
+
start: number
|
|
17
|
+
heat: number
|
|
18
|
+
coverage: number
|
|
19
|
+
score: number
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export type SnippetResult = {
|
|
23
|
+
lines: string[]
|
|
24
|
+
tokens: Token[][]
|
|
25
|
+
scores: number[]
|
|
26
|
+
windows: SnippetWindow[]
|
|
27
|
+
best: SnippetWindow
|
|
28
|
+
heat: number[]
|
|
29
|
+
snippet: string[]
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
export const WORD_REGEX = /[\p{L}\p{N}]+/gu // regex to match words (unicode letters and numbers)
|
|
33
|
+
|
|
34
|
+
const SCORE_EXACT = 3
|
|
35
|
+
const SCORE_LOWER = 2
|
|
36
|
+
const SCORE_QUERY_PREFIX = 1.5 // query "child" matches doc "children" — query is prefix of doc
|
|
37
|
+
const SCORE_DOC_PREFIX = 1 // query "children" matches doc "child" — doc is prefix of query
|
|
38
|
+
const SCORE_OVERLAP = 1
|
|
39
|
+
const MIN_PREFIX_LENGTH = 3 // minimum prefix length to consider for scoring
|
|
40
|
+
const STOPWORD_MIN = 0 // min stopword score for a term
|
|
41
|
+
const STOPWORD_MAX = 1 // max stopword score for a term
|
|
42
|
+
const HEAT_SPREAD = 5 // how many lines to spread the heat on each side of a match (should be < lines/2)
|
|
43
|
+
const WEIGHT_EMPTY = 0.1 // score multiplier for empty lines
|
|
44
|
+
const WEIGHT_NONWORD = 0.3 // score multiplier for lines without any word characters
|
|
45
|
+
const WEIGHT_REPETITION = 0.7 // score multiplier for repeated terms in the same line (to de-emphasize boilerplate)
|
|
46
|
+
const decayLinear = (d: number, r: number) => Math.max(0, 1 - d / r) // linear decay function
|
|
47
|
+
// const decayHyperbolic = (d: number, _r: number) => 1 / (1 + d) // hyperbolic decay function
|
|
48
|
+
// const decayExponential = (d: number, r: number) => Math.exp(-d / r) // exponential decay function
|
|
49
|
+
|
|
50
|
+
// Common English stop words — used for post-processing snippets.
|
|
51
|
+
// Kept minimal: only the highest-frequency words that add no search value.
|
|
52
|
+
// oxfmt-ignore
|
|
53
|
+
const STOP_WORDS = new Set([
|
|
54
|
+
"a", "an", "and", "are", "as", "at", "be", "but", "by", "do", "for", "from",
|
|
55
|
+
"had", "has", "have", "he", "her", "his", "how", "i", "if", "in", "is", "it",
|
|
56
|
+
"its", "my", "no", "not", "of", "on", "or", "our", "she", "so", "than",
|
|
57
|
+
"that", "the", "their", "them", "then", "there", "these", "they", "this",
|
|
58
|
+
"to", "up", "us", "was", "we", "what", "when", "which", "who", "will",
|
|
59
|
+
"with", "you", "your",
|
|
60
|
+
])
|
|
61
|
+
|
|
62
|
+
export function isStopWord(word: string): boolean {
|
|
63
|
+
return STOP_WORDS.has(word.toLowerCase())
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
export class Snippet {
|
|
67
|
+
query: (Token & { score: number })[] = []
|
|
68
|
+
prefixes = new Set<string>()
|
|
69
|
+
prefixRegex: RegExp
|
|
70
|
+
opts: Required<SnippetOptions>
|
|
71
|
+
|
|
72
|
+
constructor(opts: SnippetOptions) {
|
|
73
|
+
this.opts = { ...opts, lines: 5, stopWords: new Map() }
|
|
74
|
+
const stopwords = new Map<string, number>([...STOP_WORDS].map((w) => [w, 0]))
|
|
75
|
+
this.opts.stopWords.forEach((s, w) =>
|
|
76
|
+
stopwords.set(w, Math.max(STOPWORD_MIN, Math.min(STOPWORD_MAX, s)))
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
const tokens = this.tokenize(opts.query, false)
|
|
80
|
+
for (const tok of tokens) {
|
|
81
|
+
const score = stopwords.get(tok.lower) ?? 2
|
|
82
|
+
if (score === 0) continue
|
|
83
|
+
this.prefixes.add(tok.lower.slice(0, 2))
|
|
84
|
+
this.query.push({ ...tok, score })
|
|
85
|
+
}
|
|
86
|
+
this.prefixRegex =
|
|
87
|
+
this.prefixes.size > 0 ? new RegExp(`(${[...this.prefixes].join("|")})`, "i") : /(?!)/ // never matches
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
normalize(text: string): string {
|
|
91
|
+
// Handle diacritics
|
|
92
|
+
text = text.normalize("NFD").replace(/\p{M}/gu, "")
|
|
93
|
+
// Drop possessive 's (and smart quote ’s) completely to avoid orphaned "s" tokens
|
|
94
|
+
text = text.replace(/['’]s\b/gi, "")
|
|
95
|
+
// Globally replace any remaining single quotes/apostrophes with a space
|
|
96
|
+
text = text.replace(/['’]/g, " ")
|
|
97
|
+
return text
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
tokenize(text: string, queryOnly = true): Token[] {
|
|
101
|
+
if (queryOnly && !this.prefixRegex.test(text)) return []
|
|
102
|
+
// only keep unicode letters and numbers
|
|
103
|
+
const tokens = this.normalize(text).match(WORD_REGEX) ?? []
|
|
104
|
+
const ret: Token[] = []
|
|
105
|
+
for (const token of tokens) {
|
|
106
|
+
const lower = token.toLowerCase()
|
|
107
|
+
ret.push({ lower, text: token })
|
|
108
|
+
}
|
|
109
|
+
return ret
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
score(token: Token, queryToken: Token): number {
|
|
113
|
+
const tl = token.lower.length
|
|
114
|
+
if (token.text === queryToken.text) return SCORE_EXACT
|
|
115
|
+
if (token.lower === queryToken.lower) return SCORE_LOWER
|
|
116
|
+
if (token.lower.startsWith(queryToken.lower)) return SCORE_QUERY_PREFIX
|
|
117
|
+
if (queryToken.lower.startsWith(token.lower) && tl >= MIN_PREFIX_LENGTH) return SCORE_DOC_PREFIX
|
|
118
|
+
let prefix = 0
|
|
119
|
+
for (let i = 0; i < token.lower.length; i++) {
|
|
120
|
+
if (token.lower[i] !== queryToken.lower[i]) break
|
|
121
|
+
prefix++
|
|
122
|
+
}
|
|
123
|
+
return prefix >= MIN_PREFIX_LENGTH
|
|
124
|
+
? SCORE_OVERLAP * (prefix / Math.max(token.lower.length, queryToken.lower.length))
|
|
125
|
+
: 0
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
match(input: Token | string): TokenWithScore | undefined {
|
|
129
|
+
let token: Token
|
|
130
|
+
if (typeof input === "string") {
|
|
131
|
+
const tok = this.normalize(input)
|
|
132
|
+
token = { lower: tok.toLowerCase(), text: input }
|
|
133
|
+
} else token = input
|
|
134
|
+
|
|
135
|
+
let [bestScore, bestTok] = [0, this.query[0]]
|
|
136
|
+
// oxlint-disable-next-line typescript/prefer-for-of
|
|
137
|
+
for (let t = 0; t < this.query.length; t++) {
|
|
138
|
+
const queryTok = this.query[t]
|
|
139
|
+
const s = this.score(token, queryTok)
|
|
140
|
+
if (s > bestScore) {
|
|
141
|
+
;[bestScore, bestTok] = [s, queryTok]
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
if (bestScore > 0) return { ...bestTok, score: bestTok.score * bestScore }
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
// get initial scores/coverage for each line
|
|
148
|
+
scores(tokens: Token[][]) {
|
|
149
|
+
const coverage: Set<string>[] = tokens.map(() => new Set())
|
|
150
|
+
const scores = tokens.map((line, l) => {
|
|
151
|
+
let lineScore = 0
|
|
152
|
+
for (const token of line) {
|
|
153
|
+
const queryTok = this.match(token)
|
|
154
|
+
if (!queryTok) continue
|
|
155
|
+
let score = queryTok.score
|
|
156
|
+
if (coverage[l].has(queryTok.lower)) score *= WEIGHT_REPETITION
|
|
157
|
+
coverage[l].add(queryTok.lower)
|
|
158
|
+
lineScore += score
|
|
159
|
+
}
|
|
160
|
+
return lineScore
|
|
161
|
+
})
|
|
162
|
+
return { coverage, scores }
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
// Build heatmap using a bounded spread (O(N * radius))
|
|
166
|
+
heat(lines: string[], scores: number[]): number[] {
|
|
167
|
+
const spread = Math.max(HEAT_SPREAD, Math.ceil(this.opts.lines / 2))
|
|
168
|
+
const heat = new Float64Array(scores.length)
|
|
169
|
+
for (let i = 0; i < scores.length; i++) {
|
|
170
|
+
if (scores[i] === 0) continue
|
|
171
|
+
const spreadStart = Math.max(0, i - spread)
|
|
172
|
+
const spreadEnd = Math.min(scores.length - 1, i + spread)
|
|
173
|
+
for (let j = spreadStart; j <= spreadEnd; j++) {
|
|
174
|
+
let weight = 1
|
|
175
|
+
|
|
176
|
+
// NOTE: de-emphasize lines without any word characters (e.g. code blocks, separators)
|
|
177
|
+
if (!lines[j].trim()) weight *= WEIGHT_EMPTY
|
|
178
|
+
else if (!lines[j].match(/\p{L}/u)) weight *= WEIGHT_NONWORD
|
|
179
|
+
|
|
180
|
+
weight *= decayLinear(Math.abs(i - j), spread)
|
|
181
|
+
|
|
182
|
+
heat[j] += scores[i] * weight
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
return [...heat]
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
extract(text: string): SnippetResult {
|
|
189
|
+
const lines = text.split("\n")
|
|
190
|
+
const radius = Math.min(this.opts.lines, lines.length)
|
|
191
|
+
const tokens = lines.map((line) => this.tokenize(line))
|
|
192
|
+
|
|
193
|
+
const { scores, coverage } = this.scores(tokens)
|
|
194
|
+
const heat = this.heat(lines, scores)
|
|
195
|
+
|
|
196
|
+
// Find the window with highest heat × term coverage
|
|
197
|
+
const windows: SnippetWindow[] = []
|
|
198
|
+
|
|
199
|
+
for (let i = 0; i <= scores.length - radius; i++) {
|
|
200
|
+
if (heat[i] === 0 && windows.length > 0) continue // skip windows that don't start with any heat to save computation
|
|
201
|
+
let heatSum = 0
|
|
202
|
+
// Count how many distinct query terms appear in this window
|
|
203
|
+
const covered = new Set<string>()
|
|
204
|
+
for (let j = i; j < i + radius; j++) {
|
|
205
|
+
heatSum += heat[j]
|
|
206
|
+
coverage[j].forEach((t) => covered.add(t))
|
|
207
|
+
}
|
|
208
|
+
const cov = this.query.length === 0 ? 1 : covered.size / this.query.length
|
|
209
|
+
windows.push({ coverage: cov, heat: heatSum, score: heatSum * cov, start: i })
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
let best = windows[0] ?? { coverage: 0, heat: 0, score: 0, start: 0 }
|
|
213
|
+
for (let i = 1; i < windows.length; i++) {
|
|
214
|
+
if (windows[i].score > best.score) best = windows[i]
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
const snippet = lines.slice(best.start, best.start + radius)
|
|
218
|
+
return { best, heat: [...heat], lines, scores, snippet, tokens, windows }
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
debug(result: SnippetResult) {
|
|
222
|
+
// oxlint-disable-next-line no-console
|
|
223
|
+
console.info("Options:", this.opts)
|
|
224
|
+
// oxlint-disable-next-line no-console
|
|
225
|
+
console.info("Query :", this.query)
|
|
226
|
+
// oxlint-disable-next-line unicorn/consistent-function-scoping
|
|
227
|
+
const score = (n?: number, f = 1) => (n !== undefined ? n.toFixed(f).padEnd(4) : " ".repeat(4))
|
|
228
|
+
result.lines.forEach((line, i) => {
|
|
229
|
+
const isBest = (l: number) =>
|
|
230
|
+
l >= result.best.start && l < result.best.start + this.opts.lines
|
|
231
|
+
|
|
232
|
+
const lineScore = score(result.scores[i])
|
|
233
|
+
const lineHeat = score(result.heat[i])
|
|
234
|
+
const windowHeat = score(result.windows[i]?.heat ?? 0)
|
|
235
|
+
const windowScore = score(result.windows[i]?.score ?? 0)
|
|
236
|
+
const coverage = ((result.windows[i]?.coverage ?? 0) * 100).toFixed(0).padStart(3)
|
|
237
|
+
|
|
238
|
+
// oxlint-disable-next-line no-console
|
|
239
|
+
console.log(
|
|
240
|
+
`s:${lineScore} h:${lineHeat} wh:${windowHeat} ws:${windowScore} ${coverage}% ${isBest(i) ? ">" : " "} ${line}`
|
|
241
|
+
)
|
|
242
|
+
})
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
highlight(text: string, hl: (word: string, offset: number) => string): string {
|
|
246
|
+
return text.replace(WORD_REGEX, (word, offset) => (this.match(word) ? hl(word, offset) : word))
|
|
247
|
+
}
|
|
248
|
+
}
|
package/src/sqlite.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export { openDatabase, type Database } from "#runtime"
|