@sorane/search 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/assets/search.mjs +239 -0
- package/package.json +40 -0
- package/src/build-index.ts +115 -0
- package/src/chunker.ts +174 -0
- package/src/derive-web-index.ts +59 -0
- package/src/disclosure-map.ts +42 -0
- package/src/embeddings.ts +79 -0
- package/src/emit-search-assets.ts +85 -0
- package/src/heading-slug.ts +24 -0
- package/src/incremental.ts +40 -0
- package/src/index.ts +60 -0
- package/src/search.ts +131 -0
- package/src/store.ts +302 -0
- package/src/vendor-web.ts +64 -0
- package/src/walk.ts +15 -0
- package/src/web-export.ts +160 -0
package/src/store.ts
ADDED
|
@@ -0,0 +1,302 @@
|
|
|
1
|
+
import Database from "better-sqlite3";
|
|
2
|
+
import * as sqliteVec from "sqlite-vec";
|
|
3
|
+
import { mkdirSync, rmSync, existsSync } from "node:fs";
|
|
4
|
+
import { dirname } from "node:path";
|
|
5
|
+
import type { Chunk } from "./chunker.ts";
|
|
6
|
+
|
|
7
|
+
export const SCHEMA_VERSION = 2;
|
|
8
|
+
|
|
9
|
+
export interface IndexMeta {
|
|
10
|
+
readonly modelId: string;
|
|
11
|
+
readonly dim: number;
|
|
12
|
+
readonly quant: string;
|
|
13
|
+
readonly modelSha256: string;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
const SCHEMA = `
|
|
17
|
+
CREATE TABLE chunks (
|
|
18
|
+
id INTEGER PRIMARY KEY,
|
|
19
|
+
source TEXT NOT NULL,
|
|
20
|
+
chunk_index INTEGER NOT NULL,
|
|
21
|
+
text TEXT NOT NULL,
|
|
22
|
+
heading_path TEXT,
|
|
23
|
+
heading_slug TEXT,
|
|
24
|
+
doc_type TEXT,
|
|
25
|
+
title TEXT,
|
|
26
|
+
timestamp TEXT,
|
|
27
|
+
tags TEXT
|
|
28
|
+
);
|
|
29
|
+
CREATE INDEX idx_chunks_source ON chunks(source);
|
|
30
|
+
CREATE INDEX idx_chunks_type ON chunks(doc_type);
|
|
31
|
+
|
|
32
|
+
CREATE VIRTUAL TABLE chunks_fts USING fts5(
|
|
33
|
+
text,
|
|
34
|
+
content='chunks',
|
|
35
|
+
content_rowid='id',
|
|
36
|
+
tokenize='trigram'
|
|
37
|
+
);
|
|
38
|
+
|
|
39
|
+
CREATE TRIGGER chunks_ai AFTER INSERT ON chunks BEGIN
|
|
40
|
+
INSERT INTO chunks_fts(rowid, text) VALUES (new.id, new.text);
|
|
41
|
+
END;
|
|
42
|
+
CREATE TRIGGER chunks_ad AFTER DELETE ON chunks BEGIN
|
|
43
|
+
INSERT INTO chunks_fts(chunks_fts, rowid, text) VALUES ('delete', old.id, old.text);
|
|
44
|
+
END;
|
|
45
|
+
CREATE TRIGGER chunks_au AFTER UPDATE ON chunks BEGIN
|
|
46
|
+
INSERT INTO chunks_fts(chunks_fts, rowid, text) VALUES ('delete', old.id, old.text);
|
|
47
|
+
INSERT INTO chunks_fts(rowid, text) VALUES (new.id, new.text);
|
|
48
|
+
END;
|
|
49
|
+
|
|
50
|
+
CREATE TABLE index_meta (
|
|
51
|
+
key TEXT PRIMARY KEY,
|
|
52
|
+
value TEXT
|
|
53
|
+
);
|
|
54
|
+
`;
|
|
55
|
+
|
|
56
|
+
export interface ChunkRow {
|
|
57
|
+
readonly id: number;
|
|
58
|
+
readonly source: string;
|
|
59
|
+
readonly chunkIndex: number;
|
|
60
|
+
readonly text: string;
|
|
61
|
+
readonly headingPath: string;
|
|
62
|
+
readonly headingSlug: string;
|
|
63
|
+
readonly docType: string;
|
|
64
|
+
readonly title: string;
|
|
65
|
+
readonly timestamp: string;
|
|
66
|
+
readonly tags: string;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
export interface MetaFilter {
|
|
70
|
+
readonly docType?: string;
|
|
71
|
+
readonly tag?: string;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
export interface VecHit extends ChunkRow {
|
|
75
|
+
readonly distance: number;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
export interface FtsHit extends ChunkRow {
|
|
79
|
+
readonly bm25: number;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
export interface Counts {
|
|
83
|
+
readonly chunks: number;
|
|
84
|
+
readonly fts: number;
|
|
85
|
+
readonly vec: number;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
const CHUNK_COLS = `c.id, c.source, c.chunk_index AS chunkIndex, c.text,
|
|
89
|
+
c.heading_path AS headingPath, c.heading_slug AS headingSlug,
|
|
90
|
+
c.doc_type AS docType, c.title, c.timestamp, c.tags`;
|
|
91
|
+
|
|
92
|
+
function buildWhere(filter: MetaFilter): { clause: string; binds: string[] } {
|
|
93
|
+
const conds: string[] = [];
|
|
94
|
+
const binds: string[] = [];
|
|
95
|
+
if (filter.docType) {
|
|
96
|
+
conds.push("c.doc_type = ?");
|
|
97
|
+
binds.push(filter.docType);
|
|
98
|
+
}
|
|
99
|
+
if (filter.tag) {
|
|
100
|
+
conds.push("(',' || c.tags || ',') LIKE ?");
|
|
101
|
+
binds.push(`%,${filter.tag},%`);
|
|
102
|
+
}
|
|
103
|
+
return { clause: conds.length ? conds.join(" AND ") : "", binds };
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
export class IndexStore {
|
|
107
|
+
private readonly db: Database.Database;
|
|
108
|
+
|
|
109
|
+
constructor(dbPath: string, opts: { fresh?: boolean; dim?: number } = {}) {
|
|
110
|
+
mkdirSync(dirname(dbPath), { recursive: true });
|
|
111
|
+
if (opts.fresh && existsSync(dbPath)) rmSync(dbPath);
|
|
112
|
+
this.db = new Database(dbPath);
|
|
113
|
+
sqliteVec.load(this.db);
|
|
114
|
+
this.db.pragma("journal_mode = WAL");
|
|
115
|
+
if (!this.tableExists("chunks")) {
|
|
116
|
+
this.db.exec(SCHEMA);
|
|
117
|
+
const dim = opts.dim ?? 256;
|
|
118
|
+
this.db.exec(`CREATE VIRTUAL TABLE vec_chunks USING vec0(embedding FLOAT[${dim}])`);
|
|
119
|
+
} else {
|
|
120
|
+
this.ensureVecTable(opts.dim ?? 256);
|
|
121
|
+
}
|
|
122
|
+
this.db.exec(
|
|
123
|
+
"CREATE TABLE IF NOT EXISTS source_meta (source TEXT PRIMARY KEY, sha256 TEXT NOT NULL)",
|
|
124
|
+
);
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
private ensureVecTable(dim: number): void {
|
|
128
|
+
if (!this.tableExists("vec_chunks")) {
|
|
129
|
+
this.db.exec(`CREATE VIRTUAL TABLE vec_chunks USING vec0(embedding FLOAT[${dim}])`);
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
private tableExists(name: string): boolean {
|
|
134
|
+
return (
|
|
135
|
+
this.db
|
|
136
|
+
.prepare("SELECT 1 FROM sqlite_master WHERE type IN ('table','view') AND name = ?")
|
|
137
|
+
.get(name) != null
|
|
138
|
+
);
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
hasVectors(): boolean {
|
|
142
|
+
if (!this.tableExists("vec_chunks")) return false;
|
|
143
|
+
const n = (this.db.prepare("SELECT COUNT(*) c FROM vec_chunks").get() as { c: number }).c;
|
|
144
|
+
return n > 0;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
clear(): void {
|
|
148
|
+
this.db.exec("DELETE FROM chunks; DELETE FROM source_meta;");
|
|
149
|
+
if (this.tableExists("vec_chunks")) {
|
|
150
|
+
this.db.exec("DELETE FROM vec_chunks;");
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
deleteBySource(source: string): void {
|
|
155
|
+
const ids = this.db.prepare("SELECT id FROM chunks WHERE source = ?").all(source) as {
|
|
156
|
+
id: number;
|
|
157
|
+
}[];
|
|
158
|
+
const delVec = this.tableExists("vec_chunks")
|
|
159
|
+
? this.db.prepare("DELETE FROM vec_chunks WHERE rowid = ?")
|
|
160
|
+
: null;
|
|
161
|
+
const delChunk = this.db.prepare("DELETE FROM chunks WHERE source = ?");
|
|
162
|
+
const delMeta = this.db.prepare("DELETE FROM source_meta WHERE source = ?");
|
|
163
|
+
const tx = this.db.transaction(() => {
|
|
164
|
+
for (const { id } of ids) {
|
|
165
|
+
if (delVec) delVec.run(BigInt(id));
|
|
166
|
+
}
|
|
167
|
+
delChunk.run(source);
|
|
168
|
+
delMeta.run(source);
|
|
169
|
+
});
|
|
170
|
+
tx();
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
sourceHashes(): Map<string, string> {
|
|
174
|
+
const rows = this.db.prepare("SELECT source, sha256 FROM source_meta").all() as {
|
|
175
|
+
source: string;
|
|
176
|
+
sha256: string;
|
|
177
|
+
}[];
|
|
178
|
+
return new Map(rows.map((r) => [r.source, r.sha256]));
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
setSourceHash(source: string, sha256: string): void {
|
|
182
|
+
this.db
|
|
183
|
+
.prepare("INSERT OR REPLACE INTO source_meta(source, sha256) VALUES (?, ?)")
|
|
184
|
+
.run(source, sha256);
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
addChunks(chunks: Chunk[], vectors?: number[][]): void {
|
|
188
|
+
const insChunk = this.db.prepare(
|
|
189
|
+
`INSERT INTO chunks
|
|
190
|
+
(source, chunk_index, text, heading_path, heading_slug, doc_type, title, timestamp, tags)
|
|
191
|
+
VALUES (@source, @chunkIndex, @text, @headingPath, @headingSlug, @docType, @title, @timestamp, @tags)`,
|
|
192
|
+
);
|
|
193
|
+
const insVec = this.tableExists("vec_chunks")
|
|
194
|
+
? this.db.prepare("INSERT INTO vec_chunks(rowid, embedding) VALUES (?, ?)")
|
|
195
|
+
: null;
|
|
196
|
+
const tx = this.db.transaction(() => {
|
|
197
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
198
|
+
const info = insChunk.run(chunks[i]! as unknown as Record<string, unknown>);
|
|
199
|
+
const vec = vectors?.[i];
|
|
200
|
+
if (insVec && vec) {
|
|
201
|
+
const rowid = BigInt(info.lastInsertRowid as number | bigint);
|
|
202
|
+
const buf = Buffer.from(new Float32Array(vec).buffer);
|
|
203
|
+
insVec.run(rowid, buf);
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
});
|
|
207
|
+
tx();
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
setMeta(meta?: IndexMeta): void {
|
|
211
|
+
const ins = this.db.prepare(
|
|
212
|
+
"INSERT OR REPLACE INTO index_meta(key, value) VALUES (?, ?)",
|
|
213
|
+
);
|
|
214
|
+
const tx = this.db.transaction(() => {
|
|
215
|
+
ins.run("schema_version", String(SCHEMA_VERSION));
|
|
216
|
+
ins.run("built_at", new Date().toISOString());
|
|
217
|
+
if (meta) {
|
|
218
|
+
ins.run("mode", "hybrid");
|
|
219
|
+
ins.run("model_id", meta.modelId);
|
|
220
|
+
ins.run("dim", String(meta.dim));
|
|
221
|
+
ins.run("quant", meta.quant);
|
|
222
|
+
ins.run("model_sha256", meta.modelSha256);
|
|
223
|
+
} else {
|
|
224
|
+
ins.run("mode", "fts-only");
|
|
225
|
+
}
|
|
226
|
+
});
|
|
227
|
+
tx();
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
readMeta(): Record<string, string> {
|
|
231
|
+
const rows = this.db.prepare("SELECT key, value FROM index_meta").all() as {
|
|
232
|
+
key: string;
|
|
233
|
+
value: string;
|
|
234
|
+
}[];
|
|
235
|
+
return Object.fromEntries(rows.map((r) => [r.key, r.value]));
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
counts(): Counts {
|
|
239
|
+
const n = (sql: string) => (this.db.prepare(sql).get() as { c: number }).c;
|
|
240
|
+
return {
|
|
241
|
+
chunks: n("SELECT COUNT(*) c FROM chunks"),
|
|
242
|
+
fts: n("SELECT COUNT(*) c FROM chunks_fts"),
|
|
243
|
+
vec: this.tableExists("vec_chunks") ? n("SELECT COUNT(*) c FROM vec_chunks") : 0,
|
|
244
|
+
};
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
vecKnn(queryVec: number[], k: number, filter: MetaFilter = {}): VecHit[] {
|
|
248
|
+
if (!this.tableExists("vec_chunks")) return [];
|
|
249
|
+
const { clause, binds } = buildWhere(filter);
|
|
250
|
+
const knnLimit = clause ? Math.max(k * 8, 64) : k;
|
|
251
|
+
const buf = Buffer.from(new Float32Array(queryVec).buffer);
|
|
252
|
+
const sql = `
|
|
253
|
+
SELECT ${CHUNK_COLS}, k.distance
|
|
254
|
+
FROM (
|
|
255
|
+
SELECT rowid, distance FROM vec_chunks
|
|
256
|
+
WHERE embedding MATCH ? ORDER BY distance LIMIT ?
|
|
257
|
+
) k
|
|
258
|
+
JOIN chunks c ON c.id = k.rowid
|
|
259
|
+
${clause ? `WHERE ${clause}` : ""}
|
|
260
|
+
ORDER BY k.distance
|
|
261
|
+
LIMIT ?`;
|
|
262
|
+
return this.db.prepare(sql).all(buf, knnLimit, ...binds, k) as VecHit[];
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
exportAll(): { rows: ChunkRow[]; vectors: number[][] } {
|
|
266
|
+
const rows = this.db
|
|
267
|
+
.prepare(`SELECT ${CHUNK_COLS} FROM chunks c ORDER BY c.id`)
|
|
268
|
+
.all() as ChunkRow[];
|
|
269
|
+
if (!this.tableExists("vec_chunks") || rows.length === 0) {
|
|
270
|
+
return { rows, vectors: rows.map(() => []) };
|
|
271
|
+
}
|
|
272
|
+
const getVec = this.db.prepare("SELECT embedding FROM vec_chunks WHERE rowid = ?");
|
|
273
|
+
const vectors: number[][] = rows.map((r) => {
|
|
274
|
+
const v = getVec.get(BigInt(r.id)) as { embedding: Buffer | Uint8Array } | undefined;
|
|
275
|
+
if (!v) return [];
|
|
276
|
+
const buf = v.embedding;
|
|
277
|
+
const f32 = new Float32Array(
|
|
278
|
+
buf.buffer,
|
|
279
|
+
buf.byteOffset,
|
|
280
|
+
buf.byteLength / Float32Array.BYTES_PER_ELEMENT,
|
|
281
|
+
);
|
|
282
|
+
return Array.from(f32);
|
|
283
|
+
});
|
|
284
|
+
return { rows, vectors };
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
ftsSearch(query: string, k: number, filter: MetaFilter = {}): FtsHit[] {
|
|
288
|
+
const { clause, binds } = buildWhere(filter);
|
|
289
|
+
const sql = `
|
|
290
|
+
SELECT ${CHUNK_COLS}, bm25(chunks_fts) AS bm25
|
|
291
|
+
FROM chunks_fts
|
|
292
|
+
JOIN chunks c ON c.id = chunks_fts.rowid
|
|
293
|
+
WHERE chunks_fts MATCH ?${clause ? ` AND ${clause}` : ""}
|
|
294
|
+
ORDER BY bm25
|
|
295
|
+
LIMIT ?`;
|
|
296
|
+
return this.db.prepare(sql).all(query, ...binds, k) as FtsHit[];
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
close(): void {
|
|
300
|
+
this.db.close();
|
|
301
|
+
}
|
|
302
|
+
}
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import { copyFileSync, existsSync, mkdirSync, readFileSync } from "node:fs";
|
|
2
|
+
import { dirname, join, resolve } from "node:path";
|
|
3
|
+
import { fileURLToPath } from "node:url";
|
|
4
|
+
|
|
5
|
+
function packageRoot(): string {
|
|
6
|
+
return resolve(dirname(fileURLToPath(import.meta.url)), "../../..");
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
export function vendorModel(
|
|
10
|
+
modelRoot: string,
|
|
11
|
+
modelId: string,
|
|
12
|
+
outRoot: string,
|
|
13
|
+
): boolean {
|
|
14
|
+
const srcDir = resolve(modelRoot, modelId);
|
|
15
|
+
const onnx = join(srcDir, "onnx", "model_quantized.onnx");
|
|
16
|
+
if (!existsSync(onnx)) return false;
|
|
17
|
+
|
|
18
|
+
const destDir = join(outRoot, "models", modelId);
|
|
19
|
+
mkdirSync(join(destDir, "onnx"), { recursive: true });
|
|
20
|
+
for (const name of [
|
|
21
|
+
"config.json",
|
|
22
|
+
"tokenizer.json",
|
|
23
|
+
"tokenizer_config.json",
|
|
24
|
+
"special_tokens_map.json",
|
|
25
|
+
]) {
|
|
26
|
+
const src = join(srcDir, name);
|
|
27
|
+
if (existsSync(src)) copyFileSync(src, join(destDir, name));
|
|
28
|
+
}
|
|
29
|
+
copyFileSync(onnx, join(destDir, "onnx", "model_quantized.onnx"));
|
|
30
|
+
return true;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export function vendorRuntime(outRoot: string, repoRoot?: string): boolean {
|
|
34
|
+
const root = repoRoot ?? packageRoot();
|
|
35
|
+
const tjs = resolve(root, "node_modules/@huggingface/transformers/dist/transformers.web.js");
|
|
36
|
+
const ortDir = resolve(root, "node_modules/onnxruntime-web/dist");
|
|
37
|
+
const ort = join(ortDir, "ort.webgpu.bundle.min.mjs");
|
|
38
|
+
if (!existsSync(tjs) || !existsSync(ort)) return false;
|
|
39
|
+
|
|
40
|
+
const libDir = join(outRoot, "assets", "search", "lib");
|
|
41
|
+
mkdirSync(libDir, { recursive: true });
|
|
42
|
+
copyFileSync(tjs, join(libDir, "transformers.web.js"));
|
|
43
|
+
copyFileSync(ort, join(libDir, "ort.webgpu.bundle.min.mjs"));
|
|
44
|
+
for (const name of ["ort-wasm-simd-threaded.asyncify.mjs", "ort-wasm-simd-threaded.asyncify.wasm"]) {
|
|
45
|
+
const src = join(ortDir, name);
|
|
46
|
+
if (existsSync(src)) copyFileSync(src, join(libDir, name));
|
|
47
|
+
}
|
|
48
|
+
return true;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
export function copySearchScript(outRoot: string, repoRoot?: string): boolean {
|
|
52
|
+
const root = repoRoot ?? packageRoot();
|
|
53
|
+
const src = join(root, "packages/search/assets/search.mjs");
|
|
54
|
+
if (!existsSync(src)) return false;
|
|
55
|
+
const destDir = join(outRoot, "assets");
|
|
56
|
+
mkdirSync(destDir, { recursive: true });
|
|
57
|
+
copyFileSync(src, join(destDir, "search.mjs"));
|
|
58
|
+
return true;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
export function readSearchScript(repoRoot?: string): string {
|
|
62
|
+
const root = repoRoot ?? packageRoot();
|
|
63
|
+
return readFileSync(join(root, "packages/search/assets/search.mjs"), "utf8");
|
|
64
|
+
}
|
package/src/walk.ts
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import { readdirSync, statSync } from "node:fs";
|
|
2
|
+
import { join } from "node:path";
|
|
3
|
+
|
|
4
|
+
export function walkMarkdown(root: string): string[] {
|
|
5
|
+
const out: string[] = [];
|
|
6
|
+
function visit(dir: string): void {
|
|
7
|
+
for (const name of readdirSync(dir).sort()) {
|
|
8
|
+
const abs = join(dir, name);
|
|
9
|
+
if (statSync(abs).isDirectory()) visit(abs);
|
|
10
|
+
else if (name.endsWith(".md")) out.push(abs);
|
|
11
|
+
}
|
|
12
|
+
}
|
|
13
|
+
visit(root);
|
|
14
|
+
return out;
|
|
15
|
+
}
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
import { Buffer } from "node:buffer";
|
|
2
|
+
import type { ChunkRow } from "./store.ts";
|
|
3
|
+
|
|
4
|
+
export const WEB_INDEX_SCHEMA_VERSION = 3;
|
|
5
|
+
export const FTS_WEB_INDEX_SCHEMA_VERSION = 4;
|
|
6
|
+
export const INT8_SCALE = 127;
|
|
7
|
+
export const SNIPPET_LEN = 220;
|
|
8
|
+
|
|
9
|
+
export interface WebChunk {
|
|
10
|
+
readonly source: string;
|
|
11
|
+
readonly url: string;
|
|
12
|
+
readonly heading_slug: string;
|
|
13
|
+
readonly heading_path: string;
|
|
14
|
+
readonly doc_type: string;
|
|
15
|
+
readonly title: string;
|
|
16
|
+
readonly tags: string;
|
|
17
|
+
readonly snippet: string;
|
|
18
|
+
readonly digital_source_type?: string;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export interface WebIndex {
|
|
22
|
+
readonly schema_version: number;
|
|
23
|
+
readonly mode: "hybrid";
|
|
24
|
+
readonly built_at: string;
|
|
25
|
+
readonly model: { id: string; dim: number; quant: string; sha256: string };
|
|
26
|
+
readonly chunks: WebChunk[];
|
|
27
|
+
readonly embeddings: {
|
|
28
|
+
readonly dim: number;
|
|
29
|
+
readonly encoding: "int8";
|
|
30
|
+
readonly scale: number;
|
|
31
|
+
readonly vectors_b64: string;
|
|
32
|
+
};
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
export interface FtsWebChunk extends WebChunk {
|
|
36
|
+
readonly text: string;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
export interface FtsWebIndex {
|
|
40
|
+
readonly schema_version: number;
|
|
41
|
+
readonly mode: "fts";
|
|
42
|
+
readonly built_at: string;
|
|
43
|
+
readonly chunks: FtsWebChunk[];
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
export function toSnippet(text: string, max: number = SNIPPET_LEN): string {
|
|
47
|
+
const flat = text.replace(/\s+/g, " ").trim();
|
|
48
|
+
return flat.length <= max ? flat : flat.slice(0, max) + "…";
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
export function defaultSourceUrl(source: string): string {
|
|
52
|
+
return source.replace(/\.md$/i, ".html");
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
function disclosureForSource(
|
|
56
|
+
source: string,
|
|
57
|
+
disclosureMap: ReadonlyMap<string, string> | undefined,
|
|
58
|
+
machineReadable: boolean,
|
|
59
|
+
): string | undefined {
|
|
60
|
+
if (!machineReadable || !disclosureMap) return undefined;
|
|
61
|
+
return disclosureMap.get(source);
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
export function buildWebIndex(
|
|
65
|
+
rows: ChunkRow[],
|
|
66
|
+
vectors: number[][],
|
|
67
|
+
meta: Record<string, string>,
|
|
68
|
+
sourceToUrl: (source: string) => string = defaultSourceUrl,
|
|
69
|
+
opts?: {
|
|
70
|
+
readonly disclosureMap?: ReadonlyMap<string, string>;
|
|
71
|
+
readonly machineReadable?: boolean;
|
|
72
|
+
},
|
|
73
|
+
): WebIndex {
|
|
74
|
+
const machineReadable = opts?.machineReadable !== false;
|
|
75
|
+
const dim = Number(meta.dim) || (vectors[0]?.length ?? 0);
|
|
76
|
+
if (rows.length !== vectors.length) {
|
|
77
|
+
throw new Error(`row/vector count mismatch: ${rows.length} != ${vectors.length}`);
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
const kept: { row: ChunkRow; vec: number[] }[] = [];
|
|
81
|
+
for (let i = 0; i < rows.length; i++) {
|
|
82
|
+
const vec = vectors[i]!;
|
|
83
|
+
if (vec.length === 0) continue;
|
|
84
|
+
kept.push({ row: rows[i]!, vec });
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
const buf = new Int8Array(kept.length * dim);
|
|
88
|
+
for (let i = 0; i < kept.length; i++) {
|
|
89
|
+
const v = kept[i]!.vec;
|
|
90
|
+
if (v.length !== dim) throw new Error(`dimension mismatch chunk[${i}]: ${v.length} != ${dim}`);
|
|
91
|
+
for (let j = 0; j < dim; j++) {
|
|
92
|
+
const q = Math.round(v[j]! * INT8_SCALE);
|
|
93
|
+
buf[i * dim + j] = q < -127 ? -127 : q > 127 ? 127 : q;
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
const vectors_b64 = Buffer.from(buf.buffer, buf.byteOffset, buf.byteLength).toString("base64");
|
|
97
|
+
|
|
98
|
+
const chunks: WebChunk[] = kept.map(({ row: r }) => {
|
|
99
|
+
const chunk: WebChunk = {
|
|
100
|
+
source: r.source,
|
|
101
|
+
url: sourceToUrl(r.source),
|
|
102
|
+
heading_slug: r.headingSlug,
|
|
103
|
+
heading_path: r.headingPath,
|
|
104
|
+
doc_type: r.docType,
|
|
105
|
+
title: r.title,
|
|
106
|
+
tags: r.tags,
|
|
107
|
+
snippet: toSnippet(r.text),
|
|
108
|
+
};
|
|
109
|
+
const dst = disclosureForSource(r.source, opts?.disclosureMap, machineReadable);
|
|
110
|
+
if (dst) return { ...chunk, digital_source_type: dst };
|
|
111
|
+
return chunk;
|
|
112
|
+
});
|
|
113
|
+
|
|
114
|
+
return {
|
|
115
|
+
schema_version: WEB_INDEX_SCHEMA_VERSION,
|
|
116
|
+
mode: "hybrid",
|
|
117
|
+
built_at: new Date().toISOString(),
|
|
118
|
+
model: {
|
|
119
|
+
id: meta.model_id ?? "",
|
|
120
|
+
dim,
|
|
121
|
+
quant: meta.quant ?? "",
|
|
122
|
+
sha256: meta.model_sha256 ?? "",
|
|
123
|
+
},
|
|
124
|
+
chunks,
|
|
125
|
+
embeddings: { dim, encoding: "int8", scale: INT8_SCALE, vectors_b64 },
|
|
126
|
+
};
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
export function buildFtsWebIndex(
|
|
130
|
+
rows: ChunkRow[],
|
|
131
|
+
sourceToUrl: (source: string) => string = defaultSourceUrl,
|
|
132
|
+
opts?: {
|
|
133
|
+
readonly disclosureMap?: ReadonlyMap<string, string>;
|
|
134
|
+
readonly machineReadable?: boolean;
|
|
135
|
+
},
|
|
136
|
+
): FtsWebIndex {
|
|
137
|
+
const machineReadable = opts?.machineReadable !== false;
|
|
138
|
+
const chunks: FtsWebChunk[] = rows.map((r) => {
|
|
139
|
+
const chunk: FtsWebChunk = {
|
|
140
|
+
source: r.source,
|
|
141
|
+
url: sourceToUrl(r.source),
|
|
142
|
+
heading_slug: r.headingSlug,
|
|
143
|
+
heading_path: r.headingPath,
|
|
144
|
+
doc_type: r.docType,
|
|
145
|
+
title: r.title,
|
|
146
|
+
tags: r.tags,
|
|
147
|
+
snippet: toSnippet(r.text),
|
|
148
|
+
text: r.text,
|
|
149
|
+
};
|
|
150
|
+
const dst = disclosureForSource(r.source, opts?.disclosureMap, machineReadable);
|
|
151
|
+
if (dst) return { ...chunk, digital_source_type: dst };
|
|
152
|
+
return chunk;
|
|
153
|
+
});
|
|
154
|
+
return {
|
|
155
|
+
schema_version: FTS_WEB_INDEX_SCHEMA_VERSION,
|
|
156
|
+
mode: "fts",
|
|
157
|
+
built_at: new Date().toISOString(),
|
|
158
|
+
chunks,
|
|
159
|
+
};
|
|
160
|
+
}
|