@sorane/search 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,79 @@
1
+ import { resolve } from "node:path";
2
+ import { readFileSync, existsSync } from "node:fs";
3
+
4
+ export interface EmbeddingProvider {
5
+ readonly dimensions: number;
6
+ readonly modelId?: string;
7
+ readonly modelSha256?: string;
8
+ readonly quant?: string;
9
+ embed(text: string): Promise<number[]>;
10
+ embedBatch(texts: string[]): Promise<number[][]>;
11
+ }
12
+
13
+ export const DOC_PREFIX = "検索文書: ";
14
+ export const QUERY_PREFIX = "検索クエリ: ";
15
+
16
+ export interface RuriOptions {
17
+ readonly modelRoot?: string;
18
+ readonly modelId?: string;
19
+ readonly dtype?: string;
20
+ }
21
+
22
+ export class RuriEmbeddings implements EmbeddingProvider {
23
+ readonly dimensions = 256;
24
+ readonly modelId: string;
25
+ readonly modelDir: string;
26
+ private readonly dtype: string;
27
+ private readonly modelRoot: string;
28
+ private extractor: ((text: string, opts: object) => Promise<{ data: Float32Array | number[] }>) | null = null;
29
+
30
+ constructor(options: RuriOptions = {}) {
31
+ this.modelRoot = resolve(options.modelRoot ?? "vendor/models");
32
+ this.modelId = options.modelId ?? "ruri-v3-30m";
33
+ this.modelDir = resolve(this.modelRoot, this.modelId);
34
+ this.dtype = options.dtype ?? "q8";
35
+ }
36
+
37
+ get modelSha256(): string {
38
+ const p = resolve(this.modelDir, "version.txt");
39
+ return existsSync(p) ? readFileSync(p, "utf8").trim() : "";
40
+ }
41
+
42
+ get quant(): string {
43
+ return this.dtype;
44
+ }
45
+
46
+ private async ensure(): Promise<void> {
47
+ if (this.extractor) return;
48
+ if (!existsSync(this.modelDir)) {
49
+ throw new Error(
50
+ `model not found: ${this.modelDir}\n run: npm run fetch-model`,
51
+ );
52
+ }
53
+ const { env, pipeline } = await import("@huggingface/transformers");
54
+ env.allowRemoteModels = false;
55
+ env.localModelPath = this.modelRoot;
56
+ env.useBrowserCache = false;
57
+ this.extractor = (await pipeline("feature-extraction", this.modelId, {
58
+ dtype: this.dtype as "q8",
59
+ })) as unknown as typeof this.extractor;
60
+ }
61
+
62
+ async embed(text: string): Promise<number[]> {
63
+ await this.ensure();
64
+ const out = await this.extractor!(text, { pooling: "mean", normalize: true });
65
+ const data = out.data;
66
+ if (data.length !== this.dimensions) {
67
+ throw new Error(`dimension mismatch: ${data.length} != ${this.dimensions}`);
68
+ }
69
+ return Array.from(data);
70
+ }
71
+
72
+ async embedBatch(texts: string[]): Promise<number[][]> {
73
+ const out: number[][] = [];
74
+ for (let i = 0; i < texts.length; i++) {
75
+ out.push(await this.embed(texts[i]!));
76
+ }
77
+ return out;
78
+ }
79
+ }
@@ -0,0 +1,85 @@
1
+ import { mkdirSync } from "node:fs";
2
+ import { join, resolve } from "node:path";
3
+ import { deriveWebIndex, type WebSearchMode } from "./derive-web-index.ts";
4
+ import { copySearchScript, vendorModel, vendorRuntime } from "./vendor-web.ts";
5
+
6
+ export interface EmitSearchAssetsOptions {
7
+ readonly cwd: string;
8
+ readonly outDir: string;
9
+ readonly indexPath: string;
10
+ readonly mode?: WebSearchMode;
11
+ readonly modelRoot: string;
12
+ readonly modelId: string;
13
+ readonly bundleModel?: boolean;
14
+ readonly assetBaseUrl?: string;
15
+ readonly sourceToUrl: (source: string) => string;
16
+ readonly contentDir?: string;
17
+ readonly machineReadable?: boolean;
18
+ readonly repoRoot?: string;
19
+ readonly onProgress?: (message: string) => void;
20
+ }
21
+
22
+ export interface EmitSearchAssetsResult {
23
+ readonly written: boolean;
24
+ readonly chunks: number;
25
+ readonly bytes: number;
26
+ readonly mode?: WebSearchMode;
27
+ readonly model: boolean;
28
+ readonly runtime: boolean;
29
+ }
30
+
31
+ export async function emitSearchAssets(
32
+ opts: EmitSearchAssetsOptions,
33
+ ): Promise<EmitSearchAssetsResult> {
34
+ const log = opts.onProgress ?? (() => {});
35
+ const mode = opts.mode ?? "fts";
36
+ const assetsDir = join(opts.outDir, "assets");
37
+ mkdirSync(assetsDir, { recursive: true });
38
+
39
+ const webIdx = await deriveWebIndex(
40
+ opts.indexPath,
41
+ join(assetsDir, "search-index.json"),
42
+ opts.sourceToUrl,
43
+ mode,
44
+ {
45
+ contentDir: opts.contentDir,
46
+ machineReadable: opts.machineReadable,
47
+ },
48
+ );
49
+
50
+ if (!webIdx.written) {
51
+ log(`search-index.json: skipped (no index at ${opts.indexPath})`);
52
+ return { written: false, chunks: 0, bytes: 0, model: false, runtime: false };
53
+ }
54
+
55
+ const okScript = copySearchScript(opts.outDir, opts.repoRoot);
56
+ let okModel = false;
57
+ let okRuntime = false;
58
+
59
+ if (mode === "hybrid") {
60
+ const modelRoot = resolve(opts.cwd, opts.modelRoot);
61
+ okModel = opts.bundleModel !== false && vendorModel(modelRoot, opts.modelId, opts.outDir);
62
+ okRuntime = vendorRuntime(opts.outDir, opts.repoRoot);
63
+ log(
64
+ `search-index.json: ${webIdx.chunks} chunks, ${(webIdx.bytes / 1024 / 1024).toFixed(1)} MB` +
65
+ ` [hybrid] (model=${okModel ? "ok" : "missing"}, runtime=${okRuntime ? "ok" : "missing"}, script=${okScript ? "ok" : "missing"})`,
66
+ );
67
+ if (opts.assetBaseUrl) {
68
+ log(`search assets base URL: ${opts.assetBaseUrl}`);
69
+ }
70
+ } else {
71
+ log(
72
+ `search-index.json: ${webIdx.chunks} chunks, ${(webIdx.bytes / 1024).toFixed(1)} KB` +
73
+ ` [fts] (script=${okScript ? "ok" : "missing"})`,
74
+ );
75
+ }
76
+
77
+ return {
78
+ written: true,
79
+ chunks: webIdx.chunks,
80
+ bytes: webIdx.bytes,
81
+ mode: webIdx.mode,
82
+ model: okModel,
83
+ runtime: okRuntime,
84
+ };
85
+ }
@@ -0,0 +1,24 @@
1
+ /** 見出しテキスト → アンカー id 用 slug(ページ内重複は連番)。 */
2
+
3
+ export function slugifyHeading(text: string): string {
4
+ const base = text
5
+ .trim()
6
+ .toLowerCase()
7
+ .replace(/[*_`~]/g, "")
8
+ .replace(/\s+/g, "-")
9
+ .replace(/[^\w぀-ヿ㐀-鿿豈-﫿ヲ-゚-]/g, "")
10
+ .replace(/-+/g, "-")
11
+ .replace(/^-+|-+$/g, "");
12
+ return base.length > 0 ? base : "section";
13
+ }
14
+
15
+ export class SlugLedger {
16
+ private readonly used = new Map<string, number>();
17
+
18
+ next(text: string): string {
19
+ const base = slugifyHeading(text);
20
+ const count = this.used.get(base) ?? 0;
21
+ this.used.set(base, count + 1);
22
+ return count === 0 ? base : `${base}-${count + 1}`;
23
+ }
24
+ }
@@ -0,0 +1,40 @@
1
+ import { createHash } from "node:crypto";
2
+
3
+ export function hashContent(content: string): string {
4
+ return createHash("sha256").update(content, "utf8").digest("hex");
5
+ }
6
+
7
+ export interface IncrementalPlan {
8
+ readonly added: string[];
9
+ readonly changed: string[];
10
+ readonly removed: string[];
11
+ readonly unchanged: string[];
12
+ }
13
+
14
+ export function planIncremental(
15
+ disk: Map<string, string>,
16
+ indexed: Map<string, string>,
17
+ ): IncrementalPlan {
18
+ const added: string[] = [];
19
+ const changed: string[] = [];
20
+ const unchanged: string[] = [];
21
+ const removed: string[] = [];
22
+
23
+ for (const [source, hash] of disk) {
24
+ const prev = indexed.get(source);
25
+ if (prev === undefined) added.push(source);
26
+ else if (prev === hash) unchanged.push(source);
27
+ else changed.push(source);
28
+ }
29
+ for (const source of indexed.keys()) {
30
+ if (!disk.has(source)) removed.push(source);
31
+ }
32
+
33
+ const sort = (xs: string[]) => xs.sort();
34
+ return {
35
+ added: sort(added),
36
+ changed: sort(changed),
37
+ removed: sort(removed),
38
+ unchanged: sort(unchanged),
39
+ };
40
+ }
package/src/index.ts ADDED
@@ -0,0 +1,60 @@
1
+ export { chunkDocument, MIN_BODY, MAX_BODY, type Chunk } from "./chunker.ts";
2
+ export { hashContent, planIncremental, type IncrementalPlan } from "./incremental.ts";
3
+ export { slugifyHeading, SlugLedger } from "./heading-slug.ts";
4
+ export {
5
+ RuriEmbeddings,
6
+ DOC_PREFIX,
7
+ QUERY_PREFIX,
8
+ type EmbeddingProvider,
9
+ type RuriOptions,
10
+ } from "./embeddings.ts";
11
+ export {
12
+ IndexStore,
13
+ SCHEMA_VERSION,
14
+ type ChunkRow,
15
+ type MetaFilter,
16
+ type FtsHit,
17
+ type VecHit,
18
+ type Counts,
19
+ type IndexMeta,
20
+ } from "./store.ts";
21
+ export {
22
+ buildFtsQuery,
23
+ makeSnippet,
24
+ rrfFuse,
25
+ RRF_K,
26
+ checkModelMismatch,
27
+ searchFts,
28
+ searchHybrid,
29
+ search,
30
+ type SearchOptions,
31
+ type SearchResult,
32
+ } from "./search.ts";
33
+ export { walkMarkdown } from "./walk.ts";
34
+ export { buildSearchIndex, type BuildIndexOptions, type BuildIndexResult } from "./build-index.ts";
35
+ export {
36
+ buildWebIndex,
37
+ buildFtsWebIndex,
38
+ toSnippet,
39
+ defaultSourceUrl,
40
+ WEB_INDEX_SCHEMA_VERSION,
41
+ FTS_WEB_INDEX_SCHEMA_VERSION,
42
+ INT8_SCALE,
43
+ SNIPPET_LEN,
44
+ type WebChunk,
45
+ type WebIndex,
46
+ type FtsWebIndex,
47
+ type FtsWebChunk,
48
+ } from "./web-export.ts";
49
+ export { deriveWebIndex, type DeriveResult, type WebSearchMode } from "./derive-web-index.ts";
50
+ export {
51
+ vendorModel,
52
+ vendorRuntime,
53
+ copySearchScript,
54
+ readSearchScript,
55
+ } from "./vendor-web.ts";
56
+ export {
57
+ emitSearchAssets,
58
+ type EmitSearchAssetsOptions,
59
+ type EmitSearchAssetsResult,
60
+ } from "./emit-search-assets.ts";
package/src/search.ts ADDED
@@ -0,0 +1,131 @@
1
+ import type { EmbeddingProvider } from "./embeddings.ts";
2
+ import { QUERY_PREFIX } from "./embeddings.ts";
3
+ import type { ChunkRow, FtsHit, IndexStore, MetaFilter, VecHit } from "./store.ts";
4
+
5
+ export const RRF_K = 60;
6
+
7
+ export function buildFtsQuery(query: string): string {
8
+ const segs = query
9
+ .split(/[぀-ゟ]+|[\s、。・,.::;;!!??()()「」『』【】\[\]]+/)
10
+ .map((s) => s.replace(/"/g, "").trim())
11
+ .filter((s) => s.length >= 2);
12
+ if (segs.length === 0) return query.replace(/"/g, " ").trim();
13
+ return segs.map((s) => `"${s}"`).join(" OR ");
14
+ }
15
+
16
+ export function rrfFuse(rankings: number[][], k: number = RRF_K): Map<number, number> {
17
+ const scores = new Map<number, number>();
18
+ for (const ranking of rankings) {
19
+ for (let rank = 0; rank < ranking.length; rank++) {
20
+ const id = ranking[rank]!;
21
+ scores.set(id, (scores.get(id) ?? 0) + 1 / (k + rank + 1));
22
+ }
23
+ }
24
+ return scores;
25
+ }
26
+
27
+ export function makeSnippet(text: string, query: string, max: number = 160): string {
28
+ const flat = text.replace(/\s+/g, " ").trim();
29
+ if (flat.length <= max) return flat;
30
+ const q = query.trim();
31
+ let start = 0;
32
+ if (q) {
33
+ const idx = flat.indexOf(q);
34
+ if (idx >= 0) start = Math.max(0, idx - Math.floor(max / 4));
35
+ }
36
+ const end = Math.min(flat.length, start + max);
37
+ const body = flat.slice(start, end);
38
+ return (start > 0 ? "…" : "") + body + (end < flat.length ? "…" : "");
39
+ }
40
+
41
+ export function checkModelMismatch(
42
+ meta: Record<string, string>,
43
+ modelId: string,
44
+ dim: number,
45
+ ): string | null {
46
+ const issues: string[] = [];
47
+ if (meta.dim && Number(meta.dim) !== dim) {
48
+ issues.push(`dim index=${meta.dim} runtime=${dim}`);
49
+ }
50
+ if (meta.model_id && meta.model_id !== modelId) {
51
+ issues.push(`model index=${meta.model_id} runtime=${modelId}`);
52
+ }
53
+ return issues.length ? issues.join(", ") : null;
54
+ }
55
+
56
+ export interface SearchOptions {
57
+ readonly k?: number;
58
+ readonly filter?: MetaFilter;
59
+ readonly ftsOnly?: boolean;
60
+ }
61
+
62
+ export interface SearchResult extends ChunkRow {
63
+ readonly score: number;
64
+ readonly snippet: string;
65
+ }
66
+
67
+ export function searchFts(
68
+ store: IndexStore,
69
+ query: string,
70
+ opts: SearchOptions = {},
71
+ ): SearchResult[] {
72
+ const k = opts.k ?? 10;
73
+ const filter = opts.filter ?? {};
74
+ let hits: FtsHit[] = [];
75
+ try {
76
+ hits = store.ftsSearch(buildFtsQuery(query), k, filter);
77
+ } catch {
78
+ hits = [];
79
+ }
80
+ return hits.map((row, rank) => ({
81
+ ...row,
82
+ score: 1 / (rank + 1),
83
+ snippet: makeSnippet(row.text, query),
84
+ }));
85
+ }
86
+
87
+ export async function searchHybrid(
88
+ store: IndexStore,
89
+ embeddings: EmbeddingProvider,
90
+ query: string,
91
+ opts: SearchOptions = {},
92
+ ): Promise<SearchResult[]> {
93
+ const k = opts.k ?? 10;
94
+ const filter = opts.filter ?? {};
95
+ const pool = Math.max(k * 5, 50);
96
+
97
+ const queryVec = await embeddings.embed(QUERY_PREFIX + query);
98
+ const vecHits: VecHit[] = store.vecKnn(queryVec, pool, filter);
99
+ let ftsHits: FtsHit[] = [];
100
+ try {
101
+ ftsHits = store.ftsSearch(buildFtsQuery(query), pool, filter);
102
+ } catch {
103
+ ftsHits = [];
104
+ }
105
+
106
+ const byId = new Map<number, ChunkRow>();
107
+ for (const h of vecHits) byId.set(h.id, h);
108
+ for (const h of ftsHits) byId.set(h.id, h);
109
+
110
+ const fused = rrfFuse([vecHits.map((h) => h.id), ftsHits.map((h) => h.id)]);
111
+ const ranked = [...fused.entries()].sort((a, b) => b[1] - a[1]).slice(0, k);
112
+
113
+ return ranked.flatMap(([id, score]) => {
114
+ const row = byId.get(id);
115
+ if (!row) return [];
116
+ return [{ ...row, score, snippet: makeSnippet(row.text, query) }];
117
+ });
118
+ }
119
+
120
+ export async function search(
121
+ store: IndexStore,
122
+ embeddings: EmbeddingProvider | null,
123
+ query: string,
124
+ opts: SearchOptions = {},
125
+ ): Promise<SearchResult[]> {
126
+ const useHybrid = !opts.ftsOnly && embeddings !== null && store.hasVectors();
127
+ if (useHybrid && embeddings) {
128
+ return searchHybrid(store, embeddings, query, opts);
129
+ }
130
+ return searchFts(store, query, opts);
131
+ }