@sorane/search 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,239 @@
1
+ // Browser-side search (sorane SSG).
2
+ //
3
+ // FTS mount: <div data-search data-mode="fts" data-index=".../assets/search-index.json">
4
+ // Hybrid: + data-model-base, data-lib-base
5
+
6
+ const MODEL_ID = "ruri-v3-30m";
7
+ const QUERY_PREFIX = "検索クエリ: ";
8
+ const TOP_K = 10;
9
+
10
+ function tokenizeQuery(query) {
11
+ const segs = query
12
+ .split(/[぀-ゟ]+|[\s、。・,.::;;!!??()()「」『』【】\[\]]+/)
13
+ .map((s) => s.trim())
14
+ .filter((s) => s.length >= 2);
15
+ if (segs.length === 0) {
16
+ const flat = query.trim();
17
+ return flat.length >= 1 ? [flat] : [];
18
+ }
19
+ return segs;
20
+ }
21
+
22
+ function decodeVectors(b64, dim) {
23
+ const binary = atob(b64);
24
+ const buf = new Uint8Array(binary.length);
25
+ for (let i = 0; i < binary.length; i++) buf[i] = binary.charCodeAt(i);
26
+ const i8 = new Int8Array(buf.buffer);
27
+ if (i8.length % dim !== 0) throw new Error(`vector length ${i8.length} is not a multiple of dim ${dim}`);
28
+ return i8;
29
+ }
30
+
31
+ function topK(query, vectors, dim, k, allow) {
32
+ const n = vectors.length / dim;
33
+ const heap = [];
34
+ for (let i = 0; i < n; i++) {
35
+ if (allow && !allow(i)) continue;
36
+ let s = 0;
37
+ const off = i * dim;
38
+ for (let j = 0; j < dim; j++) s += query[j] * vectors[off + j];
39
+ if (heap.length < k) {
40
+ heap.push({ index: i, score: s });
41
+ heap.sort((a, b) => a.score - b.score);
42
+ } else if (s > heap[0].score) {
43
+ heap[0] = { index: i, score: s };
44
+ heap.sort((a, b) => a.score - b.score);
45
+ }
46
+ }
47
+ return heap.sort((a, b) => b.score - a.score);
48
+ }
49
+
50
+ function ftsSearch(index, query, type, k = TOP_K) {
51
+ const terms = tokenizeQuery(query);
52
+ if (terms.length === 0) return [];
53
+ const hits = [];
54
+ for (let i = 0; i < index.chunks.length; i++) {
55
+ const chunk = index.chunks[i];
56
+ if (type && chunk.doc_type !== type) continue;
57
+ const hay = [
58
+ chunk.text || "",
59
+ chunk.title || "",
60
+ chunk.heading_path || "",
61
+ chunk.tags || "",
62
+ ]
63
+ .join(" ")
64
+ .toLowerCase();
65
+ let score = 0;
66
+ for (const term of terms) {
67
+ const needle = term.toLowerCase();
68
+ if (hay.includes(needle)) score += 1;
69
+ if ((chunk.title || "").toLowerCase().includes(needle)) score += 2;
70
+ if ((chunk.heading_path || "").toLowerCase().includes(needle)) score += 1;
71
+ }
72
+ if (score > 0) hits.push({ index: i, score, chunk });
73
+ }
74
+ return hits.sort((a, b) => b.score - a.score).slice(0, k);
75
+ }
76
+
77
+ function makeSnippet(text, query, max = 160) {
78
+ const flat = text.replace(/\s+/g, " ").trim();
79
+ if (flat.length <= max) return flat;
80
+ const q = query.trim();
81
+ let start = 0;
82
+ if (q) {
83
+ const idx = flat.indexOf(q);
84
+ if (idx >= 0) start = Math.max(0, idx - Math.floor(max / 4));
85
+ }
86
+ const end = Math.min(flat.length, start + max);
87
+ const body = flat.slice(start, end);
88
+ return (start > 0 ? "…" : "") + body + (end < flat.length ? "…" : "");
89
+ }
90
+
91
+ function setup(root) {
92
+ const form = root.querySelector(".search-form");
93
+ const input = root.querySelector(".search-input");
94
+ const facet = root.querySelector(".search-facet");
95
+ const status = root.querySelector("[data-search-status]");
96
+ const resultsEl = root.querySelector("[data-search-results]");
97
+ const indexUrl = root.getAttribute("data-index");
98
+ const mode = root.getAttribute("data-mode") || "fts";
99
+ const modelBase = root.getAttribute("data-model-base");
100
+ const libBase = root.getAttribute("data-lib-base");
101
+ const compact = root.classList.contains("search--header");
102
+ if (!form || !input || !indexUrl) return;
103
+
104
+ let index = null;
105
+ let embed = null;
106
+ let busy = false;
107
+
108
+ const setStatus = (msg) => {
109
+ if (!status) return;
110
+ status.textContent = msg;
111
+ };
112
+
113
+ async function loadIndex() {
114
+ if (index) return index;
115
+ setStatus("検索インデックスを読み込み中…");
116
+ const res = await fetch(indexUrl);
117
+ if (!res.ok) throw new Error(`failed to fetch search-index.json (${res.status})`);
118
+ const json = await res.json();
119
+ const resolvedMode = json.mode === "hybrid" && json.embeddings ? "hybrid" : "fts";
120
+ if (resolvedMode === "hybrid") {
121
+ const dim = json.embeddings.dim;
122
+ index = { ...json, mode: "hybrid", vectors: decodeVectors(json.embeddings.vectors_b64, dim) };
123
+ } else {
124
+ index = { ...json, mode: "fts" };
125
+ }
126
+ return index;
127
+ }
128
+
129
+ async function loadEmbedder() {
130
+ if (embed) return embed;
131
+ setStatus("埋め込みモデルを読み込み中…(初回のみ)");
132
+ const libUrl = new URL(libBase, document.baseURI).href;
133
+ const tjs = await import(`${libUrl}transformers.web.js`);
134
+ const { env, pipeline } = tjs;
135
+ env.useBrowserCache = false;
136
+ const isRemoteBase = /^https?:\/\//i.test(modelBase || "");
137
+ if (isRemoteBase) {
138
+ env.allowLocalModels = false;
139
+ env.allowRemoteModels = true;
140
+ env.remoteHost = modelBase;
141
+ env.remotePathTemplate = "{model}/";
142
+ } else {
143
+ env.allowRemoteModels = false;
144
+ env.allowLocalModels = true;
145
+ env.localModelPath = modelBase;
146
+ }
147
+ const onnxWasm = env.backends?.onnx?.wasm;
148
+ if (onnxWasm) onnxWasm.wasmPaths = libUrl;
149
+ const extractor = await pipeline("feature-extraction", MODEL_ID, { dtype: "q8" });
150
+ embed = async (query) => {
151
+ const out = await extractor(QUERY_PREFIX + query, { pooling: "mean", normalize: true });
152
+ return new Float32Array(out.data);
153
+ };
154
+ return embed;
155
+ }
156
+
157
+ function render(hits, query) {
158
+ resultsEl.replaceChildren();
159
+ if (hits.length === 0) {
160
+ setStatus("該当なし。語を変えてお試しください。");
161
+ return;
162
+ }
163
+ setStatus(`${hits.length} 件`);
164
+ for (const { chunk, score } of hits) {
165
+ const li = document.createElement("li");
166
+ li.className = "search-hit";
167
+
168
+ const a = document.createElement("a");
169
+ a.className = "search-hit-title";
170
+ const anchor = chunk.heading_slug ? `#${chunk.heading_slug}` : "";
171
+ a.href = `${chunk.url || chunk.source.replace(/\.md$/i, ".html")}${anchor}`;
172
+ a.textContent = chunk.heading_path || chunk.title || chunk.source;
173
+ li.appendChild(a);
174
+
175
+ const meta = document.createElement("p");
176
+ meta.className = "search-hit-meta";
177
+ const scoreLabel = typeof score === "number" && score % 1 !== 0 ? score.toFixed(3) : String(score);
178
+ meta.textContent = `${chunk.doc_type || "-"} · ${chunk.source} · ${scoreLabel}`;
179
+ li.appendChild(meta);
180
+
181
+ const snippet = chunk.snippet || (chunk.text ? makeSnippet(chunk.text, query) : "");
182
+ if (snippet) {
183
+ const snip = document.createElement("p");
184
+ snip.className = "search-hit-snippet";
185
+ snip.textContent = snippet;
186
+ li.appendChild(snip);
187
+ }
188
+ resultsEl.appendChild(li);
189
+ }
190
+ }
191
+
192
+ async function runFts(query) {
193
+ const idx = await loadIndex();
194
+ setStatus("検索中…");
195
+ const type = facet ? facet.value : "";
196
+ const ranked = ftsSearch(idx, query, type);
197
+ render(
198
+ ranked.map((r) => ({ chunk: r.chunk, score: r.score })),
199
+ query,
200
+ );
201
+ }
202
+
203
+ async function runHybrid(query) {
204
+ const idx = await loadIndex();
205
+ const e = await loadEmbedder();
206
+ setStatus("検索中…");
207
+ const qv = await e(query);
208
+ const type = facet ? facet.value : "";
209
+ const allow = type ? (i) => idx.chunks[i].doc_type === type : null;
210
+ const ranked = topK(qv, idx.vectors, idx.embeddings.dim, TOP_K, allow);
211
+ const scale = idx.embeddings.scale || 1;
212
+ render(
213
+ ranked.map((r) => ({ chunk: idx.chunks[r.index], score: r.score / scale })),
214
+ query,
215
+ );
216
+ }
217
+
218
+ async function run(query) {
219
+ if (busy || !query.trim()) return;
220
+ busy = true;
221
+ try {
222
+ const idx = await loadIndex();
223
+ const useHybrid = (idx.mode || mode) === "hybrid";
224
+ if (useHybrid) await runHybrid(query);
225
+ else await runFts(query);
226
+ } catch (err) {
227
+ setStatus(`エラー: ${err && err.message ? err.message : err}`);
228
+ } finally {
229
+ busy = false;
230
+ }
231
+ }
232
+
233
+ form.addEventListener("submit", (e) => {
234
+ e.preventDefault();
235
+ run(input.value);
236
+ });
237
+ }
238
+
239
+ for (const root of document.querySelectorAll("[data-search]")) setup(root);
package/package.json ADDED
@@ -0,0 +1,40 @@
1
+ {
2
+ "name": "@sorane/search",
3
+ "version": "0.2.0",
4
+ "description": "FTS and hybrid search indexing for sorane sites",
5
+ "type": "module",
6
+ "license": "MIT",
7
+ "repository": {
8
+ "type": "git",
9
+ "url": "git+https://github.com/masanork/sorane.git",
10
+ "directory": "packages/search"
11
+ },
12
+ "homepage": "https://sorane.dev",
13
+ "bugs": "https://github.com/masanork/sorane/issues",
14
+ "files": [
15
+ "src",
16
+ "assets"
17
+ ],
18
+ "exports": {
19
+ ".": "./src/index.ts"
20
+ },
21
+ "engines": {
22
+ "node": ">=23.6"
23
+ },
24
+ "publishConfig": {
25
+ "access": "public"
26
+ },
27
+ "dependencies": {
28
+ "@huggingface/transformers": "^4.2.0",
29
+ "@sorane/okf": "0.2.0",
30
+ "better-sqlite3": "^12.11.1",
31
+ "remark-gfm": "^4.0.1",
32
+ "remark-parse": "^11.0.0",
33
+ "sqlite-vec": "^0.1.9",
34
+ "unified": "^11.0.0"
35
+ },
36
+ "devDependencies": {
37
+ "@types/better-sqlite3": "^7.6.13",
38
+ "@types/mdast": "^4.0.4"
39
+ }
40
+ }
@@ -0,0 +1,115 @@
1
+ import { readFileSync } from "node:fs";
2
+ import { relative, resolve } from "node:path";
3
+ import { chunkDocument } from "./chunker.ts";
4
+ import type { EmbeddingProvider } from "./embeddings.ts";
5
+ import { DOC_PREFIX } from "./embeddings.ts";
6
+ import { hashContent, planIncremental } from "./incremental.ts";
7
+ import { IndexStore } from "./store.ts";
8
+ import { walkMarkdown } from "./walk.ts";
9
+
10
+ export interface BuildIndexOptions {
11
+ readonly contentDir: string;
12
+ readonly indexPath: string;
13
+ readonly force?: boolean;
14
+ readonly embeddings?: EmbeddingProvider | null;
15
+ readonly onProgress?: (message: string) => void;
16
+ }
17
+
18
+ export interface BuildIndexResult {
19
+ readonly added: number;
20
+ readonly changed: number;
21
+ readonly removed: number;
22
+ readonly unchanged: number;
23
+ readonly chunks: number;
24
+ readonly fts: number;
25
+ readonly vec: number;
26
+ readonly mode: "hybrid" | "fts-only";
27
+ }
28
+
29
+ export async function buildSearchIndex(opts: BuildIndexOptions): Promise<BuildIndexResult> {
30
+ const contentDir = resolve(opts.contentDir);
31
+ const log = opts.onProgress ?? (() => {});
32
+ const hybrid = opts.embeddings != null;
33
+ const store = new IndexStore(opts.indexPath, {
34
+ fresh: opts.force === true,
35
+ dim: hybrid ? opts.embeddings!.dimensions : 256,
36
+ });
37
+
38
+ const files = walkMarkdown(contentDir);
39
+ const disk = new Map<string, string>();
40
+ const content = new Map<string, string>();
41
+ for (const abs of files) {
42
+ const rel = relative(contentDir, abs).replace(/\\/g, "/");
43
+ const text = readFileSync(abs, "utf8");
44
+ disk.set(rel, hashContent(text));
45
+ content.set(rel, text);
46
+ }
47
+
48
+ const indexed = opts.force ? new Map<string, string>() : store.sourceHashes();
49
+ const plan = planIncremental(disk, indexed);
50
+
51
+ log(
52
+ `incremental: added ${plan.added.length} / changed ${plan.changed.length} / ` +
53
+ `removed ${plan.removed.length} / unchanged ${plan.unchanged.length} (${files.length} files)` +
54
+ (hybrid ? " [hybrid]" : " [fts-only]"),
55
+ );
56
+
57
+ for (const rel of plan.removed) {
58
+ store.deleteBySource(rel);
59
+ log(`removed ${rel}`);
60
+ }
61
+
62
+ const targets = [...plan.added, ...plan.changed].sort();
63
+ let totalChunks = 0;
64
+ for (let i = 0; i < targets.length; i++) {
65
+ const rel = targets[i]!;
66
+ const text = content.get(rel)!;
67
+ const sha = disk.get(rel)!;
68
+ store.deleteBySource(rel);
69
+ const chunks = chunkDocument(text, rel);
70
+ if (chunks.length === 0) {
71
+ store.setSourceHash(rel, sha);
72
+ log(`[${i + 1}/${targets.length}] ${rel}: 0 chunks`);
73
+ continue;
74
+ }
75
+ const vectors = hybrid
76
+ ? await opts.embeddings!.embedBatch(chunks.map((c) => DOC_PREFIX + c.text))
77
+ : undefined;
78
+ store.addChunks(chunks, vectors);
79
+ store.setSourceHash(rel, sha);
80
+ totalChunks += chunks.length;
81
+ log(`[${i + 1}/${targets.length}] ${rel}: ${chunks.length} chunks (${totalChunks} total)`);
82
+ }
83
+
84
+ if (hybrid && opts.embeddings) {
85
+ store.setMeta({
86
+ modelId: opts.embeddings.modelId ?? "ruri-v3-30m",
87
+ dim: opts.embeddings.dimensions,
88
+ quant: opts.embeddings.quant ?? "q8",
89
+ modelSha256: opts.embeddings.modelSha256 ?? "",
90
+ });
91
+ } else {
92
+ store.setMeta();
93
+ }
94
+
95
+ const counts = store.counts();
96
+ store.close();
97
+
98
+ if (counts.chunks !== counts.fts) {
99
+ throw new Error(`index mismatch: chunks=${counts.chunks} fts=${counts.fts}`);
100
+ }
101
+ if (hybrid && counts.chunks !== counts.vec) {
102
+ throw new Error(`index mismatch: chunks=${counts.chunks} vec=${counts.vec}`);
103
+ }
104
+
105
+ return {
106
+ added: plan.added.length,
107
+ changed: plan.changed.length,
108
+ removed: plan.removed.length,
109
+ unchanged: plan.unchanged.length,
110
+ chunks: counts.chunks,
111
+ fts: counts.fts,
112
+ vec: counts.vec,
113
+ mode: hybrid ? "hybrid" : "fts-only",
114
+ };
115
+ }
package/src/chunker.ts ADDED
@@ -0,0 +1,174 @@
1
+ import { unified } from "unified";
2
+ import remarkParse from "remark-parse";
3
+ import remarkGfm from "remark-gfm";
4
+ import type { Root, RootContent, Heading } from "mdast";
5
+ import { extract } from "@sorane/okf";
6
+ import { parseYaml } from "@sorane/okf";
7
+ import { SlugLedger } from "./heading-slug.ts";
8
+
9
+ export const MIN_BODY = 50;
10
+ export const MAX_BODY = 800;
11
+
12
+ export interface Chunk {
13
+ readonly source: string;
14
+ readonly chunkIndex: number;
15
+ readonly text: string;
16
+ readonly headingPath: string;
17
+ readonly headingSlug: string;
18
+ readonly docType: string;
19
+ readonly title: string;
20
+ readonly timestamp: string;
21
+ readonly tags: string;
22
+ }
23
+
24
+ interface Meta {
25
+ docType: string;
26
+ title: string;
27
+ timestamp: string;
28
+ tags: string;
29
+ skip: boolean;
30
+ }
31
+
32
+ function nodeToText(node: RootContent | Heading): string {
33
+ const anyNode = node as { type: string; value?: string; children?: RootContent[] };
34
+ if (anyNode.type === "text" || anyNode.type === "inlineCode") return anyNode.value ?? "";
35
+ if (anyNode.children) return anyNode.children.map((c) => nodeToText(c as RootContent)).join("");
36
+ return "";
37
+ }
38
+
39
+ function blockToText(node: RootContent): string {
40
+ switch (node.type) {
41
+ case "paragraph":
42
+ case "heading":
43
+ case "blockquote":
44
+ return nodeToText(node);
45
+ case "list":
46
+ return node.children.map((li) => nodeToText(li as RootContent)).join("\n");
47
+ case "code":
48
+ case "table":
49
+ case "html":
50
+ case "thematicBreak":
51
+ return "";
52
+ default:
53
+ return nodeToText(node);
54
+ }
55
+ }
56
+
57
+ function splitOversized(body: string): string[] {
58
+ if (body.length <= MAX_BODY) return [body];
59
+ const parts: string[] = [];
60
+ for (const para of body.split(/\n\s*\n/)) {
61
+ const p = para.trim();
62
+ if (!p) continue;
63
+ const last = parts[parts.length - 1];
64
+ if (last === undefined || last.length + p.length + 2 > MAX_BODY) parts.push(p);
65
+ else parts[parts.length - 1] = last + "\n\n" + p;
66
+ }
67
+ return parts.length ? parts : [body];
68
+ }
69
+
70
+ function slugifyTag(tag: string): string {
71
+ return tag
72
+ .trim()
73
+ .toLowerCase()
74
+ .replace(/\s+/g, "-")
75
+ .replace(/[^\w぀-ヿ㐀-鿿豈-﫿ヲ-゚-]/g, "")
76
+ .replace(/-+/g, "-")
77
+ .replace(/^-+|-+$/g, "");
78
+ }
79
+
80
+ function readMeta(frontmatter: string | null): Meta {
81
+ const fm = (frontmatter ? (parseYaml(frontmatter) as Record<string, unknown>) : {}) ?? {};
82
+ const str = (k: string): string => {
83
+ const v = fm[k];
84
+ return v == null ? "" : String(v);
85
+ };
86
+ const rawTags = fm.tags;
87
+ const tagSlugs =
88
+ Array.isArray(rawTags)
89
+ ? rawTags.map((t) => slugifyTag(String(t))).filter(Boolean)
90
+ : typeof rawTags === "string"
91
+ ? [slugifyTag(rawTags)].filter(Boolean)
92
+ : [];
93
+ return {
94
+ docType: str("type"),
95
+ title: str("title"),
96
+ timestamp: str("timestamp"),
97
+ tags: tagSlugs.join(","),
98
+ skip: fm.isSystem === true,
99
+ };
100
+ }
101
+
102
+ function chunkBody(body: string, meta: Meta): { text: string; path: string; slug: string }[] {
103
+ const tree = unified().use(remarkParse).use(remarkGfm).parse(body) as Root;
104
+ const ledger = new SlugLedger();
105
+ const out: { text: string; path: string; slug: string }[] = [];
106
+
107
+ interface Section {
108
+ heading: Heading | null;
109
+ body: RootContent[];
110
+ }
111
+ const sections: Section[] = [];
112
+ let current: Section = { heading: null, body: [] };
113
+ for (const node of tree.children) {
114
+ if (node.type === "heading" && (node.depth === 2 || node.depth === 3)) {
115
+ if (current.heading || current.body.length) sections.push(current);
116
+ current = { heading: node, body: [] };
117
+ } else if (node.type === "heading" && node.depth === 1) {
118
+ if (current.heading || current.body.length) sections.push(current);
119
+ current = { heading: null, body: [] };
120
+ } else {
121
+ current.body.push(node);
122
+ }
123
+ }
124
+ if (current.heading || current.body.length) sections.push(current);
125
+
126
+ let lastH2 = "";
127
+ for (const sec of sections) {
128
+ const headingText = sec.heading ? nodeToText(sec.heading).trim() : "";
129
+ const slug = headingText ? ledger.next(headingText) : "";
130
+ let path: string;
131
+ if (sec.heading?.depth === 2) {
132
+ lastH2 = headingText;
133
+ path = [meta.title, headingText].filter(Boolean).join(" / ");
134
+ } else if (sec.heading?.depth === 3) {
135
+ path = [meta.title, lastH2, headingText].filter(Boolean).join(" / ");
136
+ } else {
137
+ path = meta.title;
138
+ }
139
+
140
+ const bodyText = sec.body.map(blockToText).filter(Boolean).join("\n\n").trim();
141
+ if (bodyText.length < MIN_BODY) continue;
142
+
143
+ for (const part of splitOversized(bodyText)) {
144
+ if (part.trim().length < MIN_BODY) continue;
145
+ out.push({ text: part.trim(), path, slug });
146
+ }
147
+ }
148
+ return out;
149
+ }
150
+
151
+ function isNotFoundPath(relPath: string): boolean {
152
+ const base = relPath.replace(/\\/g, "/").split("/").pop() ?? relPath;
153
+ return base.replace(/\.md$/i, "") === "404";
154
+ }
155
+
156
+ /** 1 文書を検索チャンク列へ。 */
157
+ export function chunkDocument(source: string, relPath: string): Chunk[] {
158
+ const { frontmatter, body } = extract(source);
159
+ const meta = readMeta(frontmatter);
160
+ if (meta.skip || isNotFoundPath(relPath)) return [];
161
+
162
+ const raw = chunkBody(body, meta);
163
+ return raw.map((c, i) => ({
164
+ source: relPath,
165
+ chunkIndex: i,
166
+ text: c.text,
167
+ headingPath: c.path,
168
+ headingSlug: c.slug,
169
+ docType: meta.docType,
170
+ title: meta.title,
171
+ timestamp: meta.timestamp,
172
+ tags: meta.tags,
173
+ }));
174
+ }
@@ -0,0 +1,59 @@
1
+ import { existsSync, writeFileSync } from "node:fs";
2
+ import { buildSourceDisclosureMap } from "./disclosure-map.ts";
3
+ import { buildFtsWebIndex, buildWebIndex, defaultSourceUrl } from "./web-export.ts";
4
+
5
+ export type WebSearchMode = "fts" | "hybrid";
6
+
7
+ export interface DeriveResult {
8
+ readonly written: boolean;
9
+ readonly chunks: number;
10
+ readonly bytes: number;
11
+ readonly mode?: WebSearchMode;
12
+ }
13
+
14
+ export async function deriveWebIndex(
15
+ dbPath: string,
16
+ outPath: string,
17
+ sourceToUrl: (source: string) => string = defaultSourceUrl,
18
+ mode: WebSearchMode = "fts",
19
+ opts?: {
20
+ readonly contentDir?: string;
21
+ readonly machineReadable?: boolean;
22
+ },
23
+ ): Promise<DeriveResult> {
24
+ if (!existsSync(dbPath)) return { written: false, chunks: 0, bytes: 0 };
25
+ const { IndexStore } = await import("./store.ts");
26
+ const store = new IndexStore(dbPath);
27
+ try {
28
+ const counts = store.counts();
29
+ if (counts.chunks === 0) return { written: false, chunks: 0, bytes: 0 };
30
+
31
+ const { rows, vectors } = store.exportAll();
32
+ const disclosureMap =
33
+ opts?.contentDir && opts.machineReadable !== false
34
+ ? buildSourceDisclosureMap(
35
+ opts.contentDir,
36
+ rows.map((r) => r.source),
37
+ )
38
+ : undefined;
39
+ const exportOpts = {
40
+ disclosureMap,
41
+ machineReadable: opts?.machineReadable,
42
+ };
43
+
44
+ if (mode === "hybrid" && store.hasVectors()) {
45
+ const meta = store.readMeta();
46
+ const index = buildWebIndex(rows, vectors, meta, sourceToUrl, exportOpts);
47
+ const json = JSON.stringify(index);
48
+ writeFileSync(outPath, json, "utf8");
49
+ return { written: true, chunks: index.chunks.length, bytes: json.length, mode: "hybrid" };
50
+ }
51
+
52
+ const index = buildFtsWebIndex(rows, sourceToUrl, exportOpts);
53
+ const json = JSON.stringify(index);
54
+ writeFileSync(outPath, json, "utf8");
55
+ return { written: true, chunks: index.chunks.length, bytes: json.length, mode: "fts" };
56
+ } finally {
57
+ store.close();
58
+ }
59
+ }
@@ -0,0 +1,42 @@
1
+ import { existsSync, readFileSync } from "node:fs";
2
+ import { join } from "node:path";
3
+ import { resolveDigitalSourceType } from "@sorane/okf";
4
+
5
+ const FRONTMATTER_RE = /^---\r?\n([\s\S]*?)\r?\n---/;
6
+
7
+ function parseFrontmatterScalar(block: string, key: string): string | undefined {
8
+ const re = new RegExp(`^${key}:\\s*(.+)$`, "m");
9
+ const m = block.match(re);
10
+ if (!m) return undefined;
11
+ const raw = m[1]!.trim();
12
+ if (
13
+ (raw.startsWith('"') && raw.endsWith('"')) ||
14
+ (raw.startsWith("'") && raw.endsWith("'"))
15
+ ) {
16
+ return raw.slice(1, -1);
17
+ }
18
+ return raw.length > 0 ? raw : undefined;
19
+ }
20
+
21
+ /** contentDir 内のソースパスから IPTC URI の disclosure マップを構築する。 */
22
+ export function buildSourceDisclosureMap(
23
+ contentDir: string,
24
+ sources: readonly string[],
25
+ ): ReadonlyMap<string, string> {
26
+ const map = new Map<string, string>();
27
+ const seen = new Set<string>();
28
+ for (const source of sources) {
29
+ if (seen.has(source)) continue;
30
+ seen.add(source);
31
+ const abs = join(contentDir, source);
32
+ if (!existsSync(abs)) continue;
33
+ const text = readFileSync(abs, "utf8");
34
+ const fm = FRONTMATTER_RE.exec(text);
35
+ if (!fm) continue;
36
+ const dst = parseFrontmatterScalar(fm[1]!, "digitalSourceType");
37
+ if (!dst) continue;
38
+ const resolved = resolveDigitalSourceType(dst);
39
+ if (resolved) map.set(source, resolved.uri);
40
+ }
41
+ return map;
42
+ }