@astrofoundry/grimoire 3.13.0 → 3.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/admin-HA6FNUV4.js +1516 -0
- package/dist/admin-HA6FNUV4.js.map +7 -0
- package/dist/chunk-BRS6X3AE.js +12 -0
- package/dist/chunk-BRS6X3AE.js.map +7 -0
- package/dist/cli.js +255 -713
- package/dist/cli.js.map +7 -1
- package/package.json +11 -12
- package/dist/apikey.d.ts +0 -5
- package/dist/apikey.d.ts.map +0 -1
- package/dist/apikey.js +0 -84
- package/dist/apikey.js.map +0 -1
- package/dist/chunker.d.ts +0 -7
- package/dist/chunker.d.ts.map +0 -1
- package/dist/chunker.js +0 -158
- package/dist/chunker.js.map +0 -1
- package/dist/cli.d.ts +0 -3
- package/dist/cli.d.ts.map +0 -1
- package/dist/config.d.ts +0 -23
- package/dist/config.d.ts.map +0 -1
- package/dist/config.js +0 -89
- package/dist/config.js.map +0 -1
- package/dist/consumer-config.d.ts +0 -11
- package/dist/consumer-config.d.ts.map +0 -1
- package/dist/consumer-config.js +0 -60
- package/dist/consumer-config.js.map +0 -1
- package/dist/consumer.d.ts +0 -11
- package/dist/consumer.d.ts.map +0 -1
- package/dist/consumer.js +0 -84
- package/dist/consumer.js.map +0 -1
- package/dist/converter.d.ts +0 -12
- package/dist/converter.d.ts.map +0 -1
- package/dist/converter.js +0 -95
- package/dist/converter.js.map +0 -1
- package/dist/embedder.d.ts +0 -9
- package/dist/embedder.d.ts.map +0 -1
- package/dist/embedder.js +0 -108
- package/dist/embedder.js.map +0 -1
- package/dist/format.d.ts +0 -5
- package/dist/format.d.ts.map +0 -1
- package/dist/format.js +0 -6
- package/dist/format.js.map +0 -1
- package/dist/llms-ingest.d.ts +0 -3
- package/dist/llms-ingest.d.ts.map +0 -1
- package/dist/llms-ingest.js +0 -85
- package/dist/llms-ingest.js.map +0 -1
- package/dist/reranker.d.ts +0 -6
- package/dist/reranker.d.ts.map +0 -1
- package/dist/reranker.js +0 -21
- package/dist/reranker.js.map +0 -1
- package/dist/scraper.d.ts +0 -9
- package/dist/scraper.d.ts.map +0 -1
- package/dist/scraper.js +0 -98
- package/dist/scraper.js.map +0 -1
- package/dist/search.d.ts +0 -8
- package/dist/search.d.ts.map +0 -1
- package/dist/search.js +0 -43
- package/dist/search.js.map +0 -1
- package/dist/store.d.ts +0 -15
- package/dist/store.d.ts.map +0 -1
- package/dist/store.js +0 -149
- package/dist/store.js.map +0 -1
- package/dist/types.d.ts +0 -26
- package/dist/types.d.ts.map +0 -1
- package/dist/types.js +0 -2
- package/dist/types.js.map +0 -1
|
@@ -0,0 +1,1516 @@
|
|
|
1
|
+
import {
|
|
2
|
+
bold,
|
|
3
|
+
cyan,
|
|
4
|
+
yellow
|
|
5
|
+
} from "./chunk-BRS6X3AE.js";
|
|
6
|
+
|
|
7
|
+
// src/admin.ts
|
|
8
|
+
import { parseArgs } from "node:util";
|
|
9
|
+
import { readFile as readFile3, writeFile as writeFile4, readdir, rm, mkdir as mkdir4 } from "node:fs/promises";
|
|
10
|
+
import { existsSync } from "node:fs";
|
|
11
|
+
import { join as join4, resolve } from "node:path";
|
|
12
|
+
import { createInterface } from "node:readline";
|
|
13
|
+
import { stringify } from "yaml";
|
|
14
|
+
|
|
15
|
+
// src/config.ts
|
|
16
|
+
import { readFile } from "node:fs/promises";
|
|
17
|
+
import { parse } from "yaml";
|
|
18
|
+
var REQUIRED_SOURCE_FIELDS = [
|
|
19
|
+
"name",
|
|
20
|
+
"start_url"
|
|
21
|
+
];
|
|
22
|
+
var SCRAPE_REQUIRED_FIELDS = [
|
|
23
|
+
"nav_selector",
|
|
24
|
+
"content_selector"
|
|
25
|
+
];
|
|
26
|
+
function validateConfig(data) {
|
|
27
|
+
if (typeof data !== "object" || data === null || !("sources" in data)) {
|
|
28
|
+
throw new Error("Config must have a 'sources' key");
|
|
29
|
+
}
|
|
30
|
+
const { sources } = data;
|
|
31
|
+
if (typeof sources !== "object" || sources === null) {
|
|
32
|
+
throw new Error("'sources' must be an object");
|
|
33
|
+
}
|
|
34
|
+
const entries = Object.entries(sources);
|
|
35
|
+
if (entries.length === 0) {
|
|
36
|
+
throw new Error("'sources' must contain at least one source");
|
|
37
|
+
}
|
|
38
|
+
const validated = {};
|
|
39
|
+
for (const [key, value] of entries) {
|
|
40
|
+
if (typeof value !== "object" || value === null) {
|
|
41
|
+
throw new Error(`Source '${key}' must be an object`);
|
|
42
|
+
}
|
|
43
|
+
const source = value;
|
|
44
|
+
for (const field of REQUIRED_SOURCE_FIELDS) {
|
|
45
|
+
if (typeof source[field] !== "string" || source[field] === "") {
|
|
46
|
+
throw new Error(`Source '${key}' is missing required field '${field}'`);
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
if (!source.llms_full_url) {
|
|
50
|
+
for (const field of SCRAPE_REQUIRED_FIELDS) {
|
|
51
|
+
if (typeof source[field] !== "string" || source[field] === "") {
|
|
52
|
+
throw new Error(`Source '${key}' is missing required field '${field}'`);
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
try {
|
|
57
|
+
new URL(source.start_url);
|
|
58
|
+
} catch {
|
|
59
|
+
throw new Error(`Source '${key}' has invalid start_url: ${source.start_url}`);
|
|
60
|
+
}
|
|
61
|
+
if (source.remove_selectors !== void 0 && !Array.isArray(source.remove_selectors)) {
|
|
62
|
+
throw new Error(`Source '${key}': remove_selectors must be an array`);
|
|
63
|
+
}
|
|
64
|
+
if (source.remove_text_patterns !== void 0 && !Array.isArray(source.remove_text_patterns)) {
|
|
65
|
+
throw new Error(`Source '${key}': remove_text_patterns must be an array`);
|
|
66
|
+
}
|
|
67
|
+
if (source.include_patterns !== void 0 && !Array.isArray(source.include_patterns)) {
|
|
68
|
+
throw new Error(`Source '${key}': include_patterns must be an array`);
|
|
69
|
+
}
|
|
70
|
+
if (source.exclude_patterns !== void 0 && !Array.isArray(source.exclude_patterns)) {
|
|
71
|
+
throw new Error(`Source '${key}': exclude_patterns must be an array`);
|
|
72
|
+
}
|
|
73
|
+
if (source.rate_limit_ms !== void 0 && typeof source.rate_limit_ms !== "number") {
|
|
74
|
+
throw new Error(`Source '${key}': rate_limit_ms must be a number`);
|
|
75
|
+
}
|
|
76
|
+
if (source.concurrency !== void 0 && typeof source.concurrency !== "number") {
|
|
77
|
+
throw new Error(`Source '${key}': concurrency must be a number`);
|
|
78
|
+
}
|
|
79
|
+
validated[key] = {
|
|
80
|
+
name: source.name,
|
|
81
|
+
version: source.version,
|
|
82
|
+
start_url: source.start_url,
|
|
83
|
+
nav_selector: source.nav_selector,
|
|
84
|
+
content_selector: source.content_selector,
|
|
85
|
+
remove_selectors: source.remove_selectors,
|
|
86
|
+
remove_text_patterns: source.remove_text_patterns,
|
|
87
|
+
include_patterns: source.include_patterns,
|
|
88
|
+
exclude_patterns: source.exclude_patterns,
|
|
89
|
+
rate_limit_ms: source.rate_limit_ms,
|
|
90
|
+
concurrency: source.concurrency,
|
|
91
|
+
headed: source.headed,
|
|
92
|
+
sitemap_url: source.sitemap_url,
|
|
93
|
+
llms_full_url: source.llms_full_url
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
return { sources: validated };
|
|
97
|
+
}
|
|
98
|
+
async function loadConfig(path) {
|
|
99
|
+
const content = await readFile(path, "utf-8");
|
|
100
|
+
const data = parse(content, { merge: true });
|
|
101
|
+
return validateConfig(data);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// src/scraper.ts
|
|
105
|
+
import { mkdir, writeFile } from "node:fs/promises";
|
|
106
|
+
import { join } from "node:path";
|
|
107
|
+
import { chromium } from "playwright";
|
|
108
|
+
function slugifyUrl(url) {
|
|
109
|
+
const parsed = new URL(url);
|
|
110
|
+
return parsed.pathname.replace(/^\//, "").replace(/\/$/, "").replace(/\//g, "-").replace(/[^a-zA-Z0-9-]/g, "");
|
|
111
|
+
}
|
|
112
|
+
function filterUrls(urls, includePatterns, excludePatterns) {
|
|
113
|
+
let filtered = urls.filter(
|
|
114
|
+
(url) => url.startsWith("http") && !url.includes("?hl=") && !url.endsWith("#")
|
|
115
|
+
);
|
|
116
|
+
if (includePatterns && includePatterns.length > 0) {
|
|
117
|
+
filtered = filtered.filter(
|
|
118
|
+
(url) => includePatterns.some((pattern) => url.includes(pattern))
|
|
119
|
+
);
|
|
120
|
+
}
|
|
121
|
+
if (excludePatterns && excludePatterns.length > 0) {
|
|
122
|
+
filtered = filtered.filter(
|
|
123
|
+
(url) => !excludePatterns.some((pattern) => url.includes(pattern))
|
|
124
|
+
);
|
|
125
|
+
}
|
|
126
|
+
return [...new Set(filtered)].sort();
|
|
127
|
+
}
|
|
128
|
+
async function fetchSitemapUrls(sitemapUrl) {
|
|
129
|
+
const response = await fetch(sitemapUrl);
|
|
130
|
+
const xml = await response.text();
|
|
131
|
+
const locs = [...xml.matchAll(/<loc>([^<]+)<\/loc>/g)].map((m) => m[1]);
|
|
132
|
+
if (xml.includes("<sitemapindex")) {
|
|
133
|
+
const nested = await Promise.all(locs.map((loc) => fetchSitemapUrls(loc)));
|
|
134
|
+
return nested.flat();
|
|
135
|
+
}
|
|
136
|
+
return locs;
|
|
137
|
+
}
|
|
138
|
+
async function discoverFromSitemap(sitemapUrl, source) {
|
|
139
|
+
const urls = await fetchSitemapUrls(sitemapUrl);
|
|
140
|
+
return filterUrls(urls, source.include_patterns, source.exclude_patterns);
|
|
141
|
+
}
|
|
142
|
+
async function discoverUrls(page, source) {
|
|
143
|
+
if (source.sitemap_url) {
|
|
144
|
+
return discoverFromSitemap(source.sitemap_url, source);
|
|
145
|
+
}
|
|
146
|
+
await page.goto(source.start_url, { waitUntil: source.headed ? "networkidle" : "domcontentloaded" });
|
|
147
|
+
const rawUrls = await page.$$eval(
|
|
148
|
+
`${source.nav_selector} a[href]`,
|
|
149
|
+
(links) => links.map((a) => a.href)
|
|
150
|
+
);
|
|
151
|
+
const discovered = filterUrls(rawUrls, source.include_patterns, source.exclude_patterns);
|
|
152
|
+
if (!discovered.includes(source.start_url)) {
|
|
153
|
+
discovered.unshift(source.start_url);
|
|
154
|
+
}
|
|
155
|
+
return discovered;
|
|
156
|
+
}
|
|
157
|
+
async function fetchPage(page, url, headed) {
|
|
158
|
+
await page.goto(url, { waitUntil: headed ? "networkidle" : "domcontentloaded" });
|
|
159
|
+
return page.content();
|
|
160
|
+
}
|
|
161
|
+
var DEFAULT_CONCURRENCY = 50;
|
|
162
|
+
async function runPool(items, concurrency, fn) {
|
|
163
|
+
let nextIndex = 0;
|
|
164
|
+
async function worker() {
|
|
165
|
+
while (nextIndex < items.length) {
|
|
166
|
+
const index = nextIndex++;
|
|
167
|
+
await fn(items[index], index);
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
const workers = Array.from({ length: Math.min(concurrency, items.length) }, () => worker());
|
|
171
|
+
await Promise.all(workers);
|
|
172
|
+
}
|
|
173
|
+
async function scrapeSource(source, sourceName, dataDir, onProgress) {
|
|
174
|
+
const rawDir = join(dataDir, "raw", sourceName);
|
|
175
|
+
await mkdir(rawDir, { recursive: true });
|
|
176
|
+
const concurrency = source.concurrency ?? DEFAULT_CONCURRENCY;
|
|
177
|
+
const browser = await chromium.launch({ channel: "chrome", headless: !source.headed });
|
|
178
|
+
const context = await browser.newContext(source.user_agent ? { userAgent: source.user_agent } : {});
|
|
179
|
+
const discoveryPage = await context.newPage();
|
|
180
|
+
const urls = await discoverUrls(discoveryPage, source);
|
|
181
|
+
await discoveryPage.close();
|
|
182
|
+
let completed = 0;
|
|
183
|
+
try {
|
|
184
|
+
await runPool(urls, concurrency, async (url) => {
|
|
185
|
+
const page = await context.newPage();
|
|
186
|
+
try {
|
|
187
|
+
const html = await fetchPage(page, url, source.headed);
|
|
188
|
+
const slug = slugifyUrl(url);
|
|
189
|
+
await writeFile(join(rawDir, `${slug}.html`), html, "utf-8");
|
|
190
|
+
completed++;
|
|
191
|
+
onProgress?.(completed, urls.length, url);
|
|
192
|
+
} finally {
|
|
193
|
+
await page.close();
|
|
194
|
+
}
|
|
195
|
+
});
|
|
196
|
+
await writeFile(join(rawDir, "urls.json"), JSON.stringify(urls), "utf-8");
|
|
197
|
+
return urls;
|
|
198
|
+
} finally {
|
|
199
|
+
await browser.close();
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
async function createBrowser() {
|
|
203
|
+
return chromium.launch({ channel: "chrome" });
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
// src/converter.ts
|
|
207
|
+
import { readFile as readFile2, writeFile as writeFile2, mkdir as mkdir2 } from "node:fs/promises";
|
|
208
|
+
import { join as join2 } from "node:path";
|
|
209
|
+
import { JSDOM } from "jsdom";
|
|
210
|
+
import TurndownService from "turndown";
|
|
211
|
+
var turndown = new TurndownService({
|
|
212
|
+
headingStyle: "atx",
|
|
213
|
+
codeBlockStyle: "fenced",
|
|
214
|
+
bulletListMarker: "-"
|
|
215
|
+
});
|
|
216
|
+
var GENERIC_REMOVE = [
|
|
217
|
+
"style",
|
|
218
|
+
"script",
|
|
219
|
+
"noscript",
|
|
220
|
+
"iframe",
|
|
221
|
+
"svg"
|
|
222
|
+
];
|
|
223
|
+
function cleanMarkdown(md, textPatterns) {
|
|
224
|
+
let cleaned = md.replace(/^(#+)\s*$/gm, "").replace(/\n{3,}/g, "\n\n");
|
|
225
|
+
if (textPatterns) {
|
|
226
|
+
for (const pattern of textPatterns) {
|
|
227
|
+
cleaned = cleaned.replace(new RegExp(pattern, "gm"), "");
|
|
228
|
+
}
|
|
229
|
+
cleaned = cleaned.replace(/\n{3,}/g, "\n\n");
|
|
230
|
+
}
|
|
231
|
+
return cleaned.trim();
|
|
232
|
+
}
|
|
233
|
+
function extractContent(html, contentSelector, removeSelectors, removeTextPatterns) {
|
|
234
|
+
const dom = new JSDOM(html);
|
|
235
|
+
const doc = dom.window.document;
|
|
236
|
+
const contentEl = doc.querySelector(contentSelector);
|
|
237
|
+
if (!contentEl) {
|
|
238
|
+
return cleanMarkdown(turndown.turndown(doc.body.innerHTML), removeTextPatterns);
|
|
239
|
+
}
|
|
240
|
+
const allSelectors = [...GENERIC_REMOVE, ...removeSelectors ?? []];
|
|
241
|
+
for (const selector of allSelectors) {
|
|
242
|
+
for (const el of contentEl.querySelectorAll(selector)) {
|
|
243
|
+
el.remove();
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
return cleanMarkdown(turndown.turndown(contentEl.innerHTML), removeTextPatterns);
|
|
247
|
+
}
|
|
248
|
+
function extractTitle(html) {
|
|
249
|
+
const dom = new JSDOM(html);
|
|
250
|
+
const titleEl = dom.window.document.querySelector("title");
|
|
251
|
+
if (!titleEl) return "Untitled";
|
|
252
|
+
return titleEl.textContent?.replace(/\s*[|–—-]\s*.+$/, "").trim() ?? "Untitled";
|
|
253
|
+
}
|
|
254
|
+
function buildFrontmatter(source, url, title) {
|
|
255
|
+
return [
|
|
256
|
+
"---",
|
|
257
|
+
`source: ${source}`,
|
|
258
|
+
`url: "${url}"`,
|
|
259
|
+
`title: "${title.replace(/"/g, '\\"')}"`,
|
|
260
|
+
`fetched_at: "${(/* @__PURE__ */ new Date()).toISOString()}"`,
|
|
261
|
+
"---"
|
|
262
|
+
].join("\n");
|
|
263
|
+
}
|
|
264
|
+
function convertPage(html, source, url, contentSelector, removeSelectors, removeTextPatterns) {
|
|
265
|
+
const title = extractTitle(html);
|
|
266
|
+
const content = extractContent(html, contentSelector, removeSelectors, removeTextPatterns);
|
|
267
|
+
const frontmatter = buildFrontmatter(source, url, title);
|
|
268
|
+
const markdown = `${frontmatter}
|
|
269
|
+
|
|
270
|
+
${content}`;
|
|
271
|
+
return { source, url, title, markdown };
|
|
272
|
+
}
|
|
273
|
+
var DEFAULT_CONCURRENCY2 = 10;
|
|
274
|
+
async function convertSource(sourceName, urls, contentSelector, removeSelectors, removeTextPatterns, dataDir, concurrency = DEFAULT_CONCURRENCY2, onProgress) {
|
|
275
|
+
const rawDir = join2(dataDir, "raw", sourceName);
|
|
276
|
+
const mdDir = join2(dataDir, "markdown", sourceName);
|
|
277
|
+
await mkdir2(mdDir, { recursive: true });
|
|
278
|
+
const pages = new Array(urls.length);
|
|
279
|
+
let completed = 0;
|
|
280
|
+
let nextIndex = 0;
|
|
281
|
+
async function worker() {
|
|
282
|
+
while (nextIndex < urls.length) {
|
|
283
|
+
const i = nextIndex++;
|
|
284
|
+
const url = urls[i];
|
|
285
|
+
const slug = slugifyUrl(url);
|
|
286
|
+
const htmlPath = join2(rawDir, `${slug}.html`);
|
|
287
|
+
const html = await readFile2(htmlPath, "utf-8");
|
|
288
|
+
const page = convertPage(html, sourceName, url, contentSelector, removeSelectors, removeTextPatterns);
|
|
289
|
+
await writeFile2(join2(mdDir, `${slug}.md`), page.markdown, "utf-8");
|
|
290
|
+
pages[i] = page;
|
|
291
|
+
completed++;
|
|
292
|
+
onProgress?.(completed, urls.length, url);
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
const workers = Array.from(
|
|
296
|
+
{ length: Math.min(concurrency, urls.length) },
|
|
297
|
+
() => worker()
|
|
298
|
+
);
|
|
299
|
+
await Promise.all(workers);
|
|
300
|
+
return pages;
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
// src/chunker.ts
|
|
304
|
+
var MAX_TOKENS = 500;
|
|
305
|
+
function estimateTokens(text) {
|
|
306
|
+
return Math.ceil(text.length / 4);
|
|
307
|
+
}
|
|
308
|
+
function slugifyHeading(heading) {
|
|
309
|
+
return heading.toLowerCase().replace(/[^a-z0-9\s-]/g, "").replace(/\s+/g, "-").replace(/-+/g, "-").replace(/^-|-$/g, "");
|
|
310
|
+
}
|
|
311
|
+
function buildChunkId(source, url, headingSlug, index) {
|
|
312
|
+
const urlSlug = slugifyUrl(url);
|
|
313
|
+
const prefix = `${source}::${urlSlug}::`;
|
|
314
|
+
const maxSlugBytes = 1500 - Buffer.byteLength(prefix) - 10;
|
|
315
|
+
const truncatedSlug = Buffer.byteLength(headingSlug) > maxSlugBytes ? Buffer.from(headingSlug).subarray(0, maxSlugBytes).toString() : headingSlug;
|
|
316
|
+
const base = `${prefix}${truncatedSlug}`;
|
|
317
|
+
return index !== void 0 ? `${base}-${index}` : base;
|
|
318
|
+
}
|
|
319
|
+
function parseHeadingSections(markdown) {
|
|
320
|
+
const lines = markdown.split("\n");
|
|
321
|
+
const sections = [];
|
|
322
|
+
const headingStack = [];
|
|
323
|
+
const levelStack = [];
|
|
324
|
+
let currentSection = {
|
|
325
|
+
level: 0,
|
|
326
|
+
heading: "",
|
|
327
|
+
headingPath: [],
|
|
328
|
+
lines: []
|
|
329
|
+
};
|
|
330
|
+
for (const line of lines) {
|
|
331
|
+
const headingMatch = line.match(/^(#{1,6})\s+(.+)$/);
|
|
332
|
+
if (headingMatch) {
|
|
333
|
+
if (currentSection.lines.length > 0 || currentSection.heading !== "") {
|
|
334
|
+
sections.push(currentSection);
|
|
335
|
+
}
|
|
336
|
+
const level = headingMatch[1].length;
|
|
337
|
+
const heading = headingMatch[2].trim();
|
|
338
|
+
while (levelStack.length > 0 && levelStack[levelStack.length - 1] >= level) {
|
|
339
|
+
levelStack.pop();
|
|
340
|
+
headingStack.pop();
|
|
341
|
+
}
|
|
342
|
+
headingStack.push(heading);
|
|
343
|
+
levelStack.push(level);
|
|
344
|
+
currentSection = {
|
|
345
|
+
level,
|
|
346
|
+
heading,
|
|
347
|
+
headingPath: [...headingStack],
|
|
348
|
+
lines: []
|
|
349
|
+
};
|
|
350
|
+
} else {
|
|
351
|
+
currentSection.lines.push(line);
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
if (currentSection.lines.length > 0 || currentSection.heading !== "") {
|
|
355
|
+
sections.push(currentSection);
|
|
356
|
+
}
|
|
357
|
+
return sections;
|
|
358
|
+
}
|
|
359
|
+
function splitAtParagraphBoundaries(text, maxTokens) {
|
|
360
|
+
const paragraphs = text.split(/\n\n+/);
|
|
361
|
+
const parts = [];
|
|
362
|
+
let current = [];
|
|
363
|
+
let currentTokens = 0;
|
|
364
|
+
for (const para of paragraphs) {
|
|
365
|
+
const paraTokens = estimateTokens(para);
|
|
366
|
+
if (currentTokens + paraTokens > maxTokens && current.length > 0) {
|
|
367
|
+
parts.push(current.join("\n\n"));
|
|
368
|
+
current = [para];
|
|
369
|
+
currentTokens = paraTokens;
|
|
370
|
+
} else {
|
|
371
|
+
current.push(para);
|
|
372
|
+
currentTokens += paraTokens;
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
if (current.length > 0) {
|
|
376
|
+
parts.push(current.join("\n\n"));
|
|
377
|
+
}
|
|
378
|
+
return parts;
|
|
379
|
+
}
|
|
380
|
+
function stripFrontmatter(markdown) {
|
|
381
|
+
if (markdown.startsWith("---")) {
|
|
382
|
+
const endIndex = markdown.indexOf("---", 3);
|
|
383
|
+
if (endIndex !== -1) {
|
|
384
|
+
return markdown.slice(endIndex + 3).trim();
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
return markdown;
|
|
388
|
+
}
|
|
389
|
+
function chunkMarkdown(markdown, source, url, title) {
|
|
390
|
+
const stripped = stripFrontmatter(markdown);
|
|
391
|
+
const sections = parseHeadingSections(stripped);
|
|
392
|
+
const chunks = [];
|
|
393
|
+
const usedIds = /* @__PURE__ */ new Set();
|
|
394
|
+
function uniqueId(baseSlug) {
|
|
395
|
+
let id = buildChunkId(source, url, baseSlug);
|
|
396
|
+
if (!usedIds.has(id)) {
|
|
397
|
+
usedIds.add(id);
|
|
398
|
+
return id;
|
|
399
|
+
}
|
|
400
|
+
let counter = 1;
|
|
401
|
+
while (usedIds.has(buildChunkId(source, url, baseSlug, counter))) {
|
|
402
|
+
counter++;
|
|
403
|
+
}
|
|
404
|
+
id = buildChunkId(source, url, baseSlug, counter);
|
|
405
|
+
usedIds.add(id);
|
|
406
|
+
return id;
|
|
407
|
+
}
|
|
408
|
+
for (const section of sections) {
|
|
409
|
+
const headingLine = section.heading ? `${"#".repeat(section.level)} ${section.heading}
|
|
410
|
+
|
|
411
|
+
` : "";
|
|
412
|
+
const content = headingLine + section.lines.join("\n").trim();
|
|
413
|
+
if (!content.trim()) continue;
|
|
414
|
+
const headingSlug = section.heading ? slugifyHeading(section.heading) : "intro";
|
|
415
|
+
const tokens = estimateTokens(content);
|
|
416
|
+
if (tokens <= MAX_TOKENS) {
|
|
417
|
+
chunks.push({
|
|
418
|
+
id: uniqueId(headingSlug),
|
|
419
|
+
source,
|
|
420
|
+
url,
|
|
421
|
+
title,
|
|
422
|
+
heading_path: section.headingPath,
|
|
423
|
+
content,
|
|
424
|
+
token_count: tokens
|
|
425
|
+
});
|
|
426
|
+
} else {
|
|
427
|
+
const parts = splitAtParagraphBoundaries(content, MAX_TOKENS);
|
|
428
|
+
for (let i = 0; i < parts.length; i++) {
|
|
429
|
+
const partContent = parts[i].trim();
|
|
430
|
+
if (!partContent) continue;
|
|
431
|
+
const partSlug = parts.length > 1 ? `${headingSlug}-${i}` : headingSlug;
|
|
432
|
+
chunks.push({
|
|
433
|
+
id: uniqueId(partSlug),
|
|
434
|
+
source,
|
|
435
|
+
url,
|
|
436
|
+
title,
|
|
437
|
+
heading_path: section.headingPath,
|
|
438
|
+
content: partContent,
|
|
439
|
+
token_count: estimateTokens(partContent)
|
|
440
|
+
});
|
|
441
|
+
}
|
|
442
|
+
}
|
|
443
|
+
}
|
|
444
|
+
return chunks;
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
// src/embedder.ts
|
|
448
|
+
import { GoogleGenerativeAI } from "@google/generative-ai";
|
|
449
|
+
var BATCH_SIZE = 50;
|
|
450
|
+
var MODEL = "gemini-embedding-001";
|
|
451
|
+
var OUTPUT_DIMENSIONALITY = 768;
|
|
452
|
+
var MAX_RETRIES = 5;
|
|
453
|
+
var RATE_LIMIT_BASE_DELAY_MS = 6e4;
|
|
454
|
+
var NETWORK_BASE_DELAY_MS = 1e4;
|
|
455
|
+
var BATCH_DELAY_MS = 2500;
|
|
456
|
+
var DEFAULT_CHECKPOINT_EVERY_BATCHES = 20;
|
|
457
|
+
var NETWORK_ERROR_PATTERNS = [
|
|
458
|
+
"fetch failed",
|
|
459
|
+
"ECONNRESET",
|
|
460
|
+
"ETIMEDOUT",
|
|
461
|
+
"ECONNREFUSED",
|
|
462
|
+
"EAI_AGAIN",
|
|
463
|
+
"ENOTFOUND",
|
|
464
|
+
"socket hang up",
|
|
465
|
+
"UND_ERR_"
|
|
466
|
+
];
|
|
467
|
+
var genAI;
|
|
468
|
+
function getClient() {
|
|
469
|
+
if (!genAI) {
|
|
470
|
+
const apiKey = process.env.GEMINI_API_KEY;
|
|
471
|
+
if (!apiKey) {
|
|
472
|
+
throw new Error("GEMINI_API_KEY environment variable is not set");
|
|
473
|
+
}
|
|
474
|
+
genAI = new GoogleGenerativeAI(apiKey);
|
|
475
|
+
}
|
|
476
|
+
return genAI;
|
|
477
|
+
}
|
|
478
|
+
function classifyError(message) {
|
|
479
|
+
if (message.includes("429") || message.includes("503")) {
|
|
480
|
+
return "rate_limit";
|
|
481
|
+
}
|
|
482
|
+
if (NETWORK_ERROR_PATTERNS.some((p) => message.includes(p))) {
|
|
483
|
+
return "network";
|
|
484
|
+
}
|
|
485
|
+
return "other";
|
|
486
|
+
}
|
|
487
|
+
async function embedTexts(texts, options = {}) {
|
|
488
|
+
const client = getClient();
|
|
489
|
+
const model = client.getGenerativeModel({ model: MODEL });
|
|
490
|
+
const { onProgress, onCheckpoint, resumeFrom } = options;
|
|
491
|
+
const checkpointEveryBatches = options.checkpointEveryBatches ?? DEFAULT_CHECKPOINT_EVERY_BATCHES;
|
|
492
|
+
const embeddings = resumeFrom ? [...resumeFrom] : [];
|
|
493
|
+
const startIndex = Math.floor(embeddings.length / BATCH_SIZE) * BATCH_SIZE;
|
|
494
|
+
if (embeddings.length > startIndex) {
|
|
495
|
+
embeddings.length = startIndex;
|
|
496
|
+
}
|
|
497
|
+
if (startIndex > 0) {
|
|
498
|
+
console.log(` Resuming from chunk ${startIndex} of ${texts.length} (${embeddings.length} cached).`);
|
|
499
|
+
}
|
|
500
|
+
if (startIndex >= texts.length) {
|
|
501
|
+
return embeddings.slice(0, texts.length);
|
|
502
|
+
}
|
|
503
|
+
let batchesSinceCheckpoint = 0;
|
|
504
|
+
for (let i = startIndex; i < texts.length; i += BATCH_SIZE) {
|
|
505
|
+
const batch = texts.slice(i, i + BATCH_SIZE);
|
|
506
|
+
const batchNumber = i / BATCH_SIZE + 1;
|
|
507
|
+
let result;
|
|
508
|
+
for (let attempt = 0; attempt < MAX_RETRIES; attempt++) {
|
|
509
|
+
try {
|
|
510
|
+
result = await model.batchEmbedContents({
|
|
511
|
+
requests: batch.map((text) => ({
|
|
512
|
+
content: { role: "user", parts: [{ text }] },
|
|
513
|
+
outputDimensionality: OUTPUT_DIMENSIONALITY
|
|
514
|
+
}))
|
|
515
|
+
});
|
|
516
|
+
break;
|
|
517
|
+
} catch (err) {
|
|
518
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
519
|
+
const kind = classifyError(message);
|
|
520
|
+
if (kind !== "other" && attempt < MAX_RETRIES - 1) {
|
|
521
|
+
const baseDelay = kind === "rate_limit" ? RATE_LIMIT_BASE_DELAY_MS : NETWORK_BASE_DELAY_MS;
|
|
522
|
+
const delay = baseDelay * Math.pow(2, attempt);
|
|
523
|
+
const label = kind === "rate_limit" ? "Rate limited" : "Network error";
|
|
524
|
+
console.log(` ${label} (batch ${batchNumber}), retrying in ${delay / 1e3}s...`);
|
|
525
|
+
await new Promise((resolve2) => setTimeout(resolve2, delay));
|
|
526
|
+
continue;
|
|
527
|
+
}
|
|
528
|
+
console.error(` Embedding failed at batch ${batchNumber} (chunks ${i + 1}-${i + batch.length}): ${message}`);
|
|
529
|
+
throw err;
|
|
530
|
+
}
|
|
531
|
+
}
|
|
532
|
+
for (const embedding of result.embeddings) {
|
|
533
|
+
embeddings.push(embedding.values);
|
|
534
|
+
}
|
|
535
|
+
onProgress?.(Math.min(i + BATCH_SIZE, texts.length), texts.length);
|
|
536
|
+
batchesSinceCheckpoint++;
|
|
537
|
+
if (onCheckpoint && batchesSinceCheckpoint >= checkpointEveryBatches && i + BATCH_SIZE < texts.length) {
|
|
538
|
+
await onCheckpoint(embeddings);
|
|
539
|
+
batchesSinceCheckpoint = 0;
|
|
540
|
+
}
|
|
541
|
+
if (i + BATCH_SIZE < texts.length) {
|
|
542
|
+
await new Promise((resolve2) => setTimeout(resolve2, BATCH_DELAY_MS));
|
|
543
|
+
}
|
|
544
|
+
}
|
|
545
|
+
if (onCheckpoint) {
|
|
546
|
+
await onCheckpoint(embeddings);
|
|
547
|
+
}
|
|
548
|
+
return embeddings;
|
|
549
|
+
}
|
|
550
|
+
async function embedText(text) {
|
|
551
|
+
const [embedding] = await embedTexts([text]);
|
|
552
|
+
return embedding;
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
// src/store.ts
|
|
556
|
+
import { initializeApp, applicationDefault, getApps } from "firebase-admin/app";
|
|
557
|
+
import {
|
|
558
|
+
getFirestore,
|
|
559
|
+
FieldValue
|
|
560
|
+
} from "firebase-admin/firestore";
|
|
561
|
+
var BATCH_SIZE2 = 500;
|
|
562
|
+
var MAX_RETRIES2 = 5;
|
|
563
|
+
var BASE_DELAY_MS = 5e3;
|
|
564
|
+
async function retryOnQuota(fn) {
|
|
565
|
+
for (let attempt = 0; attempt < MAX_RETRIES2; attempt++) {
|
|
566
|
+
try {
|
|
567
|
+
return await fn();
|
|
568
|
+
} catch (err) {
|
|
569
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
570
|
+
if ((message.includes("RESOURCE_EXHAUSTED") || message.includes("Quota exceeded")) && attempt < MAX_RETRIES2 - 1) {
|
|
571
|
+
const delay = BASE_DELAY_MS * Math.pow(2, attempt);
|
|
572
|
+
console.log(` Quota exceeded, retrying in ${delay / 1e3}s...`);
|
|
573
|
+
await new Promise((resolve2) => setTimeout(resolve2, delay));
|
|
574
|
+
continue;
|
|
575
|
+
}
|
|
576
|
+
throw err;
|
|
577
|
+
}
|
|
578
|
+
}
|
|
579
|
+
throw new Error("Max retries exceeded");
|
|
580
|
+
}
|
|
581
|
+
var db;
|
|
582
|
+
function getDb() {
|
|
583
|
+
if (!db) {
|
|
584
|
+
if (getApps().length === 0) {
|
|
585
|
+
initializeApp({ credential: applicationDefault() });
|
|
586
|
+
}
|
|
587
|
+
db = getFirestore();
|
|
588
|
+
}
|
|
589
|
+
return db;
|
|
590
|
+
}
|
|
591
|
+
function chunksCol() {
|
|
592
|
+
return getDb().collection("grimoire_chunks");
|
|
593
|
+
}
|
|
594
|
+
function sourcesCol() {
|
|
595
|
+
return getDb().collection("grimoire_sources");
|
|
596
|
+
}
|
|
597
|
+
async function storeChunks(chunks, embeddings, onProgress) {
|
|
598
|
+
const database = getDb();
|
|
599
|
+
const col = chunksCol();
|
|
600
|
+
for (let i = 0; i < chunks.length; i += BATCH_SIZE2) {
|
|
601
|
+
const batch = database.batch();
|
|
602
|
+
const slice = chunks.slice(i, i + BATCH_SIZE2);
|
|
603
|
+
const embSlice = embeddings.slice(i, i + BATCH_SIZE2);
|
|
604
|
+
for (let j = 0; j < slice.length; j++) {
|
|
605
|
+
const chunk = slice[j];
|
|
606
|
+
batch.set(col.doc(chunk.id), {
|
|
607
|
+
source: chunk.source,
|
|
608
|
+
url: chunk.url,
|
|
609
|
+
title: chunk.title,
|
|
610
|
+
heading_path: chunk.heading_path,
|
|
611
|
+
content: chunk.content,
|
|
612
|
+
token_count: chunk.token_count,
|
|
613
|
+
embedded_at: (/* @__PURE__ */ new Date()).toISOString(),
|
|
614
|
+
embedding: FieldValue.vector(embSlice[j])
|
|
615
|
+
});
|
|
616
|
+
}
|
|
617
|
+
await retryOnQuota(() => batch.commit());
|
|
618
|
+
onProgress?.(Math.min(i + BATCH_SIZE2, chunks.length), chunks.length);
|
|
619
|
+
}
|
|
620
|
+
}
|
|
621
|
+
async function purgeSource(sourceName) {
|
|
622
|
+
const database = getDb();
|
|
623
|
+
const col = chunksCol();
|
|
624
|
+
const snapshot = await col.where("source", "==", sourceName).get();
|
|
625
|
+
if (snapshot.empty) return 0;
|
|
626
|
+
let batch = database.batch();
|
|
627
|
+
let count = 0;
|
|
628
|
+
for (const doc of snapshot.docs) {
|
|
629
|
+
batch.delete(doc.ref);
|
|
630
|
+
count++;
|
|
631
|
+
if (count % BATCH_SIZE2 === 0) {
|
|
632
|
+
await retryOnQuota(() => batch.commit());
|
|
633
|
+
batch = database.batch();
|
|
634
|
+
}
|
|
635
|
+
}
|
|
636
|
+
if (count % BATCH_SIZE2 !== 0) {
|
|
637
|
+
await retryOnQuota(() => batch.commit());
|
|
638
|
+
}
|
|
639
|
+
return count;
|
|
640
|
+
}
|
|
641
|
+
async function updateSourceMeta(sourceName, chunkCount, urlCount, version) {
|
|
642
|
+
await sourcesCol().doc(sourceName).set({
|
|
643
|
+
source: sourceName,
|
|
644
|
+
...version ? { version } : {},
|
|
645
|
+
last_refreshed: (/* @__PURE__ */ new Date()).toISOString(),
|
|
646
|
+
chunk_count: chunkCount,
|
|
647
|
+
url_count: urlCount
|
|
648
|
+
});
|
|
649
|
+
}
|
|
650
|
+
async function getSourceMeta(sourceName) {
|
|
651
|
+
const doc = await sourcesCol().doc(sourceName).get();
|
|
652
|
+
if (!doc.exists) return null;
|
|
653
|
+
return doc.data();
|
|
654
|
+
}
|
|
655
|
+
async function getAllSourcesMeta() {
|
|
656
|
+
const snapshot = await sourcesCol().get();
|
|
657
|
+
return snapshot.docs.map((doc) => doc.data());
|
|
658
|
+
}
|
|
659
|
+
async function deleteSourceMeta(sourceName) {
|
|
660
|
+
await sourcesCol().doc(sourceName).delete();
|
|
661
|
+
}
|
|
662
|
+
async function deleteChunksByIds(ids, onProgress) {
|
|
663
|
+
if (ids.length === 0) return;
|
|
664
|
+
const database = getDb();
|
|
665
|
+
const col = chunksCol();
|
|
666
|
+
for (let i = 0; i < ids.length; i += BATCH_SIZE2) {
|
|
667
|
+
const batch = database.batch();
|
|
668
|
+
const slice = ids.slice(i, i + BATCH_SIZE2);
|
|
669
|
+
for (const id of slice) {
|
|
670
|
+
batch.delete(col.doc(id));
|
|
671
|
+
}
|
|
672
|
+
await retryOnQuota(() => batch.commit());
|
|
673
|
+
onProgress?.(Math.min(i + BATCH_SIZE2, ids.length), ids.length);
|
|
674
|
+
}
|
|
675
|
+
}
|
|
676
|
+
async function getSourceChunkIds(sourceName) {
|
|
677
|
+
const col = chunksCol();
|
|
678
|
+
const snapshot = await col.where("source", "==", sourceName).select().get();
|
|
679
|
+
return new Set(snapshot.docs.map((doc) => doc.id));
|
|
680
|
+
}
|
|
681
|
+
async function vectorSearch(queryEmbedding, limit, source) {
|
|
682
|
+
const col = chunksCol();
|
|
683
|
+
let query = col;
|
|
684
|
+
if (source) {
|
|
685
|
+
query = query.where("source", "==", source);
|
|
686
|
+
}
|
|
687
|
+
const snapshot = await query.findNearest({
|
|
688
|
+
vectorField: "embedding",
|
|
689
|
+
queryVector: FieldValue.vector(queryEmbedding),
|
|
690
|
+
limit,
|
|
691
|
+
distanceMeasure: "COSINE",
|
|
692
|
+
distanceResultField: "_distance"
|
|
693
|
+
}).get();
|
|
694
|
+
return snapshot.docs.map((doc) => {
|
|
695
|
+
const data = doc.data();
|
|
696
|
+
const distance = data._distance ?? 0;
|
|
697
|
+
delete data._distance;
|
|
698
|
+
return { id: doc.id, data, distance };
|
|
699
|
+
});
|
|
700
|
+
}
|
|
701
|
+
|
|
702
|
+
// src/reranker.ts
|
|
703
|
+
function getRerankerUrl() {
|
|
704
|
+
const url = process.env.RERANKER_URL;
|
|
705
|
+
if (!url) {
|
|
706
|
+
throw new Error("RERANKER_URL environment variable is not set");
|
|
707
|
+
}
|
|
708
|
+
return url;
|
|
709
|
+
}
|
|
710
|
+
async function rerank(query, documents, topN = 5) {
|
|
711
|
+
const baseUrl = getRerankerUrl();
|
|
712
|
+
const response = await fetch(`${baseUrl}/v1/rerank`, {
|
|
713
|
+
method: "POST",
|
|
714
|
+
headers: { "Content-Type": "application/json" },
|
|
715
|
+
body: JSON.stringify({ query, documents, top_n: topN })
|
|
716
|
+
});
|
|
717
|
+
if (!response.ok) {
|
|
718
|
+
throw new Error(`Reranker request failed: ${response.status} ${response.statusText}`);
|
|
719
|
+
}
|
|
720
|
+
const data = await response.json();
|
|
721
|
+
return data.results;
|
|
722
|
+
}
|
|
723
|
+
|
|
724
|
+
// src/search.ts
|
|
725
|
+
function hasReranker() {
|
|
726
|
+
return !!process.env.RERANKER_URL;
|
|
727
|
+
}
|
|
728
|
+
async function search(query, options = {}) {
|
|
729
|
+
const { source, candidates = 20, topN = 5 } = options;
|
|
730
|
+
const queryEmbedding = await embedText(query);
|
|
731
|
+
const rawResults = await vectorSearch(queryEmbedding, candidates, source);
|
|
732
|
+
if (rawResults.length === 0) return [];
|
|
733
|
+
if (hasReranker()) {
|
|
734
|
+
const documents = rawResults.map((r) => r.data.content);
|
|
735
|
+
const reranked = await rerank(query, documents, topN);
|
|
736
|
+
return reranked.map((r) => {
|
|
737
|
+
const original = rawResults[r.index];
|
|
738
|
+
const data = original.data;
|
|
739
|
+
return {
|
|
740
|
+
id: original.id,
|
|
741
|
+
source: data.source,
|
|
742
|
+
url: data.url,
|
|
743
|
+
title: data.title,
|
|
744
|
+
heading_path: data.heading_path,
|
|
745
|
+
content: data.content,
|
|
746
|
+
relevance_score: r.relevance_score
|
|
747
|
+
};
|
|
748
|
+
});
|
|
749
|
+
}
|
|
750
|
+
return rawResults.slice(0, topN).map((r) => {
|
|
751
|
+
const data = r.data;
|
|
752
|
+
return {
|
|
753
|
+
id: r.id,
|
|
754
|
+
source: data.source,
|
|
755
|
+
url: data.url,
|
|
756
|
+
title: data.title,
|
|
757
|
+
heading_path: data.heading_path,
|
|
758
|
+
content: data.content,
|
|
759
|
+
relevance_score: Math.max(0, 1 - r.distance / 2)
|
|
760
|
+
};
|
|
761
|
+
});
|
|
762
|
+
}
|
|
763
|
+
|
|
764
|
+
// src/apikey.ts
|
|
765
|
+
import { randomBytes, createHash } from "node:crypto";
|
|
766
|
+
import {
|
|
767
|
+
getFirestore as getFirestore2
|
|
768
|
+
} from "firebase-admin/firestore";
|
|
769
|
+
import { initializeApp as initializeApp2, applicationDefault as applicationDefault2, getApps as getApps2 } from "firebase-admin/app";
|
|
770
|
+
var db2;
|
|
771
|
+
function getDb2() {
|
|
772
|
+
if (!db2) {
|
|
773
|
+
if (getApps2().length === 0) {
|
|
774
|
+
initializeApp2({ credential: applicationDefault2() });
|
|
775
|
+
}
|
|
776
|
+
db2 = getFirestore2();
|
|
777
|
+
}
|
|
778
|
+
return db2;
|
|
779
|
+
}
|
|
780
|
+
function hashKey(key) {
|
|
781
|
+
return createHash("sha256").update(key).digest("hex");
|
|
782
|
+
}
|
|
783
|
+
function apiKeysCol() {
|
|
784
|
+
return getDb2().collection("grimoire_api_keys");
|
|
785
|
+
}
|
|
786
|
+
async function createApiKey(name) {
|
|
787
|
+
const raw = `grim_${randomBytes(32).toString("base64url")}`;
|
|
788
|
+
const hash = hashKey(raw);
|
|
789
|
+
await apiKeysCol().doc(hash).set({
|
|
790
|
+
name,
|
|
791
|
+
created_at: (/* @__PURE__ */ new Date()).toISOString(),
|
|
792
|
+
last_used_at: null
|
|
793
|
+
});
|
|
794
|
+
return raw;
|
|
795
|
+
}
|
|
796
|
+
async function listApiKeys() {
|
|
797
|
+
const snapshot = await apiKeysCol().get();
|
|
798
|
+
if (snapshot.empty) {
|
|
799
|
+
console.log("No API keys found.");
|
|
800
|
+
return;
|
|
801
|
+
}
|
|
802
|
+
console.log("\nAPI Keys:\n");
|
|
803
|
+
for (const doc of snapshot.docs) {
|
|
804
|
+
const data = doc.data();
|
|
805
|
+
const lastUsed = data.last_used_at ?? "never";
|
|
806
|
+
console.log(` ${bold(data.name)}`);
|
|
807
|
+
console.log(` Created: ${data.created_at}`);
|
|
808
|
+
console.log(` Last used: ${lastUsed}`);
|
|
809
|
+
}
|
|
810
|
+
}
|
|
811
|
+
async function deleteApiKey(name) {
|
|
812
|
+
const snapshot = await apiKeysCol().where("name", "==", name).get();
|
|
813
|
+
if (snapshot.empty) {
|
|
814
|
+
throw new Error(`No API key found with name "${name}".`);
|
|
815
|
+
}
|
|
816
|
+
for (const doc of snapshot.docs) {
|
|
817
|
+
await doc.ref.delete();
|
|
818
|
+
}
|
|
819
|
+
console.log(`API key "${name}" deleted.`);
|
|
820
|
+
}
|
|
821
|
+
async function cmdApiKey() {
|
|
822
|
+
const subcommand = process.argv[3];
|
|
823
|
+
const arg = process.argv[4];
|
|
824
|
+
if (subcommand === "create") {
|
|
825
|
+
if (!arg) {
|
|
826
|
+
console.error("Usage: grimoire apikey create <name>");
|
|
827
|
+
process.exit(1);
|
|
828
|
+
}
|
|
829
|
+
const key = await createApiKey(arg);
|
|
830
|
+
console.log(`
|
|
831
|
+
API key created for "${arg}":
|
|
832
|
+
`);
|
|
833
|
+
console.log(` ${key}
|
|
834
|
+
`);
|
|
835
|
+
console.log("Save this key \u2014 it will not be shown again.");
|
|
836
|
+
} else if (subcommand === "list") {
|
|
837
|
+
await listApiKeys();
|
|
838
|
+
} else if (subcommand === "delete") {
|
|
839
|
+
if (!arg) {
|
|
840
|
+
console.error("Usage: grimoire apikey delete <name>");
|
|
841
|
+
process.exit(1);
|
|
842
|
+
}
|
|
843
|
+
await deleteApiKey(arg);
|
|
844
|
+
} else {
|
|
845
|
+
console.error("Usage: grimoire apikey <create|list|delete> [name]");
|
|
846
|
+
process.exit(1);
|
|
847
|
+
}
|
|
848
|
+
}
|
|
849
|
+
|
|
850
|
+
// src/llms-ingest.ts
|
|
851
|
+
import { writeFile as writeFile3, mkdir as mkdir3 } from "node:fs/promises";
|
|
852
|
+
import { join as join3 } from "node:path";
|
|
853
|
+
var BOILERPLATE_PATTERNS = [
|
|
854
|
+
/^\[Skip to content\]\([^)]*\)\s*$/gm,
|
|
855
|
+
/^Was this helpful\?\s*$/gm,
|
|
856
|
+
/^YesNo\s*$/gm,
|
|
857
|
+
/^\[ Edit page \]\([^)]+\) \[ Report issue \]\([^)]+\)\s*$/gm,
|
|
858
|
+
/^Copy page\s*$/gm,
|
|
859
|
+
/^```json\n\{"@context":"https:\/\/schema\.org","@type":"BreadcrumbList"[^`]*```\s*$/gm
|
|
860
|
+
];
|
|
861
|
+
function splitPages(content) {
|
|
862
|
+
const pages = [];
|
|
863
|
+
const frontmatterPattern = /^---\ntitle: (.+)\n/gm;
|
|
864
|
+
const boundaries = [];
|
|
865
|
+
let match;
|
|
866
|
+
while ((match = frontmatterPattern.exec(content)) !== null) {
|
|
867
|
+
boundaries.push({ index: match.index, title: match[1] });
|
|
868
|
+
}
|
|
869
|
+
for (let i = 0; i < boundaries.length; i++) {
|
|
870
|
+
const start = boundaries[i].index;
|
|
871
|
+
const end = i + 1 < boundaries.length ? boundaries[i + 1].index : content.length;
|
|
872
|
+
const raw = content.slice(start, end).trimEnd();
|
|
873
|
+
const url = extractUrl(raw);
|
|
874
|
+
if (!url) continue;
|
|
875
|
+
const bodyStart = raw.indexOf("---", 3);
|
|
876
|
+
if (bodyStart === -1) continue;
|
|
877
|
+
const body = raw.slice(raw.indexOf("\n", bodyStart) + 1);
|
|
878
|
+
let cleaned = body;
|
|
879
|
+
for (const pattern of BOILERPLATE_PATTERNS) {
|
|
880
|
+
cleaned = cleaned.replace(pattern, "");
|
|
881
|
+
}
|
|
882
|
+
cleaned = cleaned.replace(/\n{3,}/g, "\n\n").trim();
|
|
883
|
+
if (!cleaned) continue;
|
|
884
|
+
pages.push({
|
|
885
|
+
title: boundaries[i].title,
|
|
886
|
+
url,
|
|
887
|
+
markdown: cleaned
|
|
888
|
+
});
|
|
889
|
+
}
|
|
890
|
+
return pages;
|
|
891
|
+
}
|
|
892
|
+
function extractUrl(pageContent) {
|
|
893
|
+
const match = pageContent.match(
|
|
894
|
+
/```json\n\{"@context":"https:\/\/schema\.org","@type":"BreadcrumbList","itemListElement":\[(.+?)\]\}\n```/
|
|
895
|
+
);
|
|
896
|
+
if (!match) return null;
|
|
897
|
+
const items = JSON.parse(`[${match[1]}]`);
|
|
898
|
+
const last = items[items.length - 1];
|
|
899
|
+
if (!last?.item?.["@id"]) return null;
|
|
900
|
+
return `https://developers.cloudflare.com${last.item["@id"]}`;
|
|
901
|
+
}
|
|
902
|
+
async function ingestLlmsFull(llmsFullUrl, sourceName, baseUrl, dataDir, onProgress) {
|
|
903
|
+
const response = await fetch(llmsFullUrl);
|
|
904
|
+
if (!response.ok) {
|
|
905
|
+
throw new Error(`Failed to fetch ${llmsFullUrl}: ${response.status} ${response.statusText}`);
|
|
906
|
+
}
|
|
907
|
+
const content = await response.text();
|
|
908
|
+
const pages = splitPages(content);
|
|
909
|
+
const mdDir = join3(dataDir, "markdown", sourceName);
|
|
910
|
+
await mkdir3(mdDir, { recursive: true });
|
|
911
|
+
const results = [];
|
|
912
|
+
for (let i = 0; i < pages.length; i++) {
|
|
913
|
+
const page = pages[i];
|
|
914
|
+
const frontmatter = buildFrontmatter(sourceName, page.url, page.title);
|
|
915
|
+
const fullMarkdown = `${frontmatter}
|
|
916
|
+
|
|
917
|
+
${page.markdown}`;
|
|
918
|
+
const slug = slugifyUrl(page.url);
|
|
919
|
+
await writeFile3(join3(mdDir, `${slug}.md`), fullMarkdown, "utf-8");
|
|
920
|
+
results.push({
|
|
921
|
+
source: sourceName,
|
|
922
|
+
url: page.url,
|
|
923
|
+
title: page.title,
|
|
924
|
+
markdown: fullMarkdown
|
|
925
|
+
});
|
|
926
|
+
if (onProgress && ((i + 1) % 100 === 0 || i + 1 === pages.length)) {
|
|
927
|
+
onProgress(i + 1, pages.length);
|
|
928
|
+
}
|
|
929
|
+
}
|
|
930
|
+
return results;
|
|
931
|
+
}
|
|
932
|
+
|
|
933
|
+
// src/admin.ts
|
|
934
|
+
var PROJECT_ROOT = resolve(import.meta.dirname, "..");
|
|
935
|
+
var CONFIG_PATH = join4(PROJECT_ROOT, "config", "sources.yaml");
|
|
936
|
+
var DATA_DIR = join4(PROJECT_ROOT, "data");
|
|
937
|
+
function prompt(rl, question) {
|
|
938
|
+
return new Promise((resolve2) => rl.question(question, resolve2));
|
|
939
|
+
}
|
|
940
|
+
async function cmdAdd() {
|
|
941
|
+
const args = parseArgs({
|
|
942
|
+
args: process.argv.slice(3),
|
|
943
|
+
options: {
|
|
944
|
+
url: { type: "string" }
|
|
945
|
+
},
|
|
946
|
+
allowPositionals: true
|
|
947
|
+
});
|
|
948
|
+
const name = args.positionals[0];
|
|
949
|
+
const url = args.values.url;
|
|
950
|
+
if (!name || !url) {
|
|
951
|
+
console.error("Usage: grimoire add <name> --url <start_url>");
|
|
952
|
+
process.exit(1);
|
|
953
|
+
}
|
|
954
|
+
console.log("Scanning page...\n");
|
|
955
|
+
const browser = await createBrowser();
|
|
956
|
+
const context = await browser.newContext();
|
|
957
|
+
const page = await context.newPage();
|
|
958
|
+
try {
|
|
959
|
+
await page.goto(url, { waitUntil: "domcontentloaded" });
|
|
960
|
+
const navCandidates = await page.evaluate(() => {
|
|
961
|
+
const selectors = ["nav", "[role='navigation']"];
|
|
962
|
+
const results = [];
|
|
963
|
+
const seen = /* @__PURE__ */ new Set();
|
|
964
|
+
for (const sel of selectors) {
|
|
965
|
+
for (const el of document.querySelectorAll(sel)) {
|
|
966
|
+
if (seen.has(el)) continue;
|
|
967
|
+
seen.add(el);
|
|
968
|
+
const links = el.querySelectorAll("a[href]");
|
|
969
|
+
const label = el.getAttribute("aria-label") || el.getAttribute("class") || el.tagName.toLowerCase();
|
|
970
|
+
results.push({
|
|
971
|
+
selector: sel,
|
|
972
|
+
label,
|
|
973
|
+
linkCount: links.length
|
|
974
|
+
});
|
|
975
|
+
}
|
|
976
|
+
}
|
|
977
|
+
return results.sort((a, b) => b.linkCount - a.linkCount);
|
|
978
|
+
});
|
|
979
|
+
if (navCandidates.length === 0) {
|
|
980
|
+
console.error("No navigation elements found on this page.");
|
|
981
|
+
process.exit(1);
|
|
982
|
+
}
|
|
983
|
+
console.log("Navigation candidates:");
|
|
984
|
+
for (let i = 0; i < navCandidates.length; i++) {
|
|
985
|
+
const c = navCandidates[i];
|
|
986
|
+
console.log(` [${i + 1}] ${c.selector} (${c.label}) \u2014 ${c.linkCount} links`);
|
|
987
|
+
}
|
|
988
|
+
const rl = createInterface({ input: process.stdin, output: process.stdout });
|
|
989
|
+
const navChoice = await prompt(rl, "\nSelect navigation: ");
|
|
990
|
+
const navIndex = parseInt(navChoice, 10) - 1;
|
|
991
|
+
if (isNaN(navIndex) || navIndex < 0 || navIndex >= navCandidates.length) {
|
|
992
|
+
console.error("Invalid selection.");
|
|
993
|
+
rl.close();
|
|
994
|
+
process.exit(1);
|
|
995
|
+
}
|
|
996
|
+
const selectedNav = navCandidates[navIndex];
|
|
997
|
+
const parsedUrl = new URL(url);
|
|
998
|
+
const defaultPattern = parsedUrl.pathname.replace(/\/$/, "");
|
|
999
|
+
const allLinks = await page.$$eval(
|
|
1000
|
+
`${selectedNav.selector} a[href]`,
|
|
1001
|
+
(links, pattern) => {
|
|
1002
|
+
return [...new Set(
|
|
1003
|
+
links.map((a) => a.href).filter((h) => h.startsWith("http") && !h.includes("?hl=") && !h.endsWith("#") && h.includes(pattern))
|
|
1004
|
+
)];
|
|
1005
|
+
},
|
|
1006
|
+
defaultPattern
|
|
1007
|
+
);
|
|
1008
|
+
console.log(`
|
|
1009
|
+
Found ${allLinks.length} links matching ${defaultPattern}`);
|
|
1010
|
+
const patternInput = await prompt(rl, `Include pattern [default: ${defaultPattern}]: `);
|
|
1011
|
+
const includePattern = patternInput.trim() || defaultPattern;
|
|
1012
|
+
const excludeInput = await prompt(rl, "Exclude patterns (comma-separated, optional): ");
|
|
1013
|
+
const excludePatterns = excludeInput.trim() ? excludeInput.split(",").map((p) => p.trim()) : void 0;
|
|
1014
|
+
rl.close();
|
|
1015
|
+
const contentSelector = await page.evaluate(() => {
|
|
1016
|
+
if (document.querySelector("article")) return "article";
|
|
1017
|
+
if (document.querySelector("main")) return "main";
|
|
1018
|
+
return "body";
|
|
1019
|
+
});
|
|
1020
|
+
const removeSelectors = await page.evaluate(() => {
|
|
1021
|
+
const candidates = [
|
|
1022
|
+
{ selector: "nav", label: "nav" },
|
|
1023
|
+
{ selector: "footer", label: "footer" },
|
|
1024
|
+
{ selector: "[role='complementary']", label: "[role='complementary']" },
|
|
1025
|
+
{ selector: "[role='banner']", label: "[role='banner']" },
|
|
1026
|
+
{ selector: ".breadcrumbs, .breadcrumb", label: ".breadcrumbs" },
|
|
1027
|
+
{ selector: ".pagination-nav, .pagination", label: ".pagination-nav" }
|
|
1028
|
+
];
|
|
1029
|
+
return candidates.filter((c) => document.querySelector(c.selector) !== null).map((c) => c.label);
|
|
1030
|
+
});
|
|
1031
|
+
if (removeSelectors.length > 0) {
|
|
1032
|
+
console.log(`
|
|
1033
|
+
Detected removable elements: ${removeSelectors.join(", ")}`);
|
|
1034
|
+
}
|
|
1035
|
+
const parsedUrlForSitemap = new URL(url);
|
|
1036
|
+
let sitemapUrl;
|
|
1037
|
+
try {
|
|
1038
|
+
const sitemapCheck = await page.goto(`${parsedUrlForSitemap.origin}/sitemap.xml`, { waitUntil: "domcontentloaded", timeout: 1e4 });
|
|
1039
|
+
if (sitemapCheck && sitemapCheck.status() === 200) {
|
|
1040
|
+
const body = await page.textContent("body");
|
|
1041
|
+
if (body && (body.includes("<urlset") || body.includes("<sitemapindex"))) {
|
|
1042
|
+
sitemapUrl = `${parsedUrlForSitemap.origin}/sitemap.xml`;
|
|
1043
|
+
console.log(`
|
|
1044
|
+
Sitemap found: ${sitemapUrl}`);
|
|
1045
|
+
}
|
|
1046
|
+
}
|
|
1047
|
+
} catch {
|
|
1048
|
+
}
|
|
1049
|
+
const source = {
|
|
1050
|
+
name: name.replace(/-/g, " ").replace(/\b\w/g, (c) => c.toUpperCase()),
|
|
1051
|
+
start_url: url,
|
|
1052
|
+
...sitemapUrl ? { sitemap_url: sitemapUrl } : {},
|
|
1053
|
+
nav_selector: selectedNav.selector,
|
|
1054
|
+
content_selector: contentSelector,
|
|
1055
|
+
include_patterns: [includePattern],
|
|
1056
|
+
...excludePatterns ? { exclude_patterns: excludePatterns } : {},
|
|
1057
|
+
...removeSelectors.length > 0 ? { remove_selectors: removeSelectors } : {}
|
|
1058
|
+
};
|
|
1059
|
+
let existingContent = "";
|
|
1060
|
+
try {
|
|
1061
|
+
existingContent = await readFile3(CONFIG_PATH, "utf-8");
|
|
1062
|
+
} catch {
|
|
1063
|
+
existingContent = "sources:\n";
|
|
1064
|
+
}
|
|
1065
|
+
const newEntry = stringify({ [name]: source }, { indent: 2 });
|
|
1066
|
+
const indented = newEntry.split("\n").map((line) => line.trim() ? ` ${line}` : "").join("\n");
|
|
1067
|
+
await writeFile4(CONFIG_PATH, existingContent.trimEnd() + "\n" + indented, "utf-8");
|
|
1068
|
+
console.log(`
|
|
1069
|
+
Source "${name}" added to config/sources.yaml`);
|
|
1070
|
+
console.log(`Run "grimoire refresh ${name}" to start scraping.`);
|
|
1071
|
+
} finally {
|
|
1072
|
+
await browser.close();
|
|
1073
|
+
}
|
|
1074
|
+
}
|
|
1075
|
+
async function loadEmbeddingsCache(cachePath) {
|
|
1076
|
+
try {
|
|
1077
|
+
const data = await readFile3(cachePath, "utf-8");
|
|
1078
|
+
return JSON.parse(data);
|
|
1079
|
+
} catch {
|
|
1080
|
+
return null;
|
|
1081
|
+
}
|
|
1082
|
+
}
|
|
1083
|
+
async function embedWithCheckpoint(texts, rawDir, embeddingsCachePath) {
|
|
1084
|
+
await mkdir4(rawDir, { recursive: true });
|
|
1085
|
+
const partialCache = await loadEmbeddingsCache(embeddingsCachePath);
|
|
1086
|
+
const resumeFrom = partialCache && partialCache.length > 0 && partialCache.length < texts.length ? partialCache : void 0;
|
|
1087
|
+
return embedTexts(texts, {
|
|
1088
|
+
onProgress: (done, total) => {
|
|
1089
|
+
console.log(` [${done}/${total}] embedded`);
|
|
1090
|
+
},
|
|
1091
|
+
onCheckpoint: async (current) => {
|
|
1092
|
+
await writeFile4(embeddingsCachePath, JSON.stringify(current), "utf-8");
|
|
1093
|
+
},
|
|
1094
|
+
resumeFrom
|
|
1095
|
+
});
|
|
1096
|
+
}
|
|
1097
|
+
async function storeWithStrategy(sourceName, allChunks, embeddings, urlCount, version, diff) {
|
|
1098
|
+
if (diff) {
|
|
1099
|
+
console.log(" Computing diff...");
|
|
1100
|
+
const existingIds = await getSourceChunkIds(sourceName);
|
|
1101
|
+
const newIds = new Set(allChunks.map((c) => c.id));
|
|
1102
|
+
const toDelete = [...existingIds].filter((id) => !newIds.has(id));
|
|
1103
|
+
console.log(` Diff: ${toDelete.length} to delete, ${allChunks.length} to upsert (${existingIds.size} existing)`);
|
|
1104
|
+
if (toDelete.length > 0) {
|
|
1105
|
+
console.log(" Deleting removed chunks...");
|
|
1106
|
+
await deleteChunksByIds(toDelete, (cur, total) => {
|
|
1107
|
+
console.log(` [${cur}/${total}] deleted`);
|
|
1108
|
+
});
|
|
1109
|
+
}
|
|
1110
|
+
console.log(" Upserting chunks...");
|
|
1111
|
+
await storeChunks(allChunks, embeddings, (cur, total) => {
|
|
1112
|
+
console.log(` [${cur}/${total}] stored`);
|
|
1113
|
+
});
|
|
1114
|
+
} else {
|
|
1115
|
+
console.log(" Purging old chunks...");
|
|
1116
|
+
await purgeSource(sourceName);
|
|
1117
|
+
console.log(" Storing in Firestore...");
|
|
1118
|
+
await storeChunks(allChunks, embeddings, (cur, total) => {
|
|
1119
|
+
console.log(` [${cur}/${total}] stored`);
|
|
1120
|
+
});
|
|
1121
|
+
}
|
|
1122
|
+
await updateSourceMeta(sourceName, allChunks.length, urlCount, version);
|
|
1123
|
+
console.log(` Done. ${allChunks.length} chunks stored for "${sourceName}".`);
|
|
1124
|
+
}
|
|
1125
|
+
async function cmdRefresh() {
|
|
1126
|
+
const args = parseArgs({
|
|
1127
|
+
args: process.argv.slice(3),
|
|
1128
|
+
options: {
|
|
1129
|
+
full: { type: "boolean", default: false },
|
|
1130
|
+
all: { type: "boolean", default: false },
|
|
1131
|
+
diff: { type: "boolean", default: false },
|
|
1132
|
+
concurrency: { type: "string" },
|
|
1133
|
+
limit: { type: "string" },
|
|
1134
|
+
"from-html": { type: "boolean", default: false },
|
|
1135
|
+
"from-markdown": { type: "boolean", default: false },
|
|
1136
|
+
"from-embeddings": { type: "boolean", default: false },
|
|
1137
|
+
"skip-store": { type: "boolean", default: false }
|
|
1138
|
+
},
|
|
1139
|
+
allowPositionals: true
|
|
1140
|
+
});
|
|
1141
|
+
const config = await loadConfig(CONFIG_PATH);
|
|
1142
|
+
const sourcesToRefresh = args.values.all ? Object.keys(config.sources) : [args.positionals[0]];
|
|
1143
|
+
if (!args.values.all && !sourcesToRefresh[0]) {
|
|
1144
|
+
console.error("Usage: grimoire refresh <source> [--full] [--from-html] [--from-markdown] [--from-embeddings] [--skip-store] [--limit <n>] [--concurrency <n>]");
|
|
1145
|
+
process.exit(1);
|
|
1146
|
+
}
|
|
1147
|
+
const concurrencyOverride = args.values.concurrency ? parseInt(args.values.concurrency, 10) : void 0;
|
|
1148
|
+
const urlLimit = args.values.limit ? parseInt(args.values.limit, 10) : void 0;
|
|
1149
|
+
for (const sourceName of sourcesToRefresh) {
|
|
1150
|
+
const source = config.sources[sourceName];
|
|
1151
|
+
if (!source) {
|
|
1152
|
+
console.error(`Source "${sourceName}" not found in config.`);
|
|
1153
|
+
process.exit(1);
|
|
1154
|
+
}
|
|
1155
|
+
if (concurrencyOverride) {
|
|
1156
|
+
source.concurrency = concurrencyOverride;
|
|
1157
|
+
}
|
|
1158
|
+
const rawDir = join4(DATA_DIR, "raw", sourceName);
|
|
1159
|
+
const mdDir = join4(DATA_DIR, "markdown", sourceName);
|
|
1160
|
+
const embeddingsCachePath = join4(rawDir, "embeddings.json");
|
|
1161
|
+
console.log(`
|
|
1162
|
+
Refreshing "${sourceName}"...`);
|
|
1163
|
+
if (args.values.full) {
|
|
1164
|
+
console.log(" Purging existing chunks...");
|
|
1165
|
+
const deleted = await purgeSource(sourceName);
|
|
1166
|
+
console.log(` Deleted ${deleted} chunks.`);
|
|
1167
|
+
await rm(rawDir, { recursive: true, force: true });
|
|
1168
|
+
await rm(mdDir, { recursive: true, force: true });
|
|
1169
|
+
}
|
|
1170
|
+
let urls;
|
|
1171
|
+
if (args.values["from-embeddings"]) {
|
|
1172
|
+
console.log(" Loading cached embeddings...");
|
|
1173
|
+
const cached = await loadEmbeddingsCache(embeddingsCachePath);
|
|
1174
|
+
if (!cached) {
|
|
1175
|
+
console.error(" No cached embeddings found. Run without --from-embeddings first.");
|
|
1176
|
+
process.exit(1);
|
|
1177
|
+
}
|
|
1178
|
+
const mdFiles = await readdir(mdDir);
|
|
1179
|
+
const allPages = [];
|
|
1180
|
+
for (const f of mdFiles.filter((f2) => f2.endsWith(".md"))) {
|
|
1181
|
+
const content = await readFile3(join4(mdDir, f), "utf-8");
|
|
1182
|
+
const urlMatch = content.match(/^url: "(.+)"$/m);
|
|
1183
|
+
const titleMatch = content.match(/^title: "(.+)"$/m);
|
|
1184
|
+
allPages.push({
|
|
1185
|
+
markdown: content,
|
|
1186
|
+
url: urlMatch?.[1] ?? "",
|
|
1187
|
+
title: titleMatch?.[1] ?? "Untitled"
|
|
1188
|
+
});
|
|
1189
|
+
}
|
|
1190
|
+
console.log(" Chunking...");
|
|
1191
|
+
const allChunks2 = allPages.flatMap((p) => chunkMarkdown(p.markdown, sourceName, p.url, p.title));
|
|
1192
|
+
console.log(` Created ${allChunks2.length} chunks.`);
|
|
1193
|
+
if (cached.length !== allChunks2.length) {
|
|
1194
|
+
console.error(` Embeddings cache (${cached.length}) doesn't match chunk count (${allChunks2.length}). Re-embed with --from-html.`);
|
|
1195
|
+
process.exit(1);
|
|
1196
|
+
}
|
|
1197
|
+
if (args.values["skip-store"]) {
|
|
1198
|
+
console.log(` Done. ${allChunks2.length} chunks ready (skipped Firestore).`);
|
|
1199
|
+
continue;
|
|
1200
|
+
}
|
|
1201
|
+
await storeWithStrategy(sourceName, allChunks2, cached, allPages.length, source.version, args.values.diff);
|
|
1202
|
+
continue;
|
|
1203
|
+
}
|
|
1204
|
+
if (args.values["from-markdown"]) {
|
|
1205
|
+
console.log(" Reading cached markdown...");
|
|
1206
|
+
const mdFiles = await readdir(mdDir).catch(() => []);
|
|
1207
|
+
const markdownFiles = mdFiles.filter((f) => f.endsWith(".md"));
|
|
1208
|
+
if (markdownFiles.length === 0) {
|
|
1209
|
+
console.error(" No cached markdown found. Run with --from-html first.");
|
|
1210
|
+
process.exit(1);
|
|
1211
|
+
}
|
|
1212
|
+
const pages2 = [];
|
|
1213
|
+
for (const f of markdownFiles) {
|
|
1214
|
+
const content = await readFile3(join4(mdDir, f), "utf-8");
|
|
1215
|
+
const urlMatch = content.match(/^url: "(.+)"$/m);
|
|
1216
|
+
const titleMatch = content.match(/^title: "(.+)"$/m);
|
|
1217
|
+
pages2.push({
|
|
1218
|
+
markdown: content,
|
|
1219
|
+
url: urlMatch?.[1] ?? "",
|
|
1220
|
+
title: titleMatch?.[1] ?? "Untitled"
|
|
1221
|
+
});
|
|
1222
|
+
}
|
|
1223
|
+
console.log(` Found ${pages2.length} cached pages.`);
|
|
1224
|
+
console.log(" Chunking...");
|
|
1225
|
+
const allChunks2 = pages2.flatMap((p) => chunkMarkdown(p.markdown, sourceName, p.url, p.title));
|
|
1226
|
+
console.log(` Created ${allChunks2.length} chunks.`);
|
|
1227
|
+
console.log(" Embedding chunks...");
|
|
1228
|
+
const texts2 = allChunks2.map((c) => c.content);
|
|
1229
|
+
const embeddings2 = await embedWithCheckpoint(texts2, rawDir, embeddingsCachePath);
|
|
1230
|
+
if (args.values["skip-store"]) {
|
|
1231
|
+
console.log(` Done. ${allChunks2.length} chunks ready (skipped Firestore).`);
|
|
1232
|
+
continue;
|
|
1233
|
+
}
|
|
1234
|
+
await storeWithStrategy(sourceName, allChunks2, embeddings2, pages2.length, source.version, args.values.diff);
|
|
1235
|
+
continue;
|
|
1236
|
+
}
|
|
1237
|
+
let pages;
|
|
1238
|
+
if (source.llms_full_url && !args.values["from-html"]) {
|
|
1239
|
+
console.log(` Fetching llms-full.txt from ${source.llms_full_url}...`);
|
|
1240
|
+
pages = await ingestLlmsFull(
|
|
1241
|
+
source.llms_full_url,
|
|
1242
|
+
sourceName,
|
|
1243
|
+
source.start_url,
|
|
1244
|
+
DATA_DIR,
|
|
1245
|
+
(cur, total) => {
|
|
1246
|
+
console.log(` [${cur}/${total}] pages processed`);
|
|
1247
|
+
}
|
|
1248
|
+
);
|
|
1249
|
+
console.log(` Extracted ${pages.length} pages.`);
|
|
1250
|
+
} else {
|
|
1251
|
+
if (args.values["from-html"]) {
|
|
1252
|
+
console.log(" Reading URLs from cached HTML...");
|
|
1253
|
+
const urlsJsonPath = join4(rawDir, "urls.json");
|
|
1254
|
+
try {
|
|
1255
|
+
urls = JSON.parse(await readFile3(urlsJsonPath, "utf-8"));
|
|
1256
|
+
} catch {
|
|
1257
|
+
const rawFiles = await readdir(rawDir);
|
|
1258
|
+
const htmlFiles = rawFiles.filter((f) => f.endsWith(".html"));
|
|
1259
|
+
urls = [];
|
|
1260
|
+
for (const f of htmlFiles) {
|
|
1261
|
+
const fileSlug = f.replace(/\.html$/, "");
|
|
1262
|
+
const htmlPath = join4(rawDir, f);
|
|
1263
|
+
const html = await readFile3(htmlPath, "utf-8");
|
|
1264
|
+
const match = html.match(/<link[^>]+rel="canonical"[^>]+href="([^"]+)"/);
|
|
1265
|
+
if (match && slugifyUrl(match[1]) === fileSlug) {
|
|
1266
|
+
urls.push(match[1]);
|
|
1267
|
+
continue;
|
|
1268
|
+
}
|
|
1269
|
+
const ogMatch = html.match(/<meta[^>]+property="og:url"[^>]+content="([^"]+)"/);
|
|
1270
|
+
if (ogMatch && slugifyUrl(ogMatch[1]) === fileSlug) {
|
|
1271
|
+
urls.push(ogMatch[1]);
|
|
1272
|
+
continue;
|
|
1273
|
+
}
|
|
1274
|
+
urls.push(`https://recovered/${fileSlug}`);
|
|
1275
|
+
}
|
|
1276
|
+
}
|
|
1277
|
+
console.log(` Found ${urls.length} cached pages.`);
|
|
1278
|
+
} else {
|
|
1279
|
+
console.log(" Scraping URLs...");
|
|
1280
|
+
urls = await scrapeSource(source, sourceName, DATA_DIR, (cur, total, url) => {
|
|
1281
|
+
console.log(` [${cur}/${total}] ${url}`);
|
|
1282
|
+
});
|
|
1283
|
+
console.log(` Found ${urls.length} pages.`);
|
|
1284
|
+
}
|
|
1285
|
+
if (urlLimit && urls.length > urlLimit) {
|
|
1286
|
+
urls = urls.slice(0, urlLimit);
|
|
1287
|
+
console.log(` Limited to ${urlLimit} pages.`);
|
|
1288
|
+
}
|
|
1289
|
+
console.log(" Converting to markdown...");
|
|
1290
|
+
pages = await convertSource(
|
|
1291
|
+
sourceName,
|
|
1292
|
+
urls,
|
|
1293
|
+
source.content_selector,
|
|
1294
|
+
source.remove_selectors,
|
|
1295
|
+
source.remove_text_patterns,
|
|
1296
|
+
DATA_DIR,
|
|
1297
|
+
source.concurrency,
|
|
1298
|
+
(cur, total) => {
|
|
1299
|
+
if (cur % 10 === 0 || cur === total) console.log(` [${cur}/${total}] converted`);
|
|
1300
|
+
}
|
|
1301
|
+
);
|
|
1302
|
+
}
|
|
1303
|
+
console.log(" Chunking...");
|
|
1304
|
+
const allChunks = pages.flatMap((p) => chunkMarkdown(p.markdown, sourceName, p.url, p.title));
|
|
1305
|
+
console.log(` Created ${allChunks.length} chunks.`);
|
|
1306
|
+
console.log(" Embedding chunks...");
|
|
1307
|
+
const texts = allChunks.map((c) => c.content);
|
|
1308
|
+
const embeddings = await embedWithCheckpoint(texts, rawDir, embeddingsCachePath);
|
|
1309
|
+
if (args.values["skip-store"]) {
|
|
1310
|
+
console.log(` Done. ${allChunks.length} chunks ready (skipped Firestore).`);
|
|
1311
|
+
continue;
|
|
1312
|
+
}
|
|
1313
|
+
await storeWithStrategy(sourceName, allChunks, embeddings, pages.length, source.version, args.values.diff);
|
|
1314
|
+
}
|
|
1315
|
+
}
|
|
1316
|
+
async function cmdSearch() {
|
|
1317
|
+
const args = parseArgs({
|
|
1318
|
+
args: process.argv.slice(3),
|
|
1319
|
+
options: {
|
|
1320
|
+
source: { type: "string" },
|
|
1321
|
+
top: { type: "string" },
|
|
1322
|
+
compact: { type: "boolean", default: false }
|
|
1323
|
+
},
|
|
1324
|
+
allowPositionals: true
|
|
1325
|
+
});
|
|
1326
|
+
const query = args.positionals.join(" ");
|
|
1327
|
+
if (!query) {
|
|
1328
|
+
console.error('Usage: grimoire search "<query>" [--source <name>] [--top <n>] [--compact]');
|
|
1329
|
+
process.exit(1);
|
|
1330
|
+
}
|
|
1331
|
+
const topN = args.values.top ? parseInt(args.values.top, 10) : void 0;
|
|
1332
|
+
const results = await search(query, { source: args.values.source, topN });
|
|
1333
|
+
if (results.length === 0) {
|
|
1334
|
+
console.log("No results found.");
|
|
1335
|
+
return;
|
|
1336
|
+
}
|
|
1337
|
+
if (args.values.compact) {
|
|
1338
|
+
for (const r of results) {
|
|
1339
|
+
console.log(`${r.relevance_score.toFixed(4)} | ${r.source} | ${r.title} | ${r.heading_path.join(" > ")} | ${r.url}`);
|
|
1340
|
+
}
|
|
1341
|
+
return;
|
|
1342
|
+
}
|
|
1343
|
+
for (let i = 0; i < results.length; i++) {
|
|
1344
|
+
const r = results[i];
|
|
1345
|
+
console.log(`
|
|
1346
|
+
${bold(`[${i + 1}] ${r.title}`)} (${r.relevance_score.toFixed(4)})`);
|
|
1347
|
+
console.log(` ${cyan(r.url)}`);
|
|
1348
|
+
console.log(` ${yellow(r.heading_path.join(" > "))}`);
|
|
1349
|
+
console.log(` ${r.content.replace(/\n/g, " ")}`);
|
|
1350
|
+
}
|
|
1351
|
+
}
|
|
1352
|
+
async function cmdList() {
|
|
1353
|
+
const args = parseArgs({
|
|
1354
|
+
args: process.argv.slice(3),
|
|
1355
|
+
options: {
|
|
1356
|
+
names: { type: "boolean", default: false }
|
|
1357
|
+
},
|
|
1358
|
+
allowPositionals: true
|
|
1359
|
+
});
|
|
1360
|
+
const metas = await getAllSourcesMeta();
|
|
1361
|
+
if (metas.length === 0) {
|
|
1362
|
+
console.log("No sources have been refreshed yet.");
|
|
1363
|
+
return;
|
|
1364
|
+
}
|
|
1365
|
+
if (args.values.names) {
|
|
1366
|
+
for (const meta of metas) {
|
|
1367
|
+
console.log(meta.source);
|
|
1368
|
+
}
|
|
1369
|
+
return;
|
|
1370
|
+
}
|
|
1371
|
+
console.log("\nSources:\n");
|
|
1372
|
+
for (const meta of metas) {
|
|
1373
|
+
const ver = meta.version ? ` v${meta.version}` : "";
|
|
1374
|
+
console.log(` ${bold(meta.source)}${ver}`);
|
|
1375
|
+
console.log(` ${meta.chunk_count} chunks, ${meta.url_count} URLs, last refreshed ${meta.last_refreshed}`);
|
|
1376
|
+
}
|
|
1377
|
+
}
|
|
1378
|
+
async function cmdStats() {
|
|
1379
|
+
const metas = await getAllSourcesMeta();
|
|
1380
|
+
if (metas.length === 0) {
|
|
1381
|
+
console.log("No sources have been refreshed yet.");
|
|
1382
|
+
return;
|
|
1383
|
+
}
|
|
1384
|
+
let totalChunks = 0;
|
|
1385
|
+
let totalUrls = 0;
|
|
1386
|
+
console.log("\nSource Statistics:\n");
|
|
1387
|
+
for (const meta of metas) {
|
|
1388
|
+
const ver = meta.version ? ` v${meta.version}` : "";
|
|
1389
|
+
console.log(` ${bold(meta.source)}${ver}`);
|
|
1390
|
+
console.log(` Chunks: ${meta.chunk_count}`);
|
|
1391
|
+
console.log(` URLs: ${meta.url_count}`);
|
|
1392
|
+
console.log(` Last refreshed: ${meta.last_refreshed}`);
|
|
1393
|
+
totalChunks += meta.chunk_count;
|
|
1394
|
+
totalUrls += meta.url_count;
|
|
1395
|
+
}
|
|
1396
|
+
console.log(`
|
|
1397
|
+
Total: ${totalChunks} chunks across ${totalUrls} URLs from ${metas.length} sources`);
|
|
1398
|
+
}
|
|
1399
|
+
async function cmdExport() {
|
|
1400
|
+
const args = parseArgs({
|
|
1401
|
+
args: process.argv.slice(3),
|
|
1402
|
+
options: {
|
|
1403
|
+
format: { type: "string", default: "json" }
|
|
1404
|
+
},
|
|
1405
|
+
allowPositionals: true
|
|
1406
|
+
});
|
|
1407
|
+
const sourceName = args.positionals[0];
|
|
1408
|
+
if (!sourceName) {
|
|
1409
|
+
console.error("Usage: grimoire export <source> [--format json]");
|
|
1410
|
+
process.exit(1);
|
|
1411
|
+
}
|
|
1412
|
+
const mdDir = join4(DATA_DIR, "markdown", sourceName);
|
|
1413
|
+
let files;
|
|
1414
|
+
try {
|
|
1415
|
+
files = await readdir(mdDir);
|
|
1416
|
+
} catch {
|
|
1417
|
+
console.error(`No markdown data found for source "${sourceName}".`);
|
|
1418
|
+
process.exit(1);
|
|
1419
|
+
}
|
|
1420
|
+
const pages = [];
|
|
1421
|
+
for (const file of files.filter((f) => f.endsWith(".md"))) {
|
|
1422
|
+
const content = await readFile3(join4(mdDir, file), "utf-8");
|
|
1423
|
+
pages.push({ file, content });
|
|
1424
|
+
}
|
|
1425
|
+
console.log(JSON.stringify(pages, null, 2));
|
|
1426
|
+
}
|
|
1427
|
+
async function cmdDelete() {
|
|
1428
|
+
const sourceName = process.argv[3];
|
|
1429
|
+
if (!sourceName) {
|
|
1430
|
+
console.error("Usage: grimoire delete <source>");
|
|
1431
|
+
process.exit(1);
|
|
1432
|
+
}
|
|
1433
|
+
const meta = await getSourceMeta(sourceName);
|
|
1434
|
+
if (!meta) {
|
|
1435
|
+
console.error(`Source "${sourceName}" not found in Firestore.`);
|
|
1436
|
+
process.exit(1);
|
|
1437
|
+
}
|
|
1438
|
+
console.log(`Deleting "${sourceName}" (${meta.chunk_count} chunks)...`);
|
|
1439
|
+
const deleted = await purgeSource(sourceName);
|
|
1440
|
+
await deleteSourceMeta(sourceName);
|
|
1441
|
+
console.log(`Deleted ${deleted} chunks and source metadata for "${sourceName}".`);
|
|
1442
|
+
}
|
|
1443
|
+
async function cmdScrapeUrls() {
|
|
1444
|
+
const args = parseArgs({
|
|
1445
|
+
args: process.argv.slice(3),
|
|
1446
|
+
options: {
|
|
1447
|
+
concurrency: { type: "string" }
|
|
1448
|
+
},
|
|
1449
|
+
allowPositionals: true
|
|
1450
|
+
});
|
|
1451
|
+
const sourceName = args.positionals[0];
|
|
1452
|
+
if (!sourceName) {
|
|
1453
|
+
console.error("Usage: grimoire scrape-urls <source> [--concurrency <n>]");
|
|
1454
|
+
process.exit(1);
|
|
1455
|
+
}
|
|
1456
|
+
const config = await loadConfig(CONFIG_PATH);
|
|
1457
|
+
const source = config.sources[sourceName];
|
|
1458
|
+
if (!source) {
|
|
1459
|
+
console.error(`Source "${sourceName}" not found in config.`);
|
|
1460
|
+
process.exit(1);
|
|
1461
|
+
}
|
|
1462
|
+
const rawDir = join4(DATA_DIR, "raw", sourceName);
|
|
1463
|
+
const urlsPath = join4(rawDir, "urls.json");
|
|
1464
|
+
let urls;
|
|
1465
|
+
try {
|
|
1466
|
+
urls = JSON.parse(await readFile3(urlsPath, "utf-8"));
|
|
1467
|
+
} catch {
|
|
1468
|
+
console.error(`No urls.json found for "${sourceName}". Run 'grimoire refresh ${sourceName} --skip-store' first.`);
|
|
1469
|
+
process.exit(1);
|
|
1470
|
+
}
|
|
1471
|
+
const missing = urls.filter((url) => !existsSync(join4(rawDir, `${slugifyUrl(url)}.html`)));
|
|
1472
|
+
console.log(`
|
|
1473
|
+
Total: ${urls.length}, Cached: ${urls.length - missing.length}, Missing: ${missing.length}`);
|
|
1474
|
+
if (missing.length === 0) {
|
|
1475
|
+
console.log("Nothing to scrape.");
|
|
1476
|
+
return;
|
|
1477
|
+
}
|
|
1478
|
+
const concurrency = args.values.concurrency ? parseInt(args.values.concurrency, 10) : source.concurrency ?? 20;
|
|
1479
|
+
const browser = await createBrowser();
|
|
1480
|
+
const context = await browser.newContext(source.user_agent ? { userAgent: source.user_agent } : {});
|
|
1481
|
+
let done = 0;
|
|
1482
|
+
for (let i = 0; i < missing.length; i += concurrency) {
|
|
1483
|
+
const batch = missing.slice(i, i + concurrency);
|
|
1484
|
+
await Promise.all(batch.map(async (url) => {
|
|
1485
|
+
const page = await context.newPage();
|
|
1486
|
+
try {
|
|
1487
|
+
await page.goto(url, { waitUntil: source.headed ? "networkidle" : "domcontentloaded", timeout: 3e4 });
|
|
1488
|
+
const html = await page.content();
|
|
1489
|
+
await writeFile4(join4(rawDir, `${slugifyUrl(url)}.html`), html, "utf-8");
|
|
1490
|
+
done++;
|
|
1491
|
+
if (done % 10 === 0 || done === missing.length) console.log(` [${done}/${missing.length}]`);
|
|
1492
|
+
} catch (e) {
|
|
1493
|
+
console.error(` FAILED: ${url} - ${e instanceof Error ? e.message : String(e)}`);
|
|
1494
|
+
} finally {
|
|
1495
|
+
await page.close();
|
|
1496
|
+
}
|
|
1497
|
+
}));
|
|
1498
|
+
}
|
|
1499
|
+
console.log(`Done. Fetched ${done} pages.`);
|
|
1500
|
+
await browser.close();
|
|
1501
|
+
}
|
|
1502
|
+
var ADMIN_COMMANDS = {
|
|
1503
|
+
add: cmdAdd,
|
|
1504
|
+
refresh: cmdRefresh,
|
|
1505
|
+
delete: cmdDelete,
|
|
1506
|
+
"scrape-urls": cmdScrapeUrls,
|
|
1507
|
+
search: cmdSearch,
|
|
1508
|
+
list: cmdList,
|
|
1509
|
+
stats: cmdStats,
|
|
1510
|
+
export: cmdExport,
|
|
1511
|
+
apikey: cmdApiKey
|
|
1512
|
+
};
|
|
1513
|
+
export {
|
|
1514
|
+
ADMIN_COMMANDS
|
|
1515
|
+
};
|
|
1516
|
+
//# sourceMappingURL=admin-HA6FNUV4.js.map
|