@astrofoundry/grimoire 3.30.0 → 3.31.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{admin-AKV4CA2O.js → admin-WUNBDKBC.js} +782 -277
- package/dist/admin-WUNBDKBC.js.map +7 -0
- package/dist/chunk-R46N6C3C.js +40 -0
- package/dist/{chunk-BRS6X3AE.js.map → chunk-R46N6C3C.js.map} +1 -1
- package/dist/cli.js +38 -22
- package/dist/cli.js.map +2 -2
- package/package.json +2 -1
- package/dist/admin-AKV4CA2O.js.map +0 -7
- package/dist/chunk-BRS6X3AE.js +0 -12
|
@@ -1,12 +1,372 @@
|
|
|
1
1
|
import {
|
|
2
|
+
__commonJS,
|
|
3
|
+
__toESM,
|
|
2
4
|
bold,
|
|
3
5
|
cyan,
|
|
4
6
|
yellow
|
|
5
|
-
} from "./chunk-
|
|
7
|
+
} from "./chunk-R46N6C3C.js";
|
|
8
|
+
|
|
9
|
+
// node_modules/.pnpm/@joplin+turndown-plugin-gfm@1.0.67/node_modules/@joplin/turndown-plugin-gfm/lib/turndown-plugin-gfm.cjs.js
|
|
10
|
+
var require_turndown_plugin_gfm_cjs = __commonJS({
|
|
11
|
+
"node_modules/.pnpm/@joplin+turndown-plugin-gfm@1.0.67/node_modules/@joplin/turndown-plugin-gfm/lib/turndown-plugin-gfm.cjs.js"(exports) {
|
|
12
|
+
"use strict";
|
|
13
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
14
|
+
var highlightRegExp = /highlight-(?:text|source)-([a-z0-9]+)/;
|
|
15
|
+
function highlightedCodeBlock(turndownService) {
|
|
16
|
+
turndownService.addRule("highlightedCodeBlock", {
|
|
17
|
+
filter: function(node) {
|
|
18
|
+
var firstChild = node.firstChild;
|
|
19
|
+
return node.nodeName === "DIV" && highlightRegExp.test(node.className) && firstChild && firstChild.nodeName === "PRE";
|
|
20
|
+
},
|
|
21
|
+
replacement: function(content, node, options) {
|
|
22
|
+
var className = node.className || "";
|
|
23
|
+
var language = (className.match(highlightRegExp) || [null, ""])[1];
|
|
24
|
+
return "\n\n" + options.fence + language + "\n" + node.firstChild.textContent + "\n" + options.fence + "\n\n";
|
|
25
|
+
}
|
|
26
|
+
});
|
|
27
|
+
}
|
|
28
|
+
function strikethrough(turndownService) {
|
|
29
|
+
turndownService.addRule("strikethrough", {
|
|
30
|
+
filter: ["del", "s", "strike"],
|
|
31
|
+
replacement: function(content) {
|
|
32
|
+
return "~~" + content + "~~";
|
|
33
|
+
}
|
|
34
|
+
});
|
|
35
|
+
}
|
|
36
|
+
var indexOf = Array.prototype.indexOf;
|
|
37
|
+
var every = Array.prototype.every;
|
|
38
|
+
var rules = {};
|
|
39
|
+
var alignMap = { left: ":---", right: "---:", center: ":---:" };
|
|
40
|
+
var isCodeBlock_ = null;
|
|
41
|
+
var options_ = null;
|
|
42
|
+
var tableShouldBeSkippedCache_ = /* @__PURE__ */ new WeakMap();
|
|
43
|
+
function getAlignment(node) {
|
|
44
|
+
return node ? (node.getAttribute("align") || node.style.textAlign || "").toLowerCase() : "";
|
|
45
|
+
}
|
|
46
|
+
function getBorder(alignment) {
|
|
47
|
+
return alignment ? alignMap[alignment] : "---";
|
|
48
|
+
}
|
|
49
|
+
function getColumnAlignment(table, columnIndex) {
|
|
50
|
+
var votes = {
|
|
51
|
+
left: 0,
|
|
52
|
+
right: 0,
|
|
53
|
+
center: 0,
|
|
54
|
+
"": 0
|
|
55
|
+
};
|
|
56
|
+
var align = "";
|
|
57
|
+
for (var i = 0; i < table.rows.length; ++i) {
|
|
58
|
+
var row = table.rows[i];
|
|
59
|
+
if (columnIndex < row.childNodes.length) {
|
|
60
|
+
var cellAlignment = getAlignment(row.childNodes[columnIndex]);
|
|
61
|
+
++votes[cellAlignment];
|
|
62
|
+
if (votes[cellAlignment] > votes[align]) {
|
|
63
|
+
align = cellAlignment;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
return align;
|
|
68
|
+
}
|
|
69
|
+
rules.tableCell = {
|
|
70
|
+
filter: ["th", "td"],
|
|
71
|
+
replacement: function(content, node) {
|
|
72
|
+
if (tableShouldBeSkipped(nodeParentTable(node))) return content;
|
|
73
|
+
return cell(content, node);
|
|
74
|
+
}
|
|
75
|
+
};
|
|
76
|
+
rules.tableRow = {
|
|
77
|
+
filter: "tr",
|
|
78
|
+
replacement: function(content, node) {
|
|
79
|
+
const parentTable = nodeParentTable(node);
|
|
80
|
+
if (tableShouldBeSkipped(parentTable)) return content;
|
|
81
|
+
var borderCells = "";
|
|
82
|
+
if (isHeadingRow(node)) {
|
|
83
|
+
const colCount = tableColCount(parentTable);
|
|
84
|
+
for (var i = 0; i < colCount; i++) {
|
|
85
|
+
const childNode = i < node.childNodes.length ? node.childNodes[i] : null;
|
|
86
|
+
var border = getBorder(getColumnAlignment(parentTable, i));
|
|
87
|
+
borderCells += cell(border, childNode, i);
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
return "\n" + content + (borderCells ? "\n" + borderCells : "");
|
|
91
|
+
}
|
|
92
|
+
};
|
|
93
|
+
rules.table = {
|
|
94
|
+
filter: function(node, options) {
|
|
95
|
+
return node.nodeName === "TABLE";
|
|
96
|
+
},
|
|
97
|
+
replacement: function(content, node) {
|
|
98
|
+
if (tableShouldBeHtml(node, options_)) {
|
|
99
|
+
let html = node.outerHTML;
|
|
100
|
+
let divParent = nodeParentDiv(node);
|
|
101
|
+
if (divParent === null || !divParent.classList.contains("joplin-table-wrapper")) {
|
|
102
|
+
return `
|
|
103
|
+
|
|
104
|
+
<div class="joplin-table-wrapper">${html}</div>
|
|
105
|
+
|
|
106
|
+
`;
|
|
107
|
+
} else {
|
|
108
|
+
return html;
|
|
109
|
+
}
|
|
110
|
+
} else {
|
|
111
|
+
if (tableShouldBeSkipped(node)) return content;
|
|
112
|
+
content = content.replace(/\n+/g, "\n");
|
|
113
|
+
var secondLine = content.trim().split("\n");
|
|
114
|
+
if (secondLine.length >= 2) secondLine = secondLine[1];
|
|
115
|
+
var secondLineIsDivider = /\| :?---/.test(secondLine);
|
|
116
|
+
var columnCount = tableColCount(node);
|
|
117
|
+
var emptyHeader = "";
|
|
118
|
+
if (columnCount && !secondLineIsDivider) {
|
|
119
|
+
emptyHeader = "|" + " |".repeat(columnCount) + "\n|";
|
|
120
|
+
for (var columnIndex = 0; columnIndex < columnCount; ++columnIndex) {
|
|
121
|
+
emptyHeader += " " + getBorder(getColumnAlignment(node, columnIndex)) + " |";
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
const captionNode = node.querySelector ? node.querySelector("caption") : node.caption;
|
|
125
|
+
const captionContent = captionNode ? captionNode.textContent || "" : "";
|
|
126
|
+
const caption = captionContent ? `${captionContent}
|
|
127
|
+
|
|
128
|
+
` : "";
|
|
129
|
+
const tableContent = `${emptyHeader}${content}`.trimStart();
|
|
130
|
+
return `
|
|
131
|
+
|
|
132
|
+
${caption}${tableContent}
|
|
133
|
+
|
|
134
|
+
`;
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
};
|
|
138
|
+
rules.tableCaption = {
|
|
139
|
+
filter: ["caption"],
|
|
140
|
+
replacement: () => ""
|
|
141
|
+
};
|
|
142
|
+
rules.tableColgroup = {
|
|
143
|
+
filter: ["colgroup", "col"],
|
|
144
|
+
replacement: () => ""
|
|
145
|
+
};
|
|
146
|
+
rules.tableSection = {
|
|
147
|
+
filter: ["thead", "tbody", "tfoot"],
|
|
148
|
+
replacement: function(content) {
|
|
149
|
+
return content;
|
|
150
|
+
}
|
|
151
|
+
};
|
|
152
|
+
function isHeadingRow(tr) {
|
|
153
|
+
var parentNode = tr.parentNode;
|
|
154
|
+
return parentNode.nodeName === "THEAD" || parentNode.firstChild === tr && (parentNode.nodeName === "TABLE" || isFirstTbody(parentNode)) && every.call(tr.childNodes, function(n) {
|
|
155
|
+
return n.nodeName === "TH";
|
|
156
|
+
});
|
|
157
|
+
}
|
|
158
|
+
function isFirstTbody(element) {
|
|
159
|
+
var previousSibling = element.previousSibling;
|
|
160
|
+
return element.nodeName === "TBODY" && (!previousSibling || previousSibling.nodeName === "THEAD" && /^\s*$/i.test(previousSibling.textContent));
|
|
161
|
+
}
|
|
162
|
+
function cell(content, node = null, index = null) {
|
|
163
|
+
if (index === null) index = indexOf.call(node.parentNode.childNodes, node);
|
|
164
|
+
var prefix = " ";
|
|
165
|
+
if (index === 0) prefix = "| ";
|
|
166
|
+
let filteredContent = content.trim().replace(/\n\r/g, "<br>").replace(/\n/g, "<br>");
|
|
167
|
+
filteredContent = filteredContent.replace(/\|+/g, "\\|");
|
|
168
|
+
while (filteredContent.length < 3) filteredContent += " ";
|
|
169
|
+
if (node) filteredContent = handleColSpan(filteredContent, node, " ");
|
|
170
|
+
return prefix + filteredContent + " |";
|
|
171
|
+
}
|
|
172
|
+
function nodeContainsTable(node) {
|
|
173
|
+
if (!node.childNodes) return false;
|
|
174
|
+
for (let i = 0; i < node.childNodes.length; i++) {
|
|
175
|
+
const child = node.childNodes[i];
|
|
176
|
+
if (child.nodeName === "TABLE") return true;
|
|
177
|
+
if (nodeContainsTable(child)) return true;
|
|
178
|
+
}
|
|
179
|
+
return false;
|
|
180
|
+
}
|
|
181
|
+
var nodeContains = (node, types) => {
|
|
182
|
+
if (!node.childNodes) return false;
|
|
183
|
+
for (let i = 0; i < node.childNodes.length; i++) {
|
|
184
|
+
const child = node.childNodes[i];
|
|
185
|
+
if (types === "code" && isCodeBlock_ && isCodeBlock_(child)) return true;
|
|
186
|
+
if (types.includes(child.nodeName)) return true;
|
|
187
|
+
if (nodeContains(child, types)) return true;
|
|
188
|
+
}
|
|
189
|
+
return false;
|
|
190
|
+
};
|
|
191
|
+
var customStyleProperties = [
|
|
192
|
+
"background-color",
|
|
193
|
+
"background",
|
|
194
|
+
"border-color",
|
|
195
|
+
"border",
|
|
196
|
+
"border-top",
|
|
197
|
+
"border-right",
|
|
198
|
+
"border-bottom",
|
|
199
|
+
"border-left",
|
|
200
|
+
"border-style",
|
|
201
|
+
"border-width",
|
|
202
|
+
"padding",
|
|
203
|
+
"padding-top",
|
|
204
|
+
"padding-right",
|
|
205
|
+
"padding-bottom",
|
|
206
|
+
"padding-left",
|
|
207
|
+
"float",
|
|
208
|
+
"margin-left",
|
|
209
|
+
"margin-right"
|
|
210
|
+
];
|
|
211
|
+
var customAttributeNames = [
|
|
212
|
+
"bgcolor",
|
|
213
|
+
"bordercolor",
|
|
214
|
+
"background"
|
|
215
|
+
];
|
|
216
|
+
var nodeHasCustomStyle = (node) => {
|
|
217
|
+
if (!node || !node.getAttribute) return false;
|
|
218
|
+
const styleAttr = node.getAttribute("style");
|
|
219
|
+
if (!styleAttr) return false;
|
|
220
|
+
const properties = styleAttr.split(";").map((s) => s.split(":")[0].trim().toLowerCase()).filter((s) => s.length > 0);
|
|
221
|
+
for (let i = 0; i < properties.length; i++) {
|
|
222
|
+
if (customStyleProperties.includes(properties[i])) return true;
|
|
223
|
+
}
|
|
224
|
+
return false;
|
|
225
|
+
};
|
|
226
|
+
var hasNonDefaultSpacingAttribute = (node, name) => {
|
|
227
|
+
if (!node || !node.getAttribute) return false;
|
|
228
|
+
const value = node.getAttribute(name);
|
|
229
|
+
if (value === null) return false;
|
|
230
|
+
const normalisedValue = `${value}`.trim().toLowerCase();
|
|
231
|
+
if (!normalisedValue) return false;
|
|
232
|
+
if (normalisedValue === "0" || normalisedValue === "0px") return false;
|
|
233
|
+
return true;
|
|
234
|
+
};
|
|
235
|
+
var nodeHasCustomAttributes = (node) => {
|
|
236
|
+
if (!node || !node.getAttribute) return false;
|
|
237
|
+
for (let i = 0; i < customAttributeNames.length; i++) {
|
|
238
|
+
const value = node.getAttribute(customAttributeNames[i]);
|
|
239
|
+
if (value !== null && `${value}`.trim() !== "") return true;
|
|
240
|
+
}
|
|
241
|
+
if (node.nodeName === "TABLE") {
|
|
242
|
+
if (hasNonDefaultSpacingAttribute(node, "cellpadding")) return true;
|
|
243
|
+
if (hasNonDefaultSpacingAttribute(node, "cellspacing")) return true;
|
|
244
|
+
}
|
|
245
|
+
return false;
|
|
246
|
+
};
|
|
247
|
+
var nodeHasCustomFormatting = (node) => {
|
|
248
|
+
return nodeHasCustomStyle(node) || nodeHasCustomAttributes(node);
|
|
249
|
+
};
|
|
250
|
+
var tableHasCustomStyles = (tableNode) => {
|
|
251
|
+
if (nodeHasCustomFormatting(tableNode)) return true;
|
|
252
|
+
const rows = tableNode.rows;
|
|
253
|
+
if (!rows) return false;
|
|
254
|
+
for (let i = 0; i < rows.length; i++) {
|
|
255
|
+
const row = rows[i];
|
|
256
|
+
if (nodeHasCustomFormatting(row)) return true;
|
|
257
|
+
for (let j = 0; j < row.childNodes.length; j++) {
|
|
258
|
+
const cell2 = row.childNodes[j];
|
|
259
|
+
if ((cell2.nodeName === "TD" || cell2.nodeName === "TH") && nodeHasCustomFormatting(cell2)) {
|
|
260
|
+
return true;
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
return false;
|
|
265
|
+
};
|
|
266
|
+
var tableShouldBeHtml = (tableNode, options) => {
|
|
267
|
+
const possibleTags = [
|
|
268
|
+
"UL",
|
|
269
|
+
"OL",
|
|
270
|
+
"H1",
|
|
271
|
+
"H2",
|
|
272
|
+
"H3",
|
|
273
|
+
"H4",
|
|
274
|
+
"H5",
|
|
275
|
+
"H6",
|
|
276
|
+
"HR",
|
|
277
|
+
"BLOCKQUOTE"
|
|
278
|
+
];
|
|
279
|
+
if (options.preserveNestedTables) possibleTags.push("TABLE");
|
|
280
|
+
return nodeContains(tableNode, "code") || nodeContains(tableNode, possibleTags) || options.preserveTableStyles && tableHasCustomStyles(tableNode);
|
|
281
|
+
};
|
|
282
|
+
function tableShouldBeSkipped(tableNode) {
|
|
283
|
+
const cached = tableShouldBeSkippedCache_.get(tableNode);
|
|
284
|
+
if (cached !== void 0) return cached;
|
|
285
|
+
const result = tableShouldBeSkipped_(tableNode);
|
|
286
|
+
tableShouldBeSkippedCache_.set(tableNode, result);
|
|
287
|
+
return result;
|
|
288
|
+
}
|
|
289
|
+
function tableShouldBeSkipped_(tableNode) {
|
|
290
|
+
if (!tableNode) return true;
|
|
291
|
+
if (!tableNode.rows) return true;
|
|
292
|
+
if (tableNode.rows.length === 1 && tableNode.rows[0].childNodes.length <= 1) return true;
|
|
293
|
+
if (nodeContainsTable(tableNode)) return true;
|
|
294
|
+
return false;
|
|
295
|
+
}
|
|
296
|
+
function nodeParentDiv(node) {
|
|
297
|
+
let parent = node.parentNode;
|
|
298
|
+
while (parent.nodeName !== "DIV") {
|
|
299
|
+
parent = parent.parentNode;
|
|
300
|
+
if (!parent) return null;
|
|
301
|
+
}
|
|
302
|
+
return parent;
|
|
303
|
+
}
|
|
304
|
+
function nodeParentTable(node) {
|
|
305
|
+
let parent = node.parentNode;
|
|
306
|
+
while (parent.nodeName !== "TABLE") {
|
|
307
|
+
parent = parent.parentNode;
|
|
308
|
+
if (!parent) return null;
|
|
309
|
+
}
|
|
310
|
+
return parent;
|
|
311
|
+
}
|
|
312
|
+
function handleColSpan(content, node, emptyChar) {
|
|
313
|
+
const colspan = node.getAttribute("colspan") || 1;
|
|
314
|
+
for (let i = 1; i < colspan; i++) {
|
|
315
|
+
content += " | " + emptyChar.repeat(3);
|
|
316
|
+
}
|
|
317
|
+
return content;
|
|
318
|
+
}
|
|
319
|
+
function tableColCount(node) {
|
|
320
|
+
let maxColCount = 0;
|
|
321
|
+
for (let i = 0; i < node.rows.length; i++) {
|
|
322
|
+
const row = node.rows[i];
|
|
323
|
+
const colCount = row.childNodes.length;
|
|
324
|
+
if (colCount > maxColCount) maxColCount = colCount;
|
|
325
|
+
}
|
|
326
|
+
return maxColCount;
|
|
327
|
+
}
|
|
328
|
+
function tables2(turndownService) {
|
|
329
|
+
isCodeBlock_ = turndownService.isCodeBlock;
|
|
330
|
+
options_ = turndownService.options;
|
|
331
|
+
turndownService.keep(function(node) {
|
|
332
|
+
if (node.nodeName === "TABLE" && tableShouldBeHtml(node, turndownService.options)) return true;
|
|
333
|
+
return false;
|
|
334
|
+
});
|
|
335
|
+
for (var key in rules) turndownService.addRule(key, rules[key]);
|
|
336
|
+
}
|
|
337
|
+
function taskListItems(turndownService) {
|
|
338
|
+
turndownService.addRule("taskListItems", {
|
|
339
|
+
filter: function(node) {
|
|
340
|
+
const parent = node.parentNode;
|
|
341
|
+
const grandparent = parent.parentNode;
|
|
342
|
+
const grandparentIsListItem = !!grandparent && grandparent.nodeName === "LI";
|
|
343
|
+
return (node.type === "checkbox" || node.getAttribute("role") === "checkbox") && (parent.nodeName === "LI" || parent.nodeName === "LABEL" && grandparentIsListItem || parent.nodeName === "SPAN" && grandparentIsListItem);
|
|
344
|
+
},
|
|
345
|
+
replacement: function(content, node) {
|
|
346
|
+
const checked = node.nodeName === "INPUT" ? node.checked : node.getAttribute("aria-checked") === "true";
|
|
347
|
+
return (checked ? "[x]" : "[ ]") + " ";
|
|
348
|
+
}
|
|
349
|
+
});
|
|
350
|
+
}
|
|
351
|
+
function gfm(turndownService) {
|
|
352
|
+
turndownService.use([
|
|
353
|
+
highlightedCodeBlock,
|
|
354
|
+
strikethrough,
|
|
355
|
+
tables2,
|
|
356
|
+
taskListItems
|
|
357
|
+
]);
|
|
358
|
+
}
|
|
359
|
+
exports.gfm = gfm;
|
|
360
|
+
exports.highlightedCodeBlock = highlightedCodeBlock;
|
|
361
|
+
exports.strikethrough = strikethrough;
|
|
362
|
+
exports.tables = tables2;
|
|
363
|
+
exports.taskListItems = taskListItems;
|
|
364
|
+
}
|
|
365
|
+
});
|
|
6
366
|
|
|
7
367
|
// src/admin.ts
|
|
8
368
|
import { parseArgs } from "node:util";
|
|
9
|
-
import { readFile as readFile3, writeFile as writeFile4, readdir, rm
|
|
369
|
+
import { readFile as readFile3, writeFile as writeFile4, readdir, rm } from "node:fs/promises";
|
|
10
370
|
import { existsSync } from "node:fs";
|
|
11
371
|
import { join as join4, resolve } from "node:path";
|
|
12
372
|
import { createInterface } from "node:readline";
|
|
@@ -107,10 +467,14 @@ async function loadConfig(path) {
|
|
|
107
467
|
import { mkdir, writeFile } from "node:fs/promises";
|
|
108
468
|
import { join } from "node:path";
|
|
109
469
|
import { chromium } from "playwright";
|
|
470
|
+
|
|
471
|
+
// src/slug.ts
|
|
110
472
|
function slugifyUrl(url) {
|
|
111
473
|
const parsed = new URL(url);
|
|
112
474
|
return parsed.pathname.replace(/^\//, "").replace(/\/$/, "").replace(/\//g, "-").replace(/[^a-zA-Z0-9-]/g, "");
|
|
113
475
|
}
|
|
476
|
+
|
|
477
|
+
// src/scraper.ts
|
|
114
478
|
function filterUrls(urls, includePatterns, excludePatterns) {
|
|
115
479
|
let filtered = urls.filter(
|
|
116
480
|
(url) => url.startsWith("http") && !url.includes("?hl=") && !url.endsWith("#")
|
|
@@ -267,6 +631,7 @@ async function createBrowser() {
|
|
|
267
631
|
}
|
|
268
632
|
|
|
269
633
|
// src/converter.ts
|
|
634
|
+
var import_turndown_plugin_gfm = __toESM(require_turndown_plugin_gfm_cjs(), 1);
|
|
270
635
|
import { readFile as readFile2, writeFile as writeFile2, mkdir as mkdir2 } from "node:fs/promises";
|
|
271
636
|
import { join as join2 } from "node:path";
|
|
272
637
|
import { JSDOM } from "jsdom";
|
|
@@ -276,6 +641,7 @@ var turndown = new TurndownService({
|
|
|
276
641
|
codeBlockStyle: "fenced",
|
|
277
642
|
bulletListMarker: "-"
|
|
278
643
|
});
|
|
644
|
+
turndown.use(import_turndown_plugin_gfm.tables);
|
|
279
645
|
var GENERIC_REMOVE = [
|
|
280
646
|
"style",
|
|
281
647
|
"script",
|
|
@@ -293,26 +659,57 @@ function cleanMarkdown(md, textPatterns) {
|
|
|
293
659
|
}
|
|
294
660
|
return cleaned.trim();
|
|
295
661
|
}
|
|
662
|
+
function flattenTableCellMarkup(contentEl) {
|
|
663
|
+
for (const table of contentEl.querySelectorAll("table")) {
|
|
664
|
+
if (!table.isConnected) continue;
|
|
665
|
+
for (const inner of table.querySelectorAll("table")) {
|
|
666
|
+
if (!inner.isConnected) continue;
|
|
667
|
+
const rows = [...inner.rows].map(
|
|
668
|
+
(row) => [...row.cells].map((c) => c.textContent?.replace(/\s+/g, " ").trim() ?? "").filter(Boolean).join(" ")
|
|
669
|
+
).filter(Boolean);
|
|
670
|
+
inner.replaceWith(rows.join("; "));
|
|
671
|
+
}
|
|
672
|
+
for (const code of table.querySelectorAll("code")) {
|
|
673
|
+
code.replaceWith(code.textContent ?? "");
|
|
674
|
+
}
|
|
675
|
+
for (const list of table.querySelectorAll("ul, ol")) {
|
|
676
|
+
if (!list.isConnected) continue;
|
|
677
|
+
const items = [...list.querySelectorAll("li")].map((li) => li.textContent?.replace(/\s+/g, " ").trim() ?? "").filter(Boolean);
|
|
678
|
+
list.replaceWith(items.join("; "));
|
|
679
|
+
}
|
|
680
|
+
for (const block of table.querySelectorAll("aside, blockquote, h1, h2, h3, h4, h5, h6")) {
|
|
681
|
+
if (!block.isConnected) continue;
|
|
682
|
+
block.replaceWith(block.textContent?.replace(/\s+/g, " ").trim() ?? "");
|
|
683
|
+
}
|
|
684
|
+
}
|
|
685
|
+
}
|
|
686
|
+
function cleanTableRowBreaks(md) {
|
|
687
|
+
return md.split("\n").map(
|
|
688
|
+
(line) => line.startsWith("|") ? line.replace(/\s*<br\s*\/?>\s*/g, "; ") : line
|
|
689
|
+
).join("\n");
|
|
690
|
+
}
|
|
296
691
|
function extractContent(html, contentSelector, removeSelectors, removeTextPatterns) {
|
|
297
692
|
const dom = new JSDOM(html);
|
|
298
693
|
const doc = dom.window.document;
|
|
299
|
-
const contentEl = doc.querySelector(contentSelector);
|
|
300
|
-
if (!contentEl) {
|
|
301
|
-
return cleanMarkdown(turndown.turndown(doc.body.innerHTML), removeTextPatterns);
|
|
302
|
-
}
|
|
694
|
+
const contentEl = doc.querySelector(contentSelector) ?? doc.body;
|
|
303
695
|
const allSelectors = [...GENERIC_REMOVE, ...removeSelectors ?? []];
|
|
304
696
|
for (const selector of allSelectors) {
|
|
305
697
|
for (const el of contentEl.querySelectorAll(selector)) {
|
|
306
698
|
el.remove();
|
|
307
699
|
}
|
|
308
700
|
}
|
|
309
|
-
|
|
701
|
+
flattenTableCellMarkup(contentEl);
|
|
702
|
+
return cleanMarkdown(
|
|
703
|
+
cleanTableRowBreaks(turndown.turndown(contentEl.innerHTML)),
|
|
704
|
+
removeTextPatterns
|
|
705
|
+
);
|
|
310
706
|
}
|
|
311
707
|
function extractTitle(html) {
|
|
312
708
|
const dom = new JSDOM(html);
|
|
313
|
-
const
|
|
314
|
-
if (!
|
|
315
|
-
|
|
709
|
+
const raw = dom.window.document.querySelector("title")?.textContent?.replace(/\s+/g, " ").trim();
|
|
710
|
+
if (!raw) return "Untitled";
|
|
711
|
+
const stripped = raw.replace(/\s*\|[^|]*$/, "").replace(/\s*[\u2013\u2014][^\u2013\u2014]*$/, "").replace(/\s+-\s+(?!.*\s-\s)[^|]*$/, "").trim();
|
|
712
|
+
return stripped || raw;
|
|
316
713
|
}
|
|
317
714
|
function buildFrontmatter(source, url, title) {
|
|
318
715
|
return [
|
|
@@ -379,6 +776,13 @@ function buildChunkId(source, url, headingSlug, index) {
|
|
|
379
776
|
const base = `${prefix}${truncatedSlug}`;
|
|
380
777
|
return index !== void 0 ? `${base}-${index}` : base;
|
|
381
778
|
}
|
|
779
|
+
function buildEmbedText(chunk) {
|
|
780
|
+
const path = chunk.heading_path[0] === chunk.title ? chunk.heading_path.slice(1) : chunk.heading_path;
|
|
781
|
+
const context = [chunk.title, ...path].filter(Boolean).join(" > ");
|
|
782
|
+
return context ? `${context}
|
|
783
|
+
|
|
784
|
+
${chunk.content}` : chunk.content;
|
|
785
|
+
}
|
|
382
786
|
function parseHeadingSections(markdown) {
|
|
383
787
|
const lines = markdown.split("\n");
|
|
384
788
|
const sections = [];
|
|
@@ -431,26 +835,135 @@ function parseHeadingSections(markdown) {
|
|
|
431
835
|
}
|
|
432
836
|
return sections;
|
|
433
837
|
}
|
|
434
|
-
function
|
|
435
|
-
|
|
436
|
-
|
|
838
|
+
function isTableLine(line) {
|
|
839
|
+
return line.trimStart().startsWith("|");
|
|
840
|
+
}
|
|
841
|
+
function isTableSeparator(line) {
|
|
842
|
+
return /^\s*\|?[\s:|-]+\|?\s*$/.test(line) && line.includes("-");
|
|
843
|
+
}
|
|
844
|
+
function parseBlocks(lines) {
|
|
845
|
+
const blocks = [];
|
|
437
846
|
let current = [];
|
|
438
|
-
let
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
current
|
|
444
|
-
|
|
847
|
+
let kind = "paragraph";
|
|
848
|
+
let inFence = false;
|
|
849
|
+
let fenceMarker = "";
|
|
850
|
+
function flush() {
|
|
851
|
+
while (current.length > 0 && current[current.length - 1].trim() === "") {
|
|
852
|
+
current.pop();
|
|
853
|
+
}
|
|
854
|
+
if (current.length > 0) {
|
|
855
|
+
blocks.push({ kind, lines: current });
|
|
856
|
+
}
|
|
857
|
+
current = [];
|
|
858
|
+
kind = "paragraph";
|
|
859
|
+
}
|
|
860
|
+
for (const line of lines) {
|
|
861
|
+
if (inFence) {
|
|
862
|
+
current.push(line);
|
|
863
|
+
const fenceMatch2 = line.match(/^\s*(```+|~~~+)/);
|
|
864
|
+
if (fenceMatch2 && fenceMatch2[1][0] === fenceMarker) {
|
|
865
|
+
inFence = false;
|
|
866
|
+
flush();
|
|
867
|
+
}
|
|
868
|
+
continue;
|
|
869
|
+
}
|
|
870
|
+
const fenceMatch = line.match(/^\s*(```+|~~~+)/);
|
|
871
|
+
if (fenceMatch) {
|
|
872
|
+
flush();
|
|
873
|
+
kind = "fence";
|
|
874
|
+
inFence = true;
|
|
875
|
+
fenceMarker = fenceMatch[1][0];
|
|
876
|
+
current.push(line);
|
|
877
|
+
continue;
|
|
878
|
+
}
|
|
879
|
+
if (isTableLine(line)) {
|
|
880
|
+
if (kind !== "table") {
|
|
881
|
+
flush();
|
|
882
|
+
kind = "table";
|
|
883
|
+
}
|
|
884
|
+
current.push(line);
|
|
885
|
+
continue;
|
|
886
|
+
}
|
|
887
|
+
if (kind === "table") {
|
|
888
|
+
flush();
|
|
889
|
+
}
|
|
890
|
+
if (line.trim() === "") {
|
|
891
|
+
flush();
|
|
445
892
|
} else {
|
|
446
|
-
current.push(
|
|
447
|
-
currentTokens += paraTokens;
|
|
893
|
+
current.push(line);
|
|
448
894
|
}
|
|
449
895
|
}
|
|
450
|
-
|
|
451
|
-
|
|
896
|
+
flush();
|
|
897
|
+
return blocks;
|
|
898
|
+
}
|
|
899
|
+
function groupLines(lines, budget) {
|
|
900
|
+
const groups = [];
|
|
901
|
+
let current = [];
|
|
902
|
+
let tokens = 0;
|
|
903
|
+
for (const line of lines) {
|
|
904
|
+
const lineTokens = estimateTokens(line) + 1;
|
|
905
|
+
if (tokens + lineTokens > budget && current.length > 0) {
|
|
906
|
+
groups.push(current);
|
|
907
|
+
current = [];
|
|
908
|
+
tokens = 0;
|
|
909
|
+
}
|
|
910
|
+
current.push(line);
|
|
911
|
+
tokens += lineTokens;
|
|
912
|
+
}
|
|
913
|
+
if (current.length > 0) groups.push(current);
|
|
914
|
+
return groups;
|
|
915
|
+
}
|
|
916
|
+
function splitBlock(block, budget) {
|
|
917
|
+
if (block.kind === "table") {
|
|
918
|
+
const hasHeader = block.lines.length >= 2 && isTableSeparator(block.lines[1]);
|
|
919
|
+
const header = hasHeader ? block.lines.slice(0, 2) : [];
|
|
920
|
+
const rows = hasHeader ? block.lines.slice(2) : block.lines;
|
|
921
|
+
const headerTokens = estimateTokens(header.join("\n"));
|
|
922
|
+
return groupLines(rows, Math.max(budget - headerTokens, 50)).map(
|
|
923
|
+
(group) => [...header, ...group].join("\n")
|
|
924
|
+
);
|
|
925
|
+
}
|
|
926
|
+
if (block.kind === "fence") {
|
|
927
|
+
const opening = block.lines[0];
|
|
928
|
+
const closing = block.lines[block.lines.length - 1].match(/^\s*(```+|~~~+)\s*$/) ? block.lines[block.lines.length - 1] : opening.match(/^\s*(```+|~~~+)/)[1];
|
|
929
|
+
const body = block.lines.slice(1, block.lines[block.lines.length - 1] === closing ? -1 : void 0);
|
|
930
|
+
const frameTokens = estimateTokens(opening) + estimateTokens(closing) + 2;
|
|
931
|
+
return groupLines(body, Math.max(budget - frameTokens, 50)).map(
|
|
932
|
+
(group) => [opening, ...group, closing].join("\n")
|
|
933
|
+
);
|
|
934
|
+
}
|
|
935
|
+
return groupLines(block.lines, budget).map((group) => group.join("\n"));
|
|
936
|
+
}
|
|
937
|
+
function splitSectionIntoParts(blocks, headingLine, maxTokens) {
|
|
938
|
+
const budget = Math.max(maxTokens - estimateTokens(headingLine), 100);
|
|
939
|
+
const parts = [];
|
|
940
|
+
let current = [];
|
|
941
|
+
let tokens = 0;
|
|
942
|
+
function flush() {
|
|
943
|
+
if (current.length > 0) {
|
|
944
|
+
parts.push(current);
|
|
945
|
+
current = [];
|
|
946
|
+
tokens = 0;
|
|
947
|
+
}
|
|
948
|
+
}
|
|
949
|
+
for (const block of blocks) {
|
|
950
|
+
const text = block.lines.join("\n");
|
|
951
|
+
const blockTokens = estimateTokens(text) + 2;
|
|
952
|
+
if (blockTokens > budget) {
|
|
953
|
+
flush();
|
|
954
|
+
for (const piece of splitBlock(block, budget)) {
|
|
955
|
+
parts.push([piece]);
|
|
956
|
+
}
|
|
957
|
+
continue;
|
|
958
|
+
}
|
|
959
|
+
if (tokens + blockTokens > budget) {
|
|
960
|
+
flush();
|
|
961
|
+
}
|
|
962
|
+
current.push(text);
|
|
963
|
+
tokens += blockTokens;
|
|
452
964
|
}
|
|
453
|
-
|
|
965
|
+
flush();
|
|
966
|
+
return parts.map((blockTexts) => headingLine + blockTexts.join("\n\n"));
|
|
454
967
|
}
|
|
455
968
|
function stripFrontmatter(markdown) {
|
|
456
969
|
if (markdown.startsWith("---")) {
|
|
@@ -484,11 +997,11 @@ function chunkMarkdown(markdown, source, url, title) {
|
|
|
484
997
|
const headingLine = section.heading ? `${"#".repeat(section.level)} ${section.heading}
|
|
485
998
|
|
|
486
999
|
` : "";
|
|
487
|
-
const
|
|
1000
|
+
const body = section.lines.join("\n").trim();
|
|
1001
|
+
const content = headingLine + body;
|
|
488
1002
|
if (!content.trim()) continue;
|
|
489
1003
|
const headingSlug = section.heading ? slugifyHeading(section.heading) : "intro";
|
|
490
|
-
|
|
491
|
-
if (tokens <= MAX_TOKENS) {
|
|
1004
|
+
if (estimateTokens(content) <= MAX_TOKENS) {
|
|
492
1005
|
chunks.push({
|
|
493
1006
|
id: uniqueId(headingSlug),
|
|
494
1007
|
source,
|
|
@@ -496,29 +1009,56 @@ function chunkMarkdown(markdown, source, url, title) {
|
|
|
496
1009
|
title,
|
|
497
1010
|
heading_path: section.headingPath,
|
|
498
1011
|
content,
|
|
499
|
-
token_count:
|
|
1012
|
+
token_count: estimateTokens(content)
|
|
1013
|
+
});
|
|
1014
|
+
continue;
|
|
1015
|
+
}
|
|
1016
|
+
const blocks = parseBlocks(body.split("\n"));
|
|
1017
|
+
const parts = splitSectionIntoParts(blocks, headingLine, MAX_TOKENS);
|
|
1018
|
+
for (let i = 0; i < parts.length; i++) {
|
|
1019
|
+
const partContent = parts[i].trim();
|
|
1020
|
+
if (!partContent) continue;
|
|
1021
|
+
const partSlug = parts.length > 1 ? `${headingSlug}-${i}` : headingSlug;
|
|
1022
|
+
chunks.push({
|
|
1023
|
+
id: uniqueId(partSlug),
|
|
1024
|
+
source,
|
|
1025
|
+
url,
|
|
1026
|
+
title,
|
|
1027
|
+
heading_path: section.headingPath,
|
|
1028
|
+
content: partContent,
|
|
1029
|
+
token_count: estimateTokens(partContent)
|
|
500
1030
|
});
|
|
501
|
-
} else {
|
|
502
|
-
const parts = splitAtParagraphBoundaries(content, MAX_TOKENS);
|
|
503
|
-
for (let i = 0; i < parts.length; i++) {
|
|
504
|
-
const partContent = parts[i].trim();
|
|
505
|
-
if (!partContent) continue;
|
|
506
|
-
const partSlug = parts.length > 1 ? `${headingSlug}-${i}` : headingSlug;
|
|
507
|
-
chunks.push({
|
|
508
|
-
id: uniqueId(partSlug),
|
|
509
|
-
source,
|
|
510
|
-
url,
|
|
511
|
-
title,
|
|
512
|
-
heading_path: section.headingPath,
|
|
513
|
-
content: partContent,
|
|
514
|
-
token_count: estimateTokens(partContent)
|
|
515
|
-
});
|
|
516
|
-
}
|
|
517
1031
|
}
|
|
518
1032
|
}
|
|
519
1033
|
return chunks;
|
|
520
1034
|
}
|
|
521
1035
|
|
|
1036
|
+
// src/tokens.ts
|
|
1037
|
+
import { createHash } from "node:crypto";
|
|
1038
|
+
var IDENTIFIER_PATTERN = /(?<![A-Za-z0-9._])(?:[A-Za-z][A-Za-z0-9]*(?:[_.-][A-Za-z0-9]+)+|[a-z][a-z0-9]*(?:[A-Z][a-z0-9]*)+|(?:[A-Z][a-z0-9]+){2,})(?![A-Za-z0-9])/g;
|
|
1039
|
+
var MIN_TOKEN_LENGTH = 4;
|
|
1040
|
+
var MAX_TOKEN_LENGTH = 80;
|
|
1041
|
+
var MAX_TOKENS_PER_CHUNK = 100;
|
|
1042
|
+
function normalizeForTokens(text) {
|
|
1043
|
+
return text.replace(/\]\([^)]*\)/g, "]").replace(/https?:\/\/\S+/g, " ").replace(/\\([_*[\]()#`~-])/g, "$1");
|
|
1044
|
+
}
|
|
1045
|
+
function extractIdentifierTokens(text, limit = MAX_TOKENS_PER_CHUNK) {
|
|
1046
|
+
const seen = /* @__PURE__ */ new Set();
|
|
1047
|
+
for (const match of normalizeForTokens(text).matchAll(IDENTIFIER_PATTERN)) {
|
|
1048
|
+
const token = match[0].toLowerCase();
|
|
1049
|
+
if (token.length < MIN_TOKEN_LENGTH || token.length > MAX_TOKEN_LENGTH) continue;
|
|
1050
|
+
seen.add(token);
|
|
1051
|
+
if (seen.size >= limit) break;
|
|
1052
|
+
}
|
|
1053
|
+
return [...seen];
|
|
1054
|
+
}
|
|
1055
|
+
function extractQueryTokens(query) {
|
|
1056
|
+
return extractIdentifierTokens(query, 5);
|
|
1057
|
+
}
|
|
1058
|
+
function contentHash(text) {
|
|
1059
|
+
return createHash("sha256").update(text).digest("hex");
|
|
1060
|
+
}
|
|
1061
|
+
|
|
522
1062
|
// src/embedder.ts
|
|
523
1063
|
import { GoogleGenerativeAI } from "@google/generative-ai";
|
|
524
1064
|
var BATCH_SIZE = 50;
|
|
@@ -528,7 +1068,6 @@ var MAX_RETRIES = 5;
|
|
|
528
1068
|
var RATE_LIMIT_BASE_DELAY_MS = 6e4;
|
|
529
1069
|
var NETWORK_BASE_DELAY_MS = 1e4;
|
|
530
1070
|
var BATCH_DELAY_MS = 2500;
|
|
531
|
-
var DEFAULT_CHECKPOINT_EVERY_BATCHES = 20;
|
|
532
1071
|
var NETWORK_ERROR_PATTERNS = [
|
|
533
1072
|
"fetch failed",
|
|
534
1073
|
"ECONNRESET",
|
|
@@ -562,22 +1101,10 @@ function classifyError(message) {
|
|
|
562
1101
|
async function embedTexts(texts, options = {}) {
|
|
563
1102
|
const client = getClient();
|
|
564
1103
|
const model = client.getGenerativeModel({ model: MODEL });
|
|
565
|
-
const { onProgress
|
|
566
|
-
const checkpointEveryBatches = options.checkpointEveryBatches ?? DEFAULT_CHECKPOINT_EVERY_BATCHES;
|
|
1104
|
+
const { onProgress } = options;
|
|
567
1105
|
const maxRetries = options.maxRetries ?? MAX_RETRIES;
|
|
568
|
-
const embeddings =
|
|
569
|
-
|
|
570
|
-
if (embeddings.length > startIndex) {
|
|
571
|
-
embeddings.length = startIndex;
|
|
572
|
-
}
|
|
573
|
-
if (startIndex > 0) {
|
|
574
|
-
console.log(` Resuming from chunk ${startIndex} of ${texts.length} (${embeddings.length} cached).`);
|
|
575
|
-
}
|
|
576
|
-
if (startIndex >= texts.length) {
|
|
577
|
-
return embeddings.slice(0, texts.length);
|
|
578
|
-
}
|
|
579
|
-
let batchesSinceCheckpoint = 0;
|
|
580
|
-
for (let i = startIndex; i < texts.length; i += BATCH_SIZE) {
|
|
1106
|
+
const embeddings = [];
|
|
1107
|
+
for (let i = 0; i < texts.length; i += BATCH_SIZE) {
|
|
581
1108
|
const batch = texts.slice(i, i + BATCH_SIZE);
|
|
582
1109
|
const batchNumber = i / BATCH_SIZE + 1;
|
|
583
1110
|
let result;
|
|
@@ -609,18 +1136,10 @@ async function embedTexts(texts, options = {}) {
|
|
|
609
1136
|
embeddings.push(embedding.values);
|
|
610
1137
|
}
|
|
611
1138
|
onProgress?.(Math.min(i + BATCH_SIZE, texts.length), texts.length);
|
|
612
|
-
batchesSinceCheckpoint++;
|
|
613
|
-
if (onCheckpoint && batchesSinceCheckpoint >= checkpointEveryBatches && i + BATCH_SIZE < texts.length) {
|
|
614
|
-
await onCheckpoint(embeddings);
|
|
615
|
-
batchesSinceCheckpoint = 0;
|
|
616
|
-
}
|
|
617
1139
|
if (i + BATCH_SIZE < texts.length) {
|
|
618
1140
|
await new Promise((resolve2) => setTimeout(resolve2, BATCH_DELAY_MS));
|
|
619
1141
|
}
|
|
620
1142
|
}
|
|
621
|
-
if (onCheckpoint) {
|
|
622
|
-
await onCheckpoint(embeddings);
|
|
623
|
-
}
|
|
624
1143
|
return embeddings;
|
|
625
1144
|
}
|
|
626
1145
|
async function embedText(text) {
|
|
@@ -701,6 +1220,8 @@ async function storeChunks(chunks, embeddings, onProgress) {
|
|
|
701
1220
|
heading_path: chunk.heading_path,
|
|
702
1221
|
content: chunk.content,
|
|
703
1222
|
token_count: chunk.token_count,
|
|
1223
|
+
tokens: extractIdentifierTokens(chunk.content),
|
|
1224
|
+
content_hash: contentHash(buildEmbedText(chunk)),
|
|
704
1225
|
embedded_at: (/* @__PURE__ */ new Date()).toISOString(),
|
|
705
1226
|
embedding: FieldValue.vector(embSlice[j])
|
|
706
1227
|
});
|
|
@@ -767,10 +1288,38 @@ async function deleteChunksByIds(ids, onProgress) {
|
|
|
767
1288
|
onProgress?.(Math.min(i + BATCH_SIZE2, ids.length), ids.length);
|
|
768
1289
|
}
|
|
769
1290
|
}
|
|
770
|
-
async function
|
|
1291
|
+
async function getSourceChunkHashes(sourceName) {
|
|
1292
|
+
const col = chunksCol();
|
|
1293
|
+
const snapshot = await col.where("source", "==", sourceName).select("content_hash").get();
|
|
1294
|
+
return new Map(
|
|
1295
|
+
snapshot.docs.map((doc) => [doc.id, doc.data().content_hash ?? ""])
|
|
1296
|
+
);
|
|
1297
|
+
}
|
|
1298
|
+
var TOKEN_QUERY_LIMIT = 100;
|
|
1299
|
+
async function tokenSearch(tokens, source) {
|
|
1300
|
+
if (tokens.length === 0) return [];
|
|
771
1301
|
const col = chunksCol();
|
|
772
|
-
const
|
|
773
|
-
|
|
1302
|
+
const hits = /* @__PURE__ */ new Map();
|
|
1303
|
+
const snapshots = await Promise.all(
|
|
1304
|
+
tokens.map((token) => {
|
|
1305
|
+
let query = col.where("tokens", "array-contains", token);
|
|
1306
|
+
if (source) {
|
|
1307
|
+
query = query.where("source", "==", source);
|
|
1308
|
+
}
|
|
1309
|
+
return query.limit(TOKEN_QUERY_LIMIT).get();
|
|
1310
|
+
})
|
|
1311
|
+
);
|
|
1312
|
+
for (const snapshot of snapshots) {
|
|
1313
|
+
for (const doc of snapshot.docs) {
|
|
1314
|
+
const existing = hits.get(doc.id);
|
|
1315
|
+
if (existing) {
|
|
1316
|
+
existing.matchedTokens++;
|
|
1317
|
+
} else {
|
|
1318
|
+
hits.set(doc.id, { id: doc.id, data: doc.data(), matchedTokens: 1 });
|
|
1319
|
+
}
|
|
1320
|
+
}
|
|
1321
|
+
}
|
|
1322
|
+
return [...hits.values()].sort((a, b) => b.matchedTokens - a.matchedTokens);
|
|
774
1323
|
}
|
|
775
1324
|
async function vectorSearch(queryEmbedding, limit, source) {
|
|
776
1325
|
const col = chunksCol();
|
|
@@ -816,47 +1365,92 @@ async function rerank(query, documents, topN = 5) {
|
|
|
816
1365
|
}
|
|
817
1366
|
|
|
818
1367
|
// src/search.ts
|
|
1368
|
+
var DEFAULT_CANDIDATES = 50;
|
|
1369
|
+
var RRF_K = 60;
|
|
819
1370
|
function hasReranker() {
|
|
820
1371
|
return !!process.env.RERANKER_URL;
|
|
821
1372
|
}
|
|
1373
|
+
function isVectorLike(value) {
|
|
1374
|
+
return typeof value === "object" && value !== null && typeof value.toArray === "function";
|
|
1375
|
+
}
|
|
1376
|
+
function cosineSimilarity(a, b) {
|
|
1377
|
+
let dot = 0;
|
|
1378
|
+
let normA = 0;
|
|
1379
|
+
let normB = 0;
|
|
1380
|
+
for (let i = 0; i < a.length; i++) {
|
|
1381
|
+
dot += a[i] * b[i];
|
|
1382
|
+
normA += a[i] * a[i];
|
|
1383
|
+
normB += b[i] * b[i];
|
|
1384
|
+
}
|
|
1385
|
+
return dot / (Math.sqrt(normA) * Math.sqrt(normB));
|
|
1386
|
+
}
|
|
1387
|
+
function contextualText(data) {
|
|
1388
|
+
return buildEmbedText({
|
|
1389
|
+
title: data.title,
|
|
1390
|
+
heading_path: data.heading_path,
|
|
1391
|
+
content: data.content
|
|
1392
|
+
});
|
|
1393
|
+
}
|
|
1394
|
+
function toSearchResult(candidate, relevance) {
|
|
1395
|
+
const data = candidate.data;
|
|
1396
|
+
return {
|
|
1397
|
+
id: candidate.id,
|
|
1398
|
+
source: data.source,
|
|
1399
|
+
url: data.url,
|
|
1400
|
+
title: data.title,
|
|
1401
|
+
heading_path: data.heading_path,
|
|
1402
|
+
content: data.content,
|
|
1403
|
+
relevance_score: relevance
|
|
1404
|
+
};
|
|
1405
|
+
}
|
|
822
1406
|
async function search(query, options = {}) {
|
|
823
|
-
const { source, candidates =
|
|
1407
|
+
const { source, candidates = DEFAULT_CANDIDATES, topN = 5 } = options;
|
|
824
1408
|
const queryEmbedding = await embedText(query);
|
|
825
|
-
const
|
|
826
|
-
|
|
1409
|
+
const [vectorResults, lexicalHits] = await Promise.all([
|
|
1410
|
+
vectorSearch(queryEmbedding, candidates, source),
|
|
1411
|
+
tokenSearch(extractQueryTokens(query), source)
|
|
1412
|
+
]);
|
|
1413
|
+
const pool = /* @__PURE__ */ new Map();
|
|
1414
|
+
vectorResults.forEach((result, rank) => {
|
|
1415
|
+
pool.set(result.id, {
|
|
1416
|
+
id: result.id,
|
|
1417
|
+
data: result.data,
|
|
1418
|
+
fusedScore: 1 / (RRF_K + rank + 1),
|
|
1419
|
+
similarity: 1 - result.distance
|
|
1420
|
+
});
|
|
1421
|
+
});
|
|
1422
|
+
lexicalHits.forEach((hit, rank) => {
|
|
1423
|
+
const lexicalScore = 1 / (RRF_K + rank + 1);
|
|
1424
|
+
const existing = pool.get(hit.id);
|
|
1425
|
+
if (existing) {
|
|
1426
|
+
existing.fusedScore += lexicalScore;
|
|
1427
|
+
return;
|
|
1428
|
+
}
|
|
1429
|
+
const embedding = hit.data.embedding;
|
|
1430
|
+
delete hit.data.embedding;
|
|
1431
|
+
pool.set(hit.id, {
|
|
1432
|
+
id: hit.id,
|
|
1433
|
+
data: hit.data,
|
|
1434
|
+
fusedScore: lexicalScore,
|
|
1435
|
+
similarity: isVectorLike(embedding) ? cosineSimilarity(queryEmbedding, embedding.toArray()) : null
|
|
1436
|
+
});
|
|
1437
|
+
});
|
|
1438
|
+
const fused = [...pool.values()].sort((a, b) => b.fusedScore - a.fusedScore);
|
|
1439
|
+
if (fused.length === 0) return [];
|
|
827
1440
|
if (hasReranker()) {
|
|
828
|
-
const
|
|
1441
|
+
const rerankPool = fused.slice(0, candidates);
|
|
1442
|
+
const documents = rerankPool.map((c) => contextualText(c.data));
|
|
829
1443
|
const reranked = await rerank(query, documents, topN);
|
|
830
|
-
return reranked.map((r) =>
|
|
831
|
-
const original = rawResults[r.index];
|
|
832
|
-
const data = original.data;
|
|
833
|
-
return {
|
|
834
|
-
id: original.id,
|
|
835
|
-
source: data.source,
|
|
836
|
-
url: data.url,
|
|
837
|
-
title: data.title,
|
|
838
|
-
heading_path: data.heading_path,
|
|
839
|
-
content: data.content,
|
|
840
|
-
relevance_score: r.relevance_score
|
|
841
|
-
};
|
|
842
|
-
});
|
|
1444
|
+
return reranked.map((r) => toSearchResult(rerankPool[r.index], r.relevance_score));
|
|
843
1445
|
}
|
|
844
|
-
return
|
|
845
|
-
const
|
|
846
|
-
return
|
|
847
|
-
id: r.id,
|
|
848
|
-
source: data.source,
|
|
849
|
-
url: data.url,
|
|
850
|
-
title: data.title,
|
|
851
|
-
heading_path: data.heading_path,
|
|
852
|
-
content: data.content,
|
|
853
|
-
relevance_score: Math.max(0, 1 - r.distance / 2)
|
|
854
|
-
};
|
|
1446
|
+
return fused.slice(0, topN).map((candidate) => {
|
|
1447
|
+
const similarity = candidate.similarity ?? 0;
|
|
1448
|
+
return toSearchResult(candidate, Math.max(0, (1 + similarity) / 2));
|
|
855
1449
|
});
|
|
856
1450
|
}
|
|
857
1451
|
|
|
858
1452
|
// src/apikey.ts
|
|
859
|
-
import { randomBytes, createHash } from "node:crypto";
|
|
1453
|
+
import { randomBytes, createHash as createHash2 } from "node:crypto";
|
|
860
1454
|
import {
|
|
861
1455
|
getFirestore as getFirestore2
|
|
862
1456
|
} from "firebase-admin/firestore";
|
|
@@ -872,7 +1466,7 @@ function getDb2() {
|
|
|
872
1466
|
return db2;
|
|
873
1467
|
}
|
|
874
1468
|
function hashKey(key) {
|
|
875
|
-
return
|
|
1469
|
+
return createHash2("sha256").update(key).digest("hex");
|
|
876
1470
|
}
|
|
877
1471
|
function apiKeysCol() {
|
|
878
1472
|
return getDb2().collection("grimoire_api_keys");
|
|
@@ -1166,89 +1760,85 @@ Source "${name}" added to config/sources.yaml`);
|
|
|
1166
1760
|
await browser.close();
|
|
1167
1761
|
}
|
|
1168
1762
|
}
|
|
1169
|
-
var
|
|
1170
|
-
function
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
const
|
|
1175
|
-
const
|
|
1176
|
-
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
if (
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
const arr = JSON.parse(data);
|
|
1186
|
-
for (const row of arr) all.push(row);
|
|
1187
|
-
}
|
|
1188
|
-
return all;
|
|
1763
|
+
var EMBED_WINDOW = 1e3;
|
|
1764
|
+
async function syncChunks(sourceName, allChunks, urlCount, version) {
|
|
1765
|
+
console.log(" Comparing with Firestore...");
|
|
1766
|
+
const existing = await getSourceChunkHashes(sourceName);
|
|
1767
|
+
const currentIds = new Set(allChunks.map((c) => c.id));
|
|
1768
|
+
const toDelete = [...existing.keys()].filter((id) => !currentIds.has(id));
|
|
1769
|
+
const toEmbed = allChunks.filter(
|
|
1770
|
+
(chunk) => existing.get(chunk.id) !== contentHash(buildEmbedText(chunk))
|
|
1771
|
+
);
|
|
1772
|
+
console.log(
|
|
1773
|
+
` Sync: ${toEmbed.length} to embed, ${allChunks.length - toEmbed.length} unchanged, ${toDelete.length} to delete.`
|
|
1774
|
+
);
|
|
1775
|
+
if (toDelete.length > 0) {
|
|
1776
|
+
await deleteChunksByIds(toDelete, (cur, total) => {
|
|
1777
|
+
console.log(` [${cur}/${total}] deleted`);
|
|
1778
|
+
});
|
|
1189
1779
|
}
|
|
1190
|
-
|
|
1191
|
-
const
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1780
|
+
for (let i = 0; i < toEmbed.length; i += EMBED_WINDOW) {
|
|
1781
|
+
const window = toEmbed.slice(i, i + EMBED_WINDOW);
|
|
1782
|
+
const embeddings = await embedTexts(window.map((c) => buildEmbedText(c)), {
|
|
1783
|
+
onProgress: (done) => {
|
|
1784
|
+
console.log(` [${i + done}/${toEmbed.length}] embedded`);
|
|
1785
|
+
}
|
|
1786
|
+
});
|
|
1787
|
+
await storeChunks(window, embeddings, (cur) => {
|
|
1788
|
+
console.log(` [${i + cur}/${toEmbed.length}] stored`);
|
|
1789
|
+
});
|
|
1195
1790
|
}
|
|
1791
|
+
await updateSourceMeta(sourceName, allChunks.length, urlCount, version);
|
|
1792
|
+
console.log(` Done. ${allChunks.length} chunks live for "${sourceName}".`);
|
|
1196
1793
|
}
|
|
1197
|
-
async function
|
|
1198
|
-
await
|
|
1199
|
-
|
|
1200
|
-
const
|
|
1201
|
-
|
|
1202
|
-
|
|
1794
|
+
async function readCachedMarkdownPages(mdDir) {
|
|
1795
|
+
const mdFiles = await readdir(mdDir).catch(() => []);
|
|
1796
|
+
const pages = [];
|
|
1797
|
+
for (const f of mdFiles.filter((f2) => f2.endsWith(".md"))) {
|
|
1798
|
+
const content = await readFile3(join4(mdDir, f), "utf-8");
|
|
1799
|
+
const urlMatch = content.match(/^url: "(.+)"$/m);
|
|
1800
|
+
const titleMatch = content.match(/^title: "(.+)"$/m);
|
|
1801
|
+
if (!urlMatch) {
|
|
1802
|
+
console.warn(` WARNING: ${f} has no url in frontmatter, skipping page.`);
|
|
1803
|
+
continue;
|
|
1804
|
+
}
|
|
1805
|
+
pages.push({
|
|
1806
|
+
markdown: content,
|
|
1807
|
+
url: urlMatch[1],
|
|
1808
|
+
title: titleMatch?.[1] ?? "Untitled"
|
|
1203
1809
|
});
|
|
1204
1810
|
}
|
|
1205
|
-
|
|
1206
|
-
const part = embeddings.slice(i, i + EMBEDDINGS_PART_SIZE);
|
|
1207
|
-
const index = Math.floor(i / EMBEDDINGS_PART_SIZE);
|
|
1208
|
-
await writeFile4(partPath(cachePath, index), JSON.stringify(part), "utf-8");
|
|
1209
|
-
}
|
|
1210
|
-
}
|
|
1211
|
-
async function embedWithCheckpoint(texts, rawDir, embeddingsCachePath) {
|
|
1212
|
-
await mkdir4(rawDir, { recursive: true });
|
|
1213
|
-
const partialCache = await loadEmbeddingsCache(embeddingsCachePath);
|
|
1214
|
-
const resumeFrom = partialCache && partialCache.length > 0 && partialCache.length < texts.length ? partialCache : void 0;
|
|
1215
|
-
return embedTexts(texts, {
|
|
1216
|
-
onProgress: (done, total) => {
|
|
1217
|
-
console.log(` [${done}/${total}] embedded`);
|
|
1218
|
-
},
|
|
1219
|
-
onCheckpoint: async (current) => {
|
|
1220
|
-
await saveEmbeddingsCache(embeddingsCachePath, current);
|
|
1221
|
-
},
|
|
1222
|
-
resumeFrom
|
|
1223
|
-
});
|
|
1811
|
+
return pages;
|
|
1224
1812
|
}
|
|
1225
|
-
async function
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
const
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1813
|
+
async function recoverUrlsFromHtml(rawDir) {
|
|
1814
|
+
const urlsJsonPath = join4(rawDir, "urls.json");
|
|
1815
|
+
try {
|
|
1816
|
+
return JSON.parse(await readFile3(urlsJsonPath, "utf-8"));
|
|
1817
|
+
} catch {
|
|
1818
|
+
const rawFiles = await readdir(rawDir);
|
|
1819
|
+
const urls = [];
|
|
1820
|
+
let skipped = 0;
|
|
1821
|
+
for (const f of rawFiles.filter((f2) => f2.endsWith(".html"))) {
|
|
1822
|
+
const fileSlug = f.replace(/\.html$/, "");
|
|
1823
|
+
const html = await readFile3(join4(rawDir, f), "utf-8");
|
|
1824
|
+
const match = html.match(/<link[^>]+rel="canonical"[^>]+href="([^"]+)"/);
|
|
1825
|
+
if (match && slugifyUrl(match[1]) === fileSlug) {
|
|
1826
|
+
urls.push(match[1]);
|
|
1827
|
+
continue;
|
|
1828
|
+
}
|
|
1829
|
+
const ogMatch = html.match(/<meta[^>]+property="og:url"[^>]+content="([^"]+)"/);
|
|
1830
|
+
if (ogMatch && slugifyUrl(ogMatch[1]) === fileSlug) {
|
|
1831
|
+
urls.push(ogMatch[1]);
|
|
1832
|
+
continue;
|
|
1833
|
+
}
|
|
1834
|
+
console.warn(` WARNING: cannot recover URL for ${f}, skipping page.`);
|
|
1835
|
+
skipped++;
|
|
1237
1836
|
}
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
} else {
|
|
1243
|
-
console.log(" Purging old chunks...");
|
|
1244
|
-
await purgeSource(sourceName);
|
|
1245
|
-
console.log(" Storing in Firestore...");
|
|
1246
|
-
await storeChunks(allChunks, embeddings, (cur, total) => {
|
|
1247
|
-
console.log(` [${cur}/${total}] stored`);
|
|
1248
|
-
});
|
|
1837
|
+
if (skipped > 0) {
|
|
1838
|
+
console.warn(` Skipped ${skipped} pages with unrecoverable URLs. Provide urls.json to include them.`);
|
|
1839
|
+
}
|
|
1840
|
+
return urls;
|
|
1249
1841
|
}
|
|
1250
|
-
await updateSourceMeta(sourceName, allChunks.length, urlCount, version);
|
|
1251
|
-
console.log(` Done. ${allChunks.length} chunks stored for "${sourceName}".`);
|
|
1252
1842
|
}
|
|
1253
1843
|
async function cmdRefresh() {
|
|
1254
1844
|
const args = parseArgs({
|
|
@@ -1256,12 +1846,10 @@ async function cmdRefresh() {
|
|
|
1256
1846
|
options: {
|
|
1257
1847
|
full: { type: "boolean", default: false },
|
|
1258
1848
|
all: { type: "boolean", default: false },
|
|
1259
|
-
diff: { type: "boolean", default: false },
|
|
1260
1849
|
concurrency: { type: "string" },
|
|
1261
1850
|
limit: { type: "string" },
|
|
1262
1851
|
"from-html": { type: "boolean", default: false },
|
|
1263
1852
|
"from-markdown": { type: "boolean", default: false },
|
|
1264
|
-
"from-embeddings": { type: "boolean", default: false },
|
|
1265
1853
|
"skip-store": { type: "boolean", default: false }
|
|
1266
1854
|
},
|
|
1267
1855
|
allowPositionals: true
|
|
@@ -1269,7 +1857,7 @@ async function cmdRefresh() {
|
|
|
1269
1857
|
const config = await loadConfig(CONFIG_PATH);
|
|
1270
1858
|
const sourcesToRefresh = args.values.all ? Object.keys(config.sources) : [args.positionals[0]];
|
|
1271
1859
|
if (!args.values.all && !sourcesToRefresh[0]) {
|
|
1272
|
-
console.error("Usage: grimoire refresh <source> [--full] [--from-html] [--from-markdown] [--
|
|
1860
|
+
console.error("Usage: grimoire refresh <source> [--full] [--from-html] [--from-markdown] [--skip-store] [--limit <n>] [--concurrency <n>]");
|
|
1273
1861
|
process.exit(1);
|
|
1274
1862
|
}
|
|
1275
1863
|
const concurrencyOverride = args.values.concurrency ? parseInt(args.values.concurrency, 10) : void 0;
|
|
@@ -1285,7 +1873,6 @@ async function cmdRefresh() {
|
|
|
1285
1873
|
}
|
|
1286
1874
|
const rawDir = join4(DATA_DIR, "raw", sourceName);
|
|
1287
1875
|
const mdDir = join4(DATA_DIR, "markdown", sourceName);
|
|
1288
|
-
const embeddingsCachePath = join4(rawDir, "embeddings.json");
|
|
1289
1876
|
console.log(`
|
|
1290
1877
|
Refreshing "${sourceName}"...`);
|
|
1291
1878
|
if (args.values.full) {
|
|
@@ -1295,75 +1882,16 @@ Refreshing "${sourceName}"...`);
|
|
|
1295
1882
|
await rm(rawDir, { recursive: true, force: true });
|
|
1296
1883
|
await rm(mdDir, { recursive: true, force: true });
|
|
1297
1884
|
}
|
|
1298
|
-
let
|
|
1299
|
-
if (args.values["from-embeddings"]) {
|
|
1300
|
-
console.log(" Loading cached embeddings...");
|
|
1301
|
-
const cached = await loadEmbeddingsCache(embeddingsCachePath);
|
|
1302
|
-
if (!cached) {
|
|
1303
|
-
console.error(" No cached embeddings found. Run without --from-embeddings first.");
|
|
1304
|
-
process.exit(1);
|
|
1305
|
-
}
|
|
1306
|
-
const mdFiles = await readdir(mdDir);
|
|
1307
|
-
const allPages = [];
|
|
1308
|
-
for (const f of mdFiles.filter((f2) => f2.endsWith(".md"))) {
|
|
1309
|
-
const content = await readFile3(join4(mdDir, f), "utf-8");
|
|
1310
|
-
const urlMatch = content.match(/^url: "(.+)"$/m);
|
|
1311
|
-
const titleMatch = content.match(/^title: "(.+)"$/m);
|
|
1312
|
-
allPages.push({
|
|
1313
|
-
markdown: content,
|
|
1314
|
-
url: urlMatch?.[1] ?? "",
|
|
1315
|
-
title: titleMatch?.[1] ?? "Untitled"
|
|
1316
|
-
});
|
|
1317
|
-
}
|
|
1318
|
-
console.log(" Chunking...");
|
|
1319
|
-
const allChunks2 = allPages.flatMap((p) => chunkMarkdown(p.markdown, sourceName, p.url, p.title));
|
|
1320
|
-
console.log(` Created ${allChunks2.length} chunks.`);
|
|
1321
|
-
if (cached.length !== allChunks2.length) {
|
|
1322
|
-
console.error(` Embeddings cache (${cached.length}) doesn't match chunk count (${allChunks2.length}). Re-embed with --from-html.`);
|
|
1323
|
-
process.exit(1);
|
|
1324
|
-
}
|
|
1325
|
-
if (args.values["skip-store"]) {
|
|
1326
|
-
console.log(` Done. ${allChunks2.length} chunks ready (skipped Firestore).`);
|
|
1327
|
-
continue;
|
|
1328
|
-
}
|
|
1329
|
-
await storeWithStrategy(sourceName, allChunks2, cached, allPages.length, source.version, args.values.diff);
|
|
1330
|
-
continue;
|
|
1331
|
-
}
|
|
1885
|
+
let pages;
|
|
1332
1886
|
if (args.values["from-markdown"]) {
|
|
1333
1887
|
console.log(" Reading cached markdown...");
|
|
1334
|
-
|
|
1335
|
-
|
|
1336
|
-
if (markdownFiles.length === 0) {
|
|
1888
|
+
pages = await readCachedMarkdownPages(mdDir);
|
|
1889
|
+
if (pages.length === 0) {
|
|
1337
1890
|
console.error(" No cached markdown found. Run with --from-html first.");
|
|
1338
1891
|
process.exit(1);
|
|
1339
1892
|
}
|
|
1340
|
-
|
|
1341
|
-
|
|
1342
|
-
const content = await readFile3(join4(mdDir, f), "utf-8");
|
|
1343
|
-
const urlMatch = content.match(/^url: "(.+)"$/m);
|
|
1344
|
-
const titleMatch = content.match(/^title: "(.+)"$/m);
|
|
1345
|
-
pages2.push({
|
|
1346
|
-
markdown: content,
|
|
1347
|
-
url: urlMatch?.[1] ?? "",
|
|
1348
|
-
title: titleMatch?.[1] ?? "Untitled"
|
|
1349
|
-
});
|
|
1350
|
-
}
|
|
1351
|
-
console.log(` Found ${pages2.length} cached pages.`);
|
|
1352
|
-
console.log(" Chunking...");
|
|
1353
|
-
const allChunks2 = pages2.flatMap((p) => chunkMarkdown(p.markdown, sourceName, p.url, p.title));
|
|
1354
|
-
console.log(` Created ${allChunks2.length} chunks.`);
|
|
1355
|
-
console.log(" Embedding chunks...");
|
|
1356
|
-
const texts2 = allChunks2.map((c) => c.content);
|
|
1357
|
-
const embeddings2 = await embedWithCheckpoint(texts2, rawDir, embeddingsCachePath);
|
|
1358
|
-
if (args.values["skip-store"]) {
|
|
1359
|
-
console.log(` Done. ${allChunks2.length} chunks ready (skipped Firestore).`);
|
|
1360
|
-
continue;
|
|
1361
|
-
}
|
|
1362
|
-
await storeWithStrategy(sourceName, allChunks2, embeddings2, pages2.length, source.version, args.values.diff);
|
|
1363
|
-
continue;
|
|
1364
|
-
}
|
|
1365
|
-
let pages;
|
|
1366
|
-
if (source.llms_full_url && !args.values["from-html"]) {
|
|
1893
|
+
console.log(` Found ${pages.length} cached pages.`);
|
|
1894
|
+
} else if (source.llms_full_url && !args.values["from-html"]) {
|
|
1367
1895
|
console.log(` Fetching llms-full.txt from ${source.llms_full_url}...`);
|
|
1368
1896
|
pages = await ingestLlmsFull(
|
|
1369
1897
|
source.llms_full_url,
|
|
@@ -1376,32 +1904,10 @@ Refreshing "${sourceName}"...`);
|
|
|
1376
1904
|
);
|
|
1377
1905
|
console.log(` Extracted ${pages.length} pages.`);
|
|
1378
1906
|
} else {
|
|
1907
|
+
let urls;
|
|
1379
1908
|
if (args.values["from-html"]) {
|
|
1380
1909
|
console.log(" Reading URLs from cached HTML...");
|
|
1381
|
-
|
|
1382
|
-
try {
|
|
1383
|
-
urls = JSON.parse(await readFile3(urlsJsonPath, "utf-8"));
|
|
1384
|
-
} catch {
|
|
1385
|
-
const rawFiles = await readdir(rawDir);
|
|
1386
|
-
const htmlFiles = rawFiles.filter((f) => f.endsWith(".html"));
|
|
1387
|
-
urls = [];
|
|
1388
|
-
for (const f of htmlFiles) {
|
|
1389
|
-
const fileSlug = f.replace(/\.html$/, "");
|
|
1390
|
-
const htmlPath = join4(rawDir, f);
|
|
1391
|
-
const html = await readFile3(htmlPath, "utf-8");
|
|
1392
|
-
const match = html.match(/<link[^>]+rel="canonical"[^>]+href="([^"]+)"/);
|
|
1393
|
-
if (match && slugifyUrl(match[1]) === fileSlug) {
|
|
1394
|
-
urls.push(match[1]);
|
|
1395
|
-
continue;
|
|
1396
|
-
}
|
|
1397
|
-
const ogMatch = html.match(/<meta[^>]+property="og:url"[^>]+content="([^"]+)"/);
|
|
1398
|
-
if (ogMatch && slugifyUrl(ogMatch[1]) === fileSlug) {
|
|
1399
|
-
urls.push(ogMatch[1]);
|
|
1400
|
-
continue;
|
|
1401
|
-
}
|
|
1402
|
-
urls.push(`https://recovered/${fileSlug}`);
|
|
1403
|
-
}
|
|
1404
|
-
}
|
|
1910
|
+
urls = await recoverUrlsFromHtml(rawDir);
|
|
1405
1911
|
console.log(` Found ${urls.length} cached pages.`);
|
|
1406
1912
|
} else {
|
|
1407
1913
|
console.log(" Scraping URLs...");
|
|
@@ -1431,14 +1937,11 @@ Refreshing "${sourceName}"...`);
|
|
|
1431
1937
|
console.log(" Chunking...");
|
|
1432
1938
|
const allChunks = pages.flatMap((p) => chunkMarkdown(p.markdown, sourceName, p.url, p.title));
|
|
1433
1939
|
console.log(` Created ${allChunks.length} chunks.`);
|
|
1434
|
-
console.log(" Embedding chunks...");
|
|
1435
|
-
const texts = allChunks.map((c) => c.content);
|
|
1436
|
-
const embeddings = await embedWithCheckpoint(texts, rawDir, embeddingsCachePath);
|
|
1437
1940
|
if (args.values["skip-store"]) {
|
|
1438
|
-
console.log(` Done. ${allChunks.length} chunks ready (
|
|
1941
|
+
console.log(` Done. ${allChunks.length} chunks ready (dry run, no embed/store).`);
|
|
1439
1942
|
continue;
|
|
1440
1943
|
}
|
|
1441
|
-
await
|
|
1944
|
+
await syncChunks(sourceName, allChunks, pages.length, source.version);
|
|
1442
1945
|
}
|
|
1443
1946
|
}
|
|
1444
1947
|
async function cmdSearch() {
|
|
@@ -1447,17 +1950,19 @@ async function cmdSearch() {
|
|
|
1447
1950
|
options: {
|
|
1448
1951
|
source: { type: "string" },
|
|
1449
1952
|
top: { type: "string" },
|
|
1953
|
+
candidates: { type: "string" },
|
|
1450
1954
|
compact: { type: "boolean", default: false }
|
|
1451
1955
|
},
|
|
1452
1956
|
allowPositionals: true
|
|
1453
1957
|
});
|
|
1454
1958
|
const query = args.positionals.join(" ");
|
|
1455
1959
|
if (!query) {
|
|
1456
|
-
console.error('Usage: grimoire search "<query>" [--source <name>] [--top <n>] [--compact]');
|
|
1960
|
+
console.error('Usage: grimoire search "<query>" [--source <name>] [--top <n>] [--candidates <n>] [--compact]');
|
|
1457
1961
|
process.exit(1);
|
|
1458
1962
|
}
|
|
1459
1963
|
const topN = args.values.top ? parseInt(args.values.top, 10) : void 0;
|
|
1460
|
-
const
|
|
1964
|
+
const candidates = args.values.candidates ? parseInt(args.values.candidates, 10) : void 0;
|
|
1965
|
+
const results = await search(query, { source: args.values.source, topN, candidates });
|
|
1461
1966
|
if (results.length === 0) {
|
|
1462
1967
|
console.log("No results found.");
|
|
1463
1968
|
return;
|
|
@@ -1644,4 +2149,4 @@ var ADMIN_COMMANDS = {
|
|
|
1644
2149
|
export {
|
|
1645
2150
|
ADMIN_COMMANDS
|
|
1646
2151
|
};
|
|
1647
|
-
//# sourceMappingURL=admin-
|
|
2152
|
+
//# sourceMappingURL=admin-WUNBDKBC.js.map
|