@astrofoundry/grimoire 3.29.1 → 3.30.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{admin-E4L4RORC.js → admin-YF2OKHEQ.js} +806 -287
- package/dist/admin-YF2OKHEQ.js.map +7 -0
- package/dist/chunk-R46N6C3C.js +40 -0
- package/dist/{chunk-BRS6X3AE.js.map → chunk-R46N6C3C.js.map} +1 -1
- package/dist/cli.js +38 -22
- package/dist/cli.js.map +2 -2
- package/package.json +2 -1
- package/dist/admin-E4L4RORC.js.map +0 -7
- package/dist/chunk-BRS6X3AE.js +0 -12
|
@@ -1,12 +1,372 @@
|
|
|
1
1
|
import {
|
|
2
|
+
__commonJS,
|
|
3
|
+
__toESM,
|
|
2
4
|
bold,
|
|
3
5
|
cyan,
|
|
4
6
|
yellow
|
|
5
|
-
} from "./chunk-
|
|
7
|
+
} from "./chunk-R46N6C3C.js";
|
|
8
|
+
|
|
9
|
+
// node_modules/.pnpm/@joplin+turndown-plugin-gfm@1.0.67/node_modules/@joplin/turndown-plugin-gfm/lib/turndown-plugin-gfm.cjs.js
|
|
10
|
+
var require_turndown_plugin_gfm_cjs = __commonJS({
|
|
11
|
+
"node_modules/.pnpm/@joplin+turndown-plugin-gfm@1.0.67/node_modules/@joplin/turndown-plugin-gfm/lib/turndown-plugin-gfm.cjs.js"(exports) {
|
|
12
|
+
"use strict";
|
|
13
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
14
|
+
var highlightRegExp = /highlight-(?:text|source)-([a-z0-9]+)/;
|
|
15
|
+
function highlightedCodeBlock(turndownService) {
|
|
16
|
+
turndownService.addRule("highlightedCodeBlock", {
|
|
17
|
+
filter: function(node) {
|
|
18
|
+
var firstChild = node.firstChild;
|
|
19
|
+
return node.nodeName === "DIV" && highlightRegExp.test(node.className) && firstChild && firstChild.nodeName === "PRE";
|
|
20
|
+
},
|
|
21
|
+
replacement: function(content, node, options) {
|
|
22
|
+
var className = node.className || "";
|
|
23
|
+
var language = (className.match(highlightRegExp) || [null, ""])[1];
|
|
24
|
+
return "\n\n" + options.fence + language + "\n" + node.firstChild.textContent + "\n" + options.fence + "\n\n";
|
|
25
|
+
}
|
|
26
|
+
});
|
|
27
|
+
}
|
|
28
|
+
function strikethrough(turndownService) {
|
|
29
|
+
turndownService.addRule("strikethrough", {
|
|
30
|
+
filter: ["del", "s", "strike"],
|
|
31
|
+
replacement: function(content) {
|
|
32
|
+
return "~~" + content + "~~";
|
|
33
|
+
}
|
|
34
|
+
});
|
|
35
|
+
}
|
|
36
|
+
var indexOf = Array.prototype.indexOf;
|
|
37
|
+
var every = Array.prototype.every;
|
|
38
|
+
var rules = {};
|
|
39
|
+
var alignMap = { left: ":---", right: "---:", center: ":---:" };
|
|
40
|
+
var isCodeBlock_ = null;
|
|
41
|
+
var options_ = null;
|
|
42
|
+
var tableShouldBeSkippedCache_ = /* @__PURE__ */ new WeakMap();
|
|
43
|
+
function getAlignment(node) {
|
|
44
|
+
return node ? (node.getAttribute("align") || node.style.textAlign || "").toLowerCase() : "";
|
|
45
|
+
}
|
|
46
|
+
function getBorder(alignment) {
|
|
47
|
+
return alignment ? alignMap[alignment] : "---";
|
|
48
|
+
}
|
|
49
|
+
function getColumnAlignment(table, columnIndex) {
|
|
50
|
+
var votes = {
|
|
51
|
+
left: 0,
|
|
52
|
+
right: 0,
|
|
53
|
+
center: 0,
|
|
54
|
+
"": 0
|
|
55
|
+
};
|
|
56
|
+
var align = "";
|
|
57
|
+
for (var i = 0; i < table.rows.length; ++i) {
|
|
58
|
+
var row = table.rows[i];
|
|
59
|
+
if (columnIndex < row.childNodes.length) {
|
|
60
|
+
var cellAlignment = getAlignment(row.childNodes[columnIndex]);
|
|
61
|
+
++votes[cellAlignment];
|
|
62
|
+
if (votes[cellAlignment] > votes[align]) {
|
|
63
|
+
align = cellAlignment;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
return align;
|
|
68
|
+
}
|
|
69
|
+
rules.tableCell = {
|
|
70
|
+
filter: ["th", "td"],
|
|
71
|
+
replacement: function(content, node) {
|
|
72
|
+
if (tableShouldBeSkipped(nodeParentTable(node))) return content;
|
|
73
|
+
return cell(content, node);
|
|
74
|
+
}
|
|
75
|
+
};
|
|
76
|
+
rules.tableRow = {
|
|
77
|
+
filter: "tr",
|
|
78
|
+
replacement: function(content, node) {
|
|
79
|
+
const parentTable = nodeParentTable(node);
|
|
80
|
+
if (tableShouldBeSkipped(parentTable)) return content;
|
|
81
|
+
var borderCells = "";
|
|
82
|
+
if (isHeadingRow(node)) {
|
|
83
|
+
const colCount = tableColCount(parentTable);
|
|
84
|
+
for (var i = 0; i < colCount; i++) {
|
|
85
|
+
const childNode = i < node.childNodes.length ? node.childNodes[i] : null;
|
|
86
|
+
var border = getBorder(getColumnAlignment(parentTable, i));
|
|
87
|
+
borderCells += cell(border, childNode, i);
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
return "\n" + content + (borderCells ? "\n" + borderCells : "");
|
|
91
|
+
}
|
|
92
|
+
};
|
|
93
|
+
rules.table = {
|
|
94
|
+
filter: function(node, options) {
|
|
95
|
+
return node.nodeName === "TABLE";
|
|
96
|
+
},
|
|
97
|
+
replacement: function(content, node) {
|
|
98
|
+
if (tableShouldBeHtml(node, options_)) {
|
|
99
|
+
let html = node.outerHTML;
|
|
100
|
+
let divParent = nodeParentDiv(node);
|
|
101
|
+
if (divParent === null || !divParent.classList.contains("joplin-table-wrapper")) {
|
|
102
|
+
return `
|
|
103
|
+
|
|
104
|
+
<div class="joplin-table-wrapper">${html}</div>
|
|
105
|
+
|
|
106
|
+
`;
|
|
107
|
+
} else {
|
|
108
|
+
return html;
|
|
109
|
+
}
|
|
110
|
+
} else {
|
|
111
|
+
if (tableShouldBeSkipped(node)) return content;
|
|
112
|
+
content = content.replace(/\n+/g, "\n");
|
|
113
|
+
var secondLine = content.trim().split("\n");
|
|
114
|
+
if (secondLine.length >= 2) secondLine = secondLine[1];
|
|
115
|
+
var secondLineIsDivider = /\| :?---/.test(secondLine);
|
|
116
|
+
var columnCount = tableColCount(node);
|
|
117
|
+
var emptyHeader = "";
|
|
118
|
+
if (columnCount && !secondLineIsDivider) {
|
|
119
|
+
emptyHeader = "|" + " |".repeat(columnCount) + "\n|";
|
|
120
|
+
for (var columnIndex = 0; columnIndex < columnCount; ++columnIndex) {
|
|
121
|
+
emptyHeader += " " + getBorder(getColumnAlignment(node, columnIndex)) + " |";
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
const captionNode = node.querySelector ? node.querySelector("caption") : node.caption;
|
|
125
|
+
const captionContent = captionNode ? captionNode.textContent || "" : "";
|
|
126
|
+
const caption = captionContent ? `${captionContent}
|
|
127
|
+
|
|
128
|
+
` : "";
|
|
129
|
+
const tableContent = `${emptyHeader}${content}`.trimStart();
|
|
130
|
+
return `
|
|
131
|
+
|
|
132
|
+
${caption}${tableContent}
|
|
133
|
+
|
|
134
|
+
`;
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
};
|
|
138
|
+
rules.tableCaption = {
|
|
139
|
+
filter: ["caption"],
|
|
140
|
+
replacement: () => ""
|
|
141
|
+
};
|
|
142
|
+
rules.tableColgroup = {
|
|
143
|
+
filter: ["colgroup", "col"],
|
|
144
|
+
replacement: () => ""
|
|
145
|
+
};
|
|
146
|
+
rules.tableSection = {
|
|
147
|
+
filter: ["thead", "tbody", "tfoot"],
|
|
148
|
+
replacement: function(content) {
|
|
149
|
+
return content;
|
|
150
|
+
}
|
|
151
|
+
};
|
|
152
|
+
function isHeadingRow(tr) {
|
|
153
|
+
var parentNode = tr.parentNode;
|
|
154
|
+
return parentNode.nodeName === "THEAD" || parentNode.firstChild === tr && (parentNode.nodeName === "TABLE" || isFirstTbody(parentNode)) && every.call(tr.childNodes, function(n) {
|
|
155
|
+
return n.nodeName === "TH";
|
|
156
|
+
});
|
|
157
|
+
}
|
|
158
|
+
function isFirstTbody(element) {
|
|
159
|
+
var previousSibling = element.previousSibling;
|
|
160
|
+
return element.nodeName === "TBODY" && (!previousSibling || previousSibling.nodeName === "THEAD" && /^\s*$/i.test(previousSibling.textContent));
|
|
161
|
+
}
|
|
162
|
+
function cell(content, node = null, index = null) {
|
|
163
|
+
if (index === null) index = indexOf.call(node.parentNode.childNodes, node);
|
|
164
|
+
var prefix = " ";
|
|
165
|
+
if (index === 0) prefix = "| ";
|
|
166
|
+
let filteredContent = content.trim().replace(/\n\r/g, "<br>").replace(/\n/g, "<br>");
|
|
167
|
+
filteredContent = filteredContent.replace(/\|+/g, "\\|");
|
|
168
|
+
while (filteredContent.length < 3) filteredContent += " ";
|
|
169
|
+
if (node) filteredContent = handleColSpan(filteredContent, node, " ");
|
|
170
|
+
return prefix + filteredContent + " |";
|
|
171
|
+
}
|
|
172
|
+
function nodeContainsTable(node) {
|
|
173
|
+
if (!node.childNodes) return false;
|
|
174
|
+
for (let i = 0; i < node.childNodes.length; i++) {
|
|
175
|
+
const child = node.childNodes[i];
|
|
176
|
+
if (child.nodeName === "TABLE") return true;
|
|
177
|
+
if (nodeContainsTable(child)) return true;
|
|
178
|
+
}
|
|
179
|
+
return false;
|
|
180
|
+
}
|
|
181
|
+
var nodeContains = (node, types) => {
|
|
182
|
+
if (!node.childNodes) return false;
|
|
183
|
+
for (let i = 0; i < node.childNodes.length; i++) {
|
|
184
|
+
const child = node.childNodes[i];
|
|
185
|
+
if (types === "code" && isCodeBlock_ && isCodeBlock_(child)) return true;
|
|
186
|
+
if (types.includes(child.nodeName)) return true;
|
|
187
|
+
if (nodeContains(child, types)) return true;
|
|
188
|
+
}
|
|
189
|
+
return false;
|
|
190
|
+
};
|
|
191
|
+
var customStyleProperties = [
|
|
192
|
+
"background-color",
|
|
193
|
+
"background",
|
|
194
|
+
"border-color",
|
|
195
|
+
"border",
|
|
196
|
+
"border-top",
|
|
197
|
+
"border-right",
|
|
198
|
+
"border-bottom",
|
|
199
|
+
"border-left",
|
|
200
|
+
"border-style",
|
|
201
|
+
"border-width",
|
|
202
|
+
"padding",
|
|
203
|
+
"padding-top",
|
|
204
|
+
"padding-right",
|
|
205
|
+
"padding-bottom",
|
|
206
|
+
"padding-left",
|
|
207
|
+
"float",
|
|
208
|
+
"margin-left",
|
|
209
|
+
"margin-right"
|
|
210
|
+
];
|
|
211
|
+
var customAttributeNames = [
|
|
212
|
+
"bgcolor",
|
|
213
|
+
"bordercolor",
|
|
214
|
+
"background"
|
|
215
|
+
];
|
|
216
|
+
var nodeHasCustomStyle = (node) => {
|
|
217
|
+
if (!node || !node.getAttribute) return false;
|
|
218
|
+
const styleAttr = node.getAttribute("style");
|
|
219
|
+
if (!styleAttr) return false;
|
|
220
|
+
const properties = styleAttr.split(";").map((s) => s.split(":")[0].trim().toLowerCase()).filter((s) => s.length > 0);
|
|
221
|
+
for (let i = 0; i < properties.length; i++) {
|
|
222
|
+
if (customStyleProperties.includes(properties[i])) return true;
|
|
223
|
+
}
|
|
224
|
+
return false;
|
|
225
|
+
};
|
|
226
|
+
var hasNonDefaultSpacingAttribute = (node, name) => {
|
|
227
|
+
if (!node || !node.getAttribute) return false;
|
|
228
|
+
const value = node.getAttribute(name);
|
|
229
|
+
if (value === null) return false;
|
|
230
|
+
const normalisedValue = `${value}`.trim().toLowerCase();
|
|
231
|
+
if (!normalisedValue) return false;
|
|
232
|
+
if (normalisedValue === "0" || normalisedValue === "0px") return false;
|
|
233
|
+
return true;
|
|
234
|
+
};
|
|
235
|
+
var nodeHasCustomAttributes = (node) => {
|
|
236
|
+
if (!node || !node.getAttribute) return false;
|
|
237
|
+
for (let i = 0; i < customAttributeNames.length; i++) {
|
|
238
|
+
const value = node.getAttribute(customAttributeNames[i]);
|
|
239
|
+
if (value !== null && `${value}`.trim() !== "") return true;
|
|
240
|
+
}
|
|
241
|
+
if (node.nodeName === "TABLE") {
|
|
242
|
+
if (hasNonDefaultSpacingAttribute(node, "cellpadding")) return true;
|
|
243
|
+
if (hasNonDefaultSpacingAttribute(node, "cellspacing")) return true;
|
|
244
|
+
}
|
|
245
|
+
return false;
|
|
246
|
+
};
|
|
247
|
+
var nodeHasCustomFormatting = (node) => {
|
|
248
|
+
return nodeHasCustomStyle(node) || nodeHasCustomAttributes(node);
|
|
249
|
+
};
|
|
250
|
+
var tableHasCustomStyles = (tableNode) => {
|
|
251
|
+
if (nodeHasCustomFormatting(tableNode)) return true;
|
|
252
|
+
const rows = tableNode.rows;
|
|
253
|
+
if (!rows) return false;
|
|
254
|
+
for (let i = 0; i < rows.length; i++) {
|
|
255
|
+
const row = rows[i];
|
|
256
|
+
if (nodeHasCustomFormatting(row)) return true;
|
|
257
|
+
for (let j = 0; j < row.childNodes.length; j++) {
|
|
258
|
+
const cell2 = row.childNodes[j];
|
|
259
|
+
if ((cell2.nodeName === "TD" || cell2.nodeName === "TH") && nodeHasCustomFormatting(cell2)) {
|
|
260
|
+
return true;
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
return false;
|
|
265
|
+
};
|
|
266
|
+
var tableShouldBeHtml = (tableNode, options) => {
|
|
267
|
+
const possibleTags = [
|
|
268
|
+
"UL",
|
|
269
|
+
"OL",
|
|
270
|
+
"H1",
|
|
271
|
+
"H2",
|
|
272
|
+
"H3",
|
|
273
|
+
"H4",
|
|
274
|
+
"H5",
|
|
275
|
+
"H6",
|
|
276
|
+
"HR",
|
|
277
|
+
"BLOCKQUOTE"
|
|
278
|
+
];
|
|
279
|
+
if (options.preserveNestedTables) possibleTags.push("TABLE");
|
|
280
|
+
return nodeContains(tableNode, "code") || nodeContains(tableNode, possibleTags) || options.preserveTableStyles && tableHasCustomStyles(tableNode);
|
|
281
|
+
};
|
|
282
|
+
function tableShouldBeSkipped(tableNode) {
|
|
283
|
+
const cached = tableShouldBeSkippedCache_.get(tableNode);
|
|
284
|
+
if (cached !== void 0) return cached;
|
|
285
|
+
const result = tableShouldBeSkipped_(tableNode);
|
|
286
|
+
tableShouldBeSkippedCache_.set(tableNode, result);
|
|
287
|
+
return result;
|
|
288
|
+
}
|
|
289
|
+
function tableShouldBeSkipped_(tableNode) {
|
|
290
|
+
if (!tableNode) return true;
|
|
291
|
+
if (!tableNode.rows) return true;
|
|
292
|
+
if (tableNode.rows.length === 1 && tableNode.rows[0].childNodes.length <= 1) return true;
|
|
293
|
+
if (nodeContainsTable(tableNode)) return true;
|
|
294
|
+
return false;
|
|
295
|
+
}
|
|
296
|
+
function nodeParentDiv(node) {
|
|
297
|
+
let parent = node.parentNode;
|
|
298
|
+
while (parent.nodeName !== "DIV") {
|
|
299
|
+
parent = parent.parentNode;
|
|
300
|
+
if (!parent) return null;
|
|
301
|
+
}
|
|
302
|
+
return parent;
|
|
303
|
+
}
|
|
304
|
+
function nodeParentTable(node) {
|
|
305
|
+
let parent = node.parentNode;
|
|
306
|
+
while (parent.nodeName !== "TABLE") {
|
|
307
|
+
parent = parent.parentNode;
|
|
308
|
+
if (!parent) return null;
|
|
309
|
+
}
|
|
310
|
+
return parent;
|
|
311
|
+
}
|
|
312
|
+
function handleColSpan(content, node, emptyChar) {
|
|
313
|
+
const colspan = node.getAttribute("colspan") || 1;
|
|
314
|
+
for (let i = 1; i < colspan; i++) {
|
|
315
|
+
content += " | " + emptyChar.repeat(3);
|
|
316
|
+
}
|
|
317
|
+
return content;
|
|
318
|
+
}
|
|
319
|
+
function tableColCount(node) {
|
|
320
|
+
let maxColCount = 0;
|
|
321
|
+
for (let i = 0; i < node.rows.length; i++) {
|
|
322
|
+
const row = node.rows[i];
|
|
323
|
+
const colCount = row.childNodes.length;
|
|
324
|
+
if (colCount > maxColCount) maxColCount = colCount;
|
|
325
|
+
}
|
|
326
|
+
return maxColCount;
|
|
327
|
+
}
|
|
328
|
+
function tables2(turndownService) {
|
|
329
|
+
isCodeBlock_ = turndownService.isCodeBlock;
|
|
330
|
+
options_ = turndownService.options;
|
|
331
|
+
turndownService.keep(function(node) {
|
|
332
|
+
if (node.nodeName === "TABLE" && tableShouldBeHtml(node, turndownService.options)) return true;
|
|
333
|
+
return false;
|
|
334
|
+
});
|
|
335
|
+
for (var key in rules) turndownService.addRule(key, rules[key]);
|
|
336
|
+
}
|
|
337
|
+
function taskListItems(turndownService) {
|
|
338
|
+
turndownService.addRule("taskListItems", {
|
|
339
|
+
filter: function(node) {
|
|
340
|
+
const parent = node.parentNode;
|
|
341
|
+
const grandparent = parent.parentNode;
|
|
342
|
+
const grandparentIsListItem = !!grandparent && grandparent.nodeName === "LI";
|
|
343
|
+
return (node.type === "checkbox" || node.getAttribute("role") === "checkbox") && (parent.nodeName === "LI" || parent.nodeName === "LABEL" && grandparentIsListItem || parent.nodeName === "SPAN" && grandparentIsListItem);
|
|
344
|
+
},
|
|
345
|
+
replacement: function(content, node) {
|
|
346
|
+
const checked = node.nodeName === "INPUT" ? node.checked : node.getAttribute("aria-checked") === "true";
|
|
347
|
+
return (checked ? "[x]" : "[ ]") + " ";
|
|
348
|
+
}
|
|
349
|
+
});
|
|
350
|
+
}
|
|
351
|
+
function gfm(turndownService) {
|
|
352
|
+
turndownService.use([
|
|
353
|
+
highlightedCodeBlock,
|
|
354
|
+
strikethrough,
|
|
355
|
+
tables2,
|
|
356
|
+
taskListItems
|
|
357
|
+
]);
|
|
358
|
+
}
|
|
359
|
+
exports.gfm = gfm;
|
|
360
|
+
exports.highlightedCodeBlock = highlightedCodeBlock;
|
|
361
|
+
exports.strikethrough = strikethrough;
|
|
362
|
+
exports.tables = tables2;
|
|
363
|
+
exports.taskListItems = taskListItems;
|
|
364
|
+
}
|
|
365
|
+
});
|
|
6
366
|
|
|
7
367
|
// src/admin.ts
|
|
8
368
|
import { parseArgs } from "node:util";
|
|
9
|
-
import { readFile as readFile3, writeFile as writeFile4, readdir, rm
|
|
369
|
+
import { readFile as readFile3, writeFile as writeFile4, readdir, rm } from "node:fs/promises";
|
|
10
370
|
import { existsSync } from "node:fs";
|
|
11
371
|
import { join as join4, resolve } from "node:path";
|
|
12
372
|
import { createInterface } from "node:readline";
|
|
@@ -267,6 +627,7 @@ async function createBrowser() {
|
|
|
267
627
|
}
|
|
268
628
|
|
|
269
629
|
// src/converter.ts
|
|
630
|
+
var import_turndown_plugin_gfm = __toESM(require_turndown_plugin_gfm_cjs(), 1);
|
|
270
631
|
import { readFile as readFile2, writeFile as writeFile2, mkdir as mkdir2 } from "node:fs/promises";
|
|
271
632
|
import { join as join2 } from "node:path";
|
|
272
633
|
import { JSDOM } from "jsdom";
|
|
@@ -276,6 +637,7 @@ var turndown = new TurndownService({
|
|
|
276
637
|
codeBlockStyle: "fenced",
|
|
277
638
|
bulletListMarker: "-"
|
|
278
639
|
});
|
|
640
|
+
turndown.use(import_turndown_plugin_gfm.tables);
|
|
279
641
|
var GENERIC_REMOVE = [
|
|
280
642
|
"style",
|
|
281
643
|
"script",
|
|
@@ -293,26 +655,57 @@ function cleanMarkdown(md, textPatterns) {
|
|
|
293
655
|
}
|
|
294
656
|
return cleaned.trim();
|
|
295
657
|
}
|
|
658
|
+
function flattenTableCellMarkup(contentEl) {
|
|
659
|
+
for (const table of contentEl.querySelectorAll("table")) {
|
|
660
|
+
if (!table.isConnected) continue;
|
|
661
|
+
for (const inner of table.querySelectorAll("table")) {
|
|
662
|
+
if (!inner.isConnected) continue;
|
|
663
|
+
const rows = [...inner.rows].map(
|
|
664
|
+
(row) => [...row.cells].map((c) => c.textContent?.replace(/\s+/g, " ").trim() ?? "").filter(Boolean).join(" ")
|
|
665
|
+
).filter(Boolean);
|
|
666
|
+
inner.replaceWith(rows.join("; "));
|
|
667
|
+
}
|
|
668
|
+
for (const code of table.querySelectorAll("code")) {
|
|
669
|
+
code.replaceWith(code.textContent ?? "");
|
|
670
|
+
}
|
|
671
|
+
for (const list of table.querySelectorAll("ul, ol")) {
|
|
672
|
+
if (!list.isConnected) continue;
|
|
673
|
+
const items = [...list.querySelectorAll("li")].map((li) => li.textContent?.replace(/\s+/g, " ").trim() ?? "").filter(Boolean);
|
|
674
|
+
list.replaceWith(items.join("; "));
|
|
675
|
+
}
|
|
676
|
+
for (const block of table.querySelectorAll("aside, blockquote, h1, h2, h3, h4, h5, h6")) {
|
|
677
|
+
if (!block.isConnected) continue;
|
|
678
|
+
block.replaceWith(block.textContent?.replace(/\s+/g, " ").trim() ?? "");
|
|
679
|
+
}
|
|
680
|
+
}
|
|
681
|
+
}
|
|
682
|
+
function cleanTableRowBreaks(md) {
|
|
683
|
+
return md.split("\n").map(
|
|
684
|
+
(line) => line.startsWith("|") ? line.replace(/\s*<br\s*\/?>\s*/g, "; ") : line
|
|
685
|
+
).join("\n");
|
|
686
|
+
}
|
|
296
687
|
function extractContent(html, contentSelector, removeSelectors, removeTextPatterns) {
|
|
297
688
|
const dom = new JSDOM(html);
|
|
298
689
|
const doc = dom.window.document;
|
|
299
|
-
const contentEl = doc.querySelector(contentSelector);
|
|
300
|
-
if (!contentEl) {
|
|
301
|
-
return cleanMarkdown(turndown.turndown(doc.body.innerHTML), removeTextPatterns);
|
|
302
|
-
}
|
|
690
|
+
const contentEl = doc.querySelector(contentSelector) ?? doc.body;
|
|
303
691
|
const allSelectors = [...GENERIC_REMOVE, ...removeSelectors ?? []];
|
|
304
692
|
for (const selector of allSelectors) {
|
|
305
693
|
for (const el of contentEl.querySelectorAll(selector)) {
|
|
306
694
|
el.remove();
|
|
307
695
|
}
|
|
308
696
|
}
|
|
309
|
-
|
|
697
|
+
flattenTableCellMarkup(contentEl);
|
|
698
|
+
return cleanMarkdown(
|
|
699
|
+
cleanTableRowBreaks(turndown.turndown(contentEl.innerHTML)),
|
|
700
|
+
removeTextPatterns
|
|
701
|
+
);
|
|
310
702
|
}
|
|
311
703
|
function extractTitle(html) {
|
|
312
704
|
const dom = new JSDOM(html);
|
|
313
|
-
const
|
|
314
|
-
if (!
|
|
315
|
-
|
|
705
|
+
const raw = dom.window.document.querySelector("title")?.textContent?.replace(/\s+/g, " ").trim();
|
|
706
|
+
if (!raw) return "Untitled";
|
|
707
|
+
const stripped = raw.replace(/\s*\|[^|]*$/, "").replace(/\s*[\u2013\u2014][^\u2013\u2014]*$/, "").replace(/\s+-\s+(?!.*\s-\s)[^|]*$/, "").trim();
|
|
708
|
+
return stripped || raw;
|
|
316
709
|
}
|
|
317
710
|
function buildFrontmatter(source, url, title) {
|
|
318
711
|
return [
|
|
@@ -379,6 +772,13 @@ function buildChunkId(source, url, headingSlug, index) {
|
|
|
379
772
|
const base = `${prefix}${truncatedSlug}`;
|
|
380
773
|
return index !== void 0 ? `${base}-${index}` : base;
|
|
381
774
|
}
|
|
775
|
+
function buildEmbedText(chunk) {
|
|
776
|
+
const path = chunk.heading_path[0] === chunk.title ? chunk.heading_path.slice(1) : chunk.heading_path;
|
|
777
|
+
const context = [chunk.title, ...path].filter(Boolean).join(" > ");
|
|
778
|
+
return context ? `${context}
|
|
779
|
+
|
|
780
|
+
${chunk.content}` : chunk.content;
|
|
781
|
+
}
|
|
382
782
|
function parseHeadingSections(markdown) {
|
|
383
783
|
const lines = markdown.split("\n");
|
|
384
784
|
const sections = [];
|
|
@@ -431,26 +831,135 @@ function parseHeadingSections(markdown) {
|
|
|
431
831
|
}
|
|
432
832
|
return sections;
|
|
433
833
|
}
|
|
434
|
-
function
|
|
435
|
-
|
|
436
|
-
|
|
834
|
+
function isTableLine(line) {
|
|
835
|
+
return line.trimStart().startsWith("|");
|
|
836
|
+
}
|
|
837
|
+
function isTableSeparator(line) {
|
|
838
|
+
return /^\s*\|?[\s:|-]+\|?\s*$/.test(line) && line.includes("-");
|
|
839
|
+
}
|
|
840
|
+
function parseBlocks(lines) {
|
|
841
|
+
const blocks = [];
|
|
437
842
|
let current = [];
|
|
438
|
-
let
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
current
|
|
444
|
-
|
|
843
|
+
let kind = "paragraph";
|
|
844
|
+
let inFence = false;
|
|
845
|
+
let fenceMarker = "";
|
|
846
|
+
function flush() {
|
|
847
|
+
while (current.length > 0 && current[current.length - 1].trim() === "") {
|
|
848
|
+
current.pop();
|
|
849
|
+
}
|
|
850
|
+
if (current.length > 0) {
|
|
851
|
+
blocks.push({ kind, lines: current });
|
|
852
|
+
}
|
|
853
|
+
current = [];
|
|
854
|
+
kind = "paragraph";
|
|
855
|
+
}
|
|
856
|
+
for (const line of lines) {
|
|
857
|
+
if (inFence) {
|
|
858
|
+
current.push(line);
|
|
859
|
+
const fenceMatch2 = line.match(/^\s*(```+|~~~+)/);
|
|
860
|
+
if (fenceMatch2 && fenceMatch2[1][0] === fenceMarker) {
|
|
861
|
+
inFence = false;
|
|
862
|
+
flush();
|
|
863
|
+
}
|
|
864
|
+
continue;
|
|
865
|
+
}
|
|
866
|
+
const fenceMatch = line.match(/^\s*(```+|~~~+)/);
|
|
867
|
+
if (fenceMatch) {
|
|
868
|
+
flush();
|
|
869
|
+
kind = "fence";
|
|
870
|
+
inFence = true;
|
|
871
|
+
fenceMarker = fenceMatch[1][0];
|
|
872
|
+
current.push(line);
|
|
873
|
+
continue;
|
|
874
|
+
}
|
|
875
|
+
if (isTableLine(line)) {
|
|
876
|
+
if (kind !== "table") {
|
|
877
|
+
flush();
|
|
878
|
+
kind = "table";
|
|
879
|
+
}
|
|
880
|
+
current.push(line);
|
|
881
|
+
continue;
|
|
882
|
+
}
|
|
883
|
+
if (kind === "table") {
|
|
884
|
+
flush();
|
|
885
|
+
}
|
|
886
|
+
if (line.trim() === "") {
|
|
887
|
+
flush();
|
|
445
888
|
} else {
|
|
446
|
-
current.push(
|
|
447
|
-
currentTokens += paraTokens;
|
|
889
|
+
current.push(line);
|
|
448
890
|
}
|
|
449
891
|
}
|
|
450
|
-
|
|
451
|
-
|
|
892
|
+
flush();
|
|
893
|
+
return blocks;
|
|
894
|
+
}
|
|
895
|
+
function groupLines(lines, budget) {
|
|
896
|
+
const groups = [];
|
|
897
|
+
let current = [];
|
|
898
|
+
let tokens = 0;
|
|
899
|
+
for (const line of lines) {
|
|
900
|
+
const lineTokens = estimateTokens(line) + 1;
|
|
901
|
+
if (tokens + lineTokens > budget && current.length > 0) {
|
|
902
|
+
groups.push(current);
|
|
903
|
+
current = [];
|
|
904
|
+
tokens = 0;
|
|
905
|
+
}
|
|
906
|
+
current.push(line);
|
|
907
|
+
tokens += lineTokens;
|
|
908
|
+
}
|
|
909
|
+
if (current.length > 0) groups.push(current);
|
|
910
|
+
return groups;
|
|
911
|
+
}
|
|
912
|
+
function splitBlock(block, budget) {
|
|
913
|
+
if (block.kind === "table") {
|
|
914
|
+
const hasHeader = block.lines.length >= 2 && isTableSeparator(block.lines[1]);
|
|
915
|
+
const header = hasHeader ? block.lines.slice(0, 2) : [];
|
|
916
|
+
const rows = hasHeader ? block.lines.slice(2) : block.lines;
|
|
917
|
+
const headerTokens = estimateTokens(header.join("\n"));
|
|
918
|
+
return groupLines(rows, Math.max(budget - headerTokens, 50)).map(
|
|
919
|
+
(group) => [...header, ...group].join("\n")
|
|
920
|
+
);
|
|
452
921
|
}
|
|
453
|
-
|
|
922
|
+
if (block.kind === "fence") {
|
|
923
|
+
const opening = block.lines[0];
|
|
924
|
+
const closing = block.lines[block.lines.length - 1].match(/^\s*(```+|~~~+)\s*$/) ? block.lines[block.lines.length - 1] : opening.match(/^\s*(```+|~~~+)/)[1];
|
|
925
|
+
const body = block.lines.slice(1, block.lines[block.lines.length - 1] === closing ? -1 : void 0);
|
|
926
|
+
const frameTokens = estimateTokens(opening) + estimateTokens(closing) + 2;
|
|
927
|
+
return groupLines(body, Math.max(budget - frameTokens, 50)).map(
|
|
928
|
+
(group) => [opening, ...group, closing].join("\n")
|
|
929
|
+
);
|
|
930
|
+
}
|
|
931
|
+
return groupLines(block.lines, budget).map((group) => group.join("\n"));
|
|
932
|
+
}
|
|
933
|
+
function splitSectionIntoParts(blocks, headingLine, maxTokens) {
|
|
934
|
+
const budget = Math.max(maxTokens - estimateTokens(headingLine), 100);
|
|
935
|
+
const parts = [];
|
|
936
|
+
let current = [];
|
|
937
|
+
let tokens = 0;
|
|
938
|
+
function flush() {
|
|
939
|
+
if (current.length > 0) {
|
|
940
|
+
parts.push(current);
|
|
941
|
+
current = [];
|
|
942
|
+
tokens = 0;
|
|
943
|
+
}
|
|
944
|
+
}
|
|
945
|
+
for (const block of blocks) {
|
|
946
|
+
const text = block.lines.join("\n");
|
|
947
|
+
const blockTokens = estimateTokens(text) + 2;
|
|
948
|
+
if (blockTokens > budget) {
|
|
949
|
+
flush();
|
|
950
|
+
for (const piece of splitBlock(block, budget)) {
|
|
951
|
+
parts.push([piece]);
|
|
952
|
+
}
|
|
953
|
+
continue;
|
|
954
|
+
}
|
|
955
|
+
if (tokens + blockTokens > budget) {
|
|
956
|
+
flush();
|
|
957
|
+
}
|
|
958
|
+
current.push(text);
|
|
959
|
+
tokens += blockTokens;
|
|
960
|
+
}
|
|
961
|
+
flush();
|
|
962
|
+
return parts.map((blockTexts) => headingLine + blockTexts.join("\n\n"));
|
|
454
963
|
}
|
|
455
964
|
function stripFrontmatter(markdown) {
|
|
456
965
|
if (markdown.startsWith("---")) {
|
|
@@ -484,11 +993,11 @@ function chunkMarkdown(markdown, source, url, title) {
|
|
|
484
993
|
const headingLine = section.heading ? `${"#".repeat(section.level)} ${section.heading}
|
|
485
994
|
|
|
486
995
|
` : "";
|
|
487
|
-
const
|
|
996
|
+
const body = section.lines.join("\n").trim();
|
|
997
|
+
const content = headingLine + body;
|
|
488
998
|
if (!content.trim()) continue;
|
|
489
999
|
const headingSlug = section.heading ? slugifyHeading(section.heading) : "intro";
|
|
490
|
-
|
|
491
|
-
if (tokens <= MAX_TOKENS) {
|
|
1000
|
+
if (estimateTokens(content) <= MAX_TOKENS) {
|
|
492
1001
|
chunks.push({
|
|
493
1002
|
id: uniqueId(headingSlug),
|
|
494
1003
|
source,
|
|
@@ -496,29 +1005,56 @@ function chunkMarkdown(markdown, source, url, title) {
|
|
|
496
1005
|
title,
|
|
497
1006
|
heading_path: section.headingPath,
|
|
498
1007
|
content,
|
|
499
|
-
token_count:
|
|
1008
|
+
token_count: estimateTokens(content)
|
|
1009
|
+
});
|
|
1010
|
+
continue;
|
|
1011
|
+
}
|
|
1012
|
+
const blocks = parseBlocks(body.split("\n"));
|
|
1013
|
+
const parts = splitSectionIntoParts(blocks, headingLine, MAX_TOKENS);
|
|
1014
|
+
for (let i = 0; i < parts.length; i++) {
|
|
1015
|
+
const partContent = parts[i].trim();
|
|
1016
|
+
if (!partContent) continue;
|
|
1017
|
+
const partSlug = parts.length > 1 ? `${headingSlug}-${i}` : headingSlug;
|
|
1018
|
+
chunks.push({
|
|
1019
|
+
id: uniqueId(partSlug),
|
|
1020
|
+
source,
|
|
1021
|
+
url,
|
|
1022
|
+
title,
|
|
1023
|
+
heading_path: section.headingPath,
|
|
1024
|
+
content: partContent,
|
|
1025
|
+
token_count: estimateTokens(partContent)
|
|
500
1026
|
});
|
|
501
|
-
} else {
|
|
502
|
-
const parts = splitAtParagraphBoundaries(content, MAX_TOKENS);
|
|
503
|
-
for (let i = 0; i < parts.length; i++) {
|
|
504
|
-
const partContent = parts[i].trim();
|
|
505
|
-
if (!partContent) continue;
|
|
506
|
-
const partSlug = parts.length > 1 ? `${headingSlug}-${i}` : headingSlug;
|
|
507
|
-
chunks.push({
|
|
508
|
-
id: uniqueId(partSlug),
|
|
509
|
-
source,
|
|
510
|
-
url,
|
|
511
|
-
title,
|
|
512
|
-
heading_path: section.headingPath,
|
|
513
|
-
content: partContent,
|
|
514
|
-
token_count: estimateTokens(partContent)
|
|
515
|
-
});
|
|
516
|
-
}
|
|
517
1027
|
}
|
|
518
1028
|
}
|
|
519
1029
|
return chunks;
|
|
520
1030
|
}
|
|
521
1031
|
|
|
1032
|
+
// src/tokens.ts
|
|
1033
|
+
import { createHash } from "node:crypto";
|
|
1034
|
+
var IDENTIFIER_PATTERN = /(?<![A-Za-z0-9._])(?:[A-Za-z][A-Za-z0-9]*(?:[_.-][A-Za-z0-9]+)+|[a-z][a-z0-9]*(?:[A-Z][a-z0-9]*)+|(?:[A-Z][a-z0-9]+){2,})(?![A-Za-z0-9])/g;
|
|
1035
|
+
var MIN_TOKEN_LENGTH = 4;
|
|
1036
|
+
var MAX_TOKEN_LENGTH = 80;
|
|
1037
|
+
var MAX_TOKENS_PER_CHUNK = 100;
|
|
1038
|
+
function normalizeForTokens(text) {
|
|
1039
|
+
return text.replace(/\]\([^)]*\)/g, "]").replace(/https?:\/\/\S+/g, " ").replace(/\\([_*[\]()#`~-])/g, "$1");
|
|
1040
|
+
}
|
|
1041
|
+
function extractIdentifierTokens(text, limit = MAX_TOKENS_PER_CHUNK) {
|
|
1042
|
+
const seen = /* @__PURE__ */ new Set();
|
|
1043
|
+
for (const match of normalizeForTokens(text).matchAll(IDENTIFIER_PATTERN)) {
|
|
1044
|
+
const token = match[0].toLowerCase();
|
|
1045
|
+
if (token.length < MIN_TOKEN_LENGTH || token.length > MAX_TOKEN_LENGTH) continue;
|
|
1046
|
+
seen.add(token);
|
|
1047
|
+
if (seen.size >= limit) break;
|
|
1048
|
+
}
|
|
1049
|
+
return [...seen];
|
|
1050
|
+
}
|
|
1051
|
+
function extractQueryTokens(query) {
|
|
1052
|
+
return extractIdentifierTokens(query, 5);
|
|
1053
|
+
}
|
|
1054
|
+
function contentHash(text) {
|
|
1055
|
+
return createHash("sha256").update(text).digest("hex");
|
|
1056
|
+
}
|
|
1057
|
+
|
|
522
1058
|
// src/embedder.ts
|
|
523
1059
|
import { GoogleGenerativeAI } from "@google/generative-ai";
|
|
524
1060
|
var BATCH_SIZE = 50;
|
|
@@ -528,7 +1064,6 @@ var MAX_RETRIES = 5;
|
|
|
528
1064
|
var RATE_LIMIT_BASE_DELAY_MS = 6e4;
|
|
529
1065
|
var NETWORK_BASE_DELAY_MS = 1e4;
|
|
530
1066
|
var BATCH_DELAY_MS = 2500;
|
|
531
|
-
var DEFAULT_CHECKPOINT_EVERY_BATCHES = 20;
|
|
532
1067
|
var NETWORK_ERROR_PATTERNS = [
|
|
533
1068
|
"fetch failed",
|
|
534
1069
|
"ECONNRESET",
|
|
@@ -562,22 +1097,10 @@ function classifyError(message) {
|
|
|
562
1097
|
async function embedTexts(texts, options = {}) {
|
|
563
1098
|
const client = getClient();
|
|
564
1099
|
const model = client.getGenerativeModel({ model: MODEL });
|
|
565
|
-
const { onProgress
|
|
566
|
-
const checkpointEveryBatches = options.checkpointEveryBatches ?? DEFAULT_CHECKPOINT_EVERY_BATCHES;
|
|
1100
|
+
const { onProgress } = options;
|
|
567
1101
|
const maxRetries = options.maxRetries ?? MAX_RETRIES;
|
|
568
|
-
const embeddings =
|
|
569
|
-
|
|
570
|
-
if (embeddings.length > startIndex) {
|
|
571
|
-
embeddings.length = startIndex;
|
|
572
|
-
}
|
|
573
|
-
if (startIndex > 0) {
|
|
574
|
-
console.log(` Resuming from chunk ${startIndex} of ${texts.length} (${embeddings.length} cached).`);
|
|
575
|
-
}
|
|
576
|
-
if (startIndex >= texts.length) {
|
|
577
|
-
return embeddings.slice(0, texts.length);
|
|
578
|
-
}
|
|
579
|
-
let batchesSinceCheckpoint = 0;
|
|
580
|
-
for (let i = startIndex; i < texts.length; i += BATCH_SIZE) {
|
|
1102
|
+
const embeddings = [];
|
|
1103
|
+
for (let i = 0; i < texts.length; i += BATCH_SIZE) {
|
|
581
1104
|
const batch = texts.slice(i, i + BATCH_SIZE);
|
|
582
1105
|
const batchNumber = i / BATCH_SIZE + 1;
|
|
583
1106
|
let result;
|
|
@@ -609,18 +1132,10 @@ async function embedTexts(texts, options = {}) {
|
|
|
609
1132
|
embeddings.push(embedding.values);
|
|
610
1133
|
}
|
|
611
1134
|
onProgress?.(Math.min(i + BATCH_SIZE, texts.length), texts.length);
|
|
612
|
-
batchesSinceCheckpoint++;
|
|
613
|
-
if (onCheckpoint && batchesSinceCheckpoint >= checkpointEveryBatches && i + BATCH_SIZE < texts.length) {
|
|
614
|
-
await onCheckpoint(embeddings);
|
|
615
|
-
batchesSinceCheckpoint = 0;
|
|
616
|
-
}
|
|
617
1135
|
if (i + BATCH_SIZE < texts.length) {
|
|
618
1136
|
await new Promise((resolve2) => setTimeout(resolve2, BATCH_DELAY_MS));
|
|
619
1137
|
}
|
|
620
1138
|
}
|
|
621
|
-
if (onCheckpoint) {
|
|
622
|
-
await onCheckpoint(embeddings);
|
|
623
|
-
}
|
|
624
1139
|
return embeddings;
|
|
625
1140
|
}
|
|
626
1141
|
async function embedText(text) {
|
|
@@ -634,19 +1149,34 @@ import {
|
|
|
634
1149
|
getFirestore,
|
|
635
1150
|
FieldValue
|
|
636
1151
|
} from "firebase-admin/firestore";
|
|
637
|
-
var BATCH_SIZE2 =
|
|
638
|
-
var
|
|
1152
|
+
var BATCH_SIZE2 = 200;
|
|
1153
|
+
var INTER_BATCH_DELAY_MS = 150;
|
|
1154
|
+
var MAX_RETRIES2 = 7;
|
|
639
1155
|
var BASE_DELAY_MS = 5e3;
|
|
640
|
-
|
|
1156
|
+
function sleep2(ms) {
|
|
1157
|
+
return new Promise((resolve2) => setTimeout(resolve2, ms));
|
|
1158
|
+
}
|
|
1159
|
+
var TRANSIENT_PATTERNS = [
|
|
1160
|
+
"RESOURCE_EXHAUSTED",
|
|
1161
|
+
"Quota exceeded",
|
|
1162
|
+
"DEADLINE_EXCEEDED",
|
|
1163
|
+
"UNAVAILABLE",
|
|
1164
|
+
"ABORTED",
|
|
1165
|
+
"INTERNAL",
|
|
1166
|
+
"503",
|
|
1167
|
+
"ECONNRESET",
|
|
1168
|
+
"ETIMEDOUT"
|
|
1169
|
+
];
|
|
1170
|
+
async function retryOnTransient(fn) {
|
|
641
1171
|
for (let attempt = 0; attempt < MAX_RETRIES2; attempt++) {
|
|
642
1172
|
try {
|
|
643
1173
|
return await fn();
|
|
644
1174
|
} catch (err) {
|
|
645
1175
|
const message = err instanceof Error ? err.message : String(err);
|
|
646
|
-
if (
|
|
1176
|
+
if (TRANSIENT_PATTERNS.some((p) => message.includes(p)) && attempt < MAX_RETRIES2 - 1) {
|
|
647
1177
|
const delay = BASE_DELAY_MS * Math.pow(2, attempt);
|
|
648
|
-
console.log(`
|
|
649
|
-
await
|
|
1178
|
+
console.log(` Transient Firestore error (${message.split(":")[0].slice(0, 40)}), retrying in ${delay / 1e3}s...`);
|
|
1179
|
+
await sleep2(delay);
|
|
650
1180
|
continue;
|
|
651
1181
|
}
|
|
652
1182
|
throw err;
|
|
@@ -686,12 +1216,17 @@ async function storeChunks(chunks, embeddings, onProgress) {
|
|
|
686
1216
|
heading_path: chunk.heading_path,
|
|
687
1217
|
content: chunk.content,
|
|
688
1218
|
token_count: chunk.token_count,
|
|
1219
|
+
tokens: extractIdentifierTokens(chunk.content),
|
|
1220
|
+
content_hash: contentHash(buildEmbedText(chunk)),
|
|
689
1221
|
embedded_at: (/* @__PURE__ */ new Date()).toISOString(),
|
|
690
1222
|
embedding: FieldValue.vector(embSlice[j])
|
|
691
1223
|
});
|
|
692
1224
|
}
|
|
693
|
-
await
|
|
1225
|
+
await retryOnTransient(() => batch.commit());
|
|
694
1226
|
onProgress?.(Math.min(i + BATCH_SIZE2, chunks.length), chunks.length);
|
|
1227
|
+
if (i + BATCH_SIZE2 < chunks.length) {
|
|
1228
|
+
await sleep2(INTER_BATCH_DELAY_MS);
|
|
1229
|
+
}
|
|
695
1230
|
}
|
|
696
1231
|
}
|
|
697
1232
|
async function purgeSource(sourceName) {
|
|
@@ -705,12 +1240,12 @@ async function purgeSource(sourceName) {
|
|
|
705
1240
|
batch.delete(doc.ref);
|
|
706
1241
|
count++;
|
|
707
1242
|
if (count % BATCH_SIZE2 === 0) {
|
|
708
|
-
await
|
|
1243
|
+
await retryOnTransient(() => batch.commit());
|
|
709
1244
|
batch = database.batch();
|
|
710
1245
|
}
|
|
711
1246
|
}
|
|
712
1247
|
if (count % BATCH_SIZE2 !== 0) {
|
|
713
|
-
await
|
|
1248
|
+
await retryOnTransient(() => batch.commit());
|
|
714
1249
|
}
|
|
715
1250
|
return count;
|
|
716
1251
|
}
|
|
@@ -745,14 +1280,42 @@ async function deleteChunksByIds(ids, onProgress) {
|
|
|
745
1280
|
for (const id of slice) {
|
|
746
1281
|
batch.delete(col.doc(id));
|
|
747
1282
|
}
|
|
748
|
-
await
|
|
1283
|
+
await retryOnTransient(() => batch.commit());
|
|
749
1284
|
onProgress?.(Math.min(i + BATCH_SIZE2, ids.length), ids.length);
|
|
750
1285
|
}
|
|
751
1286
|
}
|
|
752
|
-
async function
|
|
1287
|
+
async function getSourceChunkHashes(sourceName) {
|
|
1288
|
+
const col = chunksCol();
|
|
1289
|
+
const snapshot = await col.where("source", "==", sourceName).select("content_hash").get();
|
|
1290
|
+
return new Map(
|
|
1291
|
+
snapshot.docs.map((doc) => [doc.id, doc.data().content_hash ?? ""])
|
|
1292
|
+
);
|
|
1293
|
+
}
|
|
1294
|
+
var TOKEN_QUERY_LIMIT = 100;
|
|
1295
|
+
async function tokenSearch(tokens, source) {
|
|
1296
|
+
if (tokens.length === 0) return [];
|
|
753
1297
|
const col = chunksCol();
|
|
754
|
-
const
|
|
755
|
-
|
|
1298
|
+
const hits = /* @__PURE__ */ new Map();
|
|
1299
|
+
const snapshots = await Promise.all(
|
|
1300
|
+
tokens.map((token) => {
|
|
1301
|
+
let query = col.where("tokens", "array-contains", token);
|
|
1302
|
+
if (source) {
|
|
1303
|
+
query = query.where("source", "==", source);
|
|
1304
|
+
}
|
|
1305
|
+
return query.limit(TOKEN_QUERY_LIMIT).get();
|
|
1306
|
+
})
|
|
1307
|
+
);
|
|
1308
|
+
for (const snapshot of snapshots) {
|
|
1309
|
+
for (const doc of snapshot.docs) {
|
|
1310
|
+
const existing = hits.get(doc.id);
|
|
1311
|
+
if (existing) {
|
|
1312
|
+
existing.matchedTokens++;
|
|
1313
|
+
} else {
|
|
1314
|
+
hits.set(doc.id, { id: doc.id, data: doc.data(), matchedTokens: 1 });
|
|
1315
|
+
}
|
|
1316
|
+
}
|
|
1317
|
+
}
|
|
1318
|
+
return [...hits.values()].sort((a, b) => b.matchedTokens - a.matchedTokens);
|
|
756
1319
|
}
|
|
757
1320
|
async function vectorSearch(queryEmbedding, limit, source) {
|
|
758
1321
|
const col = chunksCol();
|
|
@@ -798,47 +1361,92 @@ async function rerank(query, documents, topN = 5) {
|
|
|
798
1361
|
}
|
|
799
1362
|
|
|
800
1363
|
// src/search.ts
|
|
1364
|
+
var DEFAULT_CANDIDATES = 50;
|
|
1365
|
+
var RRF_K = 60;
|
|
801
1366
|
function hasReranker() {
|
|
802
1367
|
return !!process.env.RERANKER_URL;
|
|
803
1368
|
}
|
|
1369
|
+
function isVectorLike(value) {
|
|
1370
|
+
return typeof value === "object" && value !== null && typeof value.toArray === "function";
|
|
1371
|
+
}
|
|
1372
|
+
function cosineSimilarity(a, b) {
|
|
1373
|
+
let dot = 0;
|
|
1374
|
+
let normA = 0;
|
|
1375
|
+
let normB = 0;
|
|
1376
|
+
for (let i = 0; i < a.length; i++) {
|
|
1377
|
+
dot += a[i] * b[i];
|
|
1378
|
+
normA += a[i] * a[i];
|
|
1379
|
+
normB += b[i] * b[i];
|
|
1380
|
+
}
|
|
1381
|
+
return dot / (Math.sqrt(normA) * Math.sqrt(normB));
|
|
1382
|
+
}
|
|
1383
|
+
function contextualText(data) {
|
|
1384
|
+
return buildEmbedText({
|
|
1385
|
+
title: data.title,
|
|
1386
|
+
heading_path: data.heading_path,
|
|
1387
|
+
content: data.content
|
|
1388
|
+
});
|
|
1389
|
+
}
|
|
1390
|
+
function toSearchResult(candidate, relevance) {
|
|
1391
|
+
const data = candidate.data;
|
|
1392
|
+
return {
|
|
1393
|
+
id: candidate.id,
|
|
1394
|
+
source: data.source,
|
|
1395
|
+
url: data.url,
|
|
1396
|
+
title: data.title,
|
|
1397
|
+
heading_path: data.heading_path,
|
|
1398
|
+
content: data.content,
|
|
1399
|
+
relevance_score: relevance
|
|
1400
|
+
};
|
|
1401
|
+
}
|
|
804
1402
|
async function search(query, options = {}) {
|
|
805
|
-
const { source, candidates =
|
|
1403
|
+
const { source, candidates = DEFAULT_CANDIDATES, topN = 5 } = options;
|
|
806
1404
|
const queryEmbedding = await embedText(query);
|
|
807
|
-
const
|
|
808
|
-
|
|
1405
|
+
const [vectorResults, lexicalHits] = await Promise.all([
|
|
1406
|
+
vectorSearch(queryEmbedding, candidates, source),
|
|
1407
|
+
tokenSearch(extractQueryTokens(query), source)
|
|
1408
|
+
]);
|
|
1409
|
+
const pool = /* @__PURE__ */ new Map();
|
|
1410
|
+
vectorResults.forEach((result, rank) => {
|
|
1411
|
+
pool.set(result.id, {
|
|
1412
|
+
id: result.id,
|
|
1413
|
+
data: result.data,
|
|
1414
|
+
fusedScore: 1 / (RRF_K + rank + 1),
|
|
1415
|
+
similarity: 1 - result.distance
|
|
1416
|
+
});
|
|
1417
|
+
});
|
|
1418
|
+
lexicalHits.forEach((hit, rank) => {
|
|
1419
|
+
const lexicalScore = 1 / (RRF_K + rank + 1);
|
|
1420
|
+
const existing = pool.get(hit.id);
|
|
1421
|
+
if (existing) {
|
|
1422
|
+
existing.fusedScore += lexicalScore;
|
|
1423
|
+
return;
|
|
1424
|
+
}
|
|
1425
|
+
const embedding = hit.data.embedding;
|
|
1426
|
+
delete hit.data.embedding;
|
|
1427
|
+
pool.set(hit.id, {
|
|
1428
|
+
id: hit.id,
|
|
1429
|
+
data: hit.data,
|
|
1430
|
+
fusedScore: lexicalScore,
|
|
1431
|
+
similarity: isVectorLike(embedding) ? cosineSimilarity(queryEmbedding, embedding.toArray()) : null
|
|
1432
|
+
});
|
|
1433
|
+
});
|
|
1434
|
+
const fused = [...pool.values()].sort((a, b) => b.fusedScore - a.fusedScore);
|
|
1435
|
+
if (fused.length === 0) return [];
|
|
809
1436
|
if (hasReranker()) {
|
|
810
|
-
const
|
|
1437
|
+
const rerankPool = fused.slice(0, candidates);
|
|
1438
|
+
const documents = rerankPool.map((c) => contextualText(c.data));
|
|
811
1439
|
const reranked = await rerank(query, documents, topN);
|
|
812
|
-
return reranked.map((r) =>
|
|
813
|
-
const original = rawResults[r.index];
|
|
814
|
-
const data = original.data;
|
|
815
|
-
return {
|
|
816
|
-
id: original.id,
|
|
817
|
-
source: data.source,
|
|
818
|
-
url: data.url,
|
|
819
|
-
title: data.title,
|
|
820
|
-
heading_path: data.heading_path,
|
|
821
|
-
content: data.content,
|
|
822
|
-
relevance_score: r.relevance_score
|
|
823
|
-
};
|
|
824
|
-
});
|
|
1440
|
+
return reranked.map((r) => toSearchResult(rerankPool[r.index], r.relevance_score));
|
|
825
1441
|
}
|
|
826
|
-
return
|
|
827
|
-
const
|
|
828
|
-
return
|
|
829
|
-
id: r.id,
|
|
830
|
-
source: data.source,
|
|
831
|
-
url: data.url,
|
|
832
|
-
title: data.title,
|
|
833
|
-
heading_path: data.heading_path,
|
|
834
|
-
content: data.content,
|
|
835
|
-
relevance_score: Math.max(0, 1 - r.distance / 2)
|
|
836
|
-
};
|
|
1442
|
+
return fused.slice(0, topN).map((candidate) => {
|
|
1443
|
+
const similarity = candidate.similarity ?? 0;
|
|
1444
|
+
return toSearchResult(candidate, Math.max(0, (1 + similarity) / 2));
|
|
837
1445
|
});
|
|
838
1446
|
}
|
|
839
1447
|
|
|
840
1448
|
// src/apikey.ts
|
|
841
|
-
import { randomBytes, createHash } from "node:crypto";
|
|
1449
|
+
import { randomBytes, createHash as createHash2 } from "node:crypto";
|
|
842
1450
|
import {
|
|
843
1451
|
getFirestore as getFirestore2
|
|
844
1452
|
} from "firebase-admin/firestore";
|
|
@@ -854,7 +1462,7 @@ function getDb2() {
|
|
|
854
1462
|
return db2;
|
|
855
1463
|
}
|
|
856
1464
|
function hashKey(key) {
|
|
857
|
-
return
|
|
1465
|
+
return createHash2("sha256").update(key).digest("hex");
|
|
858
1466
|
}
|
|
859
1467
|
function apiKeysCol() {
|
|
860
1468
|
return getDb2().collection("grimoire_api_keys");
|
|
@@ -1148,89 +1756,85 @@ Source "${name}" added to config/sources.yaml`);
|
|
|
1148
1756
|
await browser.close();
|
|
1149
1757
|
}
|
|
1150
1758
|
}
|
|
1151
|
-
var
|
|
1152
|
-
function
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
const
|
|
1157
|
-
const
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
if (
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
const arr = JSON.parse(data);
|
|
1168
|
-
for (const row of arr) all.push(row);
|
|
1169
|
-
}
|
|
1170
|
-
return all;
|
|
1759
|
+
var EMBED_WINDOW = 1e3;
|
|
1760
|
+
async function syncChunks(sourceName, allChunks, urlCount, version) {
|
|
1761
|
+
console.log(" Comparing with Firestore...");
|
|
1762
|
+
const existing = await getSourceChunkHashes(sourceName);
|
|
1763
|
+
const currentIds = new Set(allChunks.map((c) => c.id));
|
|
1764
|
+
const toDelete = [...existing.keys()].filter((id) => !currentIds.has(id));
|
|
1765
|
+
const toEmbed = allChunks.filter(
|
|
1766
|
+
(chunk) => existing.get(chunk.id) !== contentHash(buildEmbedText(chunk))
|
|
1767
|
+
);
|
|
1768
|
+
console.log(
|
|
1769
|
+
` Sync: ${toEmbed.length} to embed, ${allChunks.length - toEmbed.length} unchanged, ${toDelete.length} to delete.`
|
|
1770
|
+
);
|
|
1771
|
+
if (toDelete.length > 0) {
|
|
1772
|
+
await deleteChunksByIds(toDelete, (cur, total) => {
|
|
1773
|
+
console.log(` [${cur}/${total}] deleted`);
|
|
1774
|
+
});
|
|
1171
1775
|
}
|
|
1172
|
-
|
|
1173
|
-
const
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1776
|
+
for (let i = 0; i < toEmbed.length; i += EMBED_WINDOW) {
|
|
1777
|
+
const window = toEmbed.slice(i, i + EMBED_WINDOW);
|
|
1778
|
+
const embeddings = await embedTexts(window.map((c) => buildEmbedText(c)), {
|
|
1779
|
+
onProgress: (done) => {
|
|
1780
|
+
console.log(` [${i + done}/${toEmbed.length}] embedded`);
|
|
1781
|
+
}
|
|
1782
|
+
});
|
|
1783
|
+
await storeChunks(window, embeddings, (cur) => {
|
|
1784
|
+
console.log(` [${i + cur}/${toEmbed.length}] stored`);
|
|
1785
|
+
});
|
|
1177
1786
|
}
|
|
1787
|
+
await updateSourceMeta(sourceName, allChunks.length, urlCount, version);
|
|
1788
|
+
console.log(` Done. ${allChunks.length} chunks live for "${sourceName}".`);
|
|
1178
1789
|
}
|
|
1179
|
-
async function
|
|
1180
|
-
await
|
|
1181
|
-
|
|
1182
|
-
const
|
|
1183
|
-
|
|
1184
|
-
|
|
1790
|
+
async function readCachedMarkdownPages(mdDir) {
|
|
1791
|
+
const mdFiles = await readdir(mdDir).catch(() => []);
|
|
1792
|
+
const pages = [];
|
|
1793
|
+
for (const f of mdFiles.filter((f2) => f2.endsWith(".md"))) {
|
|
1794
|
+
const content = await readFile3(join4(mdDir, f), "utf-8");
|
|
1795
|
+
const urlMatch = content.match(/^url: "(.+)"$/m);
|
|
1796
|
+
const titleMatch = content.match(/^title: "(.+)"$/m);
|
|
1797
|
+
if (!urlMatch) {
|
|
1798
|
+
console.warn(` WARNING: ${f} has no url in frontmatter, skipping page.`);
|
|
1799
|
+
continue;
|
|
1800
|
+
}
|
|
1801
|
+
pages.push({
|
|
1802
|
+
markdown: content,
|
|
1803
|
+
url: urlMatch[1],
|
|
1804
|
+
title: titleMatch?.[1] ?? "Untitled"
|
|
1185
1805
|
});
|
|
1186
1806
|
}
|
|
1187
|
-
|
|
1188
|
-
const part = embeddings.slice(i, i + EMBEDDINGS_PART_SIZE);
|
|
1189
|
-
const index = Math.floor(i / EMBEDDINGS_PART_SIZE);
|
|
1190
|
-
await writeFile4(partPath(cachePath, index), JSON.stringify(part), "utf-8");
|
|
1191
|
-
}
|
|
1192
|
-
}
|
|
1193
|
-
async function embedWithCheckpoint(texts, rawDir, embeddingsCachePath) {
|
|
1194
|
-
await mkdir4(rawDir, { recursive: true });
|
|
1195
|
-
const partialCache = await loadEmbeddingsCache(embeddingsCachePath);
|
|
1196
|
-
const resumeFrom = partialCache && partialCache.length > 0 && partialCache.length < texts.length ? partialCache : void 0;
|
|
1197
|
-
return embedTexts(texts, {
|
|
1198
|
-
onProgress: (done, total) => {
|
|
1199
|
-
console.log(` [${done}/${total}] embedded`);
|
|
1200
|
-
},
|
|
1201
|
-
onCheckpoint: async (current) => {
|
|
1202
|
-
await saveEmbeddingsCache(embeddingsCachePath, current);
|
|
1203
|
-
},
|
|
1204
|
-
resumeFrom
|
|
1205
|
-
});
|
|
1807
|
+
return pages;
|
|
1206
1808
|
}
|
|
1207
|
-
async function
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
const
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
|
|
1217
|
-
|
|
1218
|
-
|
|
1809
|
+
async function recoverUrlsFromHtml(rawDir) {
|
|
1810
|
+
const urlsJsonPath = join4(rawDir, "urls.json");
|
|
1811
|
+
try {
|
|
1812
|
+
return JSON.parse(await readFile3(urlsJsonPath, "utf-8"));
|
|
1813
|
+
} catch {
|
|
1814
|
+
const rawFiles = await readdir(rawDir);
|
|
1815
|
+
const urls = [];
|
|
1816
|
+
let skipped = 0;
|
|
1817
|
+
for (const f of rawFiles.filter((f2) => f2.endsWith(".html"))) {
|
|
1818
|
+
const fileSlug = f.replace(/\.html$/, "");
|
|
1819
|
+
const html = await readFile3(join4(rawDir, f), "utf-8");
|
|
1820
|
+
const match = html.match(/<link[^>]+rel="canonical"[^>]+href="([^"]+)"/);
|
|
1821
|
+
if (match && slugifyUrl(match[1]) === fileSlug) {
|
|
1822
|
+
urls.push(match[1]);
|
|
1823
|
+
continue;
|
|
1824
|
+
}
|
|
1825
|
+
const ogMatch = html.match(/<meta[^>]+property="og:url"[^>]+content="([^"]+)"/);
|
|
1826
|
+
if (ogMatch && slugifyUrl(ogMatch[1]) === fileSlug) {
|
|
1827
|
+
urls.push(ogMatch[1]);
|
|
1828
|
+
continue;
|
|
1829
|
+
}
|
|
1830
|
+
console.warn(` WARNING: cannot recover URL for ${f}, skipping page.`);
|
|
1831
|
+
skipped++;
|
|
1219
1832
|
}
|
|
1220
|
-
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
|
|
1224
|
-
} else {
|
|
1225
|
-
console.log(" Purging old chunks...");
|
|
1226
|
-
await purgeSource(sourceName);
|
|
1227
|
-
console.log(" Storing in Firestore...");
|
|
1228
|
-
await storeChunks(allChunks, embeddings, (cur, total) => {
|
|
1229
|
-
console.log(` [${cur}/${total}] stored`);
|
|
1230
|
-
});
|
|
1833
|
+
if (skipped > 0) {
|
|
1834
|
+
console.warn(` Skipped ${skipped} pages with unrecoverable URLs. Provide urls.json to include them.`);
|
|
1835
|
+
}
|
|
1836
|
+
return urls;
|
|
1231
1837
|
}
|
|
1232
|
-
await updateSourceMeta(sourceName, allChunks.length, urlCount, version);
|
|
1233
|
-
console.log(` Done. ${allChunks.length} chunks stored for "${sourceName}".`);
|
|
1234
1838
|
}
|
|
1235
1839
|
async function cmdRefresh() {
|
|
1236
1840
|
const args = parseArgs({
|
|
@@ -1238,12 +1842,10 @@ async function cmdRefresh() {
|
|
|
1238
1842
|
options: {
|
|
1239
1843
|
full: { type: "boolean", default: false },
|
|
1240
1844
|
all: { type: "boolean", default: false },
|
|
1241
|
-
diff: { type: "boolean", default: false },
|
|
1242
1845
|
concurrency: { type: "string" },
|
|
1243
1846
|
limit: { type: "string" },
|
|
1244
1847
|
"from-html": { type: "boolean", default: false },
|
|
1245
1848
|
"from-markdown": { type: "boolean", default: false },
|
|
1246
|
-
"from-embeddings": { type: "boolean", default: false },
|
|
1247
1849
|
"skip-store": { type: "boolean", default: false }
|
|
1248
1850
|
},
|
|
1249
1851
|
allowPositionals: true
|
|
@@ -1251,7 +1853,7 @@ async function cmdRefresh() {
|
|
|
1251
1853
|
const config = await loadConfig(CONFIG_PATH);
|
|
1252
1854
|
const sourcesToRefresh = args.values.all ? Object.keys(config.sources) : [args.positionals[0]];
|
|
1253
1855
|
if (!args.values.all && !sourcesToRefresh[0]) {
|
|
1254
|
-
console.error("Usage: grimoire refresh <source> [--full] [--from-html] [--from-markdown] [--
|
|
1856
|
+
console.error("Usage: grimoire refresh <source> [--full] [--from-html] [--from-markdown] [--skip-store] [--limit <n>] [--concurrency <n>]");
|
|
1255
1857
|
process.exit(1);
|
|
1256
1858
|
}
|
|
1257
1859
|
const concurrencyOverride = args.values.concurrency ? parseInt(args.values.concurrency, 10) : void 0;
|
|
@@ -1267,7 +1869,6 @@ async function cmdRefresh() {
|
|
|
1267
1869
|
}
|
|
1268
1870
|
const rawDir = join4(DATA_DIR, "raw", sourceName);
|
|
1269
1871
|
const mdDir = join4(DATA_DIR, "markdown", sourceName);
|
|
1270
|
-
const embeddingsCachePath = join4(rawDir, "embeddings.json");
|
|
1271
1872
|
console.log(`
|
|
1272
1873
|
Refreshing "${sourceName}"...`);
|
|
1273
1874
|
if (args.values.full) {
|
|
@@ -1277,75 +1878,16 @@ Refreshing "${sourceName}"...`);
|
|
|
1277
1878
|
await rm(rawDir, { recursive: true, force: true });
|
|
1278
1879
|
await rm(mdDir, { recursive: true, force: true });
|
|
1279
1880
|
}
|
|
1280
|
-
let
|
|
1281
|
-
if (args.values["from-embeddings"]) {
|
|
1282
|
-
console.log(" Loading cached embeddings...");
|
|
1283
|
-
const cached = await loadEmbeddingsCache(embeddingsCachePath);
|
|
1284
|
-
if (!cached) {
|
|
1285
|
-
console.error(" No cached embeddings found. Run without --from-embeddings first.");
|
|
1286
|
-
process.exit(1);
|
|
1287
|
-
}
|
|
1288
|
-
const mdFiles = await readdir(mdDir);
|
|
1289
|
-
const allPages = [];
|
|
1290
|
-
for (const f of mdFiles.filter((f2) => f2.endsWith(".md"))) {
|
|
1291
|
-
const content = await readFile3(join4(mdDir, f), "utf-8");
|
|
1292
|
-
const urlMatch = content.match(/^url: "(.+)"$/m);
|
|
1293
|
-
const titleMatch = content.match(/^title: "(.+)"$/m);
|
|
1294
|
-
allPages.push({
|
|
1295
|
-
markdown: content,
|
|
1296
|
-
url: urlMatch?.[1] ?? "",
|
|
1297
|
-
title: titleMatch?.[1] ?? "Untitled"
|
|
1298
|
-
});
|
|
1299
|
-
}
|
|
1300
|
-
console.log(" Chunking...");
|
|
1301
|
-
const allChunks2 = allPages.flatMap((p) => chunkMarkdown(p.markdown, sourceName, p.url, p.title));
|
|
1302
|
-
console.log(` Created ${allChunks2.length} chunks.`);
|
|
1303
|
-
if (cached.length !== allChunks2.length) {
|
|
1304
|
-
console.error(` Embeddings cache (${cached.length}) doesn't match chunk count (${allChunks2.length}). Re-embed with --from-html.`);
|
|
1305
|
-
process.exit(1);
|
|
1306
|
-
}
|
|
1307
|
-
if (args.values["skip-store"]) {
|
|
1308
|
-
console.log(` Done. ${allChunks2.length} chunks ready (skipped Firestore).`);
|
|
1309
|
-
continue;
|
|
1310
|
-
}
|
|
1311
|
-
await storeWithStrategy(sourceName, allChunks2, cached, allPages.length, source.version, args.values.diff);
|
|
1312
|
-
continue;
|
|
1313
|
-
}
|
|
1881
|
+
let pages;
|
|
1314
1882
|
if (args.values["from-markdown"]) {
|
|
1315
1883
|
console.log(" Reading cached markdown...");
|
|
1316
|
-
|
|
1317
|
-
|
|
1318
|
-
if (markdownFiles.length === 0) {
|
|
1884
|
+
pages = await readCachedMarkdownPages(mdDir);
|
|
1885
|
+
if (pages.length === 0) {
|
|
1319
1886
|
console.error(" No cached markdown found. Run with --from-html first.");
|
|
1320
1887
|
process.exit(1);
|
|
1321
1888
|
}
|
|
1322
|
-
|
|
1323
|
-
|
|
1324
|
-
const content = await readFile3(join4(mdDir, f), "utf-8");
|
|
1325
|
-
const urlMatch = content.match(/^url: "(.+)"$/m);
|
|
1326
|
-
const titleMatch = content.match(/^title: "(.+)"$/m);
|
|
1327
|
-
pages2.push({
|
|
1328
|
-
markdown: content,
|
|
1329
|
-
url: urlMatch?.[1] ?? "",
|
|
1330
|
-
title: titleMatch?.[1] ?? "Untitled"
|
|
1331
|
-
});
|
|
1332
|
-
}
|
|
1333
|
-
console.log(` Found ${pages2.length} cached pages.`);
|
|
1334
|
-
console.log(" Chunking...");
|
|
1335
|
-
const allChunks2 = pages2.flatMap((p) => chunkMarkdown(p.markdown, sourceName, p.url, p.title));
|
|
1336
|
-
console.log(` Created ${allChunks2.length} chunks.`);
|
|
1337
|
-
console.log(" Embedding chunks...");
|
|
1338
|
-
const texts2 = allChunks2.map((c) => c.content);
|
|
1339
|
-
const embeddings2 = await embedWithCheckpoint(texts2, rawDir, embeddingsCachePath);
|
|
1340
|
-
if (args.values["skip-store"]) {
|
|
1341
|
-
console.log(` Done. ${allChunks2.length} chunks ready (skipped Firestore).`);
|
|
1342
|
-
continue;
|
|
1343
|
-
}
|
|
1344
|
-
await storeWithStrategy(sourceName, allChunks2, embeddings2, pages2.length, source.version, args.values.diff);
|
|
1345
|
-
continue;
|
|
1346
|
-
}
|
|
1347
|
-
let pages;
|
|
1348
|
-
if (source.llms_full_url && !args.values["from-html"]) {
|
|
1889
|
+
console.log(` Found ${pages.length} cached pages.`);
|
|
1890
|
+
} else if (source.llms_full_url && !args.values["from-html"]) {
|
|
1349
1891
|
console.log(` Fetching llms-full.txt from ${source.llms_full_url}...`);
|
|
1350
1892
|
pages = await ingestLlmsFull(
|
|
1351
1893
|
source.llms_full_url,
|
|
@@ -1358,32 +1900,10 @@ Refreshing "${sourceName}"...`);
|
|
|
1358
1900
|
);
|
|
1359
1901
|
console.log(` Extracted ${pages.length} pages.`);
|
|
1360
1902
|
} else {
|
|
1903
|
+
let urls;
|
|
1361
1904
|
if (args.values["from-html"]) {
|
|
1362
1905
|
console.log(" Reading URLs from cached HTML...");
|
|
1363
|
-
|
|
1364
|
-
try {
|
|
1365
|
-
urls = JSON.parse(await readFile3(urlsJsonPath, "utf-8"));
|
|
1366
|
-
} catch {
|
|
1367
|
-
const rawFiles = await readdir(rawDir);
|
|
1368
|
-
const htmlFiles = rawFiles.filter((f) => f.endsWith(".html"));
|
|
1369
|
-
urls = [];
|
|
1370
|
-
for (const f of htmlFiles) {
|
|
1371
|
-
const fileSlug = f.replace(/\.html$/, "");
|
|
1372
|
-
const htmlPath = join4(rawDir, f);
|
|
1373
|
-
const html = await readFile3(htmlPath, "utf-8");
|
|
1374
|
-
const match = html.match(/<link[^>]+rel="canonical"[^>]+href="([^"]+)"/);
|
|
1375
|
-
if (match && slugifyUrl(match[1]) === fileSlug) {
|
|
1376
|
-
urls.push(match[1]);
|
|
1377
|
-
continue;
|
|
1378
|
-
}
|
|
1379
|
-
const ogMatch = html.match(/<meta[^>]+property="og:url"[^>]+content="([^"]+)"/);
|
|
1380
|
-
if (ogMatch && slugifyUrl(ogMatch[1]) === fileSlug) {
|
|
1381
|
-
urls.push(ogMatch[1]);
|
|
1382
|
-
continue;
|
|
1383
|
-
}
|
|
1384
|
-
urls.push(`https://recovered/${fileSlug}`);
|
|
1385
|
-
}
|
|
1386
|
-
}
|
|
1906
|
+
urls = await recoverUrlsFromHtml(rawDir);
|
|
1387
1907
|
console.log(` Found ${urls.length} cached pages.`);
|
|
1388
1908
|
} else {
|
|
1389
1909
|
console.log(" Scraping URLs...");
|
|
@@ -1413,14 +1933,11 @@ Refreshing "${sourceName}"...`);
|
|
|
1413
1933
|
console.log(" Chunking...");
|
|
1414
1934
|
const allChunks = pages.flatMap((p) => chunkMarkdown(p.markdown, sourceName, p.url, p.title));
|
|
1415
1935
|
console.log(` Created ${allChunks.length} chunks.`);
|
|
1416
|
-
console.log(" Embedding chunks...");
|
|
1417
|
-
const texts = allChunks.map((c) => c.content);
|
|
1418
|
-
const embeddings = await embedWithCheckpoint(texts, rawDir, embeddingsCachePath);
|
|
1419
1936
|
if (args.values["skip-store"]) {
|
|
1420
|
-
console.log(` Done. ${allChunks.length} chunks ready (
|
|
1937
|
+
console.log(` Done. ${allChunks.length} chunks ready (dry run, no embed/store).`);
|
|
1421
1938
|
continue;
|
|
1422
1939
|
}
|
|
1423
|
-
await
|
|
1940
|
+
await syncChunks(sourceName, allChunks, pages.length, source.version);
|
|
1424
1941
|
}
|
|
1425
1942
|
}
|
|
1426
1943
|
async function cmdSearch() {
|
|
@@ -1429,17 +1946,19 @@ async function cmdSearch() {
|
|
|
1429
1946
|
options: {
|
|
1430
1947
|
source: { type: "string" },
|
|
1431
1948
|
top: { type: "string" },
|
|
1949
|
+
candidates: { type: "string" },
|
|
1432
1950
|
compact: { type: "boolean", default: false }
|
|
1433
1951
|
},
|
|
1434
1952
|
allowPositionals: true
|
|
1435
1953
|
});
|
|
1436
1954
|
const query = args.positionals.join(" ");
|
|
1437
1955
|
if (!query) {
|
|
1438
|
-
console.error('Usage: grimoire search "<query>" [--source <name>] [--top <n>] [--compact]');
|
|
1956
|
+
console.error('Usage: grimoire search "<query>" [--source <name>] [--top <n>] [--candidates <n>] [--compact]');
|
|
1439
1957
|
process.exit(1);
|
|
1440
1958
|
}
|
|
1441
1959
|
const topN = args.values.top ? parseInt(args.values.top, 10) : void 0;
|
|
1442
|
-
const
|
|
1960
|
+
const candidates = args.values.candidates ? parseInt(args.values.candidates, 10) : void 0;
|
|
1961
|
+
const results = await search(query, { source: args.values.source, topN, candidates });
|
|
1443
1962
|
if (results.length === 0) {
|
|
1444
1963
|
console.log("No results found.");
|
|
1445
1964
|
return;
|
|
@@ -1626,4 +2145,4 @@ var ADMIN_COMMANDS = {
|
|
|
1626
2145
|
export {
|
|
1627
2146
|
ADMIN_COMMANDS
|
|
1628
2147
|
};
|
|
1629
|
-
//# sourceMappingURL=admin-
|
|
2148
|
+
//# sourceMappingURL=admin-YF2OKHEQ.js.map
|