@astrofoundry/grimoire 3.30.0 → 3.30.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,12 +1,372 @@
1
1
  import {
2
+ __commonJS,
3
+ __toESM,
2
4
  bold,
3
5
  cyan,
4
6
  yellow
5
- } from "./chunk-BRS6X3AE.js";
7
+ } from "./chunk-R46N6C3C.js";
8
+
9
+ // node_modules/.pnpm/@joplin+turndown-plugin-gfm@1.0.67/node_modules/@joplin/turndown-plugin-gfm/lib/turndown-plugin-gfm.cjs.js
10
+ var require_turndown_plugin_gfm_cjs = __commonJS({
11
+ "node_modules/.pnpm/@joplin+turndown-plugin-gfm@1.0.67/node_modules/@joplin/turndown-plugin-gfm/lib/turndown-plugin-gfm.cjs.js"(exports) {
12
+ "use strict";
13
+ Object.defineProperty(exports, "__esModule", { value: true });
14
+ var highlightRegExp = /highlight-(?:text|source)-([a-z0-9]+)/;
15
+ function highlightedCodeBlock(turndownService) {
16
+ turndownService.addRule("highlightedCodeBlock", {
17
+ filter: function(node) {
18
+ var firstChild = node.firstChild;
19
+ return node.nodeName === "DIV" && highlightRegExp.test(node.className) && firstChild && firstChild.nodeName === "PRE";
20
+ },
21
+ replacement: function(content, node, options) {
22
+ var className = node.className || "";
23
+ var language = (className.match(highlightRegExp) || [null, ""])[1];
24
+ return "\n\n" + options.fence + language + "\n" + node.firstChild.textContent + "\n" + options.fence + "\n\n";
25
+ }
26
+ });
27
+ }
28
+ function strikethrough(turndownService) {
29
+ turndownService.addRule("strikethrough", {
30
+ filter: ["del", "s", "strike"],
31
+ replacement: function(content) {
32
+ return "~~" + content + "~~";
33
+ }
34
+ });
35
+ }
36
+ var indexOf = Array.prototype.indexOf;
37
+ var every = Array.prototype.every;
38
+ var rules = {};
39
+ var alignMap = { left: ":---", right: "---:", center: ":---:" };
40
+ var isCodeBlock_ = null;
41
+ var options_ = null;
42
+ var tableShouldBeSkippedCache_ = /* @__PURE__ */ new WeakMap();
43
+ function getAlignment(node) {
44
+ return node ? (node.getAttribute("align") || node.style.textAlign || "").toLowerCase() : "";
45
+ }
46
+ function getBorder(alignment) {
47
+ return alignment ? alignMap[alignment] : "---";
48
+ }
49
+ function getColumnAlignment(table, columnIndex) {
50
+ var votes = {
51
+ left: 0,
52
+ right: 0,
53
+ center: 0,
54
+ "": 0
55
+ };
56
+ var align = "";
57
+ for (var i = 0; i < table.rows.length; ++i) {
58
+ var row = table.rows[i];
59
+ if (columnIndex < row.childNodes.length) {
60
+ var cellAlignment = getAlignment(row.childNodes[columnIndex]);
61
+ ++votes[cellAlignment];
62
+ if (votes[cellAlignment] > votes[align]) {
63
+ align = cellAlignment;
64
+ }
65
+ }
66
+ }
67
+ return align;
68
+ }
69
+ rules.tableCell = {
70
+ filter: ["th", "td"],
71
+ replacement: function(content, node) {
72
+ if (tableShouldBeSkipped(nodeParentTable(node))) return content;
73
+ return cell(content, node);
74
+ }
75
+ };
76
+ rules.tableRow = {
77
+ filter: "tr",
78
+ replacement: function(content, node) {
79
+ const parentTable = nodeParentTable(node);
80
+ if (tableShouldBeSkipped(parentTable)) return content;
81
+ var borderCells = "";
82
+ if (isHeadingRow(node)) {
83
+ const colCount = tableColCount(parentTable);
84
+ for (var i = 0; i < colCount; i++) {
85
+ const childNode = i < node.childNodes.length ? node.childNodes[i] : null;
86
+ var border = getBorder(getColumnAlignment(parentTable, i));
87
+ borderCells += cell(border, childNode, i);
88
+ }
89
+ }
90
+ return "\n" + content + (borderCells ? "\n" + borderCells : "");
91
+ }
92
+ };
93
+ rules.table = {
94
+ filter: function(node, options) {
95
+ return node.nodeName === "TABLE";
96
+ },
97
+ replacement: function(content, node) {
98
+ if (tableShouldBeHtml(node, options_)) {
99
+ let html = node.outerHTML;
100
+ let divParent = nodeParentDiv(node);
101
+ if (divParent === null || !divParent.classList.contains("joplin-table-wrapper")) {
102
+ return `
103
+
104
+ <div class="joplin-table-wrapper">${html}</div>
105
+
106
+ `;
107
+ } else {
108
+ return html;
109
+ }
110
+ } else {
111
+ if (tableShouldBeSkipped(node)) return content;
112
+ content = content.replace(/\n+/g, "\n");
113
+ var secondLine = content.trim().split("\n");
114
+ if (secondLine.length >= 2) secondLine = secondLine[1];
115
+ var secondLineIsDivider = /\| :?---/.test(secondLine);
116
+ var columnCount = tableColCount(node);
117
+ var emptyHeader = "";
118
+ if (columnCount && !secondLineIsDivider) {
119
+ emptyHeader = "|" + " |".repeat(columnCount) + "\n|";
120
+ for (var columnIndex = 0; columnIndex < columnCount; ++columnIndex) {
121
+ emptyHeader += " " + getBorder(getColumnAlignment(node, columnIndex)) + " |";
122
+ }
123
+ }
124
+ const captionNode = node.querySelector ? node.querySelector("caption") : node.caption;
125
+ const captionContent = captionNode ? captionNode.textContent || "" : "";
126
+ const caption = captionContent ? `${captionContent}
127
+
128
+ ` : "";
129
+ const tableContent = `${emptyHeader}${content}`.trimStart();
130
+ return `
131
+
132
+ ${caption}${tableContent}
133
+
134
+ `;
135
+ }
136
+ }
137
+ };
138
+ rules.tableCaption = {
139
+ filter: ["caption"],
140
+ replacement: () => ""
141
+ };
142
+ rules.tableColgroup = {
143
+ filter: ["colgroup", "col"],
144
+ replacement: () => ""
145
+ };
146
+ rules.tableSection = {
147
+ filter: ["thead", "tbody", "tfoot"],
148
+ replacement: function(content) {
149
+ return content;
150
+ }
151
+ };
152
+ function isHeadingRow(tr) {
153
+ var parentNode = tr.parentNode;
154
+ return parentNode.nodeName === "THEAD" || parentNode.firstChild === tr && (parentNode.nodeName === "TABLE" || isFirstTbody(parentNode)) && every.call(tr.childNodes, function(n) {
155
+ return n.nodeName === "TH";
156
+ });
157
+ }
158
+ function isFirstTbody(element) {
159
+ var previousSibling = element.previousSibling;
160
+ return element.nodeName === "TBODY" && (!previousSibling || previousSibling.nodeName === "THEAD" && /^\s*$/i.test(previousSibling.textContent));
161
+ }
162
+ function cell(content, node = null, index = null) {
163
+ if (index === null) index = indexOf.call(node.parentNode.childNodes, node);
164
+ var prefix = " ";
165
+ if (index === 0) prefix = "| ";
166
+ let filteredContent = content.trim().replace(/\n\r/g, "<br>").replace(/\n/g, "<br>");
167
+ filteredContent = filteredContent.replace(/\|+/g, "\\|");
168
+ while (filteredContent.length < 3) filteredContent += " ";
169
+ if (node) filteredContent = handleColSpan(filteredContent, node, " ");
170
+ return prefix + filteredContent + " |";
171
+ }
172
+ function nodeContainsTable(node) {
173
+ if (!node.childNodes) return false;
174
+ for (let i = 0; i < node.childNodes.length; i++) {
175
+ const child = node.childNodes[i];
176
+ if (child.nodeName === "TABLE") return true;
177
+ if (nodeContainsTable(child)) return true;
178
+ }
179
+ return false;
180
+ }
181
+ var nodeContains = (node, types) => {
182
+ if (!node.childNodes) return false;
183
+ for (let i = 0; i < node.childNodes.length; i++) {
184
+ const child = node.childNodes[i];
185
+ if (types === "code" && isCodeBlock_ && isCodeBlock_(child)) return true;
186
+ if (types.includes(child.nodeName)) return true;
187
+ if (nodeContains(child, types)) return true;
188
+ }
189
+ return false;
190
+ };
191
+ var customStyleProperties = [
192
+ "background-color",
193
+ "background",
194
+ "border-color",
195
+ "border",
196
+ "border-top",
197
+ "border-right",
198
+ "border-bottom",
199
+ "border-left",
200
+ "border-style",
201
+ "border-width",
202
+ "padding",
203
+ "padding-top",
204
+ "padding-right",
205
+ "padding-bottom",
206
+ "padding-left",
207
+ "float",
208
+ "margin-left",
209
+ "margin-right"
210
+ ];
211
+ var customAttributeNames = [
212
+ "bgcolor",
213
+ "bordercolor",
214
+ "background"
215
+ ];
216
+ var nodeHasCustomStyle = (node) => {
217
+ if (!node || !node.getAttribute) return false;
218
+ const styleAttr = node.getAttribute("style");
219
+ if (!styleAttr) return false;
220
+ const properties = styleAttr.split(";").map((s) => s.split(":")[0].trim().toLowerCase()).filter((s) => s.length > 0);
221
+ for (let i = 0; i < properties.length; i++) {
222
+ if (customStyleProperties.includes(properties[i])) return true;
223
+ }
224
+ return false;
225
+ };
226
+ var hasNonDefaultSpacingAttribute = (node, name) => {
227
+ if (!node || !node.getAttribute) return false;
228
+ const value = node.getAttribute(name);
229
+ if (value === null) return false;
230
+ const normalisedValue = `${value}`.trim().toLowerCase();
231
+ if (!normalisedValue) return false;
232
+ if (normalisedValue === "0" || normalisedValue === "0px") return false;
233
+ return true;
234
+ };
235
+ var nodeHasCustomAttributes = (node) => {
236
+ if (!node || !node.getAttribute) return false;
237
+ for (let i = 0; i < customAttributeNames.length; i++) {
238
+ const value = node.getAttribute(customAttributeNames[i]);
239
+ if (value !== null && `${value}`.trim() !== "") return true;
240
+ }
241
+ if (node.nodeName === "TABLE") {
242
+ if (hasNonDefaultSpacingAttribute(node, "cellpadding")) return true;
243
+ if (hasNonDefaultSpacingAttribute(node, "cellspacing")) return true;
244
+ }
245
+ return false;
246
+ };
247
+ var nodeHasCustomFormatting = (node) => {
248
+ return nodeHasCustomStyle(node) || nodeHasCustomAttributes(node);
249
+ };
250
+ var tableHasCustomStyles = (tableNode) => {
251
+ if (nodeHasCustomFormatting(tableNode)) return true;
252
+ const rows = tableNode.rows;
253
+ if (!rows) return false;
254
+ for (let i = 0; i < rows.length; i++) {
255
+ const row = rows[i];
256
+ if (nodeHasCustomFormatting(row)) return true;
257
+ for (let j = 0; j < row.childNodes.length; j++) {
258
+ const cell2 = row.childNodes[j];
259
+ if ((cell2.nodeName === "TD" || cell2.nodeName === "TH") && nodeHasCustomFormatting(cell2)) {
260
+ return true;
261
+ }
262
+ }
263
+ }
264
+ return false;
265
+ };
266
+ var tableShouldBeHtml = (tableNode, options) => {
267
+ const possibleTags = [
268
+ "UL",
269
+ "OL",
270
+ "H1",
271
+ "H2",
272
+ "H3",
273
+ "H4",
274
+ "H5",
275
+ "H6",
276
+ "HR",
277
+ "BLOCKQUOTE"
278
+ ];
279
+ if (options.preserveNestedTables) possibleTags.push("TABLE");
280
+ return nodeContains(tableNode, "code") || nodeContains(tableNode, possibleTags) || options.preserveTableStyles && tableHasCustomStyles(tableNode);
281
+ };
282
+ function tableShouldBeSkipped(tableNode) {
283
+ const cached = tableShouldBeSkippedCache_.get(tableNode);
284
+ if (cached !== void 0) return cached;
285
+ const result = tableShouldBeSkipped_(tableNode);
286
+ tableShouldBeSkippedCache_.set(tableNode, result);
287
+ return result;
288
+ }
289
+ function tableShouldBeSkipped_(tableNode) {
290
+ if (!tableNode) return true;
291
+ if (!tableNode.rows) return true;
292
+ if (tableNode.rows.length === 1 && tableNode.rows[0].childNodes.length <= 1) return true;
293
+ if (nodeContainsTable(tableNode)) return true;
294
+ return false;
295
+ }
296
+ function nodeParentDiv(node) {
297
+ let parent = node.parentNode;
298
+ while (parent.nodeName !== "DIV") {
299
+ parent = parent.parentNode;
300
+ if (!parent) return null;
301
+ }
302
+ return parent;
303
+ }
304
+ function nodeParentTable(node) {
305
+ let parent = node.parentNode;
306
+ while (parent.nodeName !== "TABLE") {
307
+ parent = parent.parentNode;
308
+ if (!parent) return null;
309
+ }
310
+ return parent;
311
+ }
312
+ function handleColSpan(content, node, emptyChar) {
313
+ const colspan = node.getAttribute("colspan") || 1;
314
+ for (let i = 1; i < colspan; i++) {
315
+ content += " | " + emptyChar.repeat(3);
316
+ }
317
+ return content;
318
+ }
319
+ function tableColCount(node) {
320
+ let maxColCount = 0;
321
+ for (let i = 0; i < node.rows.length; i++) {
322
+ const row = node.rows[i];
323
+ const colCount = row.childNodes.length;
324
+ if (colCount > maxColCount) maxColCount = colCount;
325
+ }
326
+ return maxColCount;
327
+ }
328
+ function tables2(turndownService) {
329
+ isCodeBlock_ = turndownService.isCodeBlock;
330
+ options_ = turndownService.options;
331
+ turndownService.keep(function(node) {
332
+ if (node.nodeName === "TABLE" && tableShouldBeHtml(node, turndownService.options)) return true;
333
+ return false;
334
+ });
335
+ for (var key in rules) turndownService.addRule(key, rules[key]);
336
+ }
337
+ function taskListItems(turndownService) {
338
+ turndownService.addRule("taskListItems", {
339
+ filter: function(node) {
340
+ const parent = node.parentNode;
341
+ const grandparent = parent.parentNode;
342
+ const grandparentIsListItem = !!grandparent && grandparent.nodeName === "LI";
343
+ return (node.type === "checkbox" || node.getAttribute("role") === "checkbox") && (parent.nodeName === "LI" || parent.nodeName === "LABEL" && grandparentIsListItem || parent.nodeName === "SPAN" && grandparentIsListItem);
344
+ },
345
+ replacement: function(content, node) {
346
+ const checked = node.nodeName === "INPUT" ? node.checked : node.getAttribute("aria-checked") === "true";
347
+ return (checked ? "[x]" : "[ ]") + " ";
348
+ }
349
+ });
350
+ }
351
+ function gfm(turndownService) {
352
+ turndownService.use([
353
+ highlightedCodeBlock,
354
+ strikethrough,
355
+ tables2,
356
+ taskListItems
357
+ ]);
358
+ }
359
+ exports.gfm = gfm;
360
+ exports.highlightedCodeBlock = highlightedCodeBlock;
361
+ exports.strikethrough = strikethrough;
362
+ exports.tables = tables2;
363
+ exports.taskListItems = taskListItems;
364
+ }
365
+ });
6
366
 
7
367
  // src/admin.ts
8
368
  import { parseArgs } from "node:util";
9
- import { readFile as readFile3, writeFile as writeFile4, readdir, rm, mkdir as mkdir4, unlink } from "node:fs/promises";
369
+ import { readFile as readFile3, writeFile as writeFile4, readdir, rm } from "node:fs/promises";
10
370
  import { existsSync } from "node:fs";
11
371
  import { join as join4, resolve } from "node:path";
12
372
  import { createInterface } from "node:readline";
@@ -267,6 +627,7 @@ async function createBrowser() {
267
627
  }
268
628
 
269
629
  // src/converter.ts
630
+ var import_turndown_plugin_gfm = __toESM(require_turndown_plugin_gfm_cjs(), 1);
270
631
  import { readFile as readFile2, writeFile as writeFile2, mkdir as mkdir2 } from "node:fs/promises";
271
632
  import { join as join2 } from "node:path";
272
633
  import { JSDOM } from "jsdom";
@@ -276,6 +637,7 @@ var turndown = new TurndownService({
276
637
  codeBlockStyle: "fenced",
277
638
  bulletListMarker: "-"
278
639
  });
640
+ turndown.use(import_turndown_plugin_gfm.tables);
279
641
  var GENERIC_REMOVE = [
280
642
  "style",
281
643
  "script",
@@ -293,26 +655,57 @@ function cleanMarkdown(md, textPatterns) {
293
655
  }
294
656
  return cleaned.trim();
295
657
  }
658
+ function flattenTableCellMarkup(contentEl) {
659
+ for (const table of contentEl.querySelectorAll("table")) {
660
+ if (!table.isConnected) continue;
661
+ for (const inner of table.querySelectorAll("table")) {
662
+ if (!inner.isConnected) continue;
663
+ const rows = [...inner.rows].map(
664
+ (row) => [...row.cells].map((c) => c.textContent?.replace(/\s+/g, " ").trim() ?? "").filter(Boolean).join(" ")
665
+ ).filter(Boolean);
666
+ inner.replaceWith(rows.join("; "));
667
+ }
668
+ for (const code of table.querySelectorAll("code")) {
669
+ code.replaceWith(code.textContent ?? "");
670
+ }
671
+ for (const list of table.querySelectorAll("ul, ol")) {
672
+ if (!list.isConnected) continue;
673
+ const items = [...list.querySelectorAll("li")].map((li) => li.textContent?.replace(/\s+/g, " ").trim() ?? "").filter(Boolean);
674
+ list.replaceWith(items.join("; "));
675
+ }
676
+ for (const block of table.querySelectorAll("aside, blockquote, h1, h2, h3, h4, h5, h6")) {
677
+ if (!block.isConnected) continue;
678
+ block.replaceWith(block.textContent?.replace(/\s+/g, " ").trim() ?? "");
679
+ }
680
+ }
681
+ }
682
+ function cleanTableRowBreaks(md) {
683
+ return md.split("\n").map(
684
+ (line) => line.startsWith("|") ? line.replace(/\s*<br\s*\/?>\s*/g, "; ") : line
685
+ ).join("\n");
686
+ }
296
687
  function extractContent(html, contentSelector, removeSelectors, removeTextPatterns) {
297
688
  const dom = new JSDOM(html);
298
689
  const doc = dom.window.document;
299
- const contentEl = doc.querySelector(contentSelector);
300
- if (!contentEl) {
301
- return cleanMarkdown(turndown.turndown(doc.body.innerHTML), removeTextPatterns);
302
- }
690
+ const contentEl = doc.querySelector(contentSelector) ?? doc.body;
303
691
  const allSelectors = [...GENERIC_REMOVE, ...removeSelectors ?? []];
304
692
  for (const selector of allSelectors) {
305
693
  for (const el of contentEl.querySelectorAll(selector)) {
306
694
  el.remove();
307
695
  }
308
696
  }
309
- return cleanMarkdown(turndown.turndown(contentEl.innerHTML), removeTextPatterns);
697
+ flattenTableCellMarkup(contentEl);
698
+ return cleanMarkdown(
699
+ cleanTableRowBreaks(turndown.turndown(contentEl.innerHTML)),
700
+ removeTextPatterns
701
+ );
310
702
  }
311
703
  function extractTitle(html) {
312
704
  const dom = new JSDOM(html);
313
- const titleEl = dom.window.document.querySelector("title");
314
- if (!titleEl) return "Untitled";
315
- return titleEl.textContent?.replace(/\s*[|–—-]\s*.+$/, "").trim() ?? "Untitled";
705
+ const raw = dom.window.document.querySelector("title")?.textContent?.replace(/\s+/g, " ").trim();
706
+ if (!raw) return "Untitled";
707
+ const stripped = raw.replace(/\s*\|[^|]*$/, "").replace(/\s*[\u2013\u2014][^\u2013\u2014]*$/, "").replace(/\s+-\s+(?!.*\s-\s)[^|]*$/, "").trim();
708
+ return stripped || raw;
316
709
  }
317
710
  function buildFrontmatter(source, url, title) {
318
711
  return [
@@ -379,6 +772,13 @@ function buildChunkId(source, url, headingSlug, index) {
379
772
  const base = `${prefix}${truncatedSlug}`;
380
773
  return index !== void 0 ? `${base}-${index}` : base;
381
774
  }
775
+ function buildEmbedText(chunk) {
776
+ const path = chunk.heading_path[0] === chunk.title ? chunk.heading_path.slice(1) : chunk.heading_path;
777
+ const context = [chunk.title, ...path].filter(Boolean).join(" > ");
778
+ return context ? `${context}
779
+
780
+ ${chunk.content}` : chunk.content;
781
+ }
382
782
  function parseHeadingSections(markdown) {
383
783
  const lines = markdown.split("\n");
384
784
  const sections = [];
@@ -431,26 +831,135 @@ function parseHeadingSections(markdown) {
431
831
  }
432
832
  return sections;
433
833
  }
434
- function splitAtParagraphBoundaries(text, maxTokens) {
435
- const paragraphs = text.split(/\n\n+/);
436
- const parts = [];
834
+ function isTableLine(line) {
835
+ return line.trimStart().startsWith("|");
836
+ }
837
+ function isTableSeparator(line) {
838
+ return /^\s*\|?[\s:|-]+\|?\s*$/.test(line) && line.includes("-");
839
+ }
840
+ function parseBlocks(lines) {
841
+ const blocks = [];
437
842
  let current = [];
438
- let currentTokens = 0;
439
- for (const para of paragraphs) {
440
- const paraTokens = estimateTokens(para);
441
- if (currentTokens + paraTokens > maxTokens && current.length > 0) {
442
- parts.push(current.join("\n\n"));
443
- current = [para];
444
- currentTokens = paraTokens;
843
+ let kind = "paragraph";
844
+ let inFence = false;
845
+ let fenceMarker = "";
846
+ function flush() {
847
+ while (current.length > 0 && current[current.length - 1].trim() === "") {
848
+ current.pop();
849
+ }
850
+ if (current.length > 0) {
851
+ blocks.push({ kind, lines: current });
852
+ }
853
+ current = [];
854
+ kind = "paragraph";
855
+ }
856
+ for (const line of lines) {
857
+ if (inFence) {
858
+ current.push(line);
859
+ const fenceMatch2 = line.match(/^\s*(```+|~~~+)/);
860
+ if (fenceMatch2 && fenceMatch2[1][0] === fenceMarker) {
861
+ inFence = false;
862
+ flush();
863
+ }
864
+ continue;
865
+ }
866
+ const fenceMatch = line.match(/^\s*(```+|~~~+)/);
867
+ if (fenceMatch) {
868
+ flush();
869
+ kind = "fence";
870
+ inFence = true;
871
+ fenceMarker = fenceMatch[1][0];
872
+ current.push(line);
873
+ continue;
874
+ }
875
+ if (isTableLine(line)) {
876
+ if (kind !== "table") {
877
+ flush();
878
+ kind = "table";
879
+ }
880
+ current.push(line);
881
+ continue;
882
+ }
883
+ if (kind === "table") {
884
+ flush();
885
+ }
886
+ if (line.trim() === "") {
887
+ flush();
445
888
  } else {
446
- current.push(para);
447
- currentTokens += paraTokens;
889
+ current.push(line);
890
+ }
891
+ }
892
+ flush();
893
+ return blocks;
894
+ }
895
+ function groupLines(lines, budget) {
896
+ const groups = [];
897
+ let current = [];
898
+ let tokens = 0;
899
+ for (const line of lines) {
900
+ const lineTokens = estimateTokens(line) + 1;
901
+ if (tokens + lineTokens > budget && current.length > 0) {
902
+ groups.push(current);
903
+ current = [];
904
+ tokens = 0;
905
+ }
906
+ current.push(line);
907
+ tokens += lineTokens;
908
+ }
909
+ if (current.length > 0) groups.push(current);
910
+ return groups;
911
+ }
912
+ function splitBlock(block, budget) {
913
+ if (block.kind === "table") {
914
+ const hasHeader = block.lines.length >= 2 && isTableSeparator(block.lines[1]);
915
+ const header = hasHeader ? block.lines.slice(0, 2) : [];
916
+ const rows = hasHeader ? block.lines.slice(2) : block.lines;
917
+ const headerTokens = estimateTokens(header.join("\n"));
918
+ return groupLines(rows, Math.max(budget - headerTokens, 50)).map(
919
+ (group) => [...header, ...group].join("\n")
920
+ );
921
+ }
922
+ if (block.kind === "fence") {
923
+ const opening = block.lines[0];
924
+ const closing = block.lines[block.lines.length - 1].match(/^\s*(```+|~~~+)\s*$/) ? block.lines[block.lines.length - 1] : opening.match(/^\s*(```+|~~~+)/)[1];
925
+ const body = block.lines.slice(1, block.lines[block.lines.length - 1] === closing ? -1 : void 0);
926
+ const frameTokens = estimateTokens(opening) + estimateTokens(closing) + 2;
927
+ return groupLines(body, Math.max(budget - frameTokens, 50)).map(
928
+ (group) => [opening, ...group, closing].join("\n")
929
+ );
930
+ }
931
+ return groupLines(block.lines, budget).map((group) => group.join("\n"));
932
+ }
933
+ function splitSectionIntoParts(blocks, headingLine, maxTokens) {
934
+ const budget = Math.max(maxTokens - estimateTokens(headingLine), 100);
935
+ const parts = [];
936
+ let current = [];
937
+ let tokens = 0;
938
+ function flush() {
939
+ if (current.length > 0) {
940
+ parts.push(current);
941
+ current = [];
942
+ tokens = 0;
448
943
  }
449
944
  }
450
- if (current.length > 0) {
451
- parts.push(current.join("\n\n"));
945
+ for (const block of blocks) {
946
+ const text = block.lines.join("\n");
947
+ const blockTokens = estimateTokens(text) + 2;
948
+ if (blockTokens > budget) {
949
+ flush();
950
+ for (const piece of splitBlock(block, budget)) {
951
+ parts.push([piece]);
952
+ }
953
+ continue;
954
+ }
955
+ if (tokens + blockTokens > budget) {
956
+ flush();
957
+ }
958
+ current.push(text);
959
+ tokens += blockTokens;
452
960
  }
453
- return parts;
961
+ flush();
962
+ return parts.map((blockTexts) => headingLine + blockTexts.join("\n\n"));
454
963
  }
455
964
  function stripFrontmatter(markdown) {
456
965
  if (markdown.startsWith("---")) {
@@ -484,11 +993,11 @@ function chunkMarkdown(markdown, source, url, title) {
484
993
  const headingLine = section.heading ? `${"#".repeat(section.level)} ${section.heading}
485
994
 
486
995
  ` : "";
487
- const content = headingLine + section.lines.join("\n").trim();
996
+ const body = section.lines.join("\n").trim();
997
+ const content = headingLine + body;
488
998
  if (!content.trim()) continue;
489
999
  const headingSlug = section.heading ? slugifyHeading(section.heading) : "intro";
490
- const tokens = estimateTokens(content);
491
- if (tokens <= MAX_TOKENS) {
1000
+ if (estimateTokens(content) <= MAX_TOKENS) {
492
1001
  chunks.push({
493
1002
  id: uniqueId(headingSlug),
494
1003
  source,
@@ -496,29 +1005,56 @@ function chunkMarkdown(markdown, source, url, title) {
496
1005
  title,
497
1006
  heading_path: section.headingPath,
498
1007
  content,
499
- token_count: tokens
1008
+ token_count: estimateTokens(content)
1009
+ });
1010
+ continue;
1011
+ }
1012
+ const blocks = parseBlocks(body.split("\n"));
1013
+ const parts = splitSectionIntoParts(blocks, headingLine, MAX_TOKENS);
1014
+ for (let i = 0; i < parts.length; i++) {
1015
+ const partContent = parts[i].trim();
1016
+ if (!partContent) continue;
1017
+ const partSlug = parts.length > 1 ? `${headingSlug}-${i}` : headingSlug;
1018
+ chunks.push({
1019
+ id: uniqueId(partSlug),
1020
+ source,
1021
+ url,
1022
+ title,
1023
+ heading_path: section.headingPath,
1024
+ content: partContent,
1025
+ token_count: estimateTokens(partContent)
500
1026
  });
501
- } else {
502
- const parts = splitAtParagraphBoundaries(content, MAX_TOKENS);
503
- for (let i = 0; i < parts.length; i++) {
504
- const partContent = parts[i].trim();
505
- if (!partContent) continue;
506
- const partSlug = parts.length > 1 ? `${headingSlug}-${i}` : headingSlug;
507
- chunks.push({
508
- id: uniqueId(partSlug),
509
- source,
510
- url,
511
- title,
512
- heading_path: section.headingPath,
513
- content: partContent,
514
- token_count: estimateTokens(partContent)
515
- });
516
- }
517
1027
  }
518
1028
  }
519
1029
  return chunks;
520
1030
  }
521
1031
 
1032
+ // src/tokens.ts
1033
+ import { createHash } from "node:crypto";
1034
+ var IDENTIFIER_PATTERN = /(?<![A-Za-z0-9._])(?:[A-Za-z][A-Za-z0-9]*(?:[_.-][A-Za-z0-9]+)+|[a-z][a-z0-9]*(?:[A-Z][a-z0-9]*)+|(?:[A-Z][a-z0-9]+){2,})(?![A-Za-z0-9])/g;
1035
+ var MIN_TOKEN_LENGTH = 4;
1036
+ var MAX_TOKEN_LENGTH = 80;
1037
+ var MAX_TOKENS_PER_CHUNK = 100;
1038
+ function normalizeForTokens(text) {
1039
+ return text.replace(/\]\([^)]*\)/g, "]").replace(/https?:\/\/\S+/g, " ").replace(/\\([_*[\]()#`~-])/g, "$1");
1040
+ }
1041
+ function extractIdentifierTokens(text, limit = MAX_TOKENS_PER_CHUNK) {
1042
+ const seen = /* @__PURE__ */ new Set();
1043
+ for (const match of normalizeForTokens(text).matchAll(IDENTIFIER_PATTERN)) {
1044
+ const token = match[0].toLowerCase();
1045
+ if (token.length < MIN_TOKEN_LENGTH || token.length > MAX_TOKEN_LENGTH) continue;
1046
+ seen.add(token);
1047
+ if (seen.size >= limit) break;
1048
+ }
1049
+ return [...seen];
1050
+ }
1051
+ function extractQueryTokens(query) {
1052
+ return extractIdentifierTokens(query, 5);
1053
+ }
1054
+ function contentHash(text) {
1055
+ return createHash("sha256").update(text).digest("hex");
1056
+ }
1057
+
522
1058
  // src/embedder.ts
523
1059
  import { GoogleGenerativeAI } from "@google/generative-ai";
524
1060
  var BATCH_SIZE = 50;
@@ -528,7 +1064,6 @@ var MAX_RETRIES = 5;
528
1064
  var RATE_LIMIT_BASE_DELAY_MS = 6e4;
529
1065
  var NETWORK_BASE_DELAY_MS = 1e4;
530
1066
  var BATCH_DELAY_MS = 2500;
531
- var DEFAULT_CHECKPOINT_EVERY_BATCHES = 20;
532
1067
  var NETWORK_ERROR_PATTERNS = [
533
1068
  "fetch failed",
534
1069
  "ECONNRESET",
@@ -562,22 +1097,10 @@ function classifyError(message) {
562
1097
  async function embedTexts(texts, options = {}) {
563
1098
  const client = getClient();
564
1099
  const model = client.getGenerativeModel({ model: MODEL });
565
- const { onProgress, onCheckpoint, resumeFrom } = options;
566
- const checkpointEveryBatches = options.checkpointEveryBatches ?? DEFAULT_CHECKPOINT_EVERY_BATCHES;
1100
+ const { onProgress } = options;
567
1101
  const maxRetries = options.maxRetries ?? MAX_RETRIES;
568
- const embeddings = resumeFrom ? [...resumeFrom] : [];
569
- const startIndex = Math.floor(embeddings.length / BATCH_SIZE) * BATCH_SIZE;
570
- if (embeddings.length > startIndex) {
571
- embeddings.length = startIndex;
572
- }
573
- if (startIndex > 0) {
574
- console.log(` Resuming from chunk ${startIndex} of ${texts.length} (${embeddings.length} cached).`);
575
- }
576
- if (startIndex >= texts.length) {
577
- return embeddings.slice(0, texts.length);
578
- }
579
- let batchesSinceCheckpoint = 0;
580
- for (let i = startIndex; i < texts.length; i += BATCH_SIZE) {
1102
+ const embeddings = [];
1103
+ for (let i = 0; i < texts.length; i += BATCH_SIZE) {
581
1104
  const batch = texts.slice(i, i + BATCH_SIZE);
582
1105
  const batchNumber = i / BATCH_SIZE + 1;
583
1106
  let result;
@@ -609,18 +1132,10 @@ async function embedTexts(texts, options = {}) {
609
1132
  embeddings.push(embedding.values);
610
1133
  }
611
1134
  onProgress?.(Math.min(i + BATCH_SIZE, texts.length), texts.length);
612
- batchesSinceCheckpoint++;
613
- if (onCheckpoint && batchesSinceCheckpoint >= checkpointEveryBatches && i + BATCH_SIZE < texts.length) {
614
- await onCheckpoint(embeddings);
615
- batchesSinceCheckpoint = 0;
616
- }
617
1135
  if (i + BATCH_SIZE < texts.length) {
618
1136
  await new Promise((resolve2) => setTimeout(resolve2, BATCH_DELAY_MS));
619
1137
  }
620
1138
  }
621
- if (onCheckpoint) {
622
- await onCheckpoint(embeddings);
623
- }
624
1139
  return embeddings;
625
1140
  }
626
1141
  async function embedText(text) {
@@ -701,6 +1216,8 @@ async function storeChunks(chunks, embeddings, onProgress) {
701
1216
  heading_path: chunk.heading_path,
702
1217
  content: chunk.content,
703
1218
  token_count: chunk.token_count,
1219
+ tokens: extractIdentifierTokens(chunk.content),
1220
+ content_hash: contentHash(buildEmbedText(chunk)),
704
1221
  embedded_at: (/* @__PURE__ */ new Date()).toISOString(),
705
1222
  embedding: FieldValue.vector(embSlice[j])
706
1223
  });
@@ -767,10 +1284,38 @@ async function deleteChunksByIds(ids, onProgress) {
767
1284
  onProgress?.(Math.min(i + BATCH_SIZE2, ids.length), ids.length);
768
1285
  }
769
1286
  }
770
- async function getSourceChunkIds(sourceName) {
1287
+ async function getSourceChunkHashes(sourceName) {
1288
+ const col = chunksCol();
1289
+ const snapshot = await col.where("source", "==", sourceName).select("content_hash").get();
1290
+ return new Map(
1291
+ snapshot.docs.map((doc) => [doc.id, doc.data().content_hash ?? ""])
1292
+ );
1293
+ }
1294
+ var TOKEN_QUERY_LIMIT = 100;
1295
+ async function tokenSearch(tokens, source) {
1296
+ if (tokens.length === 0) return [];
771
1297
  const col = chunksCol();
772
- const snapshot = await col.where("source", "==", sourceName).select().get();
773
- return new Set(snapshot.docs.map((doc) => doc.id));
1298
+ const hits = /* @__PURE__ */ new Map();
1299
+ const snapshots = await Promise.all(
1300
+ tokens.map((token) => {
1301
+ let query = col.where("tokens", "array-contains", token);
1302
+ if (source) {
1303
+ query = query.where("source", "==", source);
1304
+ }
1305
+ return query.limit(TOKEN_QUERY_LIMIT).get();
1306
+ })
1307
+ );
1308
+ for (const snapshot of snapshots) {
1309
+ for (const doc of snapshot.docs) {
1310
+ const existing = hits.get(doc.id);
1311
+ if (existing) {
1312
+ existing.matchedTokens++;
1313
+ } else {
1314
+ hits.set(doc.id, { id: doc.id, data: doc.data(), matchedTokens: 1 });
1315
+ }
1316
+ }
1317
+ }
1318
+ return [...hits.values()].sort((a, b) => b.matchedTokens - a.matchedTokens);
774
1319
  }
775
1320
  async function vectorSearch(queryEmbedding, limit, source) {
776
1321
  const col = chunksCol();
@@ -816,47 +1361,92 @@ async function rerank(query, documents, topN = 5) {
816
1361
  }
817
1362
 
818
1363
  // src/search.ts
1364
+ var DEFAULT_CANDIDATES = 50;
1365
+ var RRF_K = 60;
819
1366
  function hasReranker() {
820
1367
  return !!process.env.RERANKER_URL;
821
1368
  }
1369
+ function isVectorLike(value) {
1370
+ return typeof value === "object" && value !== null && typeof value.toArray === "function";
1371
+ }
1372
+ function cosineSimilarity(a, b) {
1373
+ let dot = 0;
1374
+ let normA = 0;
1375
+ let normB = 0;
1376
+ for (let i = 0; i < a.length; i++) {
1377
+ dot += a[i] * b[i];
1378
+ normA += a[i] * a[i];
1379
+ normB += b[i] * b[i];
1380
+ }
1381
+ return dot / (Math.sqrt(normA) * Math.sqrt(normB));
1382
+ }
1383
+ function contextualText(data) {
1384
+ return buildEmbedText({
1385
+ title: data.title,
1386
+ heading_path: data.heading_path,
1387
+ content: data.content
1388
+ });
1389
+ }
1390
+ function toSearchResult(candidate, relevance) {
1391
+ const data = candidate.data;
1392
+ return {
1393
+ id: candidate.id,
1394
+ source: data.source,
1395
+ url: data.url,
1396
+ title: data.title,
1397
+ heading_path: data.heading_path,
1398
+ content: data.content,
1399
+ relevance_score: relevance
1400
+ };
1401
+ }
822
1402
  async function search(query, options = {}) {
823
- const { source, candidates = 20, topN = 5 } = options;
1403
+ const { source, candidates = DEFAULT_CANDIDATES, topN = 5 } = options;
824
1404
  const queryEmbedding = await embedText(query);
825
- const rawResults = await vectorSearch(queryEmbedding, candidates, source);
826
- if (rawResults.length === 0) return [];
1405
+ const [vectorResults, lexicalHits] = await Promise.all([
1406
+ vectorSearch(queryEmbedding, candidates, source),
1407
+ tokenSearch(extractQueryTokens(query), source)
1408
+ ]);
1409
+ const pool = /* @__PURE__ */ new Map();
1410
+ vectorResults.forEach((result, rank) => {
1411
+ pool.set(result.id, {
1412
+ id: result.id,
1413
+ data: result.data,
1414
+ fusedScore: 1 / (RRF_K + rank + 1),
1415
+ similarity: 1 - result.distance
1416
+ });
1417
+ });
1418
+ lexicalHits.forEach((hit, rank) => {
1419
+ const lexicalScore = 1 / (RRF_K + rank + 1);
1420
+ const existing = pool.get(hit.id);
1421
+ if (existing) {
1422
+ existing.fusedScore += lexicalScore;
1423
+ return;
1424
+ }
1425
+ const embedding = hit.data.embedding;
1426
+ delete hit.data.embedding;
1427
+ pool.set(hit.id, {
1428
+ id: hit.id,
1429
+ data: hit.data,
1430
+ fusedScore: lexicalScore,
1431
+ similarity: isVectorLike(embedding) ? cosineSimilarity(queryEmbedding, embedding.toArray()) : null
1432
+ });
1433
+ });
1434
+ const fused = [...pool.values()].sort((a, b) => b.fusedScore - a.fusedScore);
1435
+ if (fused.length === 0) return [];
827
1436
  if (hasReranker()) {
828
- const documents = rawResults.map((r) => r.data.content);
1437
+ const rerankPool = fused.slice(0, candidates);
1438
+ const documents = rerankPool.map((c) => contextualText(c.data));
829
1439
  const reranked = await rerank(query, documents, topN);
830
- return reranked.map((r) => {
831
- const original = rawResults[r.index];
832
- const data = original.data;
833
- return {
834
- id: original.id,
835
- source: data.source,
836
- url: data.url,
837
- title: data.title,
838
- heading_path: data.heading_path,
839
- content: data.content,
840
- relevance_score: r.relevance_score
841
- };
842
- });
1440
+ return reranked.map((r) => toSearchResult(rerankPool[r.index], r.relevance_score));
843
1441
  }
844
- return rawResults.slice(0, topN).map((r) => {
845
- const data = r.data;
846
- return {
847
- id: r.id,
848
- source: data.source,
849
- url: data.url,
850
- title: data.title,
851
- heading_path: data.heading_path,
852
- content: data.content,
853
- relevance_score: Math.max(0, 1 - r.distance / 2)
854
- };
1442
+ return fused.slice(0, topN).map((candidate) => {
1443
+ const similarity = candidate.similarity ?? 0;
1444
+ return toSearchResult(candidate, Math.max(0, (1 + similarity) / 2));
855
1445
  });
856
1446
  }
857
1447
 
858
1448
  // src/apikey.ts
859
- import { randomBytes, createHash } from "node:crypto";
1449
+ import { randomBytes, createHash as createHash2 } from "node:crypto";
860
1450
  import {
861
1451
  getFirestore as getFirestore2
862
1452
  } from "firebase-admin/firestore";
@@ -872,7 +1462,7 @@ function getDb2() {
872
1462
  return db2;
873
1463
  }
874
1464
  function hashKey(key) {
875
- return createHash("sha256").update(key).digest("hex");
1465
+ return createHash2("sha256").update(key).digest("hex");
876
1466
  }
877
1467
  function apiKeysCol() {
878
1468
  return getDb2().collection("grimoire_api_keys");
@@ -1166,89 +1756,85 @@ Source "${name}" added to config/sources.yaml`);
1166
1756
  await browser.close();
1167
1757
  }
1168
1758
  }
1169
- var EMBEDDINGS_PART_SIZE = 1e4;
1170
- function partPath(cachePath, index) {
1171
- return `${cachePath}.part-${String(index).padStart(4, "0")}`;
1172
- }
1173
- async function readPartFiles(cachePath) {
1174
- const dir = cachePath.slice(0, cachePath.lastIndexOf("/"));
1175
- const base = cachePath.slice(cachePath.lastIndexOf("/") + 1);
1176
- const entries = await readdir(dir).catch(() => []);
1177
- return entries.filter((e) => e.startsWith(`${base}.part-`)).sort().map((e) => join4(dir, e));
1178
- }
1179
- async function loadEmbeddingsCache(cachePath) {
1180
- const parts = await readPartFiles(cachePath);
1181
- if (parts.length > 0) {
1182
- const all = [];
1183
- for (const p of parts) {
1184
- const data = await readFile3(p, "utf-8");
1185
- const arr = JSON.parse(data);
1186
- for (const row of arr) all.push(row);
1187
- }
1188
- return all;
1759
+ var EMBED_WINDOW = 1e3;
1760
+ async function syncChunks(sourceName, allChunks, urlCount, version) {
1761
+ console.log(" Comparing with Firestore...");
1762
+ const existing = await getSourceChunkHashes(sourceName);
1763
+ const currentIds = new Set(allChunks.map((c) => c.id));
1764
+ const toDelete = [...existing.keys()].filter((id) => !currentIds.has(id));
1765
+ const toEmbed = allChunks.filter(
1766
+ (chunk) => existing.get(chunk.id) !== contentHash(buildEmbedText(chunk))
1767
+ );
1768
+ console.log(
1769
+ ` Sync: ${toEmbed.length} to embed, ${allChunks.length - toEmbed.length} unchanged, ${toDelete.length} to delete.`
1770
+ );
1771
+ if (toDelete.length > 0) {
1772
+ await deleteChunksByIds(toDelete, (cur, total) => {
1773
+ console.log(` [${cur}/${total}] deleted`);
1774
+ });
1189
1775
  }
1190
- try {
1191
- const data = await readFile3(cachePath, "utf-8");
1192
- return JSON.parse(data);
1193
- } catch {
1194
- return null;
1776
+ for (let i = 0; i < toEmbed.length; i += EMBED_WINDOW) {
1777
+ const window = toEmbed.slice(i, i + EMBED_WINDOW);
1778
+ const embeddings = await embedTexts(window.map((c) => buildEmbedText(c)), {
1779
+ onProgress: (done) => {
1780
+ console.log(` [${i + done}/${toEmbed.length}] embedded`);
1781
+ }
1782
+ });
1783
+ await storeChunks(window, embeddings, (cur) => {
1784
+ console.log(` [${i + cur}/${toEmbed.length}] stored`);
1785
+ });
1195
1786
  }
1787
+ await updateSourceMeta(sourceName, allChunks.length, urlCount, version);
1788
+ console.log(` Done. ${allChunks.length} chunks live for "${sourceName}".`);
1196
1789
  }
1197
- async function saveEmbeddingsCache(cachePath, embeddings) {
1198
- await unlink(cachePath).catch(() => {
1199
- });
1200
- const existingParts = await readPartFiles(cachePath);
1201
- for (const p of existingParts) {
1202
- await unlink(p).catch(() => {
1790
+ async function readCachedMarkdownPages(mdDir) {
1791
+ const mdFiles = await readdir(mdDir).catch(() => []);
1792
+ const pages = [];
1793
+ for (const f of mdFiles.filter((f2) => f2.endsWith(".md"))) {
1794
+ const content = await readFile3(join4(mdDir, f), "utf-8");
1795
+ const urlMatch = content.match(/^url: "(.+)"$/m);
1796
+ const titleMatch = content.match(/^title: "(.+)"$/m);
1797
+ if (!urlMatch) {
1798
+ console.warn(` WARNING: ${f} has no url in frontmatter, skipping page.`);
1799
+ continue;
1800
+ }
1801
+ pages.push({
1802
+ markdown: content,
1803
+ url: urlMatch[1],
1804
+ title: titleMatch?.[1] ?? "Untitled"
1203
1805
  });
1204
1806
  }
1205
- for (let i = 0; i < embeddings.length; i += EMBEDDINGS_PART_SIZE) {
1206
- const part = embeddings.slice(i, i + EMBEDDINGS_PART_SIZE);
1207
- const index = Math.floor(i / EMBEDDINGS_PART_SIZE);
1208
- await writeFile4(partPath(cachePath, index), JSON.stringify(part), "utf-8");
1209
- }
1210
- }
1211
- async function embedWithCheckpoint(texts, rawDir, embeddingsCachePath) {
1212
- await mkdir4(rawDir, { recursive: true });
1213
- const partialCache = await loadEmbeddingsCache(embeddingsCachePath);
1214
- const resumeFrom = partialCache && partialCache.length > 0 && partialCache.length < texts.length ? partialCache : void 0;
1215
- return embedTexts(texts, {
1216
- onProgress: (done, total) => {
1217
- console.log(` [${done}/${total}] embedded`);
1218
- },
1219
- onCheckpoint: async (current) => {
1220
- await saveEmbeddingsCache(embeddingsCachePath, current);
1221
- },
1222
- resumeFrom
1223
- });
1807
+ return pages;
1224
1808
  }
1225
- async function storeWithStrategy(sourceName, allChunks, embeddings, urlCount, version, diff) {
1226
- if (diff) {
1227
- console.log(" Computing diff...");
1228
- const existingIds = await getSourceChunkIds(sourceName);
1229
- const newIds = new Set(allChunks.map((c) => c.id));
1230
- const toDelete = [...existingIds].filter((id) => !newIds.has(id));
1231
- console.log(` Diff: ${toDelete.length} to delete, ${allChunks.length} to upsert (${existingIds.size} existing)`);
1232
- if (toDelete.length > 0) {
1233
- console.log(" Deleting removed chunks...");
1234
- await deleteChunksByIds(toDelete, (cur, total) => {
1235
- console.log(` [${cur}/${total}] deleted`);
1236
- });
1809
+ async function recoverUrlsFromHtml(rawDir) {
1810
+ const urlsJsonPath = join4(rawDir, "urls.json");
1811
+ try {
1812
+ return JSON.parse(await readFile3(urlsJsonPath, "utf-8"));
1813
+ } catch {
1814
+ const rawFiles = await readdir(rawDir);
1815
+ const urls = [];
1816
+ let skipped = 0;
1817
+ for (const f of rawFiles.filter((f2) => f2.endsWith(".html"))) {
1818
+ const fileSlug = f.replace(/\.html$/, "");
1819
+ const html = await readFile3(join4(rawDir, f), "utf-8");
1820
+ const match = html.match(/<link[^>]+rel="canonical"[^>]+href="([^"]+)"/);
1821
+ if (match && slugifyUrl(match[1]) === fileSlug) {
1822
+ urls.push(match[1]);
1823
+ continue;
1824
+ }
1825
+ const ogMatch = html.match(/<meta[^>]+property="og:url"[^>]+content="([^"]+)"/);
1826
+ if (ogMatch && slugifyUrl(ogMatch[1]) === fileSlug) {
1827
+ urls.push(ogMatch[1]);
1828
+ continue;
1829
+ }
1830
+ console.warn(` WARNING: cannot recover URL for ${f}, skipping page.`);
1831
+ skipped++;
1237
1832
  }
1238
- console.log(" Upserting chunks...");
1239
- await storeChunks(allChunks, embeddings, (cur, total) => {
1240
- console.log(` [${cur}/${total}] stored`);
1241
- });
1242
- } else {
1243
- console.log(" Purging old chunks...");
1244
- await purgeSource(sourceName);
1245
- console.log(" Storing in Firestore...");
1246
- await storeChunks(allChunks, embeddings, (cur, total) => {
1247
- console.log(` [${cur}/${total}] stored`);
1248
- });
1833
+ if (skipped > 0) {
1834
+ console.warn(` Skipped ${skipped} pages with unrecoverable URLs. Provide urls.json to include them.`);
1835
+ }
1836
+ return urls;
1249
1837
  }
1250
- await updateSourceMeta(sourceName, allChunks.length, urlCount, version);
1251
- console.log(` Done. ${allChunks.length} chunks stored for "${sourceName}".`);
1252
1838
  }
1253
1839
  async function cmdRefresh() {
1254
1840
  const args = parseArgs({
@@ -1256,12 +1842,10 @@ async function cmdRefresh() {
1256
1842
  options: {
1257
1843
  full: { type: "boolean", default: false },
1258
1844
  all: { type: "boolean", default: false },
1259
- diff: { type: "boolean", default: false },
1260
1845
  concurrency: { type: "string" },
1261
1846
  limit: { type: "string" },
1262
1847
  "from-html": { type: "boolean", default: false },
1263
1848
  "from-markdown": { type: "boolean", default: false },
1264
- "from-embeddings": { type: "boolean", default: false },
1265
1849
  "skip-store": { type: "boolean", default: false }
1266
1850
  },
1267
1851
  allowPositionals: true
@@ -1269,7 +1853,7 @@ async function cmdRefresh() {
1269
1853
  const config = await loadConfig(CONFIG_PATH);
1270
1854
  const sourcesToRefresh = args.values.all ? Object.keys(config.sources) : [args.positionals[0]];
1271
1855
  if (!args.values.all && !sourcesToRefresh[0]) {
1272
- console.error("Usage: grimoire refresh <source> [--full] [--from-html] [--from-markdown] [--from-embeddings] [--skip-store] [--limit <n>] [--concurrency <n>]");
1856
+ console.error("Usage: grimoire refresh <source> [--full] [--from-html] [--from-markdown] [--skip-store] [--limit <n>] [--concurrency <n>]");
1273
1857
  process.exit(1);
1274
1858
  }
1275
1859
  const concurrencyOverride = args.values.concurrency ? parseInt(args.values.concurrency, 10) : void 0;
@@ -1285,7 +1869,6 @@ async function cmdRefresh() {
1285
1869
  }
1286
1870
  const rawDir = join4(DATA_DIR, "raw", sourceName);
1287
1871
  const mdDir = join4(DATA_DIR, "markdown", sourceName);
1288
- const embeddingsCachePath = join4(rawDir, "embeddings.json");
1289
1872
  console.log(`
1290
1873
  Refreshing "${sourceName}"...`);
1291
1874
  if (args.values.full) {
@@ -1295,75 +1878,16 @@ Refreshing "${sourceName}"...`);
1295
1878
  await rm(rawDir, { recursive: true, force: true });
1296
1879
  await rm(mdDir, { recursive: true, force: true });
1297
1880
  }
1298
- let urls;
1299
- if (args.values["from-embeddings"]) {
1300
- console.log(" Loading cached embeddings...");
1301
- const cached = await loadEmbeddingsCache(embeddingsCachePath);
1302
- if (!cached) {
1303
- console.error(" No cached embeddings found. Run without --from-embeddings first.");
1304
- process.exit(1);
1305
- }
1306
- const mdFiles = await readdir(mdDir);
1307
- const allPages = [];
1308
- for (const f of mdFiles.filter((f2) => f2.endsWith(".md"))) {
1309
- const content = await readFile3(join4(mdDir, f), "utf-8");
1310
- const urlMatch = content.match(/^url: "(.+)"$/m);
1311
- const titleMatch = content.match(/^title: "(.+)"$/m);
1312
- allPages.push({
1313
- markdown: content,
1314
- url: urlMatch?.[1] ?? "",
1315
- title: titleMatch?.[1] ?? "Untitled"
1316
- });
1317
- }
1318
- console.log(" Chunking...");
1319
- const allChunks2 = allPages.flatMap((p) => chunkMarkdown(p.markdown, sourceName, p.url, p.title));
1320
- console.log(` Created ${allChunks2.length} chunks.`);
1321
- if (cached.length !== allChunks2.length) {
1322
- console.error(` Embeddings cache (${cached.length}) doesn't match chunk count (${allChunks2.length}). Re-embed with --from-html.`);
1323
- process.exit(1);
1324
- }
1325
- if (args.values["skip-store"]) {
1326
- console.log(` Done. ${allChunks2.length} chunks ready (skipped Firestore).`);
1327
- continue;
1328
- }
1329
- await storeWithStrategy(sourceName, allChunks2, cached, allPages.length, source.version, args.values.diff);
1330
- continue;
1331
- }
1881
+ let pages;
1332
1882
  if (args.values["from-markdown"]) {
1333
1883
  console.log(" Reading cached markdown...");
1334
- const mdFiles = await readdir(mdDir).catch(() => []);
1335
- const markdownFiles = mdFiles.filter((f) => f.endsWith(".md"));
1336
- if (markdownFiles.length === 0) {
1884
+ pages = await readCachedMarkdownPages(mdDir);
1885
+ if (pages.length === 0) {
1337
1886
  console.error(" No cached markdown found. Run with --from-html first.");
1338
1887
  process.exit(1);
1339
1888
  }
1340
- const pages2 = [];
1341
- for (const f of markdownFiles) {
1342
- const content = await readFile3(join4(mdDir, f), "utf-8");
1343
- const urlMatch = content.match(/^url: "(.+)"$/m);
1344
- const titleMatch = content.match(/^title: "(.+)"$/m);
1345
- pages2.push({
1346
- markdown: content,
1347
- url: urlMatch?.[1] ?? "",
1348
- title: titleMatch?.[1] ?? "Untitled"
1349
- });
1350
- }
1351
- console.log(` Found ${pages2.length} cached pages.`);
1352
- console.log(" Chunking...");
1353
- const allChunks2 = pages2.flatMap((p) => chunkMarkdown(p.markdown, sourceName, p.url, p.title));
1354
- console.log(` Created ${allChunks2.length} chunks.`);
1355
- console.log(" Embedding chunks...");
1356
- const texts2 = allChunks2.map((c) => c.content);
1357
- const embeddings2 = await embedWithCheckpoint(texts2, rawDir, embeddingsCachePath);
1358
- if (args.values["skip-store"]) {
1359
- console.log(` Done. ${allChunks2.length} chunks ready (skipped Firestore).`);
1360
- continue;
1361
- }
1362
- await storeWithStrategy(sourceName, allChunks2, embeddings2, pages2.length, source.version, args.values.diff);
1363
- continue;
1364
- }
1365
- let pages;
1366
- if (source.llms_full_url && !args.values["from-html"]) {
1889
+ console.log(` Found ${pages.length} cached pages.`);
1890
+ } else if (source.llms_full_url && !args.values["from-html"]) {
1367
1891
  console.log(` Fetching llms-full.txt from ${source.llms_full_url}...`);
1368
1892
  pages = await ingestLlmsFull(
1369
1893
  source.llms_full_url,
@@ -1376,32 +1900,10 @@ Refreshing "${sourceName}"...`);
1376
1900
  );
1377
1901
  console.log(` Extracted ${pages.length} pages.`);
1378
1902
  } else {
1903
+ let urls;
1379
1904
  if (args.values["from-html"]) {
1380
1905
  console.log(" Reading URLs from cached HTML...");
1381
- const urlsJsonPath = join4(rawDir, "urls.json");
1382
- try {
1383
- urls = JSON.parse(await readFile3(urlsJsonPath, "utf-8"));
1384
- } catch {
1385
- const rawFiles = await readdir(rawDir);
1386
- const htmlFiles = rawFiles.filter((f) => f.endsWith(".html"));
1387
- urls = [];
1388
- for (const f of htmlFiles) {
1389
- const fileSlug = f.replace(/\.html$/, "");
1390
- const htmlPath = join4(rawDir, f);
1391
- const html = await readFile3(htmlPath, "utf-8");
1392
- const match = html.match(/<link[^>]+rel="canonical"[^>]+href="([^"]+)"/);
1393
- if (match && slugifyUrl(match[1]) === fileSlug) {
1394
- urls.push(match[1]);
1395
- continue;
1396
- }
1397
- const ogMatch = html.match(/<meta[^>]+property="og:url"[^>]+content="([^"]+)"/);
1398
- if (ogMatch && slugifyUrl(ogMatch[1]) === fileSlug) {
1399
- urls.push(ogMatch[1]);
1400
- continue;
1401
- }
1402
- urls.push(`https://recovered/${fileSlug}`);
1403
- }
1404
- }
1906
+ urls = await recoverUrlsFromHtml(rawDir);
1405
1907
  console.log(` Found ${urls.length} cached pages.`);
1406
1908
  } else {
1407
1909
  console.log(" Scraping URLs...");
@@ -1431,14 +1933,11 @@ Refreshing "${sourceName}"...`);
1431
1933
  console.log(" Chunking...");
1432
1934
  const allChunks = pages.flatMap((p) => chunkMarkdown(p.markdown, sourceName, p.url, p.title));
1433
1935
  console.log(` Created ${allChunks.length} chunks.`);
1434
- console.log(" Embedding chunks...");
1435
- const texts = allChunks.map((c) => c.content);
1436
- const embeddings = await embedWithCheckpoint(texts, rawDir, embeddingsCachePath);
1437
1936
  if (args.values["skip-store"]) {
1438
- console.log(` Done. ${allChunks.length} chunks ready (skipped Firestore).`);
1937
+ console.log(` Done. ${allChunks.length} chunks ready (dry run, no embed/store).`);
1439
1938
  continue;
1440
1939
  }
1441
- await storeWithStrategy(sourceName, allChunks, embeddings, pages.length, source.version, args.values.diff);
1940
+ await syncChunks(sourceName, allChunks, pages.length, source.version);
1442
1941
  }
1443
1942
  }
1444
1943
  async function cmdSearch() {
@@ -1447,17 +1946,19 @@ async function cmdSearch() {
1447
1946
  options: {
1448
1947
  source: { type: "string" },
1449
1948
  top: { type: "string" },
1949
+ candidates: { type: "string" },
1450
1950
  compact: { type: "boolean", default: false }
1451
1951
  },
1452
1952
  allowPositionals: true
1453
1953
  });
1454
1954
  const query = args.positionals.join(" ");
1455
1955
  if (!query) {
1456
- console.error('Usage: grimoire search "<query>" [--source <name>] [--top <n>] [--compact]');
1956
+ console.error('Usage: grimoire search "<query>" [--source <name>] [--top <n>] [--candidates <n>] [--compact]');
1457
1957
  process.exit(1);
1458
1958
  }
1459
1959
  const topN = args.values.top ? parseInt(args.values.top, 10) : void 0;
1460
- const results = await search(query, { source: args.values.source, topN });
1960
+ const candidates = args.values.candidates ? parseInt(args.values.candidates, 10) : void 0;
1961
+ const results = await search(query, { source: args.values.source, topN, candidates });
1461
1962
  if (results.length === 0) {
1462
1963
  console.log("No results found.");
1463
1964
  return;
@@ -1644,4 +2145,4 @@ var ADMIN_COMMANDS = {
1644
2145
  export {
1645
2146
  ADMIN_COMMANDS
1646
2147
  };
1647
- //# sourceMappingURL=admin-AKV4CA2O.js.map
2148
+ //# sourceMappingURL=admin-YF2OKHEQ.js.map