@swarmvaultai/engine 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1729,7 +1729,7 @@ import matter3 from "gray-matter";
1729
1729
  import ignore from "ignore";
1730
1730
  import { JSDOM as JSDOM2 } from "jsdom";
1731
1731
  import mime from "mime-types";
1732
- import TurndownService from "turndown";
1732
+ import TurndownService2 from "turndown";
1733
1733
 
1734
1734
  // src/code-analysis.ts
1735
1735
  import fs6 from "fs/promises";
@@ -4481,9 +4481,10 @@ async function analyzeCodeSource(manifest, extractedText, schemaHash) {
4481
4481
  const language = manifest.language ?? inferCodeLanguage(manifest.originalPath ?? manifest.storedPath, manifest.mimeType) ?? "typescript";
4482
4482
  const { code, rationales } = language === "javascript" || language === "jsx" || language === "typescript" || language === "tsx" ? analyzeTypeScriptLikeCode(manifest, extractedText) : await analyzeTreeSitterCode(manifest, extractedText, language);
4483
4483
  return {
4484
- analysisVersion: 6,
4484
+ analysisVersion: 7,
4485
4485
  sourceId: manifest.sourceId,
4486
4486
  sourceHash: manifest.contentHash,
4487
+ semanticHash: manifest.semanticHash,
4487
4488
  extractionHash: manifest.extractionHash,
4488
4489
  schemaHash,
4489
4490
  title: manifest.title,
@@ -4503,8 +4504,10 @@ async function analyzeCodeSource(manifest, extractedText, schemaHash) {
4503
4504
  import fs7 from "fs/promises";
4504
4505
  import os from "os";
4505
4506
  import path7 from "path";
4507
+ import { parse as parseCsvSync } from "csv-parse/sync";
4506
4508
  import { strFromU8, unzipSync } from "fflate";
4507
4509
  import { JSDOM } from "jsdom";
4510
+ import TurndownService from "turndown";
4508
4511
  import { z } from "zod";
4509
4512
  var imageVisionExtractionSchema = z.object({
4510
4513
  title: z.string().min(1).nullable().optional(),
@@ -4684,7 +4687,7 @@ function normalizePdfMetadata(raw) {
4684
4687
  function normalizeDocumentText(raw) {
4685
4688
  return raw.replace(/\r\n/g, "\n").split(/\n{2,}/).map((section) => normalizeWhitespace(section)).filter(Boolean).join("\n\n").trim();
4686
4689
  }
4687
- function parseDocxCoreMetadata(bytes) {
4690
+ function parseOfficeCoreMetadata(bytes) {
4688
4691
  try {
4689
4692
  const archive = unzipSync(new Uint8Array(bytes));
4690
4693
  const coreXml = archive["docProps/core.xml"];
@@ -4724,6 +4727,122 @@ function parseDocxCoreMetadata(bytes) {
4724
4727
  return void 0;
4725
4728
  }
4726
4729
  }
4730
+ function decodeTextBytes(bytes) {
4731
+ const text = bytes.toString("utf8");
4732
+ return text.charCodeAt(0) === 65279 ? text.slice(1) : text;
4733
+ }
4734
+ function normalizeTableCell(value) {
4735
+ return normalizeWhitespace(String(value ?? ""));
4736
+ }
4737
+ function isNumericCell(value) {
4738
+ return value.length > 0 && Number.isFinite(Number(value));
4739
+ }
4740
+ function detectHeaderRow(rows) {
4741
+ if (!rows.length) {
4742
+ return { headers: [], bodyRows: [] };
4743
+ }
4744
+ const firstRow = rows[0] ?? [];
4745
+ const nonEmpty = firstRow.filter(Boolean);
4746
+ const unique = new Set(nonEmpty);
4747
+ const nonNumeric = nonEmpty.filter((value) => !isNumericCell(value));
4748
+ const looksLikeHeader = nonEmpty.length > 0 && unique.size === nonEmpty.length && nonNumeric.length >= Math.ceil(nonEmpty.length / 2) && rows.length > 1;
4749
+ if (looksLikeHeader) {
4750
+ return {
4751
+ headers: firstRow.map((value, index) => value || `column_${index + 1}`),
4752
+ bodyRows: rows.slice(1)
4753
+ };
4754
+ }
4755
+ const columnCount = Math.max(...rows.map((row) => row.length), 0);
4756
+ return {
4757
+ headers: Array.from({ length: columnCount }, (_, index) => `column_${index + 1}`),
4758
+ bodyRows: rows
4759
+ };
4760
+ }
4761
+ function columnHints(headers, rows) {
4762
+ return headers.map((header, index) => {
4763
+ const values = rows.map((row) => row[index] ?? "").map(normalizeTableCell).filter(Boolean);
4764
+ if (!values.length) {
4765
+ return null;
4766
+ }
4767
+ const uniqueValues = [...new Set(values)];
4768
+ if (values.every(isNumericCell)) {
4769
+ return `- ${header}: numeric`;
4770
+ }
4771
+ if (uniqueValues.length <= 6 && values.length >= uniqueValues.length) {
4772
+ return `- ${header}: low-cardinality (${uniqueValues.slice(0, 6).join(", ")})`;
4773
+ }
4774
+ return null;
4775
+ }).filter((item) => Boolean(item));
4776
+ }
4777
+ function markdownTable(headers, rows, rowLimit = 20) {
4778
+ if (!headers.length) {
4779
+ return ["No tabular preview available."];
4780
+ }
4781
+ const width = headers.length;
4782
+ const lines = [`| ${headers.join(" | ")} |`, `| ${headers.map(() => "---").join(" | ")} |`];
4783
+ for (const row of rows.slice(0, rowLimit)) {
4784
+ const normalized = Array.from({ length: width }, (_, index) => normalizeTableCell(row[index] ?? ""));
4785
+ lines.push(`| ${normalized.join(" | ")} |`);
4786
+ }
4787
+ return lines;
4788
+ }
4789
+ function zipEntryText(archive, entryPath) {
4790
+ const entry = archive[entryPath];
4791
+ return entry ? strFromU8(entry) : void 0;
4792
+ }
4793
+ function parseXmlDocument(xml) {
4794
+ return new JSDOM(xml, { contentType: "text/xml" }).window.document;
4795
+ }
4796
+ function zipDirname(value) {
4797
+ const index = value.lastIndexOf("/");
4798
+ return index === -1 ? "" : value.slice(0, index);
4799
+ }
4800
+ function resolveZipTarget(basePath, target) {
4801
+ return path7.posix.normalize(path7.posix.join(zipDirname(basePath), target));
4802
+ }
4803
+ function relationshipTargets(xml, basePath) {
4804
+ const document = parseXmlDocument(xml);
4805
+ const map = /* @__PURE__ */ new Map();
4806
+ for (const node of Array.from(document.getElementsByTagName("*"))) {
4807
+ if (node.localName !== "Relationship") {
4808
+ continue;
4809
+ }
4810
+ const id = node.getAttribute("Id")?.trim();
4811
+ const target = node.getAttribute("Target")?.trim();
4812
+ const type = node.getAttribute("Type")?.trim() ?? "";
4813
+ if (!id || !target) {
4814
+ continue;
4815
+ }
4816
+ map.set(id, { target: resolveZipTarget(basePath, target), type });
4817
+ }
4818
+ return map;
4819
+ }
4820
+ function xmlTextNodes(xml, localName) {
4821
+ const document = parseXmlDocument(xml);
4822
+ const values = [];
4823
+ for (const node of Array.from(document.getElementsByTagName("*"))) {
4824
+ if (node.localName !== localName) {
4825
+ continue;
4826
+ }
4827
+ const text = normalizeWhitespace(node.textContent ?? "");
4828
+ if (text) {
4829
+ values.push(text);
4830
+ }
4831
+ }
4832
+ return values;
4833
+ }
4834
+ function firstHtmlHeading(html) {
4835
+ const dom = new JSDOM(html);
4836
+ const heading = dom.window.document.querySelector("h1, h2, h3");
4837
+ const title = normalizeWhitespace(heading?.textContent ?? "");
4838
+ return title || void 0;
4839
+ }
4840
+ function htmlToMarkdown(html) {
4841
+ const dom = new JSDOM(html);
4842
+ const turndown = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced" });
4843
+ const body = dom.window.document.body?.innerHTML ?? html;
4844
+ return turndown.turndown(body).trim();
4845
+ }
4727
4846
  async function extractPdfText(input) {
4728
4847
  try {
4729
4848
  const pdfjs = await import("pdfjs-dist/legacy/build/pdf.mjs");
@@ -4781,7 +4900,7 @@ async function extractDocxText(input) {
4781
4900
  const warnings = result.messages.map((message) => normalizeWhitespace(message.message)).filter(Boolean).map((message) => truncate(message, 240));
4782
4901
  const artifact = {
4783
4902
  ...extractionMetadata("docx", input.mimeType, "docx_text"),
4784
- metadata: parseDocxCoreMetadata(input.bytes),
4903
+ metadata: parseOfficeCoreMetadata(input.bytes),
4785
4904
  warnings: warnings.length ? warnings : void 0
4786
4905
  };
4787
4906
  if (!extractedText) {
@@ -4800,6 +4919,258 @@ async function extractDocxText(input) {
4800
4919
  };
4801
4920
  }
4802
4921
  }
4922
+ async function extractCsvText(input) {
4923
+ try {
4924
+ const rawText = decodeTextBytes(input.bytes);
4925
+ const delimiter = input.fileName?.toLowerCase().endsWith(".tsv") || input.mimeType.includes("tab-separated") ? " " : ",";
4926
+ const parsed = parseCsvSync(rawText, {
4927
+ delimiter,
4928
+ relax_column_count: true,
4929
+ skip_empty_lines: true,
4930
+ trim: true
4931
+ });
4932
+ const rows = parsed.map((row) => row.map((value) => normalizeTableCell(value)));
4933
+ const { headers, bodyRows } = detectHeaderRow(rows);
4934
+ const hintLines = columnHints(headers, bodyRows);
4935
+ const title = input.fileName ? path7.basename(input.fileName, path7.extname(input.fileName)) : void 0;
4936
+ const extractedText = [
4937
+ title ? `# ${title}` : null,
4938
+ `Format: ${delimiter === " " ? "TSV" : "CSV"}`,
4939
+ `Rows: ${bodyRows.length}`,
4940
+ `Columns: ${headers.length}`,
4941
+ headers.length ? `Headers: ${headers.join(", ")}` : null,
4942
+ "",
4943
+ hintLines.length ? "## Column Hints" : null,
4944
+ hintLines.length ? hintLines.join("\n") : null,
4945
+ hintLines.length ? "" : null,
4946
+ "## Preview",
4947
+ ...markdownTable(headers, bodyRows)
4948
+ ].filter((item) => Boolean(item)).join("\n").trim();
4949
+ const artifact = {
4950
+ ...extractionMetadata("csv", input.mimeType, "csv_text"),
4951
+ metadata: {
4952
+ format: delimiter === " " ? "tsv" : "csv",
4953
+ row_count: String(bodyRows.length),
4954
+ column_count: String(headers.length),
4955
+ headers: headers.join(", ")
4956
+ }
4957
+ };
4958
+ return {
4959
+ title,
4960
+ extractedText,
4961
+ artifact
4962
+ };
4963
+ } catch (error) {
4964
+ return {
4965
+ artifact: {
4966
+ ...extractionMetadata("csv", input.mimeType, "csv_text"),
4967
+ warnings: [`CSV extraction failed: ${error instanceof Error ? truncate(error.message, 240) : "unknown error"}`]
4968
+ }
4969
+ };
4970
+ }
4971
+ }
4972
+ async function extractXlsxText(input) {
4973
+ try {
4974
+ const XLSX = await import("xlsx");
4975
+ const workbook = XLSX.read(input.bytes, { type: "buffer", cellFormula: false, cellHTML: false, cellStyles: false });
4976
+ const allSheetNames = workbook.SheetNames;
4977
+ const sheetNames = allSheetNames.slice(0, 10);
4978
+ const sheetSections = [];
4979
+ const metadata = {
4980
+ sheet_count: String(allSheetNames.length),
4981
+ sheet_names: allSheetNames.join(", ")
4982
+ };
4983
+ for (const sheetName of sheetNames) {
4984
+ const sheet = workbook.Sheets[sheetName];
4985
+ if (!sheet) {
4986
+ continue;
4987
+ }
4988
+ const rows = XLSX.utils.sheet_to_json(sheet, {
4989
+ header: 1,
4990
+ raw: false,
4991
+ defval: ""
4992
+ }).map((row) => row.map((value) => normalizeTableCell(value)));
4993
+ const { headers, bodyRows } = detectHeaderRow(rows);
4994
+ sheetSections.push(`## Sheet: ${sheetName}`);
4995
+ sheetSections.push(`Rows: ${bodyRows.length}`);
4996
+ sheetSections.push(`Columns: ${headers.length}`);
4997
+ sheetSections.push(...markdownTable(headers, bodyRows));
4998
+ sheetSections.push("");
4999
+ }
5000
+ const title = normalizeWhitespace(String(workbook.Props?.Title ?? "")) || (input.fileName ? path7.basename(input.fileName, path7.extname(input.fileName)) : void 0);
5001
+ const extractedText = [
5002
+ title ? `# ${title}` : null,
5003
+ `Sheets: ${allSheetNames.length}`,
5004
+ allSheetNames.length ? `Sheet Names: ${allSheetNames.join(", ")}` : null,
5005
+ "",
5006
+ ...sheetSections
5007
+ ].filter((item) => Boolean(item)).join("\n").trim();
5008
+ const warnings = allSheetNames.length > sheetNames.length ? ["Workbook preview truncated to the first 10 sheets."] : void 0;
5009
+ return {
5010
+ title,
5011
+ extractedText,
5012
+ artifact: {
5013
+ ...extractionMetadata("xlsx", input.mimeType, "xlsx_text"),
5014
+ metadata,
5015
+ warnings
5016
+ }
5017
+ };
5018
+ } catch (error) {
5019
+ return {
5020
+ artifact: {
5021
+ ...extractionMetadata("xlsx", input.mimeType, "xlsx_text"),
5022
+ warnings: [`XLSX extraction failed: ${error instanceof Error ? truncate(error.message, 240) : "unknown error"}`]
5023
+ }
5024
+ };
5025
+ }
5026
+ }
5027
+ async function extractPptxText(input) {
5028
+ try {
5029
+ const archive = unzipSync(new Uint8Array(input.bytes));
5030
+ const presentationXml = zipEntryText(archive, "ppt/presentation.xml");
5031
+ if (!presentationXml) {
5032
+ throw new Error("Missing ppt/presentation.xml");
5033
+ }
5034
+ const relsXml = zipEntryText(archive, "ppt/_rels/presentation.xml.rels");
5035
+ if (!relsXml) {
5036
+ throw new Error("Missing ppt/_rels/presentation.xml.rels");
5037
+ }
5038
+ const rels = relationshipTargets(relsXml, "ppt/presentation.xml");
5039
+ const document = parseXmlDocument(presentationXml);
5040
+ const slideTargets = Array.from(document.getElementsByTagName("*")).filter((node) => node.localName === "sldId").map((node) => node.getAttribute("r:id")?.trim()).filter((value) => Boolean(value)).map((relationshipId) => rels.get(relationshipId)?.target).filter((value) => Boolean(value)).slice(0, 60);
5041
+ const slideSections = [];
5042
+ for (let index = 0; index < slideTargets.length; index += 1) {
5043
+ const slidePath = slideTargets[index];
5044
+ const slideXml = zipEntryText(archive, slidePath);
5045
+ if (!slideXml) {
5046
+ continue;
5047
+ }
5048
+ const slideTexts = xmlTextNodes(slideXml, "t");
5049
+ const slideTitle = slideTexts[0] ?? `Slide ${index + 1}`;
5050
+ slideSections.push(`## Slide ${index + 1}: ${slideTitle}`);
5051
+ if (slideTexts.length) {
5052
+ slideSections.push(slideTexts.join("\n"));
5053
+ }
5054
+ const slideRelsPath = `${zipDirname(slidePath)}/_rels/${path7.posix.basename(slidePath)}.rels`;
5055
+ const slideRelsXml = zipEntryText(archive, slideRelsPath);
5056
+ if (slideRelsXml) {
5057
+ const slideRels = relationshipTargets(slideRelsXml, slidePath);
5058
+ const notesTarget = [...slideRels.values()].find((entry) => entry.type.endsWith("/notesSlide"))?.target;
5059
+ if (notesTarget) {
5060
+ const notesXml = zipEntryText(archive, notesTarget);
5061
+ const noteTexts = notesXml ? xmlTextNodes(notesXml, "t") : [];
5062
+ if (noteTexts.length) {
5063
+ slideSections.push("Notes:");
5064
+ slideSections.push(noteTexts.join("\n"));
5065
+ }
5066
+ }
5067
+ }
5068
+ slideSections.push("");
5069
+ }
5070
+ const metadata = parseOfficeCoreMetadata(input.bytes);
5071
+ const title = metadata?.title || (input.fileName ? path7.basename(input.fileName, path7.extname(input.fileName)) : void 0);
5072
+ const extractedText = [title ? `# ${title}` : null, `Slides: ${slideTargets.length}`, "", ...slideSections].filter((item) => Boolean(item)).join("\n").trim();
5073
+ return {
5074
+ title,
5075
+ extractedText,
5076
+ artifact: {
5077
+ ...extractionMetadata("pptx", input.mimeType, "pptx_text"),
5078
+ metadata: {
5079
+ ...metadata ?? {},
5080
+ slide_count: String(slideTargets.length)
5081
+ },
5082
+ warnings: Array.from(document.getElementsByTagName("*")).filter((node) => node.localName === "sldId").length > slideTargets.length ? ["Slide extraction truncated to the first 60 slides."] : void 0
5083
+ }
5084
+ };
5085
+ } catch (error) {
5086
+ return {
5087
+ artifact: {
5088
+ ...extractionMetadata("pptx", input.mimeType, "pptx_text"),
5089
+ warnings: [`PPTX extraction failed: ${error instanceof Error ? truncate(error.message, 240) : "unknown error"}`]
5090
+ }
5091
+ };
5092
+ }
5093
+ }
5094
+ async function extractEpubChapters(input) {
5095
+ try {
5096
+ const archive = unzipSync(new Uint8Array(input.bytes));
5097
+ const containerXml = zipEntryText(archive, "META-INF/container.xml");
5098
+ if (!containerXml) {
5099
+ throw new Error("Missing META-INF/container.xml");
5100
+ }
5101
+ const container = parseXmlDocument(containerXml);
5102
+ const rootfile = Array.from(container.getElementsByTagName("*")).find((node) => node.localName === "rootfile");
5103
+ const packagePath = rootfile?.getAttribute("full-path")?.trim();
5104
+ if (!packagePath) {
5105
+ throw new Error("EPUB container did not declare a package document.");
5106
+ }
5107
+ const packageXml = zipEntryText(archive, packagePath);
5108
+ if (!packageXml) {
5109
+ throw new Error(`Missing EPUB package document: ${packagePath}`);
5110
+ }
5111
+ const packageDocument = parseXmlDocument(packageXml);
5112
+ const manifestEntries = new Map(
5113
+ Array.from(packageDocument.getElementsByTagName("*")).filter((node) => node.localName === "item").map(
5114
+ (node) => [
5115
+ node.getAttribute("id")?.trim() ?? "",
5116
+ {
5117
+ href: node.getAttribute("href")?.trim() ?? "",
5118
+ mediaType: node.getAttribute("media-type")?.trim() ?? "",
5119
+ properties: node.getAttribute("properties")?.trim() ?? ""
5120
+ }
5121
+ ]
5122
+ ).filter(([id, item]) => Boolean(id && item.href))
5123
+ );
5124
+ const spineIds = Array.from(packageDocument.getElementsByTagName("*")).filter((node) => node.localName === "itemref").map((node) => node.getAttribute("idref")?.trim()).filter((value) => Boolean(value));
5125
+ const bookTitle = xmlTextNodes(packageXml, "title")[0] || (input.fileName ? path7.basename(input.fileName, path7.extname(input.fileName)) : void 0);
5126
+ const author = xmlTextNodes(packageXml, "creator")[0];
5127
+ const chapters = [];
5128
+ for (const spineId of spineIds) {
5129
+ const item = manifestEntries.get(spineId);
5130
+ if (!item || !item.mediaType.includes("html") && !item.mediaType.includes("xhtml")) {
5131
+ continue;
5132
+ }
5133
+ if (item.properties.split(/\s+/).includes("nav")) {
5134
+ continue;
5135
+ }
5136
+ const entryPath = resolveZipTarget(packagePath, item.href);
5137
+ const html = zipEntryText(archive, entryPath);
5138
+ if (!html) {
5139
+ continue;
5140
+ }
5141
+ const markdown = htmlToMarkdown(html);
5142
+ if (!markdown) {
5143
+ continue;
5144
+ }
5145
+ const chapterTitle = firstHtmlHeading(html) || markdown.match(/^#\s+(.+)$/m)?.[1]?.trim() || item.href;
5146
+ const normalizedTitle = normalizeWhitespace(chapterTitle);
5147
+ if (!normalizedTitle || /^table of contents$/i.test(normalizedTitle)) {
5148
+ continue;
5149
+ }
5150
+ chapters.push({
5151
+ partKey: item.href,
5152
+ title: normalizedTitle,
5153
+ markdown,
5154
+ metadata: {
5155
+ book_title: bookTitle ?? "",
5156
+ chapter_title: normalizedTitle,
5157
+ author: author ?? ""
5158
+ }
5159
+ });
5160
+ }
5161
+ return {
5162
+ title: bookTitle,
5163
+ author,
5164
+ chapters,
5165
+ warnings: chapters.length ? void 0 : ["EPUB extraction completed but found no chapter-like spine entries."]
5166
+ };
5167
+ } catch (error) {
5168
+ return {
5169
+ chapters: [],
5170
+ warnings: [`EPUB extraction failed: ${error instanceof Error ? truncate(error.message, 240) : "unknown error"}`]
5171
+ };
5172
+ }
5173
+ }
4803
5174
 
4804
5175
  // src/logs.ts
4805
5176
  import fs8 from "fs/promises";
@@ -5208,6 +5579,17 @@ var HARD_REPO_IGNORES = /* @__PURE__ */ new Set([".git", ".venv"]);
5208
5579
  var PROGRESS_FILE_THRESHOLD = 150;
5209
5580
  var PROGRESS_UPDATE_INTERVAL = 100;
5210
5581
  var RST_HEADING_MARKERS = /* @__PURE__ */ new Set(["=", "-", "~", "^", '"', "#", "*", "+"]);
5582
+ var MARKDOWN_SEMANTIC_FRONTMATTER_KEYS = [
5583
+ "title",
5584
+ "summary",
5585
+ "description",
5586
+ "aliases",
5587
+ "tags",
5588
+ "authors",
5589
+ "published_at",
5590
+ "canonical_url",
5591
+ "source_type"
5592
+ ];
5211
5593
  function uniqueStrings(values) {
5212
5594
  return [...new Set(values.filter(Boolean))];
5213
5595
  }
@@ -5224,15 +5606,27 @@ function inferKind(mimeType, filePath) {
5224
5606
  if (mimeType.includes("html")) {
5225
5607
  return "html";
5226
5608
  }
5227
- if (mimeType.startsWith("text/")) {
5228
- return "text";
5229
- }
5230
5609
  if (mimeType === "application/pdf" || filePath.toLowerCase().endsWith(".pdf")) {
5231
5610
  return "pdf";
5232
5611
  }
5233
5612
  if (mimeType === "application/vnd.openxmlformats-officedocument.wordprocessingml.document" || filePath.toLowerCase().endsWith(".docx")) {
5234
5613
  return "docx";
5235
5614
  }
5615
+ if (mimeType === "application/epub+zip" || filePath.toLowerCase().endsWith(".epub")) {
5616
+ return "epub";
5617
+ }
5618
+ if (mimeType === "text/csv" || mimeType === "text/tab-separated-values" || filePath.toLowerCase().endsWith(".csv") || filePath.toLowerCase().endsWith(".tsv")) {
5619
+ return "csv";
5620
+ }
5621
+ if (mimeType.startsWith("text/")) {
5622
+ return "text";
5623
+ }
5624
+ if (mimeType === "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" || filePath.toLowerCase().endsWith(".xlsx")) {
5625
+ return "xlsx";
5626
+ }
5627
+ if (mimeType === "application/vnd.openxmlformats-officedocument.presentationml.presentation" || filePath.toLowerCase().endsWith(".pptx")) {
5628
+ return "pptx";
5629
+ }
5236
5630
  if (mimeType.startsWith("image/")) {
5237
5631
  return "image";
5238
5632
  }
@@ -5258,6 +5652,10 @@ function guessMimeType(target) {
5258
5652
  }
5259
5653
  return mime.lookup(target) || "application/octet-stream";
5260
5654
  }
5655
+ function sourceGroupIdFor(prepared) {
5656
+ const originKey = prepared.originType === "url" ? prepared.url ?? prepared.title : prepared.originalPath ?? prepared.title;
5657
+ return `${slugify(prepared.title)}-${sha256(originKey).slice(0, 8)}`;
5658
+ }
5261
5659
  function rstAdornmentLine(line) {
5262
5660
  const trimmed = line.trim();
5263
5661
  if (trimmed.length < 3) {
@@ -5345,6 +5743,65 @@ function extractedTextForPlainSource(filePath, sourceKind, content) {
5345
5743
  }
5346
5744
  return content;
5347
5745
  }
5746
+ function normalizeSemanticMarkdownScalar(value) {
5747
+ if (typeof value !== "string") {
5748
+ return void 0;
5749
+ }
5750
+ const normalized = normalizeWhitespace(value.trim());
5751
+ return normalized || void 0;
5752
+ }
5753
+ function normalizeSemanticMarkdownList(value) {
5754
+ if (!Array.isArray(value)) {
5755
+ return void 0;
5756
+ }
5757
+ const items = uniqueStrings(
5758
+ value.flatMap((item) => typeof item === "string" ? [normalizeWhitespace(item.trim())] : []).filter(Boolean)
5759
+ );
5760
+ return items.length ? items : void 0;
5761
+ }
5762
+ function semanticMarkdownTitle(fallback, content, filePath) {
5763
+ const parsed = matter3(content);
5764
+ const frontmatterTitle = normalizeSemanticMarkdownScalar(parsed.data.title);
5765
+ if (frontmatterTitle) {
5766
+ return frontmatterTitle;
5767
+ }
5768
+ return titleFromText(fallback, parsed.content, filePath);
5769
+ }
5770
+ function semanticMarkdownContent(content) {
5771
+ const parsed = matter3(content);
5772
+ const body = parsed.content.replace(/\r\n?/g, "\n").trim();
5773
+ const semanticFrontmatter = Object.fromEntries(
5774
+ MARKDOWN_SEMANTIC_FRONTMATTER_KEYS.flatMap((key) => {
5775
+ const value = key === "aliases" || key === "tags" || key === "authors" ? normalizeSemanticMarkdownList(parsed.data[key]) : normalizeSemanticMarkdownScalar(parsed.data[key]);
5776
+ return value === void 0 ? [] : [[key, value]];
5777
+ })
5778
+ );
5779
+ const semanticLines = Object.entries(semanticFrontmatter).map(
5780
+ ([key, value]) => `${key}: ${Array.isArray(value) ? value.join(", ") : value}`
5781
+ );
5782
+ const extractedText = [...semanticLines, ...semanticLines.length && body ? [""] : [], body].filter(Boolean).join("\n").trim();
5783
+ return {
5784
+ extractedText,
5785
+ semanticHash: sha256(
5786
+ JSON.stringify({
5787
+ body,
5788
+ frontmatter: semanticFrontmatter
5789
+ })
5790
+ )
5791
+ };
5792
+ }
5793
+ function finalizePreparedInput(prepared) {
5794
+ if (prepared.sourceKind !== "markdown") {
5795
+ return prepared;
5796
+ }
5797
+ const semantic = semanticMarkdownContent(prepared.payloadBytes.toString("utf8"));
5798
+ return {
5799
+ ...prepared,
5800
+ extractedText: semantic.extractedText,
5801
+ extractionHash: buildExtractionHash(semantic.extractedText, prepared.extractionArtifact),
5802
+ semanticHash: semantic.semanticHash
5803
+ };
5804
+ }
5348
5805
  function shouldEmitProgress(totalItems) {
5349
5806
  return totalItems >= PROGRESS_FILE_THRESHOLD && Boolean(process.stderr?.isTTY);
5350
5807
  }
@@ -5511,7 +5968,7 @@ function markdownFrontmatter(value) {
5511
5968
  return matter3.stringify("", normalized).trimEnd().split("\n").concat([""]);
5512
5969
  }
5513
5970
  function prepareCapturedMarkdownInput(input) {
5514
- return {
5971
+ return finalizePreparedInput({
5515
5972
  title: input.title,
5516
5973
  originType: "url",
5517
5974
  sourceKind: "markdown",
@@ -5523,7 +5980,7 @@ function prepareCapturedMarkdownInput(input) {
5523
5980
  extractedText: input.markdown,
5524
5981
  attachments: input.attachments,
5525
5982
  logDetails: input.logDetails
5526
- };
5983
+ });
5527
5984
  }
5528
5985
  function isPrivateIp(ip) {
5529
5986
  if (ip === "::1" || ip.startsWith("fc") || ip.startsWith("fd")) return true;
@@ -5773,6 +6230,9 @@ function manifestMatchesOrigin(manifest, prepared) {
5773
6230
  }
5774
6231
  return Boolean(prepared.originalPath && manifest.originalPath && toPosix(manifest.originalPath) === toPosix(prepared.originalPath));
5775
6232
  }
6233
+ function manifestMatchesOriginPart(manifest, prepared) {
6234
+ return manifestMatchesOrigin(manifest, prepared) && (manifest.sourcePartKey ?? "") === (prepared.sourcePartKey ?? "");
6235
+ }
5776
6236
  function buildCompositeHash(payloadBytes, attachments = []) {
5777
6237
  if (!attachments.length) {
5778
6238
  return sha256(payloadBytes);
@@ -5870,7 +6330,7 @@ function extractMarkdownImageReferences(content, baseUrl) {
5870
6330
  async function convertHtmlToMarkdown(html, url) {
5871
6331
  const dom = new JSDOM2(html, { url });
5872
6332
  const article = new Readability(dom.window.document).parse();
5873
- const turndown = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced" });
6333
+ const turndown = new TurndownService2({ headingStyle: "atx", codeBlockStyle: "fenced" });
5874
6334
  const body = article?.content ?? dom.window.document.body.innerHTML;
5875
6335
  const markdown = turndown.turndown(body);
5876
6336
  return {
@@ -5886,23 +6346,34 @@ async function readManifestByHash(manifestsDir, contentHash) {
5886
6346
  }
5887
6347
  const manifest = await readJsonFile(path12.join(manifestsDir, entry.name));
5888
6348
  if (manifest?.contentHash === contentHash) {
5889
- return manifest;
6349
+ return {
6350
+ ...manifest,
6351
+ semanticHash: manifest.semanticHash ?? manifest.contentHash
6352
+ };
5890
6353
  }
5891
6354
  }
5892
6355
  return null;
5893
6356
  }
5894
- async function readManifestByOrigin(manifestsDir, prepared) {
6357
+ async function readManifestsByOrigin(manifestsDir, prepared) {
5895
6358
  const entries = await fs11.readdir(manifestsDir, { withFileTypes: true }).catch(() => []);
6359
+ const manifests = [];
5896
6360
  for (const entry of entries) {
5897
6361
  if (!entry.isFile() || !entry.name.endsWith(".json")) {
5898
6362
  continue;
5899
6363
  }
5900
6364
  const manifest = await readJsonFile(path12.join(manifestsDir, entry.name));
5901
6365
  if (manifest && manifestMatchesOrigin(manifest, prepared)) {
5902
- return manifest;
6366
+ manifests.push({
6367
+ ...manifest,
6368
+ semanticHash: manifest.semanticHash ?? manifest.contentHash
6369
+ });
5903
6370
  }
5904
6371
  }
5905
- return null;
6372
+ return manifests;
6373
+ }
6374
+ async function readManifestByOrigin(manifestsDir, prepared) {
6375
+ const manifests = await readManifestsByOrigin(manifestsDir, prepared);
6376
+ return manifests.find((manifest) => manifestMatchesOriginPart(manifest, prepared)) ?? null;
5906
6377
  }
5907
6378
  async function loadGitignoreMatcher(repoRoot, enabled) {
5908
6379
  if (!enabled) {
@@ -6148,10 +6619,11 @@ async function persistPreparedInput(rootDir, prepared, paths) {
6148
6619
  await ensureDir(paths.extractsDir);
6149
6620
  const attachments = prepared.attachments ?? [];
6150
6621
  const contentHash = prepared.contentHash ?? buildCompositeHash(prepared.payloadBytes, attachments);
6622
+ const semanticHash = prepared.semanticHash ?? contentHash;
6151
6623
  const extractionHash = prepared.extractionHash ?? buildExtractionHash(prepared.extractedText, prepared.extractionArtifact);
6152
6624
  const existingByOrigin = await readManifestByOrigin(paths.manifestsDir, prepared);
6153
- const existingByHash = existingByOrigin ? null : await readManifestByHash(paths.manifestsDir, contentHash);
6154
- if (existingByOrigin && existingByOrigin.contentHash === contentHash && existingByOrigin.extractionHash === extractionHash && existingByOrigin.title === prepared.title && existingByOrigin.sourceKind === prepared.sourceKind && existingByOrigin.sourceType === prepared.sourceType && existingByOrigin.sourceClass === prepared.sourceClass && existingByOrigin.language === prepared.language && existingByOrigin.mimeType === prepared.mimeType && existingByOrigin.repoRelativePath === prepared.repoRelativePath) {
6625
+ const existingByHash = existingByOrigin || prepared.sourcePartKey ? null : await readManifestByHash(paths.manifestsDir, contentHash);
6626
+ if (existingByOrigin && existingByOrigin.contentHash === contentHash && existingByOrigin.semanticHash === semanticHash && existingByOrigin.extractionHash === extractionHash && existingByOrigin.title === prepared.title && existingByOrigin.sourceKind === prepared.sourceKind && existingByOrigin.sourceType === prepared.sourceType && existingByOrigin.sourceClass === prepared.sourceClass && existingByOrigin.language === prepared.language && existingByOrigin.mimeType === prepared.mimeType && existingByOrigin.repoRelativePath === prepared.repoRelativePath && existingByOrigin.sourceGroupId === prepared.sourceGroupId && existingByOrigin.sourceGroupTitle === prepared.sourceGroupTitle && existingByOrigin.sourcePartKey === prepared.sourcePartKey && existingByOrigin.partIndex === prepared.partIndex && existingByOrigin.partCount === prepared.partCount && existingByOrigin.partTitle === prepared.partTitle && JSON.stringify(existingByOrigin.details ?? {}) === JSON.stringify(prepared.details ?? {})) {
6155
6627
  return { manifest: existingByOrigin, isNew: false, wasUpdated: false };
6156
6628
  }
6157
6629
  if (existingByHash) {
@@ -6209,6 +6681,14 @@ async function persistPreparedInput(rootDir, prepared, paths) {
6209
6681
  extractionHash,
6210
6682
  mimeType: prepared.mimeType,
6211
6683
  contentHash,
6684
+ semanticHash,
6685
+ sourceGroupId: prepared.sourceGroupId,
6686
+ sourceGroupTitle: prepared.sourceGroupTitle,
6687
+ sourcePartKey: prepared.sourcePartKey,
6688
+ partIndex: prepared.partIndex,
6689
+ partCount: prepared.partCount,
6690
+ partTitle: prepared.partTitle,
6691
+ details: prepared.details,
6212
6692
  createdAt: previous?.createdAt ?? now,
6213
6693
  updatedAt: now,
6214
6694
  attachments: manifestAttachments.length ? manifestAttachments : void 0
@@ -6230,6 +6710,42 @@ async function persistPreparedInput(rootDir, prepared, paths) {
6230
6710
  }
6231
6711
  return { manifest, isNew: !previous, wasUpdated: Boolean(previous) };
6232
6712
  }
6713
+ async function persistPreparedInputs(rootDir, input, preparedInputs, paths) {
6714
+ const template = preparedInputs[0];
6715
+ const existingByOrigin = template ? await readManifestsByOrigin(paths.manifestsDir, template) : [];
6716
+ const created = [];
6717
+ const updated = [];
6718
+ const unchanged = [];
6719
+ const removed = [];
6720
+ const seenSourceIds = /* @__PURE__ */ new Set();
6721
+ for (const prepared of preparedInputs) {
6722
+ const result = await persistPreparedInput(rootDir, prepared, paths);
6723
+ if (result.isNew) {
6724
+ created.push(result.manifest);
6725
+ } else if (result.wasUpdated) {
6726
+ updated.push(result.manifest);
6727
+ } else {
6728
+ unchanged.push(result.manifest);
6729
+ }
6730
+ seenSourceIds.add(result.manifest.sourceId);
6731
+ }
6732
+ for (const manifest of existingByOrigin) {
6733
+ if (seenSourceIds.has(manifest.sourceId)) {
6734
+ continue;
6735
+ }
6736
+ await removeManifestArtifacts(rootDir, manifest, paths);
6737
+ removed.push(manifest);
6738
+ }
6739
+ return {
6740
+ input,
6741
+ scannedCount: preparedInputs.length,
6742
+ created,
6743
+ updated,
6744
+ unchanged,
6745
+ removed,
6746
+ skipped: []
6747
+ };
6748
+ }
6233
6749
  async function removeManifestArtifacts(rootDir, manifest, paths) {
6234
6750
  await fs11.rm(path12.join(paths.manifestsDir, `${manifest.sourceId}.json`), { force: true });
6235
6751
  await fs11.rm(path12.resolve(rootDir, manifest.storedPath), { force: true });
@@ -6256,10 +6772,10 @@ function repoSyncWorkspaceIgnorePaths(rootDir, paths, repoRoot) {
6256
6772
  return candidates.map((candidate) => path12.resolve(candidate)).filter((candidate, index, items) => items.indexOf(candidate) === index).filter((candidate) => withinRoot(repoRoot, candidate));
6257
6773
  }
6258
6774
  function preparedMatchesManifest(manifest, prepared, contentHash) {
6259
- return manifest.contentHash === contentHash && manifest.extractionHash === (prepared.extractionHash ?? buildExtractionHash(prepared.extractedText, prepared.extractionArtifact)) && manifest.title === prepared.title && manifest.sourceKind === prepared.sourceKind && manifest.sourceType === prepared.sourceType && manifest.sourceClass === prepared.sourceClass && manifest.language === prepared.language && manifest.mimeType === prepared.mimeType && manifest.repoRelativePath === prepared.repoRelativePath;
6775
+ return manifest.contentHash === contentHash && manifest.extractionHash === (prepared.extractionHash ?? buildExtractionHash(prepared.extractedText, prepared.extractionArtifact)) && manifest.semanticHash === (prepared.semanticHash ?? contentHash) && manifest.title === prepared.title && manifest.sourceKind === prepared.sourceKind && manifest.sourceType === prepared.sourceType && manifest.sourceClass === prepared.sourceClass && manifest.language === prepared.language && manifest.mimeType === prepared.mimeType && manifest.repoRelativePath === prepared.repoRelativePath && manifest.sourceGroupId === prepared.sourceGroupId && manifest.sourceGroupTitle === prepared.sourceGroupTitle && manifest.sourcePartKey === prepared.sourcePartKey && manifest.partIndex === prepared.partIndex && manifest.partCount === prepared.partCount && manifest.partTitle === prepared.partTitle && JSON.stringify(manifest.details ?? {}) === JSON.stringify(prepared.details ?? {});
6260
6776
  }
6261
6777
  function shouldDeferWatchSemanticRefresh(sourceKind) {
6262
- return sourceKind === "markdown" || sourceKind === "text" || sourceKind === "html" || sourceKind === "pdf" || sourceKind === "docx" || sourceKind === "image";
6778
+ return sourceKind === "markdown" || sourceKind === "text" || sourceKind === "html" || sourceKind === "pdf" || sourceKind === "docx" || sourceKind === "epub" || sourceKind === "csv" || sourceKind === "xlsx" || sourceKind === "pptx" || sourceKind === "image";
6263
6779
  }
6264
6780
  function pendingSemanticRefreshId(changeType, repoRoot, relativePath) {
6265
6781
  return `pending:${changeType}:${sha256(`${toPosix(repoRoot)}:${relativePath}`).slice(0, 12)}`;
@@ -6325,13 +6841,16 @@ async function syncTrackedRepos(rootDir, options, repoRoots) {
6325
6841
  const currentPaths = new Set(files.map((absolutePath) => path12.resolve(absolutePath)));
6326
6842
  for (const absolutePath of files) {
6327
6843
  const relativePath = repoRelativePathFor(absolutePath, repoRoot) ?? toPosix(path12.relative(repoRoot, absolutePath));
6328
- const prepared = await prepareFileInput(rootDir, absolutePath, repoRoot, sourceClassForRelativePath(relativePath, normalizedOptions));
6329
- const result = await persistPreparedInput(rootDir, prepared, paths);
6330
- if (result.isNew) {
6331
- imported.push(result.manifest);
6332
- } else if (result.wasUpdated) {
6333
- updated.push(result.manifest);
6334
- }
6844
+ const preparedInputs = await prepareFileInputs(
6845
+ rootDir,
6846
+ absolutePath,
6847
+ repoRoot,
6848
+ sourceClassForRelativePath(relativePath, normalizedOptions)
6849
+ );
6850
+ const result = await persistPreparedInputs(rootDir, absolutePath, preparedInputs, paths);
6851
+ imported.push(...result.created);
6852
+ updated.push(...result.updated);
6853
+ removed.push(...result.removed);
6335
6854
  progress.tick();
6336
6855
  }
6337
6856
  progress.finish(`repo=${toPosix(path12.relative(rootDir, repoRoot)) || "."}`);
@@ -6390,9 +6909,6 @@ async function syncTrackedReposForWatch(rootDir, options, repoRoots) {
6390
6909
  let scannedCount = 0;
6391
6910
  for (const repoRoot of uniqueRoots) {
6392
6911
  const repoManifests = manifestsByRepoRoot.get(repoRoot) ?? [];
6393
- const manifestsByOriginalPath = new Map(
6394
- repoManifests.filter((manifest) => manifest.originalPath).map((manifest) => [path12.resolve(manifest.originalPath), manifest])
6395
- );
6396
6912
  if (!await fileExists(repoRoot)) {
6397
6913
  for (const manifest of repoManifests) {
6398
6914
  if (shouldDeferWatchSemanticRefresh(manifest.sourceKind)) {
@@ -6428,38 +6944,50 @@ async function syncTrackedReposForWatch(rootDir, options, repoRoots) {
6428
6944
  const currentPaths = new Set(files.map((absolutePath) => path12.resolve(absolutePath)));
6429
6945
  for (const absolutePath of files) {
6430
6946
  const relativePath = repoRelativePathFor(absolutePath, repoRoot) ?? toPosix(path12.relative(repoRoot, absolutePath));
6431
- const prepared = await prepareFileInput(rootDir, absolutePath, repoRoot, sourceClassForRelativePath(relativePath, normalizedOptions));
6432
- if (shouldDeferWatchSemanticRefresh(prepared.sourceKind)) {
6433
- const existing = manifestsByOriginalPath.get(path12.resolve(absolutePath));
6434
- const contentHash = buildCompositeHash(prepared.payloadBytes, prepared.attachments);
6435
- const changed = !existing || !preparedMatchesManifest(existing, prepared, contentHash);
6947
+ const preparedInputs = await prepareFileInputs(
6948
+ rootDir,
6949
+ absolutePath,
6950
+ repoRoot,
6951
+ sourceClassForRelativePath(relativePath, normalizedOptions)
6952
+ );
6953
+ const firstPrepared = preparedInputs[0];
6954
+ if (firstPrepared && shouldDeferWatchSemanticRefresh(firstPrepared.sourceKind)) {
6955
+ const existing = repoManifests.filter(
6956
+ (manifest) => manifest.originalPath && path12.resolve(manifest.originalPath) === path12.resolve(absolutePath)
6957
+ );
6958
+ const existingByPartKey = new Map(existing.map((manifest) => [manifest.sourcePartKey ?? "__single__", manifest]));
6959
+ const changed = existing.length !== preparedInputs.length || preparedInputs.some((prepared) => {
6960
+ const match = existingByPartKey.get(prepared.sourcePartKey ?? "__single__");
6961
+ const contentHash = buildCompositeHash(prepared.payloadBytes, prepared.attachments);
6962
+ return !match || !preparedMatchesManifest(match, prepared, contentHash);
6963
+ }) || existing.some(
6964
+ (manifest) => !preparedInputs.some((prepared) => (prepared.sourcePartKey ?? "") === (manifest.sourcePartKey ?? ""))
6965
+ );
6436
6966
  if (changed) {
6437
6967
  pendingSemanticRefresh.push({
6438
6968
  id: pendingSemanticRefreshId(
6439
- existing ? "modified" : "added",
6969
+ existing.length ? "modified" : "added",
6440
6970
  repoRoot,
6441
- prepared.repoRelativePath ?? toPosix(path12.relative(repoRoot, absolutePath))
6971
+ firstPrepared.repoRelativePath ?? toPosix(path12.relative(repoRoot, absolutePath))
6442
6972
  ),
6443
6973
  repoRoot,
6444
6974
  path: toPosix(path12.relative(rootDir, absolutePath)),
6445
- changeType: existing ? "modified" : "added",
6975
+ changeType: existing.length ? "modified" : "added",
6446
6976
  detectedAt: (/* @__PURE__ */ new Date()).toISOString(),
6447
- sourceId: existing?.sourceId,
6448
- sourceKind: prepared.sourceKind
6977
+ sourceId: existing[0]?.sourceId,
6978
+ sourceKind: firstPrepared.sourceKind
6449
6979
  });
6450
- if (existing?.sourceId) {
6451
- staleSourceIds.add(existing.sourceId);
6980
+ for (const manifest of existing) {
6981
+ staleSourceIds.add(manifest.sourceId);
6452
6982
  }
6453
6983
  }
6454
6984
  progress.tick();
6455
6985
  continue;
6456
6986
  }
6457
- const result = await persistPreparedInput(rootDir, prepared, paths);
6458
- if (result.isNew) {
6459
- imported.push(result.manifest);
6460
- } else if (result.wasUpdated) {
6461
- updated.push(result.manifest);
6462
- }
6987
+ const result = await persistPreparedInputs(rootDir, absolutePath, preparedInputs, paths);
6988
+ imported.push(...result.created);
6989
+ updated.push(...result.updated);
6990
+ removed.push(...result.removed);
6463
6991
  progress.tick();
6464
6992
  }
6465
6993
  progress.finish(`repo=${toPosix(path12.relative(rootDir, repoRoot)) || "."}`);
@@ -6513,7 +7041,7 @@ async function syncTrackedReposForWatch(rootDir, options, repoRoots) {
6513
7041
  staleSourceIds: [...staleSourceIds]
6514
7042
  };
6515
7043
  }
6516
- async function prepareFileInput(rootDir, absoluteInput, repoRoot, sourceClass) {
7044
+ async function prepareFileInputs(rootDir, absoluteInput, repoRoot, sourceClass) {
6517
7045
  const payloadBytes = await fs11.readFile(absoluteInput);
6518
7046
  const mimeType = guessMimeType(absoluteInput);
6519
7047
  const sourceKind = inferKind(mimeType, absoluteInput);
@@ -6523,14 +7051,15 @@ async function prepareFileInput(rootDir, absoluteInput, repoRoot, sourceClass) {
6523
7051
  let extractedText;
6524
7052
  let extractionArtifact;
6525
7053
  if (sourceKind === "markdown" || sourceKind === "text" || sourceKind === "code") {
6526
- extractedText = extractedTextForPlainSource(absoluteInput, sourceKind, payloadBytes.toString("utf8"));
6527
- title = titleFromText(path12.basename(absoluteInput, path12.extname(absoluteInput)), extractedText, absoluteInput);
7054
+ const rawText = payloadBytes.toString("utf8");
7055
+ extractedText = sourceKind === "markdown" ? semanticMarkdownContent(rawText).extractedText : extractedTextForPlainSource(absoluteInput, sourceKind, rawText);
7056
+ title = sourceKind === "markdown" ? semanticMarkdownTitle(path12.basename(absoluteInput, path12.extname(absoluteInput)), rawText, absoluteInput) : titleFromText(path12.basename(absoluteInput, path12.extname(absoluteInput)), extractedText, absoluteInput);
6528
7057
  extractionArtifact = createPlainTextExtractionArtifact(sourceKind, mimeType);
6529
7058
  } else if (sourceKind === "html") {
6530
7059
  const html = payloadBytes.toString("utf8");
6531
7060
  const converted = await convertHtmlToMarkdown(html, pathToFileURL(absoluteInput).toString());
6532
7061
  title = converted.title;
6533
- extractedText = converted.markdown;
7062
+ extractedText = semanticMarkdownContent(converted.markdown).extractedText;
6534
7063
  extractionArtifact = createHtmlReadabilityExtractionArtifact(sourceKind, mimeType);
6535
7064
  } else if (sourceKind === "pdf") {
6536
7065
  title = path12.basename(absoluteInput, path12.extname(absoluteInput));
@@ -6543,6 +7072,94 @@ async function prepareFileInput(rootDir, absoluteInput, repoRoot, sourceClass) {
6543
7072
  title = extracted.artifact.metadata?.title?.trim() || title;
6544
7073
  extractedText = extracted.extractedText;
6545
7074
  extractionArtifact = extracted.artifact;
7075
+ } else if (sourceKind === "csv") {
7076
+ title = path12.basename(absoluteInput, path12.extname(absoluteInput));
7077
+ const extracted = await extractCsvText({ mimeType, bytes: payloadBytes, fileName: absoluteInput });
7078
+ title = extracted.title?.trim() || title;
7079
+ extractedText = extracted.extractedText;
7080
+ extractionArtifact = extracted.artifact;
7081
+ } else if (sourceKind === "xlsx") {
7082
+ title = path12.basename(absoluteInput, path12.extname(absoluteInput));
7083
+ const extracted = await extractXlsxText({ mimeType, bytes: payloadBytes, fileName: absoluteInput });
7084
+ title = extracted.title?.trim() || title;
7085
+ extractedText = extracted.extractedText;
7086
+ extractionArtifact = extracted.artifact;
7087
+ } else if (sourceKind === "pptx") {
7088
+ title = path12.basename(absoluteInput, path12.extname(absoluteInput));
7089
+ const extracted = await extractPptxText({ mimeType, bytes: payloadBytes, fileName: absoluteInput });
7090
+ title = extracted.title?.trim() || title;
7091
+ extractedText = extracted.extractedText;
7092
+ extractionArtifact = extracted.artifact;
7093
+ } else if (sourceKind === "epub") {
7094
+ title = path12.basename(absoluteInput, path12.extname(absoluteInput));
7095
+ const extracted = await extractEpubChapters({ mimeType, bytes: payloadBytes, fileName: absoluteInput });
7096
+ title = extracted.title?.trim() || title;
7097
+ const groupId = sourceGroupIdFor({
7098
+ title,
7099
+ originType: "file",
7100
+ originalPath: toPosix(absoluteInput)
7101
+ });
7102
+ if (extracted.chapters.length) {
7103
+ return extracted.chapters.map(
7104
+ (chapter, index) => finalizePreparedInput({
7105
+ title: `${title} - ${chapter.title}`,
7106
+ originType: "file",
7107
+ sourceKind: "epub",
7108
+ sourceClass,
7109
+ originalPath: toPosix(absoluteInput),
7110
+ repoRelativePath: repoRelativePathFor(absoluteInput, repoRoot),
7111
+ mimeType: "text/markdown",
7112
+ storedExtension: ".md",
7113
+ payloadBytes: Buffer.from(chapter.markdown, "utf8"),
7114
+ extractedText: chapter.markdown,
7115
+ extractionArtifact: {
7116
+ extractor: "epub_text",
7117
+ sourceKind: "epub",
7118
+ mimeType,
7119
+ producedAt: (/* @__PURE__ */ new Date()).toISOString(),
7120
+ metadata: {
7121
+ ...chapter.metadata,
7122
+ chapter_index: String(index + 1),
7123
+ chapter_count: String(extracted.chapters.length)
7124
+ },
7125
+ warnings: extracted.warnings
7126
+ },
7127
+ extractionHash: buildExtractionHash(chapter.markdown, {
7128
+ extractor: "epub_text",
7129
+ sourceKind: "epub",
7130
+ mimeType,
7131
+ producedAt: (/* @__PURE__ */ new Date()).toISOString(),
7132
+ metadata: {
7133
+ ...chapter.metadata,
7134
+ chapter_index: String(index + 1),
7135
+ chapter_count: String(extracted.chapters.length)
7136
+ },
7137
+ warnings: extracted.warnings
7138
+ }),
7139
+ sourceGroupId: groupId,
7140
+ sourceGroupTitle: title,
7141
+ sourcePartKey: chapter.partKey,
7142
+ partIndex: index + 1,
7143
+ partCount: extracted.chapters.length,
7144
+ partTitle: chapter.title,
7145
+ details: {
7146
+ book_title: title,
7147
+ chapter_title: chapter.title,
7148
+ chapter_index: String(index + 1),
7149
+ chapter_count: String(extracted.chapters.length),
7150
+ ...extracted.author ? { author: extracted.author } : {}
7151
+ }
7152
+ })
7153
+ );
7154
+ }
7155
+ extractedText = void 0;
7156
+ extractionArtifact = {
7157
+ extractor: "epub_text",
7158
+ sourceKind: "epub",
7159
+ mimeType,
7160
+ producedAt: (/* @__PURE__ */ new Date()).toISOString(),
7161
+ warnings: extracted.warnings ?? ["EPUB extraction completed but produced no chapter content."]
7162
+ };
6546
7163
  } else if (sourceKind === "image") {
6547
7164
  title = path12.basename(absoluteInput, path12.extname(absoluteInput));
6548
7165
  const extracted = await extractImageWithVision(rootDir, {
@@ -6556,23 +7173,33 @@ async function prepareFileInput(rootDir, absoluteInput, repoRoot, sourceClass) {
6556
7173
  } else {
6557
7174
  title = path12.basename(absoluteInput, path12.extname(absoluteInput));
6558
7175
  }
6559
- return {
6560
- title,
6561
- originType: "file",
6562
- sourceKind,
6563
- sourceClass,
6564
- language,
6565
- originalPath: toPosix(absoluteInput),
6566
- repoRelativePath: repoRelativePathFor(absoluteInput, repoRoot),
6567
- mimeType,
6568
- storedExtension,
6569
- payloadBytes,
6570
- extractedText,
6571
- extractionArtifact,
6572
- extractionHash: buildExtractionHash(extractedText, extractionArtifact)
6573
- };
7176
+ return [
7177
+ finalizePreparedInput({
7178
+ title,
7179
+ originType: "file",
7180
+ sourceKind,
7181
+ sourceClass,
7182
+ language,
7183
+ originalPath: toPosix(absoluteInput),
7184
+ repoRelativePath: repoRelativePathFor(absoluteInput, repoRoot),
7185
+ mimeType,
7186
+ storedExtension,
7187
+ payloadBytes,
7188
+ extractedText,
7189
+ extractionArtifact,
7190
+ extractionHash: buildExtractionHash(extractedText, extractionArtifact),
7191
+ details: extractionArtifact?.metadata
7192
+ })
7193
+ ];
6574
7194
  }
6575
- async function prepareUrlInput(rootDir, input, options) {
7195
+ async function prepareFileInput(rootDir, absoluteInput, repoRoot, sourceClass) {
7196
+ const prepared = await prepareFileInputs(rootDir, absoluteInput, repoRoot, sourceClass);
7197
+ if (!prepared.length) {
7198
+ throw new Error(`No ingestable sources were extracted from ${absoluteInput}.`);
7199
+ }
7200
+ return prepared[0];
7201
+ }
7202
+ async function prepareUrlInputs(rootDir, input, options) {
6576
7203
  await validateUrlSafety(input);
6577
7204
  const response = await fetch(input);
6578
7205
  if (!response.ok) {
@@ -6634,8 +7261,9 @@ async function prepareUrlInput(rootDir, input, options) {
6634
7261
  const extension = path12.extname(inputUrl.pathname);
6635
7262
  storedExtension = extension || `.${mime.extension(mimeType) || "bin"}`;
6636
7263
  if (sourceKind === "markdown" || sourceKind === "text" || sourceKind === "code") {
6637
- extractedText = extractedTextForPlainSource(inputUrl.pathname, sourceKind, payloadBytes.toString("utf8"));
6638
- title = titleFromText(title || inputUrl.hostname, extractedText, inputUrl.pathname);
7264
+ const rawText = payloadBytes.toString("utf8");
7265
+ extractedText = sourceKind === "markdown" ? semanticMarkdownContent(rawText).extractedText : extractedTextForPlainSource(inputUrl.pathname, sourceKind, rawText);
7266
+ title = sourceKind === "markdown" ? semanticMarkdownTitle(title || inputUrl.hostname, rawText, inputUrl.pathname) : titleFromText(title || inputUrl.hostname, extractedText, inputUrl.pathname);
6639
7267
  extractionArtifact = createPlainTextExtractionArtifact(sourceKind, mimeType);
6640
7268
  if (sourceKind === "markdown" && options.includeAssets) {
6641
7269
  const { attachments: remoteAttachments, skippedCount } = await collectRemoteImageAttachments(
@@ -6666,6 +7294,88 @@ async function prepareUrlInput(rootDir, input, options) {
6666
7294
  title = extracted.artifact.metadata?.title?.trim() || title;
6667
7295
  extractedText = extracted.extractedText;
6668
7296
  extractionArtifact = extracted.artifact;
7297
+ } else if (sourceKind === "csv") {
7298
+ const extracted = await extractCsvText({ mimeType, bytes: payloadBytes, fileName: inputUrl.pathname });
7299
+ title = extracted.title?.trim() || title;
7300
+ extractedText = extracted.extractedText;
7301
+ extractionArtifact = extracted.artifact;
7302
+ } else if (sourceKind === "xlsx") {
7303
+ const extracted = await extractXlsxText({ mimeType, bytes: payloadBytes, fileName: inputUrl.pathname });
7304
+ title = extracted.title?.trim() || title;
7305
+ extractedText = extracted.extractedText;
7306
+ extractionArtifact = extracted.artifact;
7307
+ } else if (sourceKind === "pptx") {
7308
+ const extracted = await extractPptxText({ mimeType, bytes: payloadBytes, fileName: inputUrl.pathname });
7309
+ title = extracted.title?.trim() || title;
7310
+ extractedText = extracted.extractedText;
7311
+ extractionArtifact = extracted.artifact;
7312
+ } else if (sourceKind === "epub") {
7313
+ const extracted = await extractEpubChapters({ mimeType, bytes: payloadBytes, fileName: inputUrl.pathname });
7314
+ title = extracted.title?.trim() || title;
7315
+ const groupId = sourceGroupIdFor({
7316
+ title,
7317
+ originType: "url",
7318
+ url: finalUrl
7319
+ });
7320
+ if (extracted.chapters.length) {
7321
+ return extracted.chapters.map(
7322
+ (chapter, index) => finalizePreparedInput({
7323
+ title: `${title} - ${chapter.title}`,
7324
+ originType: "url",
7325
+ sourceKind: "epub",
7326
+ url: finalUrl,
7327
+ mimeType: "text/markdown",
7328
+ storedExtension: ".md",
7329
+ payloadBytes: Buffer.from(chapter.markdown, "utf8"),
7330
+ extractedText: chapter.markdown,
7331
+ extractionArtifact: {
7332
+ extractor: "epub_text",
7333
+ sourceKind: "epub",
7334
+ mimeType,
7335
+ producedAt: (/* @__PURE__ */ new Date()).toISOString(),
7336
+ metadata: {
7337
+ ...chapter.metadata,
7338
+ chapter_index: String(index + 1),
7339
+ chapter_count: String(extracted.chapters.length)
7340
+ },
7341
+ warnings: extracted.warnings
7342
+ },
7343
+ extractionHash: buildExtractionHash(chapter.markdown, {
7344
+ extractor: "epub_text",
7345
+ sourceKind: "epub",
7346
+ mimeType,
7347
+ producedAt: (/* @__PURE__ */ new Date()).toISOString(),
7348
+ metadata: {
7349
+ ...chapter.metadata,
7350
+ chapter_index: String(index + 1),
7351
+ chapter_count: String(extracted.chapters.length)
7352
+ },
7353
+ warnings: extracted.warnings
7354
+ }),
7355
+ sourceGroupId: groupId,
7356
+ sourceGroupTitle: title,
7357
+ sourcePartKey: chapter.partKey,
7358
+ partIndex: index + 1,
7359
+ partCount: extracted.chapters.length,
7360
+ partTitle: chapter.title,
7361
+ details: {
7362
+ book_title: title,
7363
+ chapter_title: chapter.title,
7364
+ chapter_index: String(index + 1),
7365
+ chapter_count: String(extracted.chapters.length),
7366
+ ...extracted.author ? { author: extracted.author } : {}
7367
+ },
7368
+ logDetails
7369
+ })
7370
+ );
7371
+ }
7372
+ extractionArtifact = {
7373
+ extractor: "epub_text",
7374
+ sourceKind: "epub",
7375
+ mimeType,
7376
+ producedAt: (/* @__PURE__ */ new Date()).toISOString(),
7377
+ warnings: extracted.warnings ?? ["EPUB extraction completed but produced no chapter content."]
7378
+ };
6669
7379
  } else if (sourceKind === "image") {
6670
7380
  const extracted = await extractImageWithVision(rootDir, {
6671
7381
  title,
@@ -6677,22 +7387,32 @@ async function prepareUrlInput(rootDir, input, options) {
6677
7387
  extractionArtifact = extracted.artifact;
6678
7388
  }
6679
7389
  }
6680
- return {
6681
- title,
6682
- originType: "url",
6683
- sourceKind,
6684
- language,
6685
- url: finalUrl,
6686
- mimeType,
6687
- storedExtension,
6688
- payloadBytes,
6689
- extractedText,
6690
- extractionArtifact,
6691
- extractionHash: buildExtractionHash(extractedText, extractionArtifact),
6692
- attachments,
6693
- contentHash,
6694
- logDetails
6695
- };
7390
+ return [
7391
+ finalizePreparedInput({
7392
+ title,
7393
+ originType: "url",
7394
+ sourceKind,
7395
+ language,
7396
+ url: finalUrl,
7397
+ mimeType,
7398
+ storedExtension,
7399
+ payloadBytes,
7400
+ extractedText,
7401
+ extractionArtifact,
7402
+ extractionHash: buildExtractionHash(extractedText, extractionArtifact),
7403
+ attachments,
7404
+ contentHash,
7405
+ details: extractionArtifact?.metadata,
7406
+ logDetails
7407
+ })
7408
+ ];
7409
+ }
7410
+ async function prepareUrlInput(rootDir, input, options) {
7411
+ const prepared = await prepareUrlInputs(rootDir, input, options);
7412
+ if (!prepared.length) {
7413
+ throw new Error(`No ingestable sources were extracted from ${input}.`);
7414
+ }
7415
+ return prepared[0];
6696
7416
  }
6697
7417
  async function collectInboxAttachmentRefs(inputDir, files) {
6698
7418
  const refsBySource = /* @__PURE__ */ new Map();
@@ -6766,7 +7486,7 @@ async function prepareInboxMarkdownInput(absolutePath, attachmentRefs) {
6766
7486
  );
6767
7487
  const rewrittenText = rewriteMarkdownReferences(originalText, replacements);
6768
7488
  const extractionArtifact = createPlainTextExtractionArtifact("markdown", "text/markdown");
6769
- return {
7489
+ return finalizePreparedInput({
6770
7490
  title,
6771
7491
  originType: "file",
6772
7492
  sourceKind: "markdown",
@@ -6779,7 +7499,7 @@ async function prepareInboxMarkdownInput(absolutePath, attachmentRefs) {
6779
7499
  extractionHash: buildExtractionHash(rewrittenText, extractionArtifact),
6780
7500
  attachments,
6781
7501
  contentHash
6782
- };
7502
+ });
6783
7503
  }
6784
7504
  async function prepareInboxHtmlInput(absolutePath, attachmentRefs) {
6785
7505
  const originalBytes = await fs11.readFile(absolutePath);
@@ -6824,18 +7544,23 @@ async function prepareInboxHtmlInput(absolutePath, attachmentRefs) {
6824
7544
  };
6825
7545
  }
6826
7546
  function isSupportedInboxKind(sourceKind) {
6827
- return ["markdown", "text", "html", "pdf", "docx", "image"].includes(sourceKind);
7547
+ return ["markdown", "text", "html", "pdf", "docx", "epub", "csv", "xlsx", "pptx", "image"].includes(sourceKind);
6828
7548
  }
6829
7549
  async function ingestInputDetailed(rootDir, input, options) {
6830
7550
  const { paths } = await initWorkspace(rootDir);
6831
7551
  const normalizedOptions = normalizeIngestOptions(options);
6832
7552
  const absoluteInput = path12.resolve(rootDir, input);
6833
7553
  const repoRoot = isHttpUrl(input) || normalizedOptions.repoRoot ? normalizedOptions.repoRoot : await findNearestGitRoot2(absoluteInput).then((value) => value ?? path12.dirname(absoluteInput));
6834
- const prepared = isHttpUrl(input) ? await prepareUrlInput(rootDir, input, normalizedOptions) : await prepareFileInput(rootDir, absoluteInput, repoRoot);
6835
- return await persistPreparedInput(rootDir, prepared, paths);
7554
+ const prepared = isHttpUrl(input) ? await prepareUrlInputs(rootDir, input, normalizedOptions) : await prepareFileInputs(rootDir, absoluteInput, repoRoot);
7555
+ return await persistPreparedInputs(rootDir, input, prepared, paths);
6836
7556
  }
6837
7557
  async function ingestInput(rootDir, input, options) {
6838
- return (await ingestInputDetailed(rootDir, input, options)).manifest;
7558
+ const result = await ingestInputDetailed(rootDir, input, options);
7559
+ const manifest = [...result.created, ...result.updated, ...result.unchanged][0];
7560
+ if (!manifest) {
7561
+ throw new Error(`No source manifests were created or updated for ${input}.`);
7562
+ }
7563
+ return manifest;
6839
7564
  }
6840
7565
  async function addInput(rootDir, input, options = {}) {
6841
7566
  const { paths } = await initWorkspace(rootDir);
@@ -6933,13 +7658,20 @@ async function ingestDirectory(rootDir, inputDir, options) {
6933
7658
  const progress = createProgressReporter("ingest", files.length);
6934
7659
  for (const absolutePath of files) {
6935
7660
  const relativePath = repoRelativePathFor(absolutePath, repoRoot) ?? toPosix(path12.relative(repoRoot, absolutePath));
6936
- const prepared = await prepareFileInput(rootDir, absolutePath, repoRoot, sourceClassForRelativePath(relativePath, normalizedOptions));
6937
- const result = await persistPreparedInput(rootDir, prepared, paths);
6938
- if (result.isNew) {
6939
- imported.push(result.manifest);
6940
- } else if (result.wasUpdated) {
6941
- updated.push(result.manifest);
6942
- } else {
7661
+ const preparedInputs = await prepareFileInputs(
7662
+ rootDir,
7663
+ absolutePath,
7664
+ repoRoot,
7665
+ sourceClassForRelativePath(relativePath, normalizedOptions)
7666
+ );
7667
+ const result = await persistPreparedInputs(rootDir, absolutePath, preparedInputs, paths);
7668
+ if (result.created.length) {
7669
+ imported.push(...result.created);
7670
+ }
7671
+ if (result.updated.length) {
7672
+ updated.push(...result.updated);
7673
+ }
7674
+ if (!result.created.length && !result.updated.length && !result.removed.length) {
6943
7675
  skipped.push({ path: toPosix(path12.relative(rootDir, absolutePath)), reason: "duplicate_content" });
6944
7676
  }
6945
7677
  progress.tick();
@@ -6990,13 +7722,13 @@ async function importInbox(rootDir, inputDir) {
6990
7722
  continue;
6991
7723
  }
6992
7724
  const prepared = sourceKind === "markdown" && refsBySource.has(absolutePath) ? await prepareInboxMarkdownInput(absolutePath, refsBySource.get(absolutePath) ?? []) : sourceKind === "html" && refsBySource.has(absolutePath) ? await prepareInboxHtmlInput(absolutePath, refsBySource.get(absolutePath) ?? []) : await prepareFileInput(rootDir, absolutePath);
6993
- const result = await persistPreparedInput(rootDir, prepared, paths);
6994
- if (!result.isNew) {
7725
+ const result = await persistPreparedInputs(rootDir, absolutePath, [prepared], paths);
7726
+ if (!result.created.length) {
6995
7727
  skipped.push({ path: toPosix(path12.relative(rootDir, absolutePath)), reason: "duplicate_content" });
6996
7728
  continue;
6997
7729
  }
6998
- attachmentCount += result.manifest.attachments?.length ?? 0;
6999
- imported.push(result.manifest);
7730
+ attachmentCount += result.created.reduce((total, manifest) => total + (manifest.attachments?.length ?? 0), 0);
7731
+ imported.push(...result.created);
7000
7732
  }
7001
7733
  await appendLogEntry(rootDir, "inbox_import", toPosix(path12.relative(rootDir, effectiveInputDir)) || ".", [
7002
7734
  `scanned=${files.length}`,
@@ -7021,7 +7753,10 @@ async function listManifests(rootDir) {
7021
7753
  const manifests = await Promise.all(
7022
7754
  entries.filter((entry) => entry.endsWith(".json")).map((entry) => readJsonFile(path12.join(paths.manifestsDir, entry)))
7023
7755
  );
7024
- return manifests.filter((manifest) => Boolean(manifest));
7756
+ return manifests.filter((manifest) => Boolean(manifest)).map((manifest) => ({
7757
+ ...manifest,
7758
+ semanticHash: manifest.semanticHash ?? manifest.contentHash
7759
+ }));
7025
7760
  }
7026
7761
  async function removeManifestBySourceId(rootDir, sourceId) {
7027
7762
  const { paths } = await initWorkspace(rootDir);
@@ -7029,8 +7764,12 @@ async function removeManifestBySourceId(rootDir, sourceId) {
7029
7764
  if (!manifest) {
7030
7765
  return null;
7031
7766
  }
7032
- await removeManifestArtifacts(rootDir, manifest, paths);
7033
- return manifest;
7767
+ const normalizedManifest = {
7768
+ ...manifest,
7769
+ semanticHash: manifest.semanticHash ?? manifest.contentHash
7770
+ };
7771
+ await removeManifestArtifacts(rootDir, normalizedManifest, paths);
7772
+ return normalizedManifest;
7034
7773
  }
7035
7774
  async function readExtractedText(rootDir, manifest) {
7036
7775
  if (!manifest.extractedTextPath) {
@@ -7176,7 +7915,7 @@ import { z as z7 } from "zod";
7176
7915
  // src/analysis.ts
7177
7916
  import path14 from "path";
7178
7917
  import { z as z2 } from "zod";
7179
- var ANALYSIS_FORMAT_VERSION = 6;
7918
+ var ANALYSIS_FORMAT_VERSION = 7;
7180
7919
  var sourceAnalysisSchema = z2.object({
7181
7920
  title: z2.string().min(1),
7182
7921
  summary: z2.string().min(1),
@@ -7281,6 +8020,7 @@ function heuristicAnalysis(manifest, text, schemaHash) {
7281
8020
  analysisVersion: ANALYSIS_FORMAT_VERSION,
7282
8021
  sourceId: manifest.sourceId,
7283
8022
  sourceHash: manifest.contentHash,
8023
+ semanticHash: manifest.semanticHash,
7284
8024
  extractionHash: manifest.extractionHash,
7285
8025
  schemaHash,
7286
8026
  title: deriveTitle(manifest, text),
@@ -7331,6 +8071,7 @@ ${truncate(text, 18e3)}`
7331
8071
  analysisVersion: ANALYSIS_FORMAT_VERSION,
7332
8072
  sourceId: manifest.sourceId,
7333
8073
  sourceHash: manifest.contentHash,
8074
+ semanticHash: manifest.semanticHash,
7334
8075
  extractionHash: manifest.extractionHash,
7335
8076
  schemaHash: schema.hash,
7336
8077
  title: parsed.title,
@@ -7367,6 +8108,7 @@ function analysisFromVisionExtraction(manifest, extraction, schemaHash) {
7367
8108
  analysisVersion: ANALYSIS_FORMAT_VERSION,
7368
8109
  sourceId: manifest.sourceId,
7369
8110
  sourceHash: manifest.contentHash,
8111
+ semanticHash: manifest.semanticHash,
7370
8112
  extractionHash: manifest.extractionHash,
7371
8113
  schemaHash,
7372
8114
  title: extraction.vision.title?.trim() || manifest.title,
@@ -7405,7 +8147,7 @@ function extractionWarningSummary(manifest, extraction) {
7405
8147
  async function analyzeSource(manifest, extractedText, provider, paths, schema) {
7406
8148
  const cachePath = path14.join(paths.analysesDir, `${manifest.sourceId}.json`);
7407
8149
  const cached = await readJsonFile(cachePath);
7408
- if (cached && cached.analysisVersion === ANALYSIS_FORMAT_VERSION && cached.sourceHash === manifest.contentHash && cached.extractionHash === manifest.extractionHash && cached.schemaHash === schema.hash) {
8150
+ if (cached && cached.analysisVersion === ANALYSIS_FORMAT_VERSION && (cached.semanticHash ?? cached.sourceHash) === manifest.semanticHash && cached.extractionHash === manifest.extractionHash && cached.schemaHash === schema.hash) {
7409
8151
  return cached;
7410
8152
  }
7411
8153
  const extraction = await readExtractionArtifact(paths.rootDir, manifest);
@@ -7422,6 +8164,7 @@ async function analyzeSource(manifest, extractedText, provider, paths, schema) {
7422
8164
  analysisVersion: ANALYSIS_FORMAT_VERSION,
7423
8165
  sourceId: manifest.sourceId,
7424
8166
  sourceHash: manifest.contentHash,
8167
+ semanticHash: manifest.semanticHash,
7425
8168
  extractionHash: manifest.extractionHash,
7426
8169
  schemaHash: schema.hash,
7427
8170
  title: manifest.title,
@@ -7448,6 +8191,7 @@ async function analyzeSource(manifest, extractedText, provider, paths, schema) {
7448
8191
  analysisVersion: ANALYSIS_FORMAT_VERSION,
7449
8192
  sourceId: manifest.sourceId,
7450
8193
  sourceHash: manifest.contentHash,
8194
+ semanticHash: manifest.semanticHash,
7451
8195
  extractionHash: manifest.extractionHash,
7452
8196
  schemaHash: schema.hash,
7453
8197
  title: manifest.title,
@@ -8231,7 +8975,9 @@ async function resolveEmbeddingProvider(rootDir) {
8231
8975
  }
8232
8976
  const provider2 = await createProvider(explicitProviderId, providerConfig, rootDir);
8233
8977
  if (!provider2.capabilities.has("embeddings") || typeof provider2.embedTexts !== "function") {
8234
- throw new Error(`Provider ${provider2.id} does not support required capability "embeddings".`);
8978
+ throw new Error(
8979
+ `Provider ${provider2.id} does not support required capability "embeddings". Configure tasks.embeddingProvider to use an embedding-capable backend such as ollama or another openai-compatible embedding service.`
8980
+ );
8235
8981
  }
8236
8982
  return provider2;
8237
8983
  }
@@ -9127,6 +9873,18 @@ function uniqueStrings2(values) {
9127
9873
  function safeFrontmatter(value) {
9128
9874
  return JSON.parse(JSON.stringify(value));
9129
9875
  }
9876
+ function sourceHashesForManifest(manifest) {
9877
+ return {
9878
+ sourceHashes: { [manifest.sourceId]: manifest.contentHash },
9879
+ sourceSemanticHashes: { [manifest.sourceId]: manifest.semanticHash }
9880
+ };
9881
+ }
9882
+ function sourceHashFrontmatter(sourceHashes, sourceSemanticHashes) {
9883
+ return {
9884
+ source_hashes: sourceHashes,
9885
+ source_semantic_hashes: sourceSemanticHashes
9886
+ };
9887
+ }
9130
9888
  function decoratedTags(baseTags, decorations) {
9131
9889
  return uniqueStrings2([
9132
9890
  ...baseTags,
@@ -9190,6 +9948,7 @@ function relatedOutputsSection(relatedOutputs) {
9190
9948
  function buildSourcePage(manifest, analysis, schemaHash, metadata, relatedOutputs = [], modulePage, decorations) {
9191
9949
  const relativePath = pagePathFor("source", manifest.sourceId);
9192
9950
  const pageId = `source:${manifest.sourceId}`;
9951
+ const { sourceHashes, sourceSemanticHashes } = sourceHashesForManifest(manifest);
9193
9952
  const moduleNodeIds = analysis.code ? [analysis.code.moduleId, ...analysis.code.symbols.map((symbol) => symbol.id)] : [];
9194
9953
  const nodeIds = [
9195
9954
  `source:${manifest.sourceId}`,
@@ -9222,17 +9981,25 @@ function buildSourcePage(manifest, analysis, schemaHash, metadata, relatedOutput
9222
9981
  managed_by: metadata.managedBy,
9223
9982
  backlinks,
9224
9983
  schema_hash: schemaHash,
9225
- source_hashes: {
9226
- [manifest.sourceId]: manifest.contentHash
9227
- }
9984
+ ...sourceHashFrontmatter(sourceHashes, sourceSemanticHashes)
9228
9985
  };
9229
9986
  const body = [
9230
9987
  `# ${analysis.title}`,
9231
9988
  "",
9232
9989
  `Source ID: \`${manifest.sourceId}\``,
9990
+ `Source Kind: \`${manifest.sourceKind}\``,
9233
9991
  manifest.url ? `Source URL: ${manifest.url}` : `Source Path: \`${manifest.originalPath ?? manifest.storedPath}\``,
9234
9992
  ...manifest.sourceType ? [`Source Type: \`${manifest.sourceType}\``, ""] : [""],
9235
9993
  ...manifest.sourceClass ? [`Source Class: \`${manifest.sourceClass}\``, ""] : [],
9994
+ ...manifest.sourceGroupTitle ? [`Source Group: ${manifest.sourceGroupTitle}`] : [],
9995
+ ...manifest.partTitle ? [`Part: ${manifest.partIndex ?? "?"}/${manifest.partCount ?? "?"} - ${manifest.partTitle}`] : [],
9996
+ ...manifest.details && Object.keys(manifest.details).length ? [
9997
+ "",
9998
+ "## Source Details",
9999
+ "",
10000
+ ...Object.entries(manifest.details).map(([key, value]) => `- ${key.replace(/_/g, " ")}: ${value}`),
10001
+ ""
10002
+ ] : [],
9236
10003
  "",
9237
10004
  "## Summary",
9238
10005
  "",
@@ -9287,7 +10054,8 @@ function buildSourcePage(manifest, analysis, schemaHash, metadata, relatedOutput
9287
10054
  confidence: metadata.confidence,
9288
10055
  backlinks,
9289
10056
  schemaHash,
9290
- sourceHashes: { [manifest.sourceId]: manifest.contentHash },
10057
+ sourceHashes,
10058
+ sourceSemanticHashes,
9291
10059
  relatedPageIds: [...modulePage ? [modulePage.id] : [], ...relatedOutputs.map((page) => page.id)],
9292
10060
  relatedNodeIds: moduleNodeIds,
9293
10061
  relatedSourceIds: [],
@@ -9312,6 +10080,7 @@ function buildModulePage(input) {
9312
10080
  const localModuleBacklinks = input.localModules.map((moduleRef) => moduleRef.page.id);
9313
10081
  const relatedOutputs = input.relatedOutputs ?? [];
9314
10082
  const backlinks = uniqueStrings2([sourcePage.id, ...localModuleBacklinks, ...relatedOutputs.map((page) => page.id)]);
10083
+ const { sourceHashes, sourceSemanticHashes } = sourceHashesForManifest(manifest);
9315
10084
  const importsSection = code.imports.length ? code.imports.map((item) => {
9316
10085
  const localModule = item.resolvedSourceId ? input.localModules.find((moduleRef) => moduleRef.sourceId === item.resolvedSourceId && moduleRef.reExport === item.reExport) : void 0;
9317
10086
  const importedBits = [
@@ -9355,9 +10124,7 @@ function buildModulePage(input) {
9355
10124
  managed_by: metadata.managedBy,
9356
10125
  backlinks,
9357
10126
  schema_hash: schemaHash,
9358
- source_hashes: {
9359
- [manifest.sourceId]: manifest.contentHash
9360
- },
10127
+ ...sourceHashFrontmatter(sourceHashes, sourceSemanticHashes),
9361
10128
  related_page_ids: uniqueStrings2([sourcePage.id, ...localModuleBacklinks, ...relatedOutputs.map((page) => page.id)]),
9362
10129
  related_node_ids: [],
9363
10130
  related_source_ids: uniqueStrings2([
@@ -9433,7 +10200,8 @@ function buildModulePage(input) {
9433
10200
  confidence: metadata.confidence,
9434
10201
  backlinks,
9435
10202
  schemaHash,
9436
- sourceHashes: { [manifest.sourceId]: manifest.contentHash },
10203
+ sourceHashes,
10204
+ sourceSemanticHashes,
9437
10205
  relatedPageIds: uniqueStrings2([sourcePage.id, ...localModuleBacklinks, ...relatedOutputs.map((page) => page.id)]),
9438
10206
  relatedNodeIds: [],
9439
10207
  relatedSourceIds: uniqueStrings2([
@@ -9449,7 +10217,7 @@ function buildModulePage(input) {
9449
10217
  content: matter5.stringify(body, frontmatter)
9450
10218
  };
9451
10219
  }
9452
- function buildAggregatePage(kind, name, descriptions, sourceAnalyses, sourceHashes, schemaHash, metadata, relativePath, relatedOutputs = [], decorations) {
10220
+ function buildAggregatePage(kind, name, descriptions, sourceAnalyses, sourceHashes, sourceSemanticHashes, schemaHash, metadata, relativePath, relatedOutputs = [], decorations) {
9453
10221
  const slug = slugify(name);
9454
10222
  const pageId = `${kind}:${slug}`;
9455
10223
  const sourceIds = sourceAnalyses.map((item) => item.sourceId);
@@ -9473,7 +10241,7 @@ function buildAggregatePage(kind, name, descriptions, sourceAnalyses, sourceHash
9473
10241
  managed_by: metadata.managedBy,
9474
10242
  backlinks: otherPages,
9475
10243
  schema_hash: schemaHash,
9476
- source_hashes: sourceHashes
10244
+ ...sourceHashFrontmatter(sourceHashes, sourceSemanticHashes)
9477
10245
  };
9478
10246
  const body = [
9479
10247
  `# ${name}`,
@@ -9511,6 +10279,7 @@ function buildAggregatePage(kind, name, descriptions, sourceAnalyses, sourceHash
9511
10279
  backlinks: otherPages,
9512
10280
  schemaHash,
9513
10281
  sourceHashes,
10282
+ sourceSemanticHashes,
9514
10283
  relatedPageIds: relatedOutputs.map((page) => page.id),
9515
10284
  relatedNodeIds: [],
9516
10285
  relatedSourceIds: [],
@@ -9551,6 +10320,7 @@ function buildIndexPage(pages, schemaHash, metadata, projectPages = []) {
9551
10320
  "backlinks: []",
9552
10321
  `schema_hash: ${schemaHash}`,
9553
10322
  "source_hashes: {}",
10323
+ "source_semantic_hashes: {}",
9554
10324
  "---",
9555
10325
  "",
9556
10326
  "# SwarmVault Index",
@@ -9614,7 +10384,8 @@ function buildSectionIndex(kind, pages, schemaHash, metadata, projectIds = []) {
9614
10384
  managed_by: metadata.managedBy,
9615
10385
  backlinks: [],
9616
10386
  schema_hash: schemaHash,
9617
- source_hashes: {}
10387
+ source_hashes: {},
10388
+ source_semantic_hashes: {}
9618
10389
  }
9619
10390
  );
9620
10391
  }
@@ -9910,6 +10681,7 @@ function buildGraphReportPage(input) {
9910
10681
  backlinks: [],
9911
10682
  schema_hash: input.schemaHash,
9912
10683
  source_hashes: {},
10684
+ source_semantic_hashes: {},
9913
10685
  related_page_ids: relatedPageIds,
9914
10686
  related_node_ids: relatedNodeIds,
9915
10687
  related_source_ids: relatedSourceIds
@@ -10025,6 +10797,7 @@ function buildGraphReportPage(input) {
10025
10797
  backlinks: [],
10026
10798
  schemaHash: input.schemaHash,
10027
10799
  sourceHashes: {},
10800
+ sourceSemanticHashes: {},
10028
10801
  relatedPageIds,
10029
10802
  relatedNodeIds,
10030
10803
  relatedSourceIds,
@@ -10068,6 +10841,7 @@ function buildCommunitySummaryPage(input) {
10068
10841
  backlinks: ["graph:report"],
10069
10842
  schema_hash: input.schemaHash,
10070
10843
  source_hashes: {},
10844
+ source_semantic_hashes: {},
10071
10845
  related_page_ids: uniqueStrings2(["graph:report", ...communityPageIds]),
10072
10846
  related_node_ids: input.community.nodeIds,
10073
10847
  related_source_ids: relatedSourceIds
@@ -10107,6 +10881,7 @@ function buildCommunitySummaryPage(input) {
10107
10881
  backlinks: ["graph:report"],
10108
10882
  schemaHash: input.schemaHash,
10109
10883
  sourceHashes: {},
10884
+ sourceSemanticHashes: {},
10110
10885
  relatedPageIds: uniqueStrings2(["graph:report", ...communityPageIds]),
10111
10886
  relatedNodeIds: input.community.nodeIds,
10112
10887
  relatedSourceIds,
@@ -10143,7 +10918,8 @@ function buildProjectsIndex(projectPages, schemaHash, metadata) {
10143
10918
  managed_by: metadata.managedBy,
10144
10919
  backlinks: [],
10145
10920
  schema_hash: schemaHash,
10146
- source_hashes: {}
10921
+ source_hashes: {},
10922
+ source_semantic_hashes: {}
10147
10923
  }
10148
10924
  );
10149
10925
  }
@@ -10195,7 +10971,8 @@ function buildProjectIndex(input) {
10195
10971
  managed_by: input.metadata.managedBy,
10196
10972
  backlinks: [],
10197
10973
  schema_hash: input.schemaHash,
10198
- source_hashes: {}
10974
+ source_hashes: {},
10975
+ source_semantic_hashes: {}
10199
10976
  }
10200
10977
  );
10201
10978
  }
@@ -10226,6 +11003,7 @@ function buildOutputPage(input) {
10226
11003
  backlinks,
10227
11004
  schema_hash: input.schemaHash,
10228
11005
  source_hashes: {},
11006
+ source_semantic_hashes: {},
10229
11007
  related_page_ids: relatedPageIds,
10230
11008
  related_node_ids: relatedNodeIds,
10231
11009
  related_source_ids: relatedSourceIds,
@@ -10250,6 +11028,7 @@ function buildOutputPage(input) {
10250
11028
  backlinks,
10251
11029
  schemaHash: input.schemaHash,
10252
11030
  sourceHashes: {},
11031
+ sourceSemanticHashes: {},
10253
11032
  relatedPageIds,
10254
11033
  relatedNodeIds,
10255
11034
  relatedSourceIds,
@@ -10352,6 +11131,7 @@ function buildExploreHubPage(input) {
10352
11131
  backlinks,
10353
11132
  schema_hash: input.schemaHash,
10354
11133
  source_hashes: {},
11134
+ source_semantic_hashes: {},
10355
11135
  related_page_ids: relatedPageIds,
10356
11136
  related_node_ids: relatedNodeIds,
10357
11137
  related_source_ids: relatedSourceIds,
@@ -10376,6 +11156,7 @@ function buildExploreHubPage(input) {
10376
11156
  backlinks,
10377
11157
  schemaHash: input.schemaHash,
10378
11158
  sourceHashes: {},
11159
+ sourceSemanticHashes: {},
10379
11160
  relatedPageIds,
10380
11161
  relatedNodeIds,
10381
11162
  relatedSourceIds,
@@ -10674,6 +11455,9 @@ function normalizeSourceHashes(value) {
10674
11455
  Object.entries(value).filter((entry) => typeof entry[0] === "string" && typeof entry[1] === "string")
10675
11456
  );
10676
11457
  }
11458
+ function normalizeSourceSemanticHashes(value) {
11459
+ return normalizeSourceHashes(value);
11460
+ }
10677
11461
  function normalizePageStatus(value, fallback = "active") {
10678
11462
  return value === "draft" || value === "candidate" || value === "active" || value === "archived" ? value : fallback;
10679
11463
  }
@@ -10802,6 +11586,7 @@ function parseStoredPage(relativePath, content, defaults = {}) {
10802
11586
  backlinks,
10803
11587
  schemaHash: typeof parsed.data.schema_hash === "string" ? parsed.data.schema_hash : "",
10804
11588
  sourceHashes: normalizeSourceHashes(parsed.data.source_hashes),
11589
+ sourceSemanticHashes: normalizeSourceSemanticHashes(parsed.data.source_semantic_hashes),
10805
11590
  relatedPageIds,
10806
11591
  relatedNodeIds,
10807
11592
  relatedSourceIds,
@@ -10855,6 +11640,7 @@ async function loadInsightPages(wikiDir) {
10855
11640
  backlinks,
10856
11641
  schemaHash: typeof parsed.data.schema_hash === "string" ? parsed.data.schema_hash : "",
10857
11642
  sourceHashes: normalizeSourceHashes(parsed.data.source_hashes),
11643
+ sourceSemanticHashes: normalizeSourceSemanticHashes(parsed.data.source_semantic_hashes),
10858
11644
  relatedPageIds,
10859
11645
  relatedNodeIds,
10860
11646
  relatedSourceIds,
@@ -10955,6 +11741,7 @@ async function loadSavedOutputPages(wikiDir) {
10955
11741
  backlinks,
10956
11742
  schemaHash: typeof parsed.data.schema_hash === "string" ? parsed.data.schema_hash : "",
10957
11743
  sourceHashes: normalizeSourceHashes(parsed.data.source_hashes),
11744
+ sourceSemanticHashes: normalizeSourceSemanticHashes(parsed.data.source_semantic_hashes),
10958
11745
  relatedPageIds,
10959
11746
  relatedNodeIds,
10960
11747
  relatedSourceIds,
@@ -12431,11 +13218,13 @@ function aggregateItems(analyses, kind) {
12431
13218
  name: item.name,
12432
13219
  descriptions: [],
12433
13220
  sourceAnalyses: [],
12434
- sourceHashes: {}
13221
+ sourceHashes: {},
13222
+ sourceSemanticHashes: {}
12435
13223
  };
12436
13224
  existing.descriptions.push(item.description);
12437
13225
  existing.sourceAnalyses.push(analysis);
12438
13226
  existing.sourceHashes[analysis.sourceId] = analysis.sourceHash;
13227
+ existing.sourceSemanticHashes[analysis.sourceId] = analysis.semanticHash;
12439
13228
  grouped.set(key, existing);
12440
13229
  }
12441
13230
  }
@@ -12457,6 +13246,7 @@ function emptyGraphPage(input) {
12457
13246
  backlinks: [],
12458
13247
  schemaHash: input.schemaHash,
12459
13248
  sourceHashes: input.sourceHashes,
13249
+ sourceSemanticHashes: input.sourceSemanticHashes ?? {},
12460
13250
  relatedPageIds: [],
12461
13251
  relatedNodeIds: [],
12462
13252
  relatedSourceIds: [],
@@ -12621,6 +13411,7 @@ async function syncVaultArtifacts(rootDir, input) {
12621
13411
  nodeIds: [analysis.code.moduleId, ...analysis.code.symbols.map((symbol) => symbol.id)],
12622
13412
  schemaHash: sourceSchemaHash,
12623
13413
  sourceHashes: { [manifest.sourceId]: manifest.contentHash },
13414
+ sourceSemanticHashes: { [manifest.sourceId]: manifest.semanticHash },
12624
13415
  confidence: 1
12625
13416
  }) : null;
12626
13417
  const preview = emptyGraphPage({
@@ -12639,6 +13430,7 @@ async function syncVaultArtifacts(rootDir, input) {
12639
13430
  ],
12640
13431
  schemaHash: sourceSchemaHash,
12641
13432
  sourceHashes: { [manifest.sourceId]: manifest.contentHash },
13433
+ sourceSemanticHashes: { [manifest.sourceId]: manifest.semanticHash },
12642
13434
  confidence: 1
12643
13435
  });
12644
13436
  const sourceRecord = await buildManagedGraphPage(
@@ -12755,6 +13547,7 @@ async function syncVaultArtifacts(rootDir, input) {
12755
13547
  aggregate.descriptions,
12756
13548
  aggregate.sourceAnalyses,
12757
13549
  aggregate.sourceHashes,
13550
+ aggregate.sourceSemanticHashes,
12758
13551
  schemaHash,
12759
13552
  metadata,
12760
13553
  relativePath,
@@ -13002,6 +13795,7 @@ async function syncVaultArtifacts(rootDir, input) {
13002
13795
  projectConfigHash: projectConfigHash(config),
13003
13796
  analyses: Object.fromEntries(input.analyses.map((analysis) => [analysis.sourceId, analysisSignature(analysis)])),
13004
13797
  sourceHashes: Object.fromEntries(input.manifests.map((manifest) => [manifest.sourceId, manifest.contentHash])),
13798
+ sourceSemanticHashes: Object.fromEntries(input.manifests.map((manifest) => [manifest.sourceId, manifest.semanticHash])),
13005
13799
  sourceProjects: input.sourceProjects,
13006
13800
  outputHashes: input.outputHashes,
13007
13801
  insightHashes: input.insightHashes,
@@ -13467,6 +14261,7 @@ function emptyCompileState() {
13467
14261
  projectConfigHash: "",
13468
14262
  analyses: {},
13469
14263
  sourceHashes: {},
14264
+ sourceSemanticHashes: {},
13470
14265
  sourceProjects: {},
13471
14266
  outputHashes: {},
13472
14267
  insightHashes: {},
@@ -13896,7 +14691,8 @@ async function initVault(rootDir, options = {}) {
13896
14691
  managed_by: "human",
13897
14692
  backlinks: [],
13898
14693
  schema_hash: "",
13899
- source_hashes: {}
14694
+ source_hashes: {},
14695
+ source_semantic_hashes: {}
13900
14696
  }
13901
14697
  )
13902
14698
  );
@@ -13919,7 +14715,8 @@ async function initVault(rootDir, options = {}) {
13919
14715
  managed_by: "system",
13920
14716
  backlinks: [],
13921
14717
  schema_hash: "",
13922
- source_hashes: {}
14718
+ source_hashes: {},
14719
+ source_semantic_hashes: {}
13923
14720
  })
13924
14721
  );
13925
14722
  await writeFileIfChanged(
@@ -13941,7 +14738,8 @@ async function initVault(rootDir, options = {}) {
13941
14738
  managed_by: "system",
13942
14739
  backlinks: [],
13943
14740
  schema_hash: "",
13944
- source_hashes: {}
14741
+ source_hashes: {},
14742
+ source_semantic_hashes: {}
13945
14743
  })
13946
14744
  );
13947
14745
  if (options.obsidian) {
@@ -13982,7 +14780,7 @@ async function compileVault(rootDir, options = {}) {
13982
14780
  );
13983
14781
  const nextProjectConfigHash = projectConfigHash(config);
13984
14782
  const projectConfigChanged = !previousState || previousState.projectConfigHash !== nextProjectConfigHash;
13985
- const previousSourceHashes = previousState?.sourceHashes ?? {};
14783
+ const previousSourceHashes = previousState?.sourceSemanticHashes ?? previousState?.sourceHashes ?? {};
13986
14784
  const previousAnalyses = previousState?.analyses ?? {};
13987
14785
  const previousSourceProjects = previousState?.sourceProjects ?? {};
13988
14786
  const previousOutputHashes = previousState?.outputHashes ?? {};
@@ -13997,7 +14795,7 @@ async function compileVault(rootDir, options = {}) {
13997
14795
  const dirty = [];
13998
14796
  const clean = [];
13999
14797
  for (const manifest of manifests) {
14000
- const hashChanged = previousSourceHashes[manifest.sourceId] !== manifest.contentHash;
14798
+ const hashChanged = previousSourceHashes[manifest.sourceId] !== manifest.semanticHash;
14001
14799
  const noAnalysis = !previousAnalyses[manifest.sourceId];
14002
14800
  const projectId = sourceProjects[manifest.sourceId] ?? null;
14003
14801
  const projectChanged = (previousSourceProjects[manifest.sourceId] ?? null) !== projectId;
@@ -14707,9 +15505,11 @@ function structuralLintFindings(_rootDir, paths, graph, schemas, manifests, sour
14707
15505
  relatedPageIds: [page.id]
14708
15506
  });
14709
15507
  }
14710
- for (const [sourceId, knownHash] of Object.entries(page.sourceHashes)) {
15508
+ const freshnessHashes = Object.keys(page.sourceSemanticHashes).length ? page.sourceSemanticHashes : page.sourceHashes;
15509
+ for (const [sourceId, knownHash] of Object.entries(freshnessHashes)) {
14711
15510
  const manifest = manifestMap.get(sourceId);
14712
- if (manifest && manifest.contentHash !== knownHash) {
15511
+ const manifestHash = manifest?.semanticHash ?? manifest?.contentHash;
15512
+ if (manifestHash && manifestHash !== knownHash) {
14713
15513
  findings.push({
14714
15514
  severity: "warning",
14715
15515
  code: "stale_page",
@@ -14848,7 +15648,7 @@ async function bootstrapDemo(rootDir, input) {
14848
15648
  }
14849
15649
 
14850
15650
  // src/mcp.ts
14851
- var SERVER_VERSION = "0.2.1";
15651
+ var SERVER_VERSION = "0.3.0";
14852
15652
  async function createMcpServer(rootDir) {
14853
15653
  const server = new McpServer({
14854
15654
  name: "swarmvault",
@@ -15026,8 +15826,8 @@ async function createMcpServer(rootDir) {
15026
15826
  }
15027
15827
  },
15028
15828
  async ({ input }) => {
15029
- const manifest = await ingestInput(rootDir, input);
15030
- return asToolText(manifest);
15829
+ const result = await ingestInputDetailed(rootDir, input);
15830
+ return asToolText(result);
15031
15831
  }
15032
15832
  );
15033
15833
  server.registerTool(
@@ -15831,12 +16631,11 @@ async function syncCrawlSource(rootDir, entry, options) {
15831
16631
  let updatedCount = 0;
15832
16632
  for (const pageUrl of crawl.pages) {
15833
16633
  const persisted = await ingestInputDetailed(rootDir, pageUrl);
15834
- currentSourceIds.push(persisted.manifest.sourceId);
15835
- if (persisted.isNew) {
15836
- importedCount += 1;
15837
- } else if (persisted.wasUpdated) {
15838
- updatedCount += 1;
15839
- }
16634
+ currentSourceIds.push(...persisted.created.map((manifest) => manifest.sourceId));
16635
+ currentSourceIds.push(...persisted.updated.map((manifest) => manifest.sourceId));
16636
+ currentSourceIds.push(...persisted.unchanged.map((manifest) => manifest.sourceId));
16637
+ importedCount += persisted.created.length;
16638
+ updatedCount += persisted.updated.length;
15840
16639
  }
15841
16640
  let removedCount = 0;
15842
16641
  for (const sourceId of previousSourceIds) {
@@ -16221,6 +17020,124 @@ import { promisify } from "util";
16221
17020
  import matter10 from "gray-matter";
16222
17021
  import mime2 from "mime-types";
16223
17022
 
17023
+ // src/graph-presentation.ts
17024
+ var OVERVIEW_THRESHOLD = 5e3;
17025
+ var OVERVIEW_NODE_BUDGET = 1500;
17026
+ function nodePriority(node, pinnedNodeIds) {
17027
+ return [pinnedNodeIds.has(node.id) ? 0 : 1, -(node.degree ?? 0), -(node.bridgeScore ?? 0), node.label, node.id];
17028
+ }
17029
+ function compareTuples(left, right) {
17030
+ const length = Math.max(left.length, right.length);
17031
+ for (let index = 0; index < length; index += 1) {
17032
+ const leftValue = left[index];
17033
+ const rightValue = right[index];
17034
+ if (leftValue === rightValue) {
17035
+ continue;
17036
+ }
17037
+ if (typeof leftValue === "number" && typeof rightValue === "number") {
17038
+ return leftValue - rightValue;
17039
+ }
17040
+ return String(leftValue ?? "").localeCompare(String(rightValue ?? ""));
17041
+ }
17042
+ return 0;
17043
+ }
17044
+ function survivingHyperedges(hyperedges, sampledNodeIds) {
17045
+ return hyperedges.filter((hyperedge) => hyperedge.nodeIds.filter((nodeId) => sampledNodeIds.has(nodeId)).length >= 2);
17046
+ }
17047
+ function pinnedNodeIdsForReport(report) {
17048
+ if (!report) {
17049
+ return /* @__PURE__ */ new Set();
17050
+ }
17051
+ return /* @__PURE__ */ new Set([
17052
+ ...report.godNodes.map((node) => node.nodeId),
17053
+ ...report.bridgeNodes.map((node) => node.nodeId),
17054
+ ...report.surprisingConnections.flatMap((connection) => [connection.sourceNodeId, connection.targetNodeId])
17055
+ ]);
17056
+ }
17057
+ function sampleGraphNodes(graph, report, nodeBudget = OVERVIEW_NODE_BUDGET) {
17058
+ const pinned = pinnedNodeIdsForReport(report);
17059
+ const nodeById2 = new Map(graph.nodes.map((node) => [node.id, node]));
17060
+ const selected = new Set([...pinned].filter((nodeId) => nodeById2.has(nodeId)));
17061
+ const sortedCommunities2 = [...graph.communities ?? []].sort((left, right) => {
17062
+ const leftNodes = left.nodeIds.map((nodeId) => nodeById2.get(nodeId)).filter((node) => Boolean(node));
17063
+ const rightNodes = right.nodeIds.map((nodeId) => nodeById2.get(nodeId)).filter((node) => Boolean(node));
17064
+ const leftFirstParty = leftNodes.filter((node) => node.sourceClass === "first_party").length;
17065
+ const rightFirstParty = rightNodes.filter((node) => node.sourceClass === "first_party").length;
17066
+ return compareTuples(
17067
+ [-leftFirstParty, -leftNodes.length, left.label, left.id],
17068
+ [-rightFirstParty, -rightNodes.length, right.label, right.id]
17069
+ );
17070
+ });
17071
+ for (const community of sortedCommunities2) {
17072
+ const communityNodes = community.nodeIds.map((nodeId) => nodeById2.get(nodeId)).filter((node) => Boolean(node)).sort((left, right) => compareTuples(nodePriority(left, pinned), nodePriority(right, pinned)));
17073
+ for (const node of communityNodes) {
17074
+ if (selected.size >= nodeBudget && !pinned.has(node.id)) {
17075
+ break;
17076
+ }
17077
+ selected.add(node.id);
17078
+ }
17079
+ if (selected.size >= nodeBudget) {
17080
+ break;
17081
+ }
17082
+ }
17083
+ if (selected.size < nodeBudget) {
17084
+ for (const node of [...graph.nodes].sort((left, right) => compareTuples(nodePriority(left, pinned), nodePriority(right, pinned)))) {
17085
+ if (selected.size >= nodeBudget && !pinned.has(node.id)) {
17086
+ break;
17087
+ }
17088
+ selected.add(node.id);
17089
+ }
17090
+ }
17091
+ return selected;
17092
+ }
17093
+ function buildViewerGraphArtifact(graph, options = {}) {
17094
+ const threshold = options.threshold ?? OVERVIEW_THRESHOLD;
17095
+ const nodeBudget = options.nodeBudget ?? OVERVIEW_NODE_BUDGET;
17096
+ const totalCommunities = graph.communities?.length ?? 0;
17097
+ if (options.full || graph.nodes.length <= threshold) {
17098
+ return {
17099
+ ...graph,
17100
+ presentation: {
17101
+ mode: "full",
17102
+ threshold,
17103
+ nodeBudget,
17104
+ totalNodes: graph.nodes.length,
17105
+ displayedNodes: graph.nodes.length,
17106
+ totalEdges: graph.edges.length,
17107
+ displayedEdges: graph.edges.length,
17108
+ totalCommunities,
17109
+ displayedCommunities: totalCommunities
17110
+ }
17111
+ };
17112
+ }
17113
+ const sampledNodeIds = sampleGraphNodes(graph, options.report, nodeBudget);
17114
+ const nodes = graph.nodes.filter((node) => sampledNodeIds.has(node.id));
17115
+ const edges = graph.edges.filter((edge) => sampledNodeIds.has(edge.source) && sampledNodeIds.has(edge.target));
17116
+ const hyperedges = survivingHyperedges(graph.hyperedges ?? [], sampledNodeIds);
17117
+ const communities = (graph.communities ?? []).map((community) => ({
17118
+ ...community,
17119
+ nodeIds: community.nodeIds.filter((nodeId) => sampledNodeIds.has(nodeId))
17120
+ })).filter((community) => community.nodeIds.length > 0);
17121
+ return {
17122
+ ...graph,
17123
+ nodes,
17124
+ edges,
17125
+ hyperedges,
17126
+ communities,
17127
+ presentation: {
17128
+ mode: "overview",
17129
+ threshold,
17130
+ nodeBudget,
17131
+ totalNodes: graph.nodes.length,
17132
+ displayedNodes: nodes.length,
17133
+ totalEdges: graph.edges.length,
17134
+ displayedEdges: edges.length,
17135
+ totalCommunities,
17136
+ displayedCommunities: communities.length
17137
+ }
17138
+ };
17139
+ }
17140
+
16224
17141
  // src/watch.ts
16225
17142
  import path26 from "path";
16226
17143
  import process3 from "process";
@@ -16686,7 +17603,7 @@ async function ensureViewerDist(viewerDistDir) {
16686
17603
  await execFileAsync("pnpm", ["build"], { cwd: viewerProjectDir });
16687
17604
  }
16688
17605
  }
16689
- async function startGraphServer(rootDir, port) {
17606
+ async function startGraphServer(rootDir, port, options = {}) {
16690
17607
  const { config, paths } = await loadVaultConfig(rootDir);
16691
17608
  const effectivePort = port ?? config.viewer.port;
16692
17609
  await ensureViewerDist(paths.viewerDistDir);
@@ -16698,8 +17615,16 @@ async function startGraphServer(rootDir, port) {
16698
17615
  response.end(JSON.stringify({ error: "Graph artifact not found. Run `swarmvault compile` first." }));
16699
17616
  return;
16700
17617
  }
17618
+ const graph = await readJsonFile(paths.graphPath);
17619
+ if (!graph) {
17620
+ response.writeHead(404, { "content-type": "application/json" });
17621
+ response.end(JSON.stringify({ error: "Graph artifact not found. Run `swarmvault compile` first." }));
17622
+ return;
17623
+ }
17624
+ const reportPath = path27.join(paths.wikiDir, "graph", "report.json");
17625
+ const report = await readJsonFile(reportPath) ?? null;
16701
17626
  response.writeHead(200, { "content-type": "application/json" });
16702
- response.end(await fs22.readFile(paths.graphPath, "utf8"));
17627
+ response.end(JSON.stringify(buildViewerGraphArtifact(graph, { report, full: options.full ?? false })));
16703
17628
  return;
16704
17629
  }
16705
17630
  if (url.pathname === "/api/graph/query") {
@@ -16875,7 +17800,7 @@ async function startGraphServer(rootDir, port) {
16875
17800
  }
16876
17801
  };
16877
17802
  }
16878
- async function exportGraphHtml(rootDir, outputPath) {
17803
+ async function exportGraphHtml(rootDir, outputPath, options = {}) {
16879
17804
  const { paths } = await loadVaultConfig(rootDir);
16880
17805
  const graph = await readJsonFile(paths.graphPath);
16881
17806
  if (!graph) {
@@ -16919,7 +17844,11 @@ async function exportGraphHtml(rootDir, outputPath) {
16919
17844
  const script = await fs22.readFile(scriptPath, "utf8");
16920
17845
  const style = stylePath && await fileExists(stylePath) ? await fs22.readFile(stylePath, "utf8") : "";
16921
17846
  const report = await readJsonFile(path27.join(paths.wikiDir, "graph", "report.json"));
16922
- const embeddedData = JSON.stringify({ graph, pages: pages.filter(Boolean), report }, null, 2).replace(/</g, "\\u003c");
17847
+ const embeddedData = JSON.stringify(
17848
+ { graph: buildViewerGraphArtifact(graph, { report, full: options.full ?? false }), pages: pages.filter(Boolean), report },
17849
+ null,
17850
+ 2
17851
+ ).replace(/</g, "\\u003c");
16923
17852
  const html = [
16924
17853
  "<!doctype html>",
16925
17854
  '<html lang="en">',
@@ -16968,6 +17897,7 @@ export {
16968
17897
  importInbox,
16969
17898
  ingestDirectory,
16970
17899
  ingestInput,
17900
+ ingestInputDetailed,
16971
17901
  initVault,
16972
17902
  initWorkspace,
16973
17903
  installAgent,