@swarmvaultai/engine 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -1
- package/dist/index.d.ts +30 -5
- package/dist/index.js +1071 -141
- package/dist/viewer/assets/index-BHjjw4rU.css +1 -0
- package/dist/viewer/assets/{index-Csm8eB3P.js → index-DxKn2KOc.js} +23 -23
- package/dist/viewer/index.html +2 -2
- package/dist/viewer/lib.d.ts +11 -0
- package/package.json +3 -1
- package/dist/viewer/assets/index-DUJ6MWHL.css +0 -1
package/dist/index.js
CHANGED
|
@@ -1729,7 +1729,7 @@ import matter3 from "gray-matter";
|
|
|
1729
1729
|
import ignore from "ignore";
|
|
1730
1730
|
import { JSDOM as JSDOM2 } from "jsdom";
|
|
1731
1731
|
import mime from "mime-types";
|
|
1732
|
-
import
|
|
1732
|
+
import TurndownService2 from "turndown";
|
|
1733
1733
|
|
|
1734
1734
|
// src/code-analysis.ts
|
|
1735
1735
|
import fs6 from "fs/promises";
|
|
@@ -4481,9 +4481,10 @@ async function analyzeCodeSource(manifest, extractedText, schemaHash) {
|
|
|
4481
4481
|
const language = manifest.language ?? inferCodeLanguage(manifest.originalPath ?? manifest.storedPath, manifest.mimeType) ?? "typescript";
|
|
4482
4482
|
const { code, rationales } = language === "javascript" || language === "jsx" || language === "typescript" || language === "tsx" ? analyzeTypeScriptLikeCode(manifest, extractedText) : await analyzeTreeSitterCode(manifest, extractedText, language);
|
|
4483
4483
|
return {
|
|
4484
|
-
analysisVersion:
|
|
4484
|
+
analysisVersion: 7,
|
|
4485
4485
|
sourceId: manifest.sourceId,
|
|
4486
4486
|
sourceHash: manifest.contentHash,
|
|
4487
|
+
semanticHash: manifest.semanticHash,
|
|
4487
4488
|
extractionHash: manifest.extractionHash,
|
|
4488
4489
|
schemaHash,
|
|
4489
4490
|
title: manifest.title,
|
|
@@ -4503,8 +4504,10 @@ async function analyzeCodeSource(manifest, extractedText, schemaHash) {
|
|
|
4503
4504
|
import fs7 from "fs/promises";
|
|
4504
4505
|
import os from "os";
|
|
4505
4506
|
import path7 from "path";
|
|
4507
|
+
import { parse as parseCsvSync } from "csv-parse/sync";
|
|
4506
4508
|
import { strFromU8, unzipSync } from "fflate";
|
|
4507
4509
|
import { JSDOM } from "jsdom";
|
|
4510
|
+
import TurndownService from "turndown";
|
|
4508
4511
|
import { z } from "zod";
|
|
4509
4512
|
var imageVisionExtractionSchema = z.object({
|
|
4510
4513
|
title: z.string().min(1).nullable().optional(),
|
|
@@ -4684,7 +4687,7 @@ function normalizePdfMetadata(raw) {
|
|
|
4684
4687
|
function normalizeDocumentText(raw) {
|
|
4685
4688
|
return raw.replace(/\r\n/g, "\n").split(/\n{2,}/).map((section) => normalizeWhitespace(section)).filter(Boolean).join("\n\n").trim();
|
|
4686
4689
|
}
|
|
4687
|
-
function
|
|
4690
|
+
function parseOfficeCoreMetadata(bytes) {
|
|
4688
4691
|
try {
|
|
4689
4692
|
const archive = unzipSync(new Uint8Array(bytes));
|
|
4690
4693
|
const coreXml = archive["docProps/core.xml"];
|
|
@@ -4724,6 +4727,122 @@ function parseDocxCoreMetadata(bytes) {
|
|
|
4724
4727
|
return void 0;
|
|
4725
4728
|
}
|
|
4726
4729
|
}
|
|
4730
|
+
function decodeTextBytes(bytes) {
|
|
4731
|
+
const text = bytes.toString("utf8");
|
|
4732
|
+
return text.charCodeAt(0) === 65279 ? text.slice(1) : text;
|
|
4733
|
+
}
|
|
4734
|
+
function normalizeTableCell(value) {
|
|
4735
|
+
return normalizeWhitespace(String(value ?? ""));
|
|
4736
|
+
}
|
|
4737
|
+
function isNumericCell(value) {
|
|
4738
|
+
return value.length > 0 && Number.isFinite(Number(value));
|
|
4739
|
+
}
|
|
4740
|
+
function detectHeaderRow(rows) {
|
|
4741
|
+
if (!rows.length) {
|
|
4742
|
+
return { headers: [], bodyRows: [] };
|
|
4743
|
+
}
|
|
4744
|
+
const firstRow = rows[0] ?? [];
|
|
4745
|
+
const nonEmpty = firstRow.filter(Boolean);
|
|
4746
|
+
const unique = new Set(nonEmpty);
|
|
4747
|
+
const nonNumeric = nonEmpty.filter((value) => !isNumericCell(value));
|
|
4748
|
+
const looksLikeHeader = nonEmpty.length > 0 && unique.size === nonEmpty.length && nonNumeric.length >= Math.ceil(nonEmpty.length / 2) && rows.length > 1;
|
|
4749
|
+
if (looksLikeHeader) {
|
|
4750
|
+
return {
|
|
4751
|
+
headers: firstRow.map((value, index) => value || `column_${index + 1}`),
|
|
4752
|
+
bodyRows: rows.slice(1)
|
|
4753
|
+
};
|
|
4754
|
+
}
|
|
4755
|
+
const columnCount = Math.max(...rows.map((row) => row.length), 0);
|
|
4756
|
+
return {
|
|
4757
|
+
headers: Array.from({ length: columnCount }, (_, index) => `column_${index + 1}`),
|
|
4758
|
+
bodyRows: rows
|
|
4759
|
+
};
|
|
4760
|
+
}
|
|
4761
|
+
function columnHints(headers, rows) {
|
|
4762
|
+
return headers.map((header, index) => {
|
|
4763
|
+
const values = rows.map((row) => row[index] ?? "").map(normalizeTableCell).filter(Boolean);
|
|
4764
|
+
if (!values.length) {
|
|
4765
|
+
return null;
|
|
4766
|
+
}
|
|
4767
|
+
const uniqueValues = [...new Set(values)];
|
|
4768
|
+
if (values.every(isNumericCell)) {
|
|
4769
|
+
return `- ${header}: numeric`;
|
|
4770
|
+
}
|
|
4771
|
+
if (uniqueValues.length <= 6 && values.length >= uniqueValues.length) {
|
|
4772
|
+
return `- ${header}: low-cardinality (${uniqueValues.slice(0, 6).join(", ")})`;
|
|
4773
|
+
}
|
|
4774
|
+
return null;
|
|
4775
|
+
}).filter((item) => Boolean(item));
|
|
4776
|
+
}
|
|
4777
|
+
function markdownTable(headers, rows, rowLimit = 20) {
|
|
4778
|
+
if (!headers.length) {
|
|
4779
|
+
return ["No tabular preview available."];
|
|
4780
|
+
}
|
|
4781
|
+
const width = headers.length;
|
|
4782
|
+
const lines = [`| ${headers.join(" | ")} |`, `| ${headers.map(() => "---").join(" | ")} |`];
|
|
4783
|
+
for (const row of rows.slice(0, rowLimit)) {
|
|
4784
|
+
const normalized = Array.from({ length: width }, (_, index) => normalizeTableCell(row[index] ?? ""));
|
|
4785
|
+
lines.push(`| ${normalized.join(" | ")} |`);
|
|
4786
|
+
}
|
|
4787
|
+
return lines;
|
|
4788
|
+
}
|
|
4789
|
+
function zipEntryText(archive, entryPath) {
|
|
4790
|
+
const entry = archive[entryPath];
|
|
4791
|
+
return entry ? strFromU8(entry) : void 0;
|
|
4792
|
+
}
|
|
4793
|
+
function parseXmlDocument(xml) {
|
|
4794
|
+
return new JSDOM(xml, { contentType: "text/xml" }).window.document;
|
|
4795
|
+
}
|
|
4796
|
+
function zipDirname(value) {
|
|
4797
|
+
const index = value.lastIndexOf("/");
|
|
4798
|
+
return index === -1 ? "" : value.slice(0, index);
|
|
4799
|
+
}
|
|
4800
|
+
function resolveZipTarget(basePath, target) {
|
|
4801
|
+
return path7.posix.normalize(path7.posix.join(zipDirname(basePath), target));
|
|
4802
|
+
}
|
|
4803
|
+
function relationshipTargets(xml, basePath) {
|
|
4804
|
+
const document = parseXmlDocument(xml);
|
|
4805
|
+
const map = /* @__PURE__ */ new Map();
|
|
4806
|
+
for (const node of Array.from(document.getElementsByTagName("*"))) {
|
|
4807
|
+
if (node.localName !== "Relationship") {
|
|
4808
|
+
continue;
|
|
4809
|
+
}
|
|
4810
|
+
const id = node.getAttribute("Id")?.trim();
|
|
4811
|
+
const target = node.getAttribute("Target")?.trim();
|
|
4812
|
+
const type = node.getAttribute("Type")?.trim() ?? "";
|
|
4813
|
+
if (!id || !target) {
|
|
4814
|
+
continue;
|
|
4815
|
+
}
|
|
4816
|
+
map.set(id, { target: resolveZipTarget(basePath, target), type });
|
|
4817
|
+
}
|
|
4818
|
+
return map;
|
|
4819
|
+
}
|
|
4820
|
+
function xmlTextNodes(xml, localName) {
|
|
4821
|
+
const document = parseXmlDocument(xml);
|
|
4822
|
+
const values = [];
|
|
4823
|
+
for (const node of Array.from(document.getElementsByTagName("*"))) {
|
|
4824
|
+
if (node.localName !== localName) {
|
|
4825
|
+
continue;
|
|
4826
|
+
}
|
|
4827
|
+
const text = normalizeWhitespace(node.textContent ?? "");
|
|
4828
|
+
if (text) {
|
|
4829
|
+
values.push(text);
|
|
4830
|
+
}
|
|
4831
|
+
}
|
|
4832
|
+
return values;
|
|
4833
|
+
}
|
|
4834
|
+
function firstHtmlHeading(html) {
|
|
4835
|
+
const dom = new JSDOM(html);
|
|
4836
|
+
const heading = dom.window.document.querySelector("h1, h2, h3");
|
|
4837
|
+
const title = normalizeWhitespace(heading?.textContent ?? "");
|
|
4838
|
+
return title || void 0;
|
|
4839
|
+
}
|
|
4840
|
+
function htmlToMarkdown(html) {
|
|
4841
|
+
const dom = new JSDOM(html);
|
|
4842
|
+
const turndown = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced" });
|
|
4843
|
+
const body = dom.window.document.body?.innerHTML ?? html;
|
|
4844
|
+
return turndown.turndown(body).trim();
|
|
4845
|
+
}
|
|
4727
4846
|
async function extractPdfText(input) {
|
|
4728
4847
|
try {
|
|
4729
4848
|
const pdfjs = await import("pdfjs-dist/legacy/build/pdf.mjs");
|
|
@@ -4781,7 +4900,7 @@ async function extractDocxText(input) {
|
|
|
4781
4900
|
const warnings = result.messages.map((message) => normalizeWhitespace(message.message)).filter(Boolean).map((message) => truncate(message, 240));
|
|
4782
4901
|
const artifact = {
|
|
4783
4902
|
...extractionMetadata("docx", input.mimeType, "docx_text"),
|
|
4784
|
-
metadata:
|
|
4903
|
+
metadata: parseOfficeCoreMetadata(input.bytes),
|
|
4785
4904
|
warnings: warnings.length ? warnings : void 0
|
|
4786
4905
|
};
|
|
4787
4906
|
if (!extractedText) {
|
|
@@ -4800,6 +4919,258 @@ async function extractDocxText(input) {
|
|
|
4800
4919
|
};
|
|
4801
4920
|
}
|
|
4802
4921
|
}
|
|
4922
|
+
async function extractCsvText(input) {
|
|
4923
|
+
try {
|
|
4924
|
+
const rawText = decodeTextBytes(input.bytes);
|
|
4925
|
+
const delimiter = input.fileName?.toLowerCase().endsWith(".tsv") || input.mimeType.includes("tab-separated") ? " " : ",";
|
|
4926
|
+
const parsed = parseCsvSync(rawText, {
|
|
4927
|
+
delimiter,
|
|
4928
|
+
relax_column_count: true,
|
|
4929
|
+
skip_empty_lines: true,
|
|
4930
|
+
trim: true
|
|
4931
|
+
});
|
|
4932
|
+
const rows = parsed.map((row) => row.map((value) => normalizeTableCell(value)));
|
|
4933
|
+
const { headers, bodyRows } = detectHeaderRow(rows);
|
|
4934
|
+
const hintLines = columnHints(headers, bodyRows);
|
|
4935
|
+
const title = input.fileName ? path7.basename(input.fileName, path7.extname(input.fileName)) : void 0;
|
|
4936
|
+
const extractedText = [
|
|
4937
|
+
title ? `# ${title}` : null,
|
|
4938
|
+
`Format: ${delimiter === " " ? "TSV" : "CSV"}`,
|
|
4939
|
+
`Rows: ${bodyRows.length}`,
|
|
4940
|
+
`Columns: ${headers.length}`,
|
|
4941
|
+
headers.length ? `Headers: ${headers.join(", ")}` : null,
|
|
4942
|
+
"",
|
|
4943
|
+
hintLines.length ? "## Column Hints" : null,
|
|
4944
|
+
hintLines.length ? hintLines.join("\n") : null,
|
|
4945
|
+
hintLines.length ? "" : null,
|
|
4946
|
+
"## Preview",
|
|
4947
|
+
...markdownTable(headers, bodyRows)
|
|
4948
|
+
].filter((item) => Boolean(item)).join("\n").trim();
|
|
4949
|
+
const artifact = {
|
|
4950
|
+
...extractionMetadata("csv", input.mimeType, "csv_text"),
|
|
4951
|
+
metadata: {
|
|
4952
|
+
format: delimiter === " " ? "tsv" : "csv",
|
|
4953
|
+
row_count: String(bodyRows.length),
|
|
4954
|
+
column_count: String(headers.length),
|
|
4955
|
+
headers: headers.join(", ")
|
|
4956
|
+
}
|
|
4957
|
+
};
|
|
4958
|
+
return {
|
|
4959
|
+
title,
|
|
4960
|
+
extractedText,
|
|
4961
|
+
artifact
|
|
4962
|
+
};
|
|
4963
|
+
} catch (error) {
|
|
4964
|
+
return {
|
|
4965
|
+
artifact: {
|
|
4966
|
+
...extractionMetadata("csv", input.mimeType, "csv_text"),
|
|
4967
|
+
warnings: [`CSV extraction failed: ${error instanceof Error ? truncate(error.message, 240) : "unknown error"}`]
|
|
4968
|
+
}
|
|
4969
|
+
};
|
|
4970
|
+
}
|
|
4971
|
+
}
|
|
4972
|
+
async function extractXlsxText(input) {
|
|
4973
|
+
try {
|
|
4974
|
+
const XLSX = await import("xlsx");
|
|
4975
|
+
const workbook = XLSX.read(input.bytes, { type: "buffer", cellFormula: false, cellHTML: false, cellStyles: false });
|
|
4976
|
+
const allSheetNames = workbook.SheetNames;
|
|
4977
|
+
const sheetNames = allSheetNames.slice(0, 10);
|
|
4978
|
+
const sheetSections = [];
|
|
4979
|
+
const metadata = {
|
|
4980
|
+
sheet_count: String(allSheetNames.length),
|
|
4981
|
+
sheet_names: allSheetNames.join(", ")
|
|
4982
|
+
};
|
|
4983
|
+
for (const sheetName of sheetNames) {
|
|
4984
|
+
const sheet = workbook.Sheets[sheetName];
|
|
4985
|
+
if (!sheet) {
|
|
4986
|
+
continue;
|
|
4987
|
+
}
|
|
4988
|
+
const rows = XLSX.utils.sheet_to_json(sheet, {
|
|
4989
|
+
header: 1,
|
|
4990
|
+
raw: false,
|
|
4991
|
+
defval: ""
|
|
4992
|
+
}).map((row) => row.map((value) => normalizeTableCell(value)));
|
|
4993
|
+
const { headers, bodyRows } = detectHeaderRow(rows);
|
|
4994
|
+
sheetSections.push(`## Sheet: ${sheetName}`);
|
|
4995
|
+
sheetSections.push(`Rows: ${bodyRows.length}`);
|
|
4996
|
+
sheetSections.push(`Columns: ${headers.length}`);
|
|
4997
|
+
sheetSections.push(...markdownTable(headers, bodyRows));
|
|
4998
|
+
sheetSections.push("");
|
|
4999
|
+
}
|
|
5000
|
+
const title = normalizeWhitespace(String(workbook.Props?.Title ?? "")) || (input.fileName ? path7.basename(input.fileName, path7.extname(input.fileName)) : void 0);
|
|
5001
|
+
const extractedText = [
|
|
5002
|
+
title ? `# ${title}` : null,
|
|
5003
|
+
`Sheets: ${allSheetNames.length}`,
|
|
5004
|
+
allSheetNames.length ? `Sheet Names: ${allSheetNames.join(", ")}` : null,
|
|
5005
|
+
"",
|
|
5006
|
+
...sheetSections
|
|
5007
|
+
].filter((item) => Boolean(item)).join("\n").trim();
|
|
5008
|
+
const warnings = allSheetNames.length > sheetNames.length ? ["Workbook preview truncated to the first 10 sheets."] : void 0;
|
|
5009
|
+
return {
|
|
5010
|
+
title,
|
|
5011
|
+
extractedText,
|
|
5012
|
+
artifact: {
|
|
5013
|
+
...extractionMetadata("xlsx", input.mimeType, "xlsx_text"),
|
|
5014
|
+
metadata,
|
|
5015
|
+
warnings
|
|
5016
|
+
}
|
|
5017
|
+
};
|
|
5018
|
+
} catch (error) {
|
|
5019
|
+
return {
|
|
5020
|
+
artifact: {
|
|
5021
|
+
...extractionMetadata("xlsx", input.mimeType, "xlsx_text"),
|
|
5022
|
+
warnings: [`XLSX extraction failed: ${error instanceof Error ? truncate(error.message, 240) : "unknown error"}`]
|
|
5023
|
+
}
|
|
5024
|
+
};
|
|
5025
|
+
}
|
|
5026
|
+
}
|
|
5027
|
+
async function extractPptxText(input) {
|
|
5028
|
+
try {
|
|
5029
|
+
const archive = unzipSync(new Uint8Array(input.bytes));
|
|
5030
|
+
const presentationXml = zipEntryText(archive, "ppt/presentation.xml");
|
|
5031
|
+
if (!presentationXml) {
|
|
5032
|
+
throw new Error("Missing ppt/presentation.xml");
|
|
5033
|
+
}
|
|
5034
|
+
const relsXml = zipEntryText(archive, "ppt/_rels/presentation.xml.rels");
|
|
5035
|
+
if (!relsXml) {
|
|
5036
|
+
throw new Error("Missing ppt/_rels/presentation.xml.rels");
|
|
5037
|
+
}
|
|
5038
|
+
const rels = relationshipTargets(relsXml, "ppt/presentation.xml");
|
|
5039
|
+
const document = parseXmlDocument(presentationXml);
|
|
5040
|
+
const slideTargets = Array.from(document.getElementsByTagName("*")).filter((node) => node.localName === "sldId").map((node) => node.getAttribute("r:id")?.trim()).filter((value) => Boolean(value)).map((relationshipId) => rels.get(relationshipId)?.target).filter((value) => Boolean(value)).slice(0, 60);
|
|
5041
|
+
const slideSections = [];
|
|
5042
|
+
for (let index = 0; index < slideTargets.length; index += 1) {
|
|
5043
|
+
const slidePath = slideTargets[index];
|
|
5044
|
+
const slideXml = zipEntryText(archive, slidePath);
|
|
5045
|
+
if (!slideXml) {
|
|
5046
|
+
continue;
|
|
5047
|
+
}
|
|
5048
|
+
const slideTexts = xmlTextNodes(slideXml, "t");
|
|
5049
|
+
const slideTitle = slideTexts[0] ?? `Slide ${index + 1}`;
|
|
5050
|
+
slideSections.push(`## Slide ${index + 1}: ${slideTitle}`);
|
|
5051
|
+
if (slideTexts.length) {
|
|
5052
|
+
slideSections.push(slideTexts.join("\n"));
|
|
5053
|
+
}
|
|
5054
|
+
const slideRelsPath = `${zipDirname(slidePath)}/_rels/${path7.posix.basename(slidePath)}.rels`;
|
|
5055
|
+
const slideRelsXml = zipEntryText(archive, slideRelsPath);
|
|
5056
|
+
if (slideRelsXml) {
|
|
5057
|
+
const slideRels = relationshipTargets(slideRelsXml, slidePath);
|
|
5058
|
+
const notesTarget = [...slideRels.values()].find((entry) => entry.type.endsWith("/notesSlide"))?.target;
|
|
5059
|
+
if (notesTarget) {
|
|
5060
|
+
const notesXml = zipEntryText(archive, notesTarget);
|
|
5061
|
+
const noteTexts = notesXml ? xmlTextNodes(notesXml, "t") : [];
|
|
5062
|
+
if (noteTexts.length) {
|
|
5063
|
+
slideSections.push("Notes:");
|
|
5064
|
+
slideSections.push(noteTexts.join("\n"));
|
|
5065
|
+
}
|
|
5066
|
+
}
|
|
5067
|
+
}
|
|
5068
|
+
slideSections.push("");
|
|
5069
|
+
}
|
|
5070
|
+
const metadata = parseOfficeCoreMetadata(input.bytes);
|
|
5071
|
+
const title = metadata?.title || (input.fileName ? path7.basename(input.fileName, path7.extname(input.fileName)) : void 0);
|
|
5072
|
+
const extractedText = [title ? `# ${title}` : null, `Slides: ${slideTargets.length}`, "", ...slideSections].filter((item) => Boolean(item)).join("\n").trim();
|
|
5073
|
+
return {
|
|
5074
|
+
title,
|
|
5075
|
+
extractedText,
|
|
5076
|
+
artifact: {
|
|
5077
|
+
...extractionMetadata("pptx", input.mimeType, "pptx_text"),
|
|
5078
|
+
metadata: {
|
|
5079
|
+
...metadata ?? {},
|
|
5080
|
+
slide_count: String(slideTargets.length)
|
|
5081
|
+
},
|
|
5082
|
+
warnings: Array.from(document.getElementsByTagName("*")).filter((node) => node.localName === "sldId").length > slideTargets.length ? ["Slide extraction truncated to the first 60 slides."] : void 0
|
|
5083
|
+
}
|
|
5084
|
+
};
|
|
5085
|
+
} catch (error) {
|
|
5086
|
+
return {
|
|
5087
|
+
artifact: {
|
|
5088
|
+
...extractionMetadata("pptx", input.mimeType, "pptx_text"),
|
|
5089
|
+
warnings: [`PPTX extraction failed: ${error instanceof Error ? truncate(error.message, 240) : "unknown error"}`]
|
|
5090
|
+
}
|
|
5091
|
+
};
|
|
5092
|
+
}
|
|
5093
|
+
}
|
|
5094
|
+
async function extractEpubChapters(input) {
|
|
5095
|
+
try {
|
|
5096
|
+
const archive = unzipSync(new Uint8Array(input.bytes));
|
|
5097
|
+
const containerXml = zipEntryText(archive, "META-INF/container.xml");
|
|
5098
|
+
if (!containerXml) {
|
|
5099
|
+
throw new Error("Missing META-INF/container.xml");
|
|
5100
|
+
}
|
|
5101
|
+
const container = parseXmlDocument(containerXml);
|
|
5102
|
+
const rootfile = Array.from(container.getElementsByTagName("*")).find((node) => node.localName === "rootfile");
|
|
5103
|
+
const packagePath = rootfile?.getAttribute("full-path")?.trim();
|
|
5104
|
+
if (!packagePath) {
|
|
5105
|
+
throw new Error("EPUB container did not declare a package document.");
|
|
5106
|
+
}
|
|
5107
|
+
const packageXml = zipEntryText(archive, packagePath);
|
|
5108
|
+
if (!packageXml) {
|
|
5109
|
+
throw new Error(`Missing EPUB package document: ${packagePath}`);
|
|
5110
|
+
}
|
|
5111
|
+
const packageDocument = parseXmlDocument(packageXml);
|
|
5112
|
+
const manifestEntries = new Map(
|
|
5113
|
+
Array.from(packageDocument.getElementsByTagName("*")).filter((node) => node.localName === "item").map(
|
|
5114
|
+
(node) => [
|
|
5115
|
+
node.getAttribute("id")?.trim() ?? "",
|
|
5116
|
+
{
|
|
5117
|
+
href: node.getAttribute("href")?.trim() ?? "",
|
|
5118
|
+
mediaType: node.getAttribute("media-type")?.trim() ?? "",
|
|
5119
|
+
properties: node.getAttribute("properties")?.trim() ?? ""
|
|
5120
|
+
}
|
|
5121
|
+
]
|
|
5122
|
+
).filter(([id, item]) => Boolean(id && item.href))
|
|
5123
|
+
);
|
|
5124
|
+
const spineIds = Array.from(packageDocument.getElementsByTagName("*")).filter((node) => node.localName === "itemref").map((node) => node.getAttribute("idref")?.trim()).filter((value) => Boolean(value));
|
|
5125
|
+
const bookTitle = xmlTextNodes(packageXml, "title")[0] || (input.fileName ? path7.basename(input.fileName, path7.extname(input.fileName)) : void 0);
|
|
5126
|
+
const author = xmlTextNodes(packageXml, "creator")[0];
|
|
5127
|
+
const chapters = [];
|
|
5128
|
+
for (const spineId of spineIds) {
|
|
5129
|
+
const item = manifestEntries.get(spineId);
|
|
5130
|
+
if (!item || !item.mediaType.includes("html") && !item.mediaType.includes("xhtml")) {
|
|
5131
|
+
continue;
|
|
5132
|
+
}
|
|
5133
|
+
if (item.properties.split(/\s+/).includes("nav")) {
|
|
5134
|
+
continue;
|
|
5135
|
+
}
|
|
5136
|
+
const entryPath = resolveZipTarget(packagePath, item.href);
|
|
5137
|
+
const html = zipEntryText(archive, entryPath);
|
|
5138
|
+
if (!html) {
|
|
5139
|
+
continue;
|
|
5140
|
+
}
|
|
5141
|
+
const markdown = htmlToMarkdown(html);
|
|
5142
|
+
if (!markdown) {
|
|
5143
|
+
continue;
|
|
5144
|
+
}
|
|
5145
|
+
const chapterTitle = firstHtmlHeading(html) || markdown.match(/^#\s+(.+)$/m)?.[1]?.trim() || item.href;
|
|
5146
|
+
const normalizedTitle = normalizeWhitespace(chapterTitle);
|
|
5147
|
+
if (!normalizedTitle || /^table of contents$/i.test(normalizedTitle)) {
|
|
5148
|
+
continue;
|
|
5149
|
+
}
|
|
5150
|
+
chapters.push({
|
|
5151
|
+
partKey: item.href,
|
|
5152
|
+
title: normalizedTitle,
|
|
5153
|
+
markdown,
|
|
5154
|
+
metadata: {
|
|
5155
|
+
book_title: bookTitle ?? "",
|
|
5156
|
+
chapter_title: normalizedTitle,
|
|
5157
|
+
author: author ?? ""
|
|
5158
|
+
}
|
|
5159
|
+
});
|
|
5160
|
+
}
|
|
5161
|
+
return {
|
|
5162
|
+
title: bookTitle,
|
|
5163
|
+
author,
|
|
5164
|
+
chapters,
|
|
5165
|
+
warnings: chapters.length ? void 0 : ["EPUB extraction completed but found no chapter-like spine entries."]
|
|
5166
|
+
};
|
|
5167
|
+
} catch (error) {
|
|
5168
|
+
return {
|
|
5169
|
+
chapters: [],
|
|
5170
|
+
warnings: [`EPUB extraction failed: ${error instanceof Error ? truncate(error.message, 240) : "unknown error"}`]
|
|
5171
|
+
};
|
|
5172
|
+
}
|
|
5173
|
+
}
|
|
4803
5174
|
|
|
4804
5175
|
// src/logs.ts
|
|
4805
5176
|
import fs8 from "fs/promises";
|
|
@@ -5208,6 +5579,17 @@ var HARD_REPO_IGNORES = /* @__PURE__ */ new Set([".git", ".venv"]);
|
|
|
5208
5579
|
var PROGRESS_FILE_THRESHOLD = 150;
|
|
5209
5580
|
var PROGRESS_UPDATE_INTERVAL = 100;
|
|
5210
5581
|
var RST_HEADING_MARKERS = /* @__PURE__ */ new Set(["=", "-", "~", "^", '"', "#", "*", "+"]);
|
|
5582
|
+
var MARKDOWN_SEMANTIC_FRONTMATTER_KEYS = [
|
|
5583
|
+
"title",
|
|
5584
|
+
"summary",
|
|
5585
|
+
"description",
|
|
5586
|
+
"aliases",
|
|
5587
|
+
"tags",
|
|
5588
|
+
"authors",
|
|
5589
|
+
"published_at",
|
|
5590
|
+
"canonical_url",
|
|
5591
|
+
"source_type"
|
|
5592
|
+
];
|
|
5211
5593
|
function uniqueStrings(values) {
|
|
5212
5594
|
return [...new Set(values.filter(Boolean))];
|
|
5213
5595
|
}
|
|
@@ -5224,15 +5606,27 @@ function inferKind(mimeType, filePath) {
|
|
|
5224
5606
|
if (mimeType.includes("html")) {
|
|
5225
5607
|
return "html";
|
|
5226
5608
|
}
|
|
5227
|
-
if (mimeType.startsWith("text/")) {
|
|
5228
|
-
return "text";
|
|
5229
|
-
}
|
|
5230
5609
|
if (mimeType === "application/pdf" || filePath.toLowerCase().endsWith(".pdf")) {
|
|
5231
5610
|
return "pdf";
|
|
5232
5611
|
}
|
|
5233
5612
|
if (mimeType === "application/vnd.openxmlformats-officedocument.wordprocessingml.document" || filePath.toLowerCase().endsWith(".docx")) {
|
|
5234
5613
|
return "docx";
|
|
5235
5614
|
}
|
|
5615
|
+
if (mimeType === "application/epub+zip" || filePath.toLowerCase().endsWith(".epub")) {
|
|
5616
|
+
return "epub";
|
|
5617
|
+
}
|
|
5618
|
+
if (mimeType === "text/csv" || mimeType === "text/tab-separated-values" || filePath.toLowerCase().endsWith(".csv") || filePath.toLowerCase().endsWith(".tsv")) {
|
|
5619
|
+
return "csv";
|
|
5620
|
+
}
|
|
5621
|
+
if (mimeType.startsWith("text/")) {
|
|
5622
|
+
return "text";
|
|
5623
|
+
}
|
|
5624
|
+
if (mimeType === "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" || filePath.toLowerCase().endsWith(".xlsx")) {
|
|
5625
|
+
return "xlsx";
|
|
5626
|
+
}
|
|
5627
|
+
if (mimeType === "application/vnd.openxmlformats-officedocument.presentationml.presentation" || filePath.toLowerCase().endsWith(".pptx")) {
|
|
5628
|
+
return "pptx";
|
|
5629
|
+
}
|
|
5236
5630
|
if (mimeType.startsWith("image/")) {
|
|
5237
5631
|
return "image";
|
|
5238
5632
|
}
|
|
@@ -5258,6 +5652,10 @@ function guessMimeType(target) {
|
|
|
5258
5652
|
}
|
|
5259
5653
|
return mime.lookup(target) || "application/octet-stream";
|
|
5260
5654
|
}
|
|
5655
|
+
function sourceGroupIdFor(prepared) {
|
|
5656
|
+
const originKey = prepared.originType === "url" ? prepared.url ?? prepared.title : prepared.originalPath ?? prepared.title;
|
|
5657
|
+
return `${slugify(prepared.title)}-${sha256(originKey).slice(0, 8)}`;
|
|
5658
|
+
}
|
|
5261
5659
|
function rstAdornmentLine(line) {
|
|
5262
5660
|
const trimmed = line.trim();
|
|
5263
5661
|
if (trimmed.length < 3) {
|
|
@@ -5345,6 +5743,65 @@ function extractedTextForPlainSource(filePath, sourceKind, content) {
|
|
|
5345
5743
|
}
|
|
5346
5744
|
return content;
|
|
5347
5745
|
}
|
|
5746
|
+
function normalizeSemanticMarkdownScalar(value) {
|
|
5747
|
+
if (typeof value !== "string") {
|
|
5748
|
+
return void 0;
|
|
5749
|
+
}
|
|
5750
|
+
const normalized = normalizeWhitespace(value.trim());
|
|
5751
|
+
return normalized || void 0;
|
|
5752
|
+
}
|
|
5753
|
+
function normalizeSemanticMarkdownList(value) {
|
|
5754
|
+
if (!Array.isArray(value)) {
|
|
5755
|
+
return void 0;
|
|
5756
|
+
}
|
|
5757
|
+
const items = uniqueStrings(
|
|
5758
|
+
value.flatMap((item) => typeof item === "string" ? [normalizeWhitespace(item.trim())] : []).filter(Boolean)
|
|
5759
|
+
);
|
|
5760
|
+
return items.length ? items : void 0;
|
|
5761
|
+
}
|
|
5762
|
+
function semanticMarkdownTitle(fallback, content, filePath) {
|
|
5763
|
+
const parsed = matter3(content);
|
|
5764
|
+
const frontmatterTitle = normalizeSemanticMarkdownScalar(parsed.data.title);
|
|
5765
|
+
if (frontmatterTitle) {
|
|
5766
|
+
return frontmatterTitle;
|
|
5767
|
+
}
|
|
5768
|
+
return titleFromText(fallback, parsed.content, filePath);
|
|
5769
|
+
}
|
|
5770
|
+
function semanticMarkdownContent(content) {
|
|
5771
|
+
const parsed = matter3(content);
|
|
5772
|
+
const body = parsed.content.replace(/\r\n?/g, "\n").trim();
|
|
5773
|
+
const semanticFrontmatter = Object.fromEntries(
|
|
5774
|
+
MARKDOWN_SEMANTIC_FRONTMATTER_KEYS.flatMap((key) => {
|
|
5775
|
+
const value = key === "aliases" || key === "tags" || key === "authors" ? normalizeSemanticMarkdownList(parsed.data[key]) : normalizeSemanticMarkdownScalar(parsed.data[key]);
|
|
5776
|
+
return value === void 0 ? [] : [[key, value]];
|
|
5777
|
+
})
|
|
5778
|
+
);
|
|
5779
|
+
const semanticLines = Object.entries(semanticFrontmatter).map(
|
|
5780
|
+
([key, value]) => `${key}: ${Array.isArray(value) ? value.join(", ") : value}`
|
|
5781
|
+
);
|
|
5782
|
+
const extractedText = [...semanticLines, ...semanticLines.length && body ? [""] : [], body].filter(Boolean).join("\n").trim();
|
|
5783
|
+
return {
|
|
5784
|
+
extractedText,
|
|
5785
|
+
semanticHash: sha256(
|
|
5786
|
+
JSON.stringify({
|
|
5787
|
+
body,
|
|
5788
|
+
frontmatter: semanticFrontmatter
|
|
5789
|
+
})
|
|
5790
|
+
)
|
|
5791
|
+
};
|
|
5792
|
+
}
|
|
5793
|
+
function finalizePreparedInput(prepared) {
|
|
5794
|
+
if (prepared.sourceKind !== "markdown") {
|
|
5795
|
+
return prepared;
|
|
5796
|
+
}
|
|
5797
|
+
const semantic = semanticMarkdownContent(prepared.payloadBytes.toString("utf8"));
|
|
5798
|
+
return {
|
|
5799
|
+
...prepared,
|
|
5800
|
+
extractedText: semantic.extractedText,
|
|
5801
|
+
extractionHash: buildExtractionHash(semantic.extractedText, prepared.extractionArtifact),
|
|
5802
|
+
semanticHash: semantic.semanticHash
|
|
5803
|
+
};
|
|
5804
|
+
}
|
|
5348
5805
|
function shouldEmitProgress(totalItems) {
|
|
5349
5806
|
return totalItems >= PROGRESS_FILE_THRESHOLD && Boolean(process.stderr?.isTTY);
|
|
5350
5807
|
}
|
|
@@ -5511,7 +5968,7 @@ function markdownFrontmatter(value) {
|
|
|
5511
5968
|
return matter3.stringify("", normalized).trimEnd().split("\n").concat([""]);
|
|
5512
5969
|
}
|
|
5513
5970
|
function prepareCapturedMarkdownInput(input) {
|
|
5514
|
-
return {
|
|
5971
|
+
return finalizePreparedInput({
|
|
5515
5972
|
title: input.title,
|
|
5516
5973
|
originType: "url",
|
|
5517
5974
|
sourceKind: "markdown",
|
|
@@ -5523,7 +5980,7 @@ function prepareCapturedMarkdownInput(input) {
|
|
|
5523
5980
|
extractedText: input.markdown,
|
|
5524
5981
|
attachments: input.attachments,
|
|
5525
5982
|
logDetails: input.logDetails
|
|
5526
|
-
};
|
|
5983
|
+
});
|
|
5527
5984
|
}
|
|
5528
5985
|
function isPrivateIp(ip) {
|
|
5529
5986
|
if (ip === "::1" || ip.startsWith("fc") || ip.startsWith("fd")) return true;
|
|
@@ -5773,6 +6230,9 @@ function manifestMatchesOrigin(manifest, prepared) {
|
|
|
5773
6230
|
}
|
|
5774
6231
|
return Boolean(prepared.originalPath && manifest.originalPath && toPosix(manifest.originalPath) === toPosix(prepared.originalPath));
|
|
5775
6232
|
}
|
|
6233
|
+
function manifestMatchesOriginPart(manifest, prepared) {
|
|
6234
|
+
return manifestMatchesOrigin(manifest, prepared) && (manifest.sourcePartKey ?? "") === (prepared.sourcePartKey ?? "");
|
|
6235
|
+
}
|
|
5776
6236
|
function buildCompositeHash(payloadBytes, attachments = []) {
|
|
5777
6237
|
if (!attachments.length) {
|
|
5778
6238
|
return sha256(payloadBytes);
|
|
@@ -5870,7 +6330,7 @@ function extractMarkdownImageReferences(content, baseUrl) {
|
|
|
5870
6330
|
async function convertHtmlToMarkdown(html, url) {
|
|
5871
6331
|
const dom = new JSDOM2(html, { url });
|
|
5872
6332
|
const article = new Readability(dom.window.document).parse();
|
|
5873
|
-
const turndown = new
|
|
6333
|
+
const turndown = new TurndownService2({ headingStyle: "atx", codeBlockStyle: "fenced" });
|
|
5874
6334
|
const body = article?.content ?? dom.window.document.body.innerHTML;
|
|
5875
6335
|
const markdown = turndown.turndown(body);
|
|
5876
6336
|
return {
|
|
@@ -5886,23 +6346,34 @@ async function readManifestByHash(manifestsDir, contentHash) {
|
|
|
5886
6346
|
}
|
|
5887
6347
|
const manifest = await readJsonFile(path12.join(manifestsDir, entry.name));
|
|
5888
6348
|
if (manifest?.contentHash === contentHash) {
|
|
5889
|
-
return
|
|
6349
|
+
return {
|
|
6350
|
+
...manifest,
|
|
6351
|
+
semanticHash: manifest.semanticHash ?? manifest.contentHash
|
|
6352
|
+
};
|
|
5890
6353
|
}
|
|
5891
6354
|
}
|
|
5892
6355
|
return null;
|
|
5893
6356
|
}
|
|
5894
|
-
async function
|
|
6357
|
+
async function readManifestsByOrigin(manifestsDir, prepared) {
|
|
5895
6358
|
const entries = await fs11.readdir(manifestsDir, { withFileTypes: true }).catch(() => []);
|
|
6359
|
+
const manifests = [];
|
|
5896
6360
|
for (const entry of entries) {
|
|
5897
6361
|
if (!entry.isFile() || !entry.name.endsWith(".json")) {
|
|
5898
6362
|
continue;
|
|
5899
6363
|
}
|
|
5900
6364
|
const manifest = await readJsonFile(path12.join(manifestsDir, entry.name));
|
|
5901
6365
|
if (manifest && manifestMatchesOrigin(manifest, prepared)) {
|
|
5902
|
-
|
|
6366
|
+
manifests.push({
|
|
6367
|
+
...manifest,
|
|
6368
|
+
semanticHash: manifest.semanticHash ?? manifest.contentHash
|
|
6369
|
+
});
|
|
5903
6370
|
}
|
|
5904
6371
|
}
|
|
5905
|
-
return
|
|
6372
|
+
return manifests;
|
|
6373
|
+
}
|
|
6374
|
+
async function readManifestByOrigin(manifestsDir, prepared) {
|
|
6375
|
+
const manifests = await readManifestsByOrigin(manifestsDir, prepared);
|
|
6376
|
+
return manifests.find((manifest) => manifestMatchesOriginPart(manifest, prepared)) ?? null;
|
|
5906
6377
|
}
|
|
5907
6378
|
async function loadGitignoreMatcher(repoRoot, enabled) {
|
|
5908
6379
|
if (!enabled) {
|
|
@@ -6148,10 +6619,11 @@ async function persistPreparedInput(rootDir, prepared, paths) {
|
|
|
6148
6619
|
await ensureDir(paths.extractsDir);
|
|
6149
6620
|
const attachments = prepared.attachments ?? [];
|
|
6150
6621
|
const contentHash = prepared.contentHash ?? buildCompositeHash(prepared.payloadBytes, attachments);
|
|
6622
|
+
const semanticHash = prepared.semanticHash ?? contentHash;
|
|
6151
6623
|
const extractionHash = prepared.extractionHash ?? buildExtractionHash(prepared.extractedText, prepared.extractionArtifact);
|
|
6152
6624
|
const existingByOrigin = await readManifestByOrigin(paths.manifestsDir, prepared);
|
|
6153
|
-
const existingByHash = existingByOrigin ? null : await readManifestByHash(paths.manifestsDir, contentHash);
|
|
6154
|
-
if (existingByOrigin && existingByOrigin.contentHash === contentHash && existingByOrigin.extractionHash === extractionHash && existingByOrigin.title === prepared.title && existingByOrigin.sourceKind === prepared.sourceKind && existingByOrigin.sourceType === prepared.sourceType && existingByOrigin.sourceClass === prepared.sourceClass && existingByOrigin.language === prepared.language && existingByOrigin.mimeType === prepared.mimeType && existingByOrigin.repoRelativePath === prepared.repoRelativePath) {
|
|
6625
|
+
const existingByHash = existingByOrigin || prepared.sourcePartKey ? null : await readManifestByHash(paths.manifestsDir, contentHash);
|
|
6626
|
+
if (existingByOrigin && existingByOrigin.contentHash === contentHash && existingByOrigin.semanticHash === semanticHash && existingByOrigin.extractionHash === extractionHash && existingByOrigin.title === prepared.title && existingByOrigin.sourceKind === prepared.sourceKind && existingByOrigin.sourceType === prepared.sourceType && existingByOrigin.sourceClass === prepared.sourceClass && existingByOrigin.language === prepared.language && existingByOrigin.mimeType === prepared.mimeType && existingByOrigin.repoRelativePath === prepared.repoRelativePath && existingByOrigin.sourceGroupId === prepared.sourceGroupId && existingByOrigin.sourceGroupTitle === prepared.sourceGroupTitle && existingByOrigin.sourcePartKey === prepared.sourcePartKey && existingByOrigin.partIndex === prepared.partIndex && existingByOrigin.partCount === prepared.partCount && existingByOrigin.partTitle === prepared.partTitle && JSON.stringify(existingByOrigin.details ?? {}) === JSON.stringify(prepared.details ?? {})) {
|
|
6155
6627
|
return { manifest: existingByOrigin, isNew: false, wasUpdated: false };
|
|
6156
6628
|
}
|
|
6157
6629
|
if (existingByHash) {
|
|
@@ -6209,6 +6681,14 @@ async function persistPreparedInput(rootDir, prepared, paths) {
|
|
|
6209
6681
|
extractionHash,
|
|
6210
6682
|
mimeType: prepared.mimeType,
|
|
6211
6683
|
contentHash,
|
|
6684
|
+
semanticHash,
|
|
6685
|
+
sourceGroupId: prepared.sourceGroupId,
|
|
6686
|
+
sourceGroupTitle: prepared.sourceGroupTitle,
|
|
6687
|
+
sourcePartKey: prepared.sourcePartKey,
|
|
6688
|
+
partIndex: prepared.partIndex,
|
|
6689
|
+
partCount: prepared.partCount,
|
|
6690
|
+
partTitle: prepared.partTitle,
|
|
6691
|
+
details: prepared.details,
|
|
6212
6692
|
createdAt: previous?.createdAt ?? now,
|
|
6213
6693
|
updatedAt: now,
|
|
6214
6694
|
attachments: manifestAttachments.length ? manifestAttachments : void 0
|
|
@@ -6230,6 +6710,42 @@ async function persistPreparedInput(rootDir, prepared, paths) {
|
|
|
6230
6710
|
}
|
|
6231
6711
|
return { manifest, isNew: !previous, wasUpdated: Boolean(previous) };
|
|
6232
6712
|
}
|
|
6713
|
+
async function persistPreparedInputs(rootDir, input, preparedInputs, paths) {
|
|
6714
|
+
const template = preparedInputs[0];
|
|
6715
|
+
const existingByOrigin = template ? await readManifestsByOrigin(paths.manifestsDir, template) : [];
|
|
6716
|
+
const created = [];
|
|
6717
|
+
const updated = [];
|
|
6718
|
+
const unchanged = [];
|
|
6719
|
+
const removed = [];
|
|
6720
|
+
const seenSourceIds = /* @__PURE__ */ new Set();
|
|
6721
|
+
for (const prepared of preparedInputs) {
|
|
6722
|
+
const result = await persistPreparedInput(rootDir, prepared, paths);
|
|
6723
|
+
if (result.isNew) {
|
|
6724
|
+
created.push(result.manifest);
|
|
6725
|
+
} else if (result.wasUpdated) {
|
|
6726
|
+
updated.push(result.manifest);
|
|
6727
|
+
} else {
|
|
6728
|
+
unchanged.push(result.manifest);
|
|
6729
|
+
}
|
|
6730
|
+
seenSourceIds.add(result.manifest.sourceId);
|
|
6731
|
+
}
|
|
6732
|
+
for (const manifest of existingByOrigin) {
|
|
6733
|
+
if (seenSourceIds.has(manifest.sourceId)) {
|
|
6734
|
+
continue;
|
|
6735
|
+
}
|
|
6736
|
+
await removeManifestArtifacts(rootDir, manifest, paths);
|
|
6737
|
+
removed.push(manifest);
|
|
6738
|
+
}
|
|
6739
|
+
return {
|
|
6740
|
+
input,
|
|
6741
|
+
scannedCount: preparedInputs.length,
|
|
6742
|
+
created,
|
|
6743
|
+
updated,
|
|
6744
|
+
unchanged,
|
|
6745
|
+
removed,
|
|
6746
|
+
skipped: []
|
|
6747
|
+
};
|
|
6748
|
+
}
|
|
6233
6749
|
async function removeManifestArtifacts(rootDir, manifest, paths) {
|
|
6234
6750
|
await fs11.rm(path12.join(paths.manifestsDir, `${manifest.sourceId}.json`), { force: true });
|
|
6235
6751
|
await fs11.rm(path12.resolve(rootDir, manifest.storedPath), { force: true });
|
|
@@ -6256,10 +6772,10 @@ function repoSyncWorkspaceIgnorePaths(rootDir, paths, repoRoot) {
|
|
|
6256
6772
|
return candidates.map((candidate) => path12.resolve(candidate)).filter((candidate, index, items) => items.indexOf(candidate) === index).filter((candidate) => withinRoot(repoRoot, candidate));
|
|
6257
6773
|
}
|
|
6258
6774
|
function preparedMatchesManifest(manifest, prepared, contentHash) {
|
|
6259
|
-
return manifest.contentHash === contentHash && manifest.extractionHash === (prepared.extractionHash ?? buildExtractionHash(prepared.extractedText, prepared.extractionArtifact)) && manifest.title === prepared.title && manifest.sourceKind === prepared.sourceKind && manifest.sourceType === prepared.sourceType && manifest.sourceClass === prepared.sourceClass && manifest.language === prepared.language && manifest.mimeType === prepared.mimeType && manifest.repoRelativePath === prepared.repoRelativePath;
|
|
6775
|
+
return manifest.contentHash === contentHash && manifest.extractionHash === (prepared.extractionHash ?? buildExtractionHash(prepared.extractedText, prepared.extractionArtifact)) && manifest.semanticHash === (prepared.semanticHash ?? contentHash) && manifest.title === prepared.title && manifest.sourceKind === prepared.sourceKind && manifest.sourceType === prepared.sourceType && manifest.sourceClass === prepared.sourceClass && manifest.language === prepared.language && manifest.mimeType === prepared.mimeType && manifest.repoRelativePath === prepared.repoRelativePath && manifest.sourceGroupId === prepared.sourceGroupId && manifest.sourceGroupTitle === prepared.sourceGroupTitle && manifest.sourcePartKey === prepared.sourcePartKey && manifest.partIndex === prepared.partIndex && manifest.partCount === prepared.partCount && manifest.partTitle === prepared.partTitle && JSON.stringify(manifest.details ?? {}) === JSON.stringify(prepared.details ?? {});
|
|
6260
6776
|
}
|
|
6261
6777
|
function shouldDeferWatchSemanticRefresh(sourceKind) {
|
|
6262
|
-
return sourceKind === "markdown" || sourceKind === "text" || sourceKind === "html" || sourceKind === "pdf" || sourceKind === "docx" || sourceKind === "image";
|
|
6778
|
+
return sourceKind === "markdown" || sourceKind === "text" || sourceKind === "html" || sourceKind === "pdf" || sourceKind === "docx" || sourceKind === "epub" || sourceKind === "csv" || sourceKind === "xlsx" || sourceKind === "pptx" || sourceKind === "image";
|
|
6263
6779
|
}
|
|
6264
6780
|
function pendingSemanticRefreshId(changeType, repoRoot, relativePath) {
|
|
6265
6781
|
return `pending:${changeType}:${sha256(`${toPosix(repoRoot)}:${relativePath}`).slice(0, 12)}`;
|
|
@@ -6325,13 +6841,16 @@ async function syncTrackedRepos(rootDir, options, repoRoots) {
|
|
|
6325
6841
|
const currentPaths = new Set(files.map((absolutePath) => path12.resolve(absolutePath)));
|
|
6326
6842
|
for (const absolutePath of files) {
|
|
6327
6843
|
const relativePath = repoRelativePathFor(absolutePath, repoRoot) ?? toPosix(path12.relative(repoRoot, absolutePath));
|
|
6328
|
-
const
|
|
6329
|
-
|
|
6330
|
-
|
|
6331
|
-
|
|
6332
|
-
|
|
6333
|
-
|
|
6334
|
-
|
|
6844
|
+
const preparedInputs = await prepareFileInputs(
|
|
6845
|
+
rootDir,
|
|
6846
|
+
absolutePath,
|
|
6847
|
+
repoRoot,
|
|
6848
|
+
sourceClassForRelativePath(relativePath, normalizedOptions)
|
|
6849
|
+
);
|
|
6850
|
+
const result = await persistPreparedInputs(rootDir, absolutePath, preparedInputs, paths);
|
|
6851
|
+
imported.push(...result.created);
|
|
6852
|
+
updated.push(...result.updated);
|
|
6853
|
+
removed.push(...result.removed);
|
|
6335
6854
|
progress.tick();
|
|
6336
6855
|
}
|
|
6337
6856
|
progress.finish(`repo=${toPosix(path12.relative(rootDir, repoRoot)) || "."}`);
|
|
@@ -6390,9 +6909,6 @@ async function syncTrackedReposForWatch(rootDir, options, repoRoots) {
|
|
|
6390
6909
|
let scannedCount = 0;
|
|
6391
6910
|
for (const repoRoot of uniqueRoots) {
|
|
6392
6911
|
const repoManifests = manifestsByRepoRoot.get(repoRoot) ?? [];
|
|
6393
|
-
const manifestsByOriginalPath = new Map(
|
|
6394
|
-
repoManifests.filter((manifest) => manifest.originalPath).map((manifest) => [path12.resolve(manifest.originalPath), manifest])
|
|
6395
|
-
);
|
|
6396
6912
|
if (!await fileExists(repoRoot)) {
|
|
6397
6913
|
for (const manifest of repoManifests) {
|
|
6398
6914
|
if (shouldDeferWatchSemanticRefresh(manifest.sourceKind)) {
|
|
@@ -6428,38 +6944,50 @@ async function syncTrackedReposForWatch(rootDir, options, repoRoots) {
|
|
|
6428
6944
|
const currentPaths = new Set(files.map((absolutePath) => path12.resolve(absolutePath)));
|
|
6429
6945
|
for (const absolutePath of files) {
|
|
6430
6946
|
const relativePath = repoRelativePathFor(absolutePath, repoRoot) ?? toPosix(path12.relative(repoRoot, absolutePath));
|
|
6431
|
-
const
|
|
6432
|
-
|
|
6433
|
-
|
|
6434
|
-
|
|
6435
|
-
|
|
6947
|
+
const preparedInputs = await prepareFileInputs(
|
|
6948
|
+
rootDir,
|
|
6949
|
+
absolutePath,
|
|
6950
|
+
repoRoot,
|
|
6951
|
+
sourceClassForRelativePath(relativePath, normalizedOptions)
|
|
6952
|
+
);
|
|
6953
|
+
const firstPrepared = preparedInputs[0];
|
|
6954
|
+
if (firstPrepared && shouldDeferWatchSemanticRefresh(firstPrepared.sourceKind)) {
|
|
6955
|
+
const existing = repoManifests.filter(
|
|
6956
|
+
(manifest) => manifest.originalPath && path12.resolve(manifest.originalPath) === path12.resolve(absolutePath)
|
|
6957
|
+
);
|
|
6958
|
+
const existingByPartKey = new Map(existing.map((manifest) => [manifest.sourcePartKey ?? "__single__", manifest]));
|
|
6959
|
+
const changed = existing.length !== preparedInputs.length || preparedInputs.some((prepared) => {
|
|
6960
|
+
const match = existingByPartKey.get(prepared.sourcePartKey ?? "__single__");
|
|
6961
|
+
const contentHash = buildCompositeHash(prepared.payloadBytes, prepared.attachments);
|
|
6962
|
+
return !match || !preparedMatchesManifest(match, prepared, contentHash);
|
|
6963
|
+
}) || existing.some(
|
|
6964
|
+
(manifest) => !preparedInputs.some((prepared) => (prepared.sourcePartKey ?? "") === (manifest.sourcePartKey ?? ""))
|
|
6965
|
+
);
|
|
6436
6966
|
if (changed) {
|
|
6437
6967
|
pendingSemanticRefresh.push({
|
|
6438
6968
|
id: pendingSemanticRefreshId(
|
|
6439
|
-
existing ? "modified" : "added",
|
|
6969
|
+
existing.length ? "modified" : "added",
|
|
6440
6970
|
repoRoot,
|
|
6441
|
-
|
|
6971
|
+
firstPrepared.repoRelativePath ?? toPosix(path12.relative(repoRoot, absolutePath))
|
|
6442
6972
|
),
|
|
6443
6973
|
repoRoot,
|
|
6444
6974
|
path: toPosix(path12.relative(rootDir, absolutePath)),
|
|
6445
|
-
changeType: existing ? "modified" : "added",
|
|
6975
|
+
changeType: existing.length ? "modified" : "added",
|
|
6446
6976
|
detectedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
6447
|
-
sourceId: existing?.sourceId,
|
|
6448
|
-
sourceKind:
|
|
6977
|
+
sourceId: existing[0]?.sourceId,
|
|
6978
|
+
sourceKind: firstPrepared.sourceKind
|
|
6449
6979
|
});
|
|
6450
|
-
|
|
6451
|
-
staleSourceIds.add(
|
|
6980
|
+
for (const manifest of existing) {
|
|
6981
|
+
staleSourceIds.add(manifest.sourceId);
|
|
6452
6982
|
}
|
|
6453
6983
|
}
|
|
6454
6984
|
progress.tick();
|
|
6455
6985
|
continue;
|
|
6456
6986
|
}
|
|
6457
|
-
const result = await
|
|
6458
|
-
|
|
6459
|
-
|
|
6460
|
-
|
|
6461
|
-
updated.push(result.manifest);
|
|
6462
|
-
}
|
|
6987
|
+
const result = await persistPreparedInputs(rootDir, absolutePath, preparedInputs, paths);
|
|
6988
|
+
imported.push(...result.created);
|
|
6989
|
+
updated.push(...result.updated);
|
|
6990
|
+
removed.push(...result.removed);
|
|
6463
6991
|
progress.tick();
|
|
6464
6992
|
}
|
|
6465
6993
|
progress.finish(`repo=${toPosix(path12.relative(rootDir, repoRoot)) || "."}`);
|
|
@@ -6513,7 +7041,7 @@ async function syncTrackedReposForWatch(rootDir, options, repoRoots) {
|
|
|
6513
7041
|
staleSourceIds: [...staleSourceIds]
|
|
6514
7042
|
};
|
|
6515
7043
|
}
|
|
6516
|
-
async function
|
|
7044
|
+
async function prepareFileInputs(rootDir, absoluteInput, repoRoot, sourceClass) {
|
|
6517
7045
|
const payloadBytes = await fs11.readFile(absoluteInput);
|
|
6518
7046
|
const mimeType = guessMimeType(absoluteInput);
|
|
6519
7047
|
const sourceKind = inferKind(mimeType, absoluteInput);
|
|
@@ -6523,14 +7051,15 @@ async function prepareFileInput(rootDir, absoluteInput, repoRoot, sourceClass) {
|
|
|
6523
7051
|
let extractedText;
|
|
6524
7052
|
let extractionArtifact;
|
|
6525
7053
|
if (sourceKind === "markdown" || sourceKind === "text" || sourceKind === "code") {
|
|
6526
|
-
|
|
6527
|
-
|
|
7054
|
+
const rawText = payloadBytes.toString("utf8");
|
|
7055
|
+
extractedText = sourceKind === "markdown" ? semanticMarkdownContent(rawText).extractedText : extractedTextForPlainSource(absoluteInput, sourceKind, rawText);
|
|
7056
|
+
title = sourceKind === "markdown" ? semanticMarkdownTitle(path12.basename(absoluteInput, path12.extname(absoluteInput)), rawText, absoluteInput) : titleFromText(path12.basename(absoluteInput, path12.extname(absoluteInput)), extractedText, absoluteInput);
|
|
6528
7057
|
extractionArtifact = createPlainTextExtractionArtifact(sourceKind, mimeType);
|
|
6529
7058
|
} else if (sourceKind === "html") {
|
|
6530
7059
|
const html = payloadBytes.toString("utf8");
|
|
6531
7060
|
const converted = await convertHtmlToMarkdown(html, pathToFileURL(absoluteInput).toString());
|
|
6532
7061
|
title = converted.title;
|
|
6533
|
-
extractedText = converted.markdown;
|
|
7062
|
+
extractedText = semanticMarkdownContent(converted.markdown).extractedText;
|
|
6534
7063
|
extractionArtifact = createHtmlReadabilityExtractionArtifact(sourceKind, mimeType);
|
|
6535
7064
|
} else if (sourceKind === "pdf") {
|
|
6536
7065
|
title = path12.basename(absoluteInput, path12.extname(absoluteInput));
|
|
@@ -6543,6 +7072,94 @@ async function prepareFileInput(rootDir, absoluteInput, repoRoot, sourceClass) {
|
|
|
6543
7072
|
title = extracted.artifact.metadata?.title?.trim() || title;
|
|
6544
7073
|
extractedText = extracted.extractedText;
|
|
6545
7074
|
extractionArtifact = extracted.artifact;
|
|
7075
|
+
} else if (sourceKind === "csv") {
|
|
7076
|
+
title = path12.basename(absoluteInput, path12.extname(absoluteInput));
|
|
7077
|
+
const extracted = await extractCsvText({ mimeType, bytes: payloadBytes, fileName: absoluteInput });
|
|
7078
|
+
title = extracted.title?.trim() || title;
|
|
7079
|
+
extractedText = extracted.extractedText;
|
|
7080
|
+
extractionArtifact = extracted.artifact;
|
|
7081
|
+
} else if (sourceKind === "xlsx") {
|
|
7082
|
+
title = path12.basename(absoluteInput, path12.extname(absoluteInput));
|
|
7083
|
+
const extracted = await extractXlsxText({ mimeType, bytes: payloadBytes, fileName: absoluteInput });
|
|
7084
|
+
title = extracted.title?.trim() || title;
|
|
7085
|
+
extractedText = extracted.extractedText;
|
|
7086
|
+
extractionArtifact = extracted.artifact;
|
|
7087
|
+
} else if (sourceKind === "pptx") {
|
|
7088
|
+
title = path12.basename(absoluteInput, path12.extname(absoluteInput));
|
|
7089
|
+
const extracted = await extractPptxText({ mimeType, bytes: payloadBytes, fileName: absoluteInput });
|
|
7090
|
+
title = extracted.title?.trim() || title;
|
|
7091
|
+
extractedText = extracted.extractedText;
|
|
7092
|
+
extractionArtifact = extracted.artifact;
|
|
7093
|
+
} else if (sourceKind === "epub") {
|
|
7094
|
+
title = path12.basename(absoluteInput, path12.extname(absoluteInput));
|
|
7095
|
+
const extracted = await extractEpubChapters({ mimeType, bytes: payloadBytes, fileName: absoluteInput });
|
|
7096
|
+
title = extracted.title?.trim() || title;
|
|
7097
|
+
const groupId = sourceGroupIdFor({
|
|
7098
|
+
title,
|
|
7099
|
+
originType: "file",
|
|
7100
|
+
originalPath: toPosix(absoluteInput)
|
|
7101
|
+
});
|
|
7102
|
+
if (extracted.chapters.length) {
|
|
7103
|
+
return extracted.chapters.map(
|
|
7104
|
+
(chapter, index) => finalizePreparedInput({
|
|
7105
|
+
title: `${title} - ${chapter.title}`,
|
|
7106
|
+
originType: "file",
|
|
7107
|
+
sourceKind: "epub",
|
|
7108
|
+
sourceClass,
|
|
7109
|
+
originalPath: toPosix(absoluteInput),
|
|
7110
|
+
repoRelativePath: repoRelativePathFor(absoluteInput, repoRoot),
|
|
7111
|
+
mimeType: "text/markdown",
|
|
7112
|
+
storedExtension: ".md",
|
|
7113
|
+
payloadBytes: Buffer.from(chapter.markdown, "utf8"),
|
|
7114
|
+
extractedText: chapter.markdown,
|
|
7115
|
+
extractionArtifact: {
|
|
7116
|
+
extractor: "epub_text",
|
|
7117
|
+
sourceKind: "epub",
|
|
7118
|
+
mimeType,
|
|
7119
|
+
producedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
7120
|
+
metadata: {
|
|
7121
|
+
...chapter.metadata,
|
|
7122
|
+
chapter_index: String(index + 1),
|
|
7123
|
+
chapter_count: String(extracted.chapters.length)
|
|
7124
|
+
},
|
|
7125
|
+
warnings: extracted.warnings
|
|
7126
|
+
},
|
|
7127
|
+
extractionHash: buildExtractionHash(chapter.markdown, {
|
|
7128
|
+
extractor: "epub_text",
|
|
7129
|
+
sourceKind: "epub",
|
|
7130
|
+
mimeType,
|
|
7131
|
+
producedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
7132
|
+
metadata: {
|
|
7133
|
+
...chapter.metadata,
|
|
7134
|
+
chapter_index: String(index + 1),
|
|
7135
|
+
chapter_count: String(extracted.chapters.length)
|
|
7136
|
+
},
|
|
7137
|
+
warnings: extracted.warnings
|
|
7138
|
+
}),
|
|
7139
|
+
sourceGroupId: groupId,
|
|
7140
|
+
sourceGroupTitle: title,
|
|
7141
|
+
sourcePartKey: chapter.partKey,
|
|
7142
|
+
partIndex: index + 1,
|
|
7143
|
+
partCount: extracted.chapters.length,
|
|
7144
|
+
partTitle: chapter.title,
|
|
7145
|
+
details: {
|
|
7146
|
+
book_title: title,
|
|
7147
|
+
chapter_title: chapter.title,
|
|
7148
|
+
chapter_index: String(index + 1),
|
|
7149
|
+
chapter_count: String(extracted.chapters.length),
|
|
7150
|
+
...extracted.author ? { author: extracted.author } : {}
|
|
7151
|
+
}
|
|
7152
|
+
})
|
|
7153
|
+
);
|
|
7154
|
+
}
|
|
7155
|
+
extractedText = void 0;
|
|
7156
|
+
extractionArtifact = {
|
|
7157
|
+
extractor: "epub_text",
|
|
7158
|
+
sourceKind: "epub",
|
|
7159
|
+
mimeType,
|
|
7160
|
+
producedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
7161
|
+
warnings: extracted.warnings ?? ["EPUB extraction completed but produced no chapter content."]
|
|
7162
|
+
};
|
|
6546
7163
|
} else if (sourceKind === "image") {
|
|
6547
7164
|
title = path12.basename(absoluteInput, path12.extname(absoluteInput));
|
|
6548
7165
|
const extracted = await extractImageWithVision(rootDir, {
|
|
@@ -6556,23 +7173,33 @@ async function prepareFileInput(rootDir, absoluteInput, repoRoot, sourceClass) {
|
|
|
6556
7173
|
} else {
|
|
6557
7174
|
title = path12.basename(absoluteInput, path12.extname(absoluteInput));
|
|
6558
7175
|
}
|
|
6559
|
-
return
|
|
6560
|
-
|
|
6561
|
-
|
|
6562
|
-
|
|
6563
|
-
|
|
6564
|
-
|
|
6565
|
-
|
|
6566
|
-
|
|
6567
|
-
|
|
6568
|
-
|
|
6569
|
-
|
|
6570
|
-
|
|
6571
|
-
|
|
6572
|
-
|
|
6573
|
-
|
|
7176
|
+
return [
|
|
7177
|
+
finalizePreparedInput({
|
|
7178
|
+
title,
|
|
7179
|
+
originType: "file",
|
|
7180
|
+
sourceKind,
|
|
7181
|
+
sourceClass,
|
|
7182
|
+
language,
|
|
7183
|
+
originalPath: toPosix(absoluteInput),
|
|
7184
|
+
repoRelativePath: repoRelativePathFor(absoluteInput, repoRoot),
|
|
7185
|
+
mimeType,
|
|
7186
|
+
storedExtension,
|
|
7187
|
+
payloadBytes,
|
|
7188
|
+
extractedText,
|
|
7189
|
+
extractionArtifact,
|
|
7190
|
+
extractionHash: buildExtractionHash(extractedText, extractionArtifact),
|
|
7191
|
+
details: extractionArtifact?.metadata
|
|
7192
|
+
})
|
|
7193
|
+
];
|
|
6574
7194
|
}
|
|
6575
|
-
async function
|
|
7195
|
+
async function prepareFileInput(rootDir, absoluteInput, repoRoot, sourceClass) {
|
|
7196
|
+
const prepared = await prepareFileInputs(rootDir, absoluteInput, repoRoot, sourceClass);
|
|
7197
|
+
if (!prepared.length) {
|
|
7198
|
+
throw new Error(`No ingestable sources were extracted from ${absoluteInput}.`);
|
|
7199
|
+
}
|
|
7200
|
+
return prepared[0];
|
|
7201
|
+
}
|
|
7202
|
+
async function prepareUrlInputs(rootDir, input, options) {
|
|
6576
7203
|
await validateUrlSafety(input);
|
|
6577
7204
|
const response = await fetch(input);
|
|
6578
7205
|
if (!response.ok) {
|
|
@@ -6634,8 +7261,9 @@ async function prepareUrlInput(rootDir, input, options) {
|
|
|
6634
7261
|
const extension = path12.extname(inputUrl.pathname);
|
|
6635
7262
|
storedExtension = extension || `.${mime.extension(mimeType) || "bin"}`;
|
|
6636
7263
|
if (sourceKind === "markdown" || sourceKind === "text" || sourceKind === "code") {
|
|
6637
|
-
|
|
6638
|
-
|
|
7264
|
+
const rawText = payloadBytes.toString("utf8");
|
|
7265
|
+
extractedText = sourceKind === "markdown" ? semanticMarkdownContent(rawText).extractedText : extractedTextForPlainSource(inputUrl.pathname, sourceKind, rawText);
|
|
7266
|
+
title = sourceKind === "markdown" ? semanticMarkdownTitle(title || inputUrl.hostname, rawText, inputUrl.pathname) : titleFromText(title || inputUrl.hostname, extractedText, inputUrl.pathname);
|
|
6639
7267
|
extractionArtifact = createPlainTextExtractionArtifact(sourceKind, mimeType);
|
|
6640
7268
|
if (sourceKind === "markdown" && options.includeAssets) {
|
|
6641
7269
|
const { attachments: remoteAttachments, skippedCount } = await collectRemoteImageAttachments(
|
|
@@ -6666,6 +7294,88 @@ async function prepareUrlInput(rootDir, input, options) {
|
|
|
6666
7294
|
title = extracted.artifact.metadata?.title?.trim() || title;
|
|
6667
7295
|
extractedText = extracted.extractedText;
|
|
6668
7296
|
extractionArtifact = extracted.artifact;
|
|
7297
|
+
} else if (sourceKind === "csv") {
|
|
7298
|
+
const extracted = await extractCsvText({ mimeType, bytes: payloadBytes, fileName: inputUrl.pathname });
|
|
7299
|
+
title = extracted.title?.trim() || title;
|
|
7300
|
+
extractedText = extracted.extractedText;
|
|
7301
|
+
extractionArtifact = extracted.artifact;
|
|
7302
|
+
} else if (sourceKind === "xlsx") {
|
|
7303
|
+
const extracted = await extractXlsxText({ mimeType, bytes: payloadBytes, fileName: inputUrl.pathname });
|
|
7304
|
+
title = extracted.title?.trim() || title;
|
|
7305
|
+
extractedText = extracted.extractedText;
|
|
7306
|
+
extractionArtifact = extracted.artifact;
|
|
7307
|
+
} else if (sourceKind === "pptx") {
|
|
7308
|
+
const extracted = await extractPptxText({ mimeType, bytes: payloadBytes, fileName: inputUrl.pathname });
|
|
7309
|
+
title = extracted.title?.trim() || title;
|
|
7310
|
+
extractedText = extracted.extractedText;
|
|
7311
|
+
extractionArtifact = extracted.artifact;
|
|
7312
|
+
} else if (sourceKind === "epub") {
|
|
7313
|
+
const extracted = await extractEpubChapters({ mimeType, bytes: payloadBytes, fileName: inputUrl.pathname });
|
|
7314
|
+
title = extracted.title?.trim() || title;
|
|
7315
|
+
const groupId = sourceGroupIdFor({
|
|
7316
|
+
title,
|
|
7317
|
+
originType: "url",
|
|
7318
|
+
url: finalUrl
|
|
7319
|
+
});
|
|
7320
|
+
if (extracted.chapters.length) {
|
|
7321
|
+
return extracted.chapters.map(
|
|
7322
|
+
(chapter, index) => finalizePreparedInput({
|
|
7323
|
+
title: `${title} - ${chapter.title}`,
|
|
7324
|
+
originType: "url",
|
|
7325
|
+
sourceKind: "epub",
|
|
7326
|
+
url: finalUrl,
|
|
7327
|
+
mimeType: "text/markdown",
|
|
7328
|
+
storedExtension: ".md",
|
|
7329
|
+
payloadBytes: Buffer.from(chapter.markdown, "utf8"),
|
|
7330
|
+
extractedText: chapter.markdown,
|
|
7331
|
+
extractionArtifact: {
|
|
7332
|
+
extractor: "epub_text",
|
|
7333
|
+
sourceKind: "epub",
|
|
7334
|
+
mimeType,
|
|
7335
|
+
producedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
7336
|
+
metadata: {
|
|
7337
|
+
...chapter.metadata,
|
|
7338
|
+
chapter_index: String(index + 1),
|
|
7339
|
+
chapter_count: String(extracted.chapters.length)
|
|
7340
|
+
},
|
|
7341
|
+
warnings: extracted.warnings
|
|
7342
|
+
},
|
|
7343
|
+
extractionHash: buildExtractionHash(chapter.markdown, {
|
|
7344
|
+
extractor: "epub_text",
|
|
7345
|
+
sourceKind: "epub",
|
|
7346
|
+
mimeType,
|
|
7347
|
+
producedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
7348
|
+
metadata: {
|
|
7349
|
+
...chapter.metadata,
|
|
7350
|
+
chapter_index: String(index + 1),
|
|
7351
|
+
chapter_count: String(extracted.chapters.length)
|
|
7352
|
+
},
|
|
7353
|
+
warnings: extracted.warnings
|
|
7354
|
+
}),
|
|
7355
|
+
sourceGroupId: groupId,
|
|
7356
|
+
sourceGroupTitle: title,
|
|
7357
|
+
sourcePartKey: chapter.partKey,
|
|
7358
|
+
partIndex: index + 1,
|
|
7359
|
+
partCount: extracted.chapters.length,
|
|
7360
|
+
partTitle: chapter.title,
|
|
7361
|
+
details: {
|
|
7362
|
+
book_title: title,
|
|
7363
|
+
chapter_title: chapter.title,
|
|
7364
|
+
chapter_index: String(index + 1),
|
|
7365
|
+
chapter_count: String(extracted.chapters.length),
|
|
7366
|
+
...extracted.author ? { author: extracted.author } : {}
|
|
7367
|
+
},
|
|
7368
|
+
logDetails
|
|
7369
|
+
})
|
|
7370
|
+
);
|
|
7371
|
+
}
|
|
7372
|
+
extractionArtifact = {
|
|
7373
|
+
extractor: "epub_text",
|
|
7374
|
+
sourceKind: "epub",
|
|
7375
|
+
mimeType,
|
|
7376
|
+
producedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
7377
|
+
warnings: extracted.warnings ?? ["EPUB extraction completed but produced no chapter content."]
|
|
7378
|
+
};
|
|
6669
7379
|
} else if (sourceKind === "image") {
|
|
6670
7380
|
const extracted = await extractImageWithVision(rootDir, {
|
|
6671
7381
|
title,
|
|
@@ -6677,22 +7387,32 @@ async function prepareUrlInput(rootDir, input, options) {
|
|
|
6677
7387
|
extractionArtifact = extracted.artifact;
|
|
6678
7388
|
}
|
|
6679
7389
|
}
|
|
6680
|
-
return
|
|
6681
|
-
|
|
6682
|
-
|
|
6683
|
-
|
|
6684
|
-
|
|
6685
|
-
|
|
6686
|
-
|
|
6687
|
-
|
|
6688
|
-
|
|
6689
|
-
|
|
6690
|
-
|
|
6691
|
-
|
|
6692
|
-
|
|
6693
|
-
|
|
6694
|
-
|
|
6695
|
-
|
|
7390
|
+
return [
|
|
7391
|
+
finalizePreparedInput({
|
|
7392
|
+
title,
|
|
7393
|
+
originType: "url",
|
|
7394
|
+
sourceKind,
|
|
7395
|
+
language,
|
|
7396
|
+
url: finalUrl,
|
|
7397
|
+
mimeType,
|
|
7398
|
+
storedExtension,
|
|
7399
|
+
payloadBytes,
|
|
7400
|
+
extractedText,
|
|
7401
|
+
extractionArtifact,
|
|
7402
|
+
extractionHash: buildExtractionHash(extractedText, extractionArtifact),
|
|
7403
|
+
attachments,
|
|
7404
|
+
contentHash,
|
|
7405
|
+
details: extractionArtifact?.metadata,
|
|
7406
|
+
logDetails
|
|
7407
|
+
})
|
|
7408
|
+
];
|
|
7409
|
+
}
|
|
7410
|
+
async function prepareUrlInput(rootDir, input, options) {
|
|
7411
|
+
const prepared = await prepareUrlInputs(rootDir, input, options);
|
|
7412
|
+
if (!prepared.length) {
|
|
7413
|
+
throw new Error(`No ingestable sources were extracted from ${input}.`);
|
|
7414
|
+
}
|
|
7415
|
+
return prepared[0];
|
|
6696
7416
|
}
|
|
6697
7417
|
async function collectInboxAttachmentRefs(inputDir, files) {
|
|
6698
7418
|
const refsBySource = /* @__PURE__ */ new Map();
|
|
@@ -6766,7 +7486,7 @@ async function prepareInboxMarkdownInput(absolutePath, attachmentRefs) {
|
|
|
6766
7486
|
);
|
|
6767
7487
|
const rewrittenText = rewriteMarkdownReferences(originalText, replacements);
|
|
6768
7488
|
const extractionArtifact = createPlainTextExtractionArtifact("markdown", "text/markdown");
|
|
6769
|
-
return {
|
|
7489
|
+
return finalizePreparedInput({
|
|
6770
7490
|
title,
|
|
6771
7491
|
originType: "file",
|
|
6772
7492
|
sourceKind: "markdown",
|
|
@@ -6779,7 +7499,7 @@ async function prepareInboxMarkdownInput(absolutePath, attachmentRefs) {
|
|
|
6779
7499
|
extractionHash: buildExtractionHash(rewrittenText, extractionArtifact),
|
|
6780
7500
|
attachments,
|
|
6781
7501
|
contentHash
|
|
6782
|
-
};
|
|
7502
|
+
});
|
|
6783
7503
|
}
|
|
6784
7504
|
async function prepareInboxHtmlInput(absolutePath, attachmentRefs) {
|
|
6785
7505
|
const originalBytes = await fs11.readFile(absolutePath);
|
|
@@ -6824,18 +7544,23 @@ async function prepareInboxHtmlInput(absolutePath, attachmentRefs) {
|
|
|
6824
7544
|
};
|
|
6825
7545
|
}
|
|
6826
7546
|
function isSupportedInboxKind(sourceKind) {
|
|
6827
|
-
return ["markdown", "text", "html", "pdf", "docx", "image"].includes(sourceKind);
|
|
7547
|
+
return ["markdown", "text", "html", "pdf", "docx", "epub", "csv", "xlsx", "pptx", "image"].includes(sourceKind);
|
|
6828
7548
|
}
|
|
6829
7549
|
async function ingestInputDetailed(rootDir, input, options) {
|
|
6830
7550
|
const { paths } = await initWorkspace(rootDir);
|
|
6831
7551
|
const normalizedOptions = normalizeIngestOptions(options);
|
|
6832
7552
|
const absoluteInput = path12.resolve(rootDir, input);
|
|
6833
7553
|
const repoRoot = isHttpUrl(input) || normalizedOptions.repoRoot ? normalizedOptions.repoRoot : await findNearestGitRoot2(absoluteInput).then((value) => value ?? path12.dirname(absoluteInput));
|
|
6834
|
-
const prepared = isHttpUrl(input) ? await
|
|
6835
|
-
return await
|
|
7554
|
+
const prepared = isHttpUrl(input) ? await prepareUrlInputs(rootDir, input, normalizedOptions) : await prepareFileInputs(rootDir, absoluteInput, repoRoot);
|
|
7555
|
+
return await persistPreparedInputs(rootDir, input, prepared, paths);
|
|
6836
7556
|
}
|
|
6837
7557
|
async function ingestInput(rootDir, input, options) {
|
|
6838
|
-
|
|
7558
|
+
const result = await ingestInputDetailed(rootDir, input, options);
|
|
7559
|
+
const manifest = [...result.created, ...result.updated, ...result.unchanged][0];
|
|
7560
|
+
if (!manifest) {
|
|
7561
|
+
throw new Error(`No source manifests were created or updated for ${input}.`);
|
|
7562
|
+
}
|
|
7563
|
+
return manifest;
|
|
6839
7564
|
}
|
|
6840
7565
|
async function addInput(rootDir, input, options = {}) {
|
|
6841
7566
|
const { paths } = await initWorkspace(rootDir);
|
|
@@ -6933,13 +7658,20 @@ async function ingestDirectory(rootDir, inputDir, options) {
|
|
|
6933
7658
|
const progress = createProgressReporter("ingest", files.length);
|
|
6934
7659
|
for (const absolutePath of files) {
|
|
6935
7660
|
const relativePath = repoRelativePathFor(absolutePath, repoRoot) ?? toPosix(path12.relative(repoRoot, absolutePath));
|
|
6936
|
-
const
|
|
6937
|
-
|
|
6938
|
-
|
|
6939
|
-
|
|
6940
|
-
|
|
6941
|
-
|
|
6942
|
-
|
|
7661
|
+
const preparedInputs = await prepareFileInputs(
|
|
7662
|
+
rootDir,
|
|
7663
|
+
absolutePath,
|
|
7664
|
+
repoRoot,
|
|
7665
|
+
sourceClassForRelativePath(relativePath, normalizedOptions)
|
|
7666
|
+
);
|
|
7667
|
+
const result = await persistPreparedInputs(rootDir, absolutePath, preparedInputs, paths);
|
|
7668
|
+
if (result.created.length) {
|
|
7669
|
+
imported.push(...result.created);
|
|
7670
|
+
}
|
|
7671
|
+
if (result.updated.length) {
|
|
7672
|
+
updated.push(...result.updated);
|
|
7673
|
+
}
|
|
7674
|
+
if (!result.created.length && !result.updated.length && !result.removed.length) {
|
|
6943
7675
|
skipped.push({ path: toPosix(path12.relative(rootDir, absolutePath)), reason: "duplicate_content" });
|
|
6944
7676
|
}
|
|
6945
7677
|
progress.tick();
|
|
@@ -6990,13 +7722,13 @@ async function importInbox(rootDir, inputDir) {
|
|
|
6990
7722
|
continue;
|
|
6991
7723
|
}
|
|
6992
7724
|
const prepared = sourceKind === "markdown" && refsBySource.has(absolutePath) ? await prepareInboxMarkdownInput(absolutePath, refsBySource.get(absolutePath) ?? []) : sourceKind === "html" && refsBySource.has(absolutePath) ? await prepareInboxHtmlInput(absolutePath, refsBySource.get(absolutePath) ?? []) : await prepareFileInput(rootDir, absolutePath);
|
|
6993
|
-
const result = await
|
|
6994
|
-
if (!result.
|
|
7725
|
+
const result = await persistPreparedInputs(rootDir, absolutePath, [prepared], paths);
|
|
7726
|
+
if (!result.created.length) {
|
|
6995
7727
|
skipped.push({ path: toPosix(path12.relative(rootDir, absolutePath)), reason: "duplicate_content" });
|
|
6996
7728
|
continue;
|
|
6997
7729
|
}
|
|
6998
|
-
attachmentCount += result.manifest.attachments?.length ?? 0;
|
|
6999
|
-
imported.push(result.
|
|
7730
|
+
attachmentCount += result.created.reduce((total, manifest) => total + (manifest.attachments?.length ?? 0), 0);
|
|
7731
|
+
imported.push(...result.created);
|
|
7000
7732
|
}
|
|
7001
7733
|
await appendLogEntry(rootDir, "inbox_import", toPosix(path12.relative(rootDir, effectiveInputDir)) || ".", [
|
|
7002
7734
|
`scanned=${files.length}`,
|
|
@@ -7021,7 +7753,10 @@ async function listManifests(rootDir) {
|
|
|
7021
7753
|
const manifests = await Promise.all(
|
|
7022
7754
|
entries.filter((entry) => entry.endsWith(".json")).map((entry) => readJsonFile(path12.join(paths.manifestsDir, entry)))
|
|
7023
7755
|
);
|
|
7024
|
-
return manifests.filter((manifest) => Boolean(manifest))
|
|
7756
|
+
return manifests.filter((manifest) => Boolean(manifest)).map((manifest) => ({
|
|
7757
|
+
...manifest,
|
|
7758
|
+
semanticHash: manifest.semanticHash ?? manifest.contentHash
|
|
7759
|
+
}));
|
|
7025
7760
|
}
|
|
7026
7761
|
async function removeManifestBySourceId(rootDir, sourceId) {
|
|
7027
7762
|
const { paths } = await initWorkspace(rootDir);
|
|
@@ -7029,8 +7764,12 @@ async function removeManifestBySourceId(rootDir, sourceId) {
|
|
|
7029
7764
|
if (!manifest) {
|
|
7030
7765
|
return null;
|
|
7031
7766
|
}
|
|
7032
|
-
|
|
7033
|
-
|
|
7767
|
+
const normalizedManifest = {
|
|
7768
|
+
...manifest,
|
|
7769
|
+
semanticHash: manifest.semanticHash ?? manifest.contentHash
|
|
7770
|
+
};
|
|
7771
|
+
await removeManifestArtifacts(rootDir, normalizedManifest, paths);
|
|
7772
|
+
return normalizedManifest;
|
|
7034
7773
|
}
|
|
7035
7774
|
async function readExtractedText(rootDir, manifest) {
|
|
7036
7775
|
if (!manifest.extractedTextPath) {
|
|
@@ -7176,7 +7915,7 @@ import { z as z7 } from "zod";
|
|
|
7176
7915
|
// src/analysis.ts
|
|
7177
7916
|
import path14 from "path";
|
|
7178
7917
|
import { z as z2 } from "zod";
|
|
7179
|
-
var ANALYSIS_FORMAT_VERSION =
|
|
7918
|
+
var ANALYSIS_FORMAT_VERSION = 7;
|
|
7180
7919
|
var sourceAnalysisSchema = z2.object({
|
|
7181
7920
|
title: z2.string().min(1),
|
|
7182
7921
|
summary: z2.string().min(1),
|
|
@@ -7281,6 +8020,7 @@ function heuristicAnalysis(manifest, text, schemaHash) {
|
|
|
7281
8020
|
analysisVersion: ANALYSIS_FORMAT_VERSION,
|
|
7282
8021
|
sourceId: manifest.sourceId,
|
|
7283
8022
|
sourceHash: manifest.contentHash,
|
|
8023
|
+
semanticHash: manifest.semanticHash,
|
|
7284
8024
|
extractionHash: manifest.extractionHash,
|
|
7285
8025
|
schemaHash,
|
|
7286
8026
|
title: deriveTitle(manifest, text),
|
|
@@ -7331,6 +8071,7 @@ ${truncate(text, 18e3)}`
|
|
|
7331
8071
|
analysisVersion: ANALYSIS_FORMAT_VERSION,
|
|
7332
8072
|
sourceId: manifest.sourceId,
|
|
7333
8073
|
sourceHash: manifest.contentHash,
|
|
8074
|
+
semanticHash: manifest.semanticHash,
|
|
7334
8075
|
extractionHash: manifest.extractionHash,
|
|
7335
8076
|
schemaHash: schema.hash,
|
|
7336
8077
|
title: parsed.title,
|
|
@@ -7367,6 +8108,7 @@ function analysisFromVisionExtraction(manifest, extraction, schemaHash) {
|
|
|
7367
8108
|
analysisVersion: ANALYSIS_FORMAT_VERSION,
|
|
7368
8109
|
sourceId: manifest.sourceId,
|
|
7369
8110
|
sourceHash: manifest.contentHash,
|
|
8111
|
+
semanticHash: manifest.semanticHash,
|
|
7370
8112
|
extractionHash: manifest.extractionHash,
|
|
7371
8113
|
schemaHash,
|
|
7372
8114
|
title: extraction.vision.title?.trim() || manifest.title,
|
|
@@ -7405,7 +8147,7 @@ function extractionWarningSummary(manifest, extraction) {
|
|
|
7405
8147
|
async function analyzeSource(manifest, extractedText, provider, paths, schema) {
|
|
7406
8148
|
const cachePath = path14.join(paths.analysesDir, `${manifest.sourceId}.json`);
|
|
7407
8149
|
const cached = await readJsonFile(cachePath);
|
|
7408
|
-
if (cached && cached.analysisVersion === ANALYSIS_FORMAT_VERSION && cached.sourceHash === manifest.
|
|
8150
|
+
if (cached && cached.analysisVersion === ANALYSIS_FORMAT_VERSION && (cached.semanticHash ?? cached.sourceHash) === manifest.semanticHash && cached.extractionHash === manifest.extractionHash && cached.schemaHash === schema.hash) {
|
|
7409
8151
|
return cached;
|
|
7410
8152
|
}
|
|
7411
8153
|
const extraction = await readExtractionArtifact(paths.rootDir, manifest);
|
|
@@ -7422,6 +8164,7 @@ async function analyzeSource(manifest, extractedText, provider, paths, schema) {
|
|
|
7422
8164
|
analysisVersion: ANALYSIS_FORMAT_VERSION,
|
|
7423
8165
|
sourceId: manifest.sourceId,
|
|
7424
8166
|
sourceHash: manifest.contentHash,
|
|
8167
|
+
semanticHash: manifest.semanticHash,
|
|
7425
8168
|
extractionHash: manifest.extractionHash,
|
|
7426
8169
|
schemaHash: schema.hash,
|
|
7427
8170
|
title: manifest.title,
|
|
@@ -7448,6 +8191,7 @@ async function analyzeSource(manifest, extractedText, provider, paths, schema) {
|
|
|
7448
8191
|
analysisVersion: ANALYSIS_FORMAT_VERSION,
|
|
7449
8192
|
sourceId: manifest.sourceId,
|
|
7450
8193
|
sourceHash: manifest.contentHash,
|
|
8194
|
+
semanticHash: manifest.semanticHash,
|
|
7451
8195
|
extractionHash: manifest.extractionHash,
|
|
7452
8196
|
schemaHash: schema.hash,
|
|
7453
8197
|
title: manifest.title,
|
|
@@ -8231,7 +8975,9 @@ async function resolveEmbeddingProvider(rootDir) {
|
|
|
8231
8975
|
}
|
|
8232
8976
|
const provider2 = await createProvider(explicitProviderId, providerConfig, rootDir);
|
|
8233
8977
|
if (!provider2.capabilities.has("embeddings") || typeof provider2.embedTexts !== "function") {
|
|
8234
|
-
throw new Error(
|
|
8978
|
+
throw new Error(
|
|
8979
|
+
`Provider ${provider2.id} does not support required capability "embeddings". Configure tasks.embeddingProvider to use an embedding-capable backend such as ollama or another openai-compatible embedding service.`
|
|
8980
|
+
);
|
|
8235
8981
|
}
|
|
8236
8982
|
return provider2;
|
|
8237
8983
|
}
|
|
@@ -9127,6 +9873,18 @@ function uniqueStrings2(values) {
|
|
|
9127
9873
|
function safeFrontmatter(value) {
|
|
9128
9874
|
return JSON.parse(JSON.stringify(value));
|
|
9129
9875
|
}
|
|
9876
|
+
function sourceHashesForManifest(manifest) {
|
|
9877
|
+
return {
|
|
9878
|
+
sourceHashes: { [manifest.sourceId]: manifest.contentHash },
|
|
9879
|
+
sourceSemanticHashes: { [manifest.sourceId]: manifest.semanticHash }
|
|
9880
|
+
};
|
|
9881
|
+
}
|
|
9882
|
+
function sourceHashFrontmatter(sourceHashes, sourceSemanticHashes) {
|
|
9883
|
+
return {
|
|
9884
|
+
source_hashes: sourceHashes,
|
|
9885
|
+
source_semantic_hashes: sourceSemanticHashes
|
|
9886
|
+
};
|
|
9887
|
+
}
|
|
9130
9888
|
function decoratedTags(baseTags, decorations) {
|
|
9131
9889
|
return uniqueStrings2([
|
|
9132
9890
|
...baseTags,
|
|
@@ -9190,6 +9948,7 @@ function relatedOutputsSection(relatedOutputs) {
|
|
|
9190
9948
|
function buildSourcePage(manifest, analysis, schemaHash, metadata, relatedOutputs = [], modulePage, decorations) {
|
|
9191
9949
|
const relativePath = pagePathFor("source", manifest.sourceId);
|
|
9192
9950
|
const pageId = `source:${manifest.sourceId}`;
|
|
9951
|
+
const { sourceHashes, sourceSemanticHashes } = sourceHashesForManifest(manifest);
|
|
9193
9952
|
const moduleNodeIds = analysis.code ? [analysis.code.moduleId, ...analysis.code.symbols.map((symbol) => symbol.id)] : [];
|
|
9194
9953
|
const nodeIds = [
|
|
9195
9954
|
`source:${manifest.sourceId}`,
|
|
@@ -9222,17 +9981,25 @@ function buildSourcePage(manifest, analysis, schemaHash, metadata, relatedOutput
|
|
|
9222
9981
|
managed_by: metadata.managedBy,
|
|
9223
9982
|
backlinks,
|
|
9224
9983
|
schema_hash: schemaHash,
|
|
9225
|
-
|
|
9226
|
-
[manifest.sourceId]: manifest.contentHash
|
|
9227
|
-
}
|
|
9984
|
+
...sourceHashFrontmatter(sourceHashes, sourceSemanticHashes)
|
|
9228
9985
|
};
|
|
9229
9986
|
const body = [
|
|
9230
9987
|
`# ${analysis.title}`,
|
|
9231
9988
|
"",
|
|
9232
9989
|
`Source ID: \`${manifest.sourceId}\``,
|
|
9990
|
+
`Source Kind: \`${manifest.sourceKind}\``,
|
|
9233
9991
|
manifest.url ? `Source URL: ${manifest.url}` : `Source Path: \`${manifest.originalPath ?? manifest.storedPath}\``,
|
|
9234
9992
|
...manifest.sourceType ? [`Source Type: \`${manifest.sourceType}\``, ""] : [""],
|
|
9235
9993
|
...manifest.sourceClass ? [`Source Class: \`${manifest.sourceClass}\``, ""] : [],
|
|
9994
|
+
...manifest.sourceGroupTitle ? [`Source Group: ${manifest.sourceGroupTitle}`] : [],
|
|
9995
|
+
...manifest.partTitle ? [`Part: ${manifest.partIndex ?? "?"}/${manifest.partCount ?? "?"} - ${manifest.partTitle}`] : [],
|
|
9996
|
+
...manifest.details && Object.keys(manifest.details).length ? [
|
|
9997
|
+
"",
|
|
9998
|
+
"## Source Details",
|
|
9999
|
+
"",
|
|
10000
|
+
...Object.entries(manifest.details).map(([key, value]) => `- ${key.replace(/_/g, " ")}: ${value}`),
|
|
10001
|
+
""
|
|
10002
|
+
] : [],
|
|
9236
10003
|
"",
|
|
9237
10004
|
"## Summary",
|
|
9238
10005
|
"",
|
|
@@ -9287,7 +10054,8 @@ function buildSourcePage(manifest, analysis, schemaHash, metadata, relatedOutput
|
|
|
9287
10054
|
confidence: metadata.confidence,
|
|
9288
10055
|
backlinks,
|
|
9289
10056
|
schemaHash,
|
|
9290
|
-
sourceHashes
|
|
10057
|
+
sourceHashes,
|
|
10058
|
+
sourceSemanticHashes,
|
|
9291
10059
|
relatedPageIds: [...modulePage ? [modulePage.id] : [], ...relatedOutputs.map((page) => page.id)],
|
|
9292
10060
|
relatedNodeIds: moduleNodeIds,
|
|
9293
10061
|
relatedSourceIds: [],
|
|
@@ -9312,6 +10080,7 @@ function buildModulePage(input) {
|
|
|
9312
10080
|
const localModuleBacklinks = input.localModules.map((moduleRef) => moduleRef.page.id);
|
|
9313
10081
|
const relatedOutputs = input.relatedOutputs ?? [];
|
|
9314
10082
|
const backlinks = uniqueStrings2([sourcePage.id, ...localModuleBacklinks, ...relatedOutputs.map((page) => page.id)]);
|
|
10083
|
+
const { sourceHashes, sourceSemanticHashes } = sourceHashesForManifest(manifest);
|
|
9315
10084
|
const importsSection = code.imports.length ? code.imports.map((item) => {
|
|
9316
10085
|
const localModule = item.resolvedSourceId ? input.localModules.find((moduleRef) => moduleRef.sourceId === item.resolvedSourceId && moduleRef.reExport === item.reExport) : void 0;
|
|
9317
10086
|
const importedBits = [
|
|
@@ -9355,9 +10124,7 @@ function buildModulePage(input) {
|
|
|
9355
10124
|
managed_by: metadata.managedBy,
|
|
9356
10125
|
backlinks,
|
|
9357
10126
|
schema_hash: schemaHash,
|
|
9358
|
-
|
|
9359
|
-
[manifest.sourceId]: manifest.contentHash
|
|
9360
|
-
},
|
|
10127
|
+
...sourceHashFrontmatter(sourceHashes, sourceSemanticHashes),
|
|
9361
10128
|
related_page_ids: uniqueStrings2([sourcePage.id, ...localModuleBacklinks, ...relatedOutputs.map((page) => page.id)]),
|
|
9362
10129
|
related_node_ids: [],
|
|
9363
10130
|
related_source_ids: uniqueStrings2([
|
|
@@ -9433,7 +10200,8 @@ function buildModulePage(input) {
|
|
|
9433
10200
|
confidence: metadata.confidence,
|
|
9434
10201
|
backlinks,
|
|
9435
10202
|
schemaHash,
|
|
9436
|
-
sourceHashes
|
|
10203
|
+
sourceHashes,
|
|
10204
|
+
sourceSemanticHashes,
|
|
9437
10205
|
relatedPageIds: uniqueStrings2([sourcePage.id, ...localModuleBacklinks, ...relatedOutputs.map((page) => page.id)]),
|
|
9438
10206
|
relatedNodeIds: [],
|
|
9439
10207
|
relatedSourceIds: uniqueStrings2([
|
|
@@ -9449,7 +10217,7 @@ function buildModulePage(input) {
|
|
|
9449
10217
|
content: matter5.stringify(body, frontmatter)
|
|
9450
10218
|
};
|
|
9451
10219
|
}
|
|
9452
|
-
function buildAggregatePage(kind, name, descriptions, sourceAnalyses, sourceHashes, schemaHash, metadata, relativePath, relatedOutputs = [], decorations) {
|
|
10220
|
+
function buildAggregatePage(kind, name, descriptions, sourceAnalyses, sourceHashes, sourceSemanticHashes, schemaHash, metadata, relativePath, relatedOutputs = [], decorations) {
|
|
9453
10221
|
const slug = slugify(name);
|
|
9454
10222
|
const pageId = `${kind}:${slug}`;
|
|
9455
10223
|
const sourceIds = sourceAnalyses.map((item) => item.sourceId);
|
|
@@ -9473,7 +10241,7 @@ function buildAggregatePage(kind, name, descriptions, sourceAnalyses, sourceHash
|
|
|
9473
10241
|
managed_by: metadata.managedBy,
|
|
9474
10242
|
backlinks: otherPages,
|
|
9475
10243
|
schema_hash: schemaHash,
|
|
9476
|
-
|
|
10244
|
+
...sourceHashFrontmatter(sourceHashes, sourceSemanticHashes)
|
|
9477
10245
|
};
|
|
9478
10246
|
const body = [
|
|
9479
10247
|
`# ${name}`,
|
|
@@ -9511,6 +10279,7 @@ function buildAggregatePage(kind, name, descriptions, sourceAnalyses, sourceHash
|
|
|
9511
10279
|
backlinks: otherPages,
|
|
9512
10280
|
schemaHash,
|
|
9513
10281
|
sourceHashes,
|
|
10282
|
+
sourceSemanticHashes,
|
|
9514
10283
|
relatedPageIds: relatedOutputs.map((page) => page.id),
|
|
9515
10284
|
relatedNodeIds: [],
|
|
9516
10285
|
relatedSourceIds: [],
|
|
@@ -9551,6 +10320,7 @@ function buildIndexPage(pages, schemaHash, metadata, projectPages = []) {
|
|
|
9551
10320
|
"backlinks: []",
|
|
9552
10321
|
`schema_hash: ${schemaHash}`,
|
|
9553
10322
|
"source_hashes: {}",
|
|
10323
|
+
"source_semantic_hashes: {}",
|
|
9554
10324
|
"---",
|
|
9555
10325
|
"",
|
|
9556
10326
|
"# SwarmVault Index",
|
|
@@ -9614,7 +10384,8 @@ function buildSectionIndex(kind, pages, schemaHash, metadata, projectIds = []) {
|
|
|
9614
10384
|
managed_by: metadata.managedBy,
|
|
9615
10385
|
backlinks: [],
|
|
9616
10386
|
schema_hash: schemaHash,
|
|
9617
|
-
source_hashes: {}
|
|
10387
|
+
source_hashes: {},
|
|
10388
|
+
source_semantic_hashes: {}
|
|
9618
10389
|
}
|
|
9619
10390
|
);
|
|
9620
10391
|
}
|
|
@@ -9910,6 +10681,7 @@ function buildGraphReportPage(input) {
|
|
|
9910
10681
|
backlinks: [],
|
|
9911
10682
|
schema_hash: input.schemaHash,
|
|
9912
10683
|
source_hashes: {},
|
|
10684
|
+
source_semantic_hashes: {},
|
|
9913
10685
|
related_page_ids: relatedPageIds,
|
|
9914
10686
|
related_node_ids: relatedNodeIds,
|
|
9915
10687
|
related_source_ids: relatedSourceIds
|
|
@@ -10025,6 +10797,7 @@ function buildGraphReportPage(input) {
|
|
|
10025
10797
|
backlinks: [],
|
|
10026
10798
|
schemaHash: input.schemaHash,
|
|
10027
10799
|
sourceHashes: {},
|
|
10800
|
+
sourceSemanticHashes: {},
|
|
10028
10801
|
relatedPageIds,
|
|
10029
10802
|
relatedNodeIds,
|
|
10030
10803
|
relatedSourceIds,
|
|
@@ -10068,6 +10841,7 @@ function buildCommunitySummaryPage(input) {
|
|
|
10068
10841
|
backlinks: ["graph:report"],
|
|
10069
10842
|
schema_hash: input.schemaHash,
|
|
10070
10843
|
source_hashes: {},
|
|
10844
|
+
source_semantic_hashes: {},
|
|
10071
10845
|
related_page_ids: uniqueStrings2(["graph:report", ...communityPageIds]),
|
|
10072
10846
|
related_node_ids: input.community.nodeIds,
|
|
10073
10847
|
related_source_ids: relatedSourceIds
|
|
@@ -10107,6 +10881,7 @@ function buildCommunitySummaryPage(input) {
|
|
|
10107
10881
|
backlinks: ["graph:report"],
|
|
10108
10882
|
schemaHash: input.schemaHash,
|
|
10109
10883
|
sourceHashes: {},
|
|
10884
|
+
sourceSemanticHashes: {},
|
|
10110
10885
|
relatedPageIds: uniqueStrings2(["graph:report", ...communityPageIds]),
|
|
10111
10886
|
relatedNodeIds: input.community.nodeIds,
|
|
10112
10887
|
relatedSourceIds,
|
|
@@ -10143,7 +10918,8 @@ function buildProjectsIndex(projectPages, schemaHash, metadata) {
|
|
|
10143
10918
|
managed_by: metadata.managedBy,
|
|
10144
10919
|
backlinks: [],
|
|
10145
10920
|
schema_hash: schemaHash,
|
|
10146
|
-
source_hashes: {}
|
|
10921
|
+
source_hashes: {},
|
|
10922
|
+
source_semantic_hashes: {}
|
|
10147
10923
|
}
|
|
10148
10924
|
);
|
|
10149
10925
|
}
|
|
@@ -10195,7 +10971,8 @@ function buildProjectIndex(input) {
|
|
|
10195
10971
|
managed_by: input.metadata.managedBy,
|
|
10196
10972
|
backlinks: [],
|
|
10197
10973
|
schema_hash: input.schemaHash,
|
|
10198
|
-
source_hashes: {}
|
|
10974
|
+
source_hashes: {},
|
|
10975
|
+
source_semantic_hashes: {}
|
|
10199
10976
|
}
|
|
10200
10977
|
);
|
|
10201
10978
|
}
|
|
@@ -10226,6 +11003,7 @@ function buildOutputPage(input) {
|
|
|
10226
11003
|
backlinks,
|
|
10227
11004
|
schema_hash: input.schemaHash,
|
|
10228
11005
|
source_hashes: {},
|
|
11006
|
+
source_semantic_hashes: {},
|
|
10229
11007
|
related_page_ids: relatedPageIds,
|
|
10230
11008
|
related_node_ids: relatedNodeIds,
|
|
10231
11009
|
related_source_ids: relatedSourceIds,
|
|
@@ -10250,6 +11028,7 @@ function buildOutputPage(input) {
|
|
|
10250
11028
|
backlinks,
|
|
10251
11029
|
schemaHash: input.schemaHash,
|
|
10252
11030
|
sourceHashes: {},
|
|
11031
|
+
sourceSemanticHashes: {},
|
|
10253
11032
|
relatedPageIds,
|
|
10254
11033
|
relatedNodeIds,
|
|
10255
11034
|
relatedSourceIds,
|
|
@@ -10352,6 +11131,7 @@ function buildExploreHubPage(input) {
|
|
|
10352
11131
|
backlinks,
|
|
10353
11132
|
schema_hash: input.schemaHash,
|
|
10354
11133
|
source_hashes: {},
|
|
11134
|
+
source_semantic_hashes: {},
|
|
10355
11135
|
related_page_ids: relatedPageIds,
|
|
10356
11136
|
related_node_ids: relatedNodeIds,
|
|
10357
11137
|
related_source_ids: relatedSourceIds,
|
|
@@ -10376,6 +11156,7 @@ function buildExploreHubPage(input) {
|
|
|
10376
11156
|
backlinks,
|
|
10377
11157
|
schemaHash: input.schemaHash,
|
|
10378
11158
|
sourceHashes: {},
|
|
11159
|
+
sourceSemanticHashes: {},
|
|
10379
11160
|
relatedPageIds,
|
|
10380
11161
|
relatedNodeIds,
|
|
10381
11162
|
relatedSourceIds,
|
|
@@ -10674,6 +11455,9 @@ function normalizeSourceHashes(value) {
|
|
|
10674
11455
|
Object.entries(value).filter((entry) => typeof entry[0] === "string" && typeof entry[1] === "string")
|
|
10675
11456
|
);
|
|
10676
11457
|
}
|
|
11458
|
+
function normalizeSourceSemanticHashes(value) {
|
|
11459
|
+
return normalizeSourceHashes(value);
|
|
11460
|
+
}
|
|
10677
11461
|
function normalizePageStatus(value, fallback = "active") {
|
|
10678
11462
|
return value === "draft" || value === "candidate" || value === "active" || value === "archived" ? value : fallback;
|
|
10679
11463
|
}
|
|
@@ -10802,6 +11586,7 @@ function parseStoredPage(relativePath, content, defaults = {}) {
|
|
|
10802
11586
|
backlinks,
|
|
10803
11587
|
schemaHash: typeof parsed.data.schema_hash === "string" ? parsed.data.schema_hash : "",
|
|
10804
11588
|
sourceHashes: normalizeSourceHashes(parsed.data.source_hashes),
|
|
11589
|
+
sourceSemanticHashes: normalizeSourceSemanticHashes(parsed.data.source_semantic_hashes),
|
|
10805
11590
|
relatedPageIds,
|
|
10806
11591
|
relatedNodeIds,
|
|
10807
11592
|
relatedSourceIds,
|
|
@@ -10855,6 +11640,7 @@ async function loadInsightPages(wikiDir) {
|
|
|
10855
11640
|
backlinks,
|
|
10856
11641
|
schemaHash: typeof parsed.data.schema_hash === "string" ? parsed.data.schema_hash : "",
|
|
10857
11642
|
sourceHashes: normalizeSourceHashes(parsed.data.source_hashes),
|
|
11643
|
+
sourceSemanticHashes: normalizeSourceSemanticHashes(parsed.data.source_semantic_hashes),
|
|
10858
11644
|
relatedPageIds,
|
|
10859
11645
|
relatedNodeIds,
|
|
10860
11646
|
relatedSourceIds,
|
|
@@ -10955,6 +11741,7 @@ async function loadSavedOutputPages(wikiDir) {
|
|
|
10955
11741
|
backlinks,
|
|
10956
11742
|
schemaHash: typeof parsed.data.schema_hash === "string" ? parsed.data.schema_hash : "",
|
|
10957
11743
|
sourceHashes: normalizeSourceHashes(parsed.data.source_hashes),
|
|
11744
|
+
sourceSemanticHashes: normalizeSourceSemanticHashes(parsed.data.source_semantic_hashes),
|
|
10958
11745
|
relatedPageIds,
|
|
10959
11746
|
relatedNodeIds,
|
|
10960
11747
|
relatedSourceIds,
|
|
@@ -12431,11 +13218,13 @@ function aggregateItems(analyses, kind) {
|
|
|
12431
13218
|
name: item.name,
|
|
12432
13219
|
descriptions: [],
|
|
12433
13220
|
sourceAnalyses: [],
|
|
12434
|
-
sourceHashes: {}
|
|
13221
|
+
sourceHashes: {},
|
|
13222
|
+
sourceSemanticHashes: {}
|
|
12435
13223
|
};
|
|
12436
13224
|
existing.descriptions.push(item.description);
|
|
12437
13225
|
existing.sourceAnalyses.push(analysis);
|
|
12438
13226
|
existing.sourceHashes[analysis.sourceId] = analysis.sourceHash;
|
|
13227
|
+
existing.sourceSemanticHashes[analysis.sourceId] = analysis.semanticHash;
|
|
12439
13228
|
grouped.set(key, existing);
|
|
12440
13229
|
}
|
|
12441
13230
|
}
|
|
@@ -12457,6 +13246,7 @@ function emptyGraphPage(input) {
|
|
|
12457
13246
|
backlinks: [],
|
|
12458
13247
|
schemaHash: input.schemaHash,
|
|
12459
13248
|
sourceHashes: input.sourceHashes,
|
|
13249
|
+
sourceSemanticHashes: input.sourceSemanticHashes ?? {},
|
|
12460
13250
|
relatedPageIds: [],
|
|
12461
13251
|
relatedNodeIds: [],
|
|
12462
13252
|
relatedSourceIds: [],
|
|
@@ -12621,6 +13411,7 @@ async function syncVaultArtifacts(rootDir, input) {
|
|
|
12621
13411
|
nodeIds: [analysis.code.moduleId, ...analysis.code.symbols.map((symbol) => symbol.id)],
|
|
12622
13412
|
schemaHash: sourceSchemaHash,
|
|
12623
13413
|
sourceHashes: { [manifest.sourceId]: manifest.contentHash },
|
|
13414
|
+
sourceSemanticHashes: { [manifest.sourceId]: manifest.semanticHash },
|
|
12624
13415
|
confidence: 1
|
|
12625
13416
|
}) : null;
|
|
12626
13417
|
const preview = emptyGraphPage({
|
|
@@ -12639,6 +13430,7 @@ async function syncVaultArtifacts(rootDir, input) {
|
|
|
12639
13430
|
],
|
|
12640
13431
|
schemaHash: sourceSchemaHash,
|
|
12641
13432
|
sourceHashes: { [manifest.sourceId]: manifest.contentHash },
|
|
13433
|
+
sourceSemanticHashes: { [manifest.sourceId]: manifest.semanticHash },
|
|
12642
13434
|
confidence: 1
|
|
12643
13435
|
});
|
|
12644
13436
|
const sourceRecord = await buildManagedGraphPage(
|
|
@@ -12755,6 +13547,7 @@ async function syncVaultArtifacts(rootDir, input) {
|
|
|
12755
13547
|
aggregate.descriptions,
|
|
12756
13548
|
aggregate.sourceAnalyses,
|
|
12757
13549
|
aggregate.sourceHashes,
|
|
13550
|
+
aggregate.sourceSemanticHashes,
|
|
12758
13551
|
schemaHash,
|
|
12759
13552
|
metadata,
|
|
12760
13553
|
relativePath,
|
|
@@ -13002,6 +13795,7 @@ async function syncVaultArtifacts(rootDir, input) {
|
|
|
13002
13795
|
projectConfigHash: projectConfigHash(config),
|
|
13003
13796
|
analyses: Object.fromEntries(input.analyses.map((analysis) => [analysis.sourceId, analysisSignature(analysis)])),
|
|
13004
13797
|
sourceHashes: Object.fromEntries(input.manifests.map((manifest) => [manifest.sourceId, manifest.contentHash])),
|
|
13798
|
+
sourceSemanticHashes: Object.fromEntries(input.manifests.map((manifest) => [manifest.sourceId, manifest.semanticHash])),
|
|
13005
13799
|
sourceProjects: input.sourceProjects,
|
|
13006
13800
|
outputHashes: input.outputHashes,
|
|
13007
13801
|
insightHashes: input.insightHashes,
|
|
@@ -13467,6 +14261,7 @@ function emptyCompileState() {
|
|
|
13467
14261
|
projectConfigHash: "",
|
|
13468
14262
|
analyses: {},
|
|
13469
14263
|
sourceHashes: {},
|
|
14264
|
+
sourceSemanticHashes: {},
|
|
13470
14265
|
sourceProjects: {},
|
|
13471
14266
|
outputHashes: {},
|
|
13472
14267
|
insightHashes: {},
|
|
@@ -13896,7 +14691,8 @@ async function initVault(rootDir, options = {}) {
|
|
|
13896
14691
|
managed_by: "human",
|
|
13897
14692
|
backlinks: [],
|
|
13898
14693
|
schema_hash: "",
|
|
13899
|
-
source_hashes: {}
|
|
14694
|
+
source_hashes: {},
|
|
14695
|
+
source_semantic_hashes: {}
|
|
13900
14696
|
}
|
|
13901
14697
|
)
|
|
13902
14698
|
);
|
|
@@ -13919,7 +14715,8 @@ async function initVault(rootDir, options = {}) {
|
|
|
13919
14715
|
managed_by: "system",
|
|
13920
14716
|
backlinks: [],
|
|
13921
14717
|
schema_hash: "",
|
|
13922
|
-
source_hashes: {}
|
|
14718
|
+
source_hashes: {},
|
|
14719
|
+
source_semantic_hashes: {}
|
|
13923
14720
|
})
|
|
13924
14721
|
);
|
|
13925
14722
|
await writeFileIfChanged(
|
|
@@ -13941,7 +14738,8 @@ async function initVault(rootDir, options = {}) {
|
|
|
13941
14738
|
managed_by: "system",
|
|
13942
14739
|
backlinks: [],
|
|
13943
14740
|
schema_hash: "",
|
|
13944
|
-
source_hashes: {}
|
|
14741
|
+
source_hashes: {},
|
|
14742
|
+
source_semantic_hashes: {}
|
|
13945
14743
|
})
|
|
13946
14744
|
);
|
|
13947
14745
|
if (options.obsidian) {
|
|
@@ -13982,7 +14780,7 @@ async function compileVault(rootDir, options = {}) {
|
|
|
13982
14780
|
);
|
|
13983
14781
|
const nextProjectConfigHash = projectConfigHash(config);
|
|
13984
14782
|
const projectConfigChanged = !previousState || previousState.projectConfigHash !== nextProjectConfigHash;
|
|
13985
|
-
const previousSourceHashes = previousState?.sourceHashes ?? {};
|
|
14783
|
+
const previousSourceHashes = previousState?.sourceSemanticHashes ?? previousState?.sourceHashes ?? {};
|
|
13986
14784
|
const previousAnalyses = previousState?.analyses ?? {};
|
|
13987
14785
|
const previousSourceProjects = previousState?.sourceProjects ?? {};
|
|
13988
14786
|
const previousOutputHashes = previousState?.outputHashes ?? {};
|
|
@@ -13997,7 +14795,7 @@ async function compileVault(rootDir, options = {}) {
|
|
|
13997
14795
|
const dirty = [];
|
|
13998
14796
|
const clean = [];
|
|
13999
14797
|
for (const manifest of manifests) {
|
|
14000
|
-
const hashChanged = previousSourceHashes[manifest.sourceId] !== manifest.
|
|
14798
|
+
const hashChanged = previousSourceHashes[manifest.sourceId] !== manifest.semanticHash;
|
|
14001
14799
|
const noAnalysis = !previousAnalyses[manifest.sourceId];
|
|
14002
14800
|
const projectId = sourceProjects[manifest.sourceId] ?? null;
|
|
14003
14801
|
const projectChanged = (previousSourceProjects[manifest.sourceId] ?? null) !== projectId;
|
|
@@ -14707,9 +15505,11 @@ function structuralLintFindings(_rootDir, paths, graph, schemas, manifests, sour
|
|
|
14707
15505
|
relatedPageIds: [page.id]
|
|
14708
15506
|
});
|
|
14709
15507
|
}
|
|
14710
|
-
|
|
15508
|
+
const freshnessHashes = Object.keys(page.sourceSemanticHashes).length ? page.sourceSemanticHashes : page.sourceHashes;
|
|
15509
|
+
for (const [sourceId, knownHash] of Object.entries(freshnessHashes)) {
|
|
14711
15510
|
const manifest = manifestMap.get(sourceId);
|
|
14712
|
-
|
|
15511
|
+
const manifestHash = manifest?.semanticHash ?? manifest?.contentHash;
|
|
15512
|
+
if (manifestHash && manifestHash !== knownHash) {
|
|
14713
15513
|
findings.push({
|
|
14714
15514
|
severity: "warning",
|
|
14715
15515
|
code: "stale_page",
|
|
@@ -14848,7 +15648,7 @@ async function bootstrapDemo(rootDir, input) {
|
|
|
14848
15648
|
}
|
|
14849
15649
|
|
|
14850
15650
|
// src/mcp.ts
|
|
14851
|
-
var SERVER_VERSION = "0.
|
|
15651
|
+
var SERVER_VERSION = "0.3.0";
|
|
14852
15652
|
async function createMcpServer(rootDir) {
|
|
14853
15653
|
const server = new McpServer({
|
|
14854
15654
|
name: "swarmvault",
|
|
@@ -15026,8 +15826,8 @@ async function createMcpServer(rootDir) {
|
|
|
15026
15826
|
}
|
|
15027
15827
|
},
|
|
15028
15828
|
async ({ input }) => {
|
|
15029
|
-
const
|
|
15030
|
-
return asToolText(
|
|
15829
|
+
const result = await ingestInputDetailed(rootDir, input);
|
|
15830
|
+
return asToolText(result);
|
|
15031
15831
|
}
|
|
15032
15832
|
);
|
|
15033
15833
|
server.registerTool(
|
|
@@ -15831,12 +16631,11 @@ async function syncCrawlSource(rootDir, entry, options) {
|
|
|
15831
16631
|
let updatedCount = 0;
|
|
15832
16632
|
for (const pageUrl of crawl.pages) {
|
|
15833
16633
|
const persisted = await ingestInputDetailed(rootDir, pageUrl);
|
|
15834
|
-
currentSourceIds.push(persisted.manifest.sourceId);
|
|
15835
|
-
|
|
15836
|
-
|
|
15837
|
-
|
|
15838
|
-
|
|
15839
|
-
}
|
|
16634
|
+
currentSourceIds.push(...persisted.created.map((manifest) => manifest.sourceId));
|
|
16635
|
+
currentSourceIds.push(...persisted.updated.map((manifest) => manifest.sourceId));
|
|
16636
|
+
currentSourceIds.push(...persisted.unchanged.map((manifest) => manifest.sourceId));
|
|
16637
|
+
importedCount += persisted.created.length;
|
|
16638
|
+
updatedCount += persisted.updated.length;
|
|
15840
16639
|
}
|
|
15841
16640
|
let removedCount = 0;
|
|
15842
16641
|
for (const sourceId of previousSourceIds) {
|
|
@@ -16221,6 +17020,124 @@ import { promisify } from "util";
|
|
|
16221
17020
|
import matter10 from "gray-matter";
|
|
16222
17021
|
import mime2 from "mime-types";
|
|
16223
17022
|
|
|
17023
|
+
// src/graph-presentation.ts
|
|
17024
|
+
var OVERVIEW_THRESHOLD = 5e3;
|
|
17025
|
+
var OVERVIEW_NODE_BUDGET = 1500;
|
|
17026
|
+
function nodePriority(node, pinnedNodeIds) {
|
|
17027
|
+
return [pinnedNodeIds.has(node.id) ? 0 : 1, -(node.degree ?? 0), -(node.bridgeScore ?? 0), node.label, node.id];
|
|
17028
|
+
}
|
|
17029
|
+
function compareTuples(left, right) {
|
|
17030
|
+
const length = Math.max(left.length, right.length);
|
|
17031
|
+
for (let index = 0; index < length; index += 1) {
|
|
17032
|
+
const leftValue = left[index];
|
|
17033
|
+
const rightValue = right[index];
|
|
17034
|
+
if (leftValue === rightValue) {
|
|
17035
|
+
continue;
|
|
17036
|
+
}
|
|
17037
|
+
if (typeof leftValue === "number" && typeof rightValue === "number") {
|
|
17038
|
+
return leftValue - rightValue;
|
|
17039
|
+
}
|
|
17040
|
+
return String(leftValue ?? "").localeCompare(String(rightValue ?? ""));
|
|
17041
|
+
}
|
|
17042
|
+
return 0;
|
|
17043
|
+
}
|
|
17044
|
+
function survivingHyperedges(hyperedges, sampledNodeIds) {
|
|
17045
|
+
return hyperedges.filter((hyperedge) => hyperedge.nodeIds.filter((nodeId) => sampledNodeIds.has(nodeId)).length >= 2);
|
|
17046
|
+
}
|
|
17047
|
+
function pinnedNodeIdsForReport(report) {
|
|
17048
|
+
if (!report) {
|
|
17049
|
+
return /* @__PURE__ */ new Set();
|
|
17050
|
+
}
|
|
17051
|
+
return /* @__PURE__ */ new Set([
|
|
17052
|
+
...report.godNodes.map((node) => node.nodeId),
|
|
17053
|
+
...report.bridgeNodes.map((node) => node.nodeId),
|
|
17054
|
+
...report.surprisingConnections.flatMap((connection) => [connection.sourceNodeId, connection.targetNodeId])
|
|
17055
|
+
]);
|
|
17056
|
+
}
|
|
17057
|
+
function sampleGraphNodes(graph, report, nodeBudget = OVERVIEW_NODE_BUDGET) {
|
|
17058
|
+
const pinned = pinnedNodeIdsForReport(report);
|
|
17059
|
+
const nodeById2 = new Map(graph.nodes.map((node) => [node.id, node]));
|
|
17060
|
+
const selected = new Set([...pinned].filter((nodeId) => nodeById2.has(nodeId)));
|
|
17061
|
+
const sortedCommunities2 = [...graph.communities ?? []].sort((left, right) => {
|
|
17062
|
+
const leftNodes = left.nodeIds.map((nodeId) => nodeById2.get(nodeId)).filter((node) => Boolean(node));
|
|
17063
|
+
const rightNodes = right.nodeIds.map((nodeId) => nodeById2.get(nodeId)).filter((node) => Boolean(node));
|
|
17064
|
+
const leftFirstParty = leftNodes.filter((node) => node.sourceClass === "first_party").length;
|
|
17065
|
+
const rightFirstParty = rightNodes.filter((node) => node.sourceClass === "first_party").length;
|
|
17066
|
+
return compareTuples(
|
|
17067
|
+
[-leftFirstParty, -leftNodes.length, left.label, left.id],
|
|
17068
|
+
[-rightFirstParty, -rightNodes.length, right.label, right.id]
|
|
17069
|
+
);
|
|
17070
|
+
});
|
|
17071
|
+
for (const community of sortedCommunities2) {
|
|
17072
|
+
const communityNodes = community.nodeIds.map((nodeId) => nodeById2.get(nodeId)).filter((node) => Boolean(node)).sort((left, right) => compareTuples(nodePriority(left, pinned), nodePriority(right, pinned)));
|
|
17073
|
+
for (const node of communityNodes) {
|
|
17074
|
+
if (selected.size >= nodeBudget && !pinned.has(node.id)) {
|
|
17075
|
+
break;
|
|
17076
|
+
}
|
|
17077
|
+
selected.add(node.id);
|
|
17078
|
+
}
|
|
17079
|
+
if (selected.size >= nodeBudget) {
|
|
17080
|
+
break;
|
|
17081
|
+
}
|
|
17082
|
+
}
|
|
17083
|
+
if (selected.size < nodeBudget) {
|
|
17084
|
+
for (const node of [...graph.nodes].sort((left, right) => compareTuples(nodePriority(left, pinned), nodePriority(right, pinned)))) {
|
|
17085
|
+
if (selected.size >= nodeBudget && !pinned.has(node.id)) {
|
|
17086
|
+
break;
|
|
17087
|
+
}
|
|
17088
|
+
selected.add(node.id);
|
|
17089
|
+
}
|
|
17090
|
+
}
|
|
17091
|
+
return selected;
|
|
17092
|
+
}
|
|
17093
|
+
function buildViewerGraphArtifact(graph, options = {}) {
|
|
17094
|
+
const threshold = options.threshold ?? OVERVIEW_THRESHOLD;
|
|
17095
|
+
const nodeBudget = options.nodeBudget ?? OVERVIEW_NODE_BUDGET;
|
|
17096
|
+
const totalCommunities = graph.communities?.length ?? 0;
|
|
17097
|
+
if (options.full || graph.nodes.length <= threshold) {
|
|
17098
|
+
return {
|
|
17099
|
+
...graph,
|
|
17100
|
+
presentation: {
|
|
17101
|
+
mode: "full",
|
|
17102
|
+
threshold,
|
|
17103
|
+
nodeBudget,
|
|
17104
|
+
totalNodes: graph.nodes.length,
|
|
17105
|
+
displayedNodes: graph.nodes.length,
|
|
17106
|
+
totalEdges: graph.edges.length,
|
|
17107
|
+
displayedEdges: graph.edges.length,
|
|
17108
|
+
totalCommunities,
|
|
17109
|
+
displayedCommunities: totalCommunities
|
|
17110
|
+
}
|
|
17111
|
+
};
|
|
17112
|
+
}
|
|
17113
|
+
const sampledNodeIds = sampleGraphNodes(graph, options.report, nodeBudget);
|
|
17114
|
+
const nodes = graph.nodes.filter((node) => sampledNodeIds.has(node.id));
|
|
17115
|
+
const edges = graph.edges.filter((edge) => sampledNodeIds.has(edge.source) && sampledNodeIds.has(edge.target));
|
|
17116
|
+
const hyperedges = survivingHyperedges(graph.hyperedges ?? [], sampledNodeIds);
|
|
17117
|
+
const communities = (graph.communities ?? []).map((community) => ({
|
|
17118
|
+
...community,
|
|
17119
|
+
nodeIds: community.nodeIds.filter((nodeId) => sampledNodeIds.has(nodeId))
|
|
17120
|
+
})).filter((community) => community.nodeIds.length > 0);
|
|
17121
|
+
return {
|
|
17122
|
+
...graph,
|
|
17123
|
+
nodes,
|
|
17124
|
+
edges,
|
|
17125
|
+
hyperedges,
|
|
17126
|
+
communities,
|
|
17127
|
+
presentation: {
|
|
17128
|
+
mode: "overview",
|
|
17129
|
+
threshold,
|
|
17130
|
+
nodeBudget,
|
|
17131
|
+
totalNodes: graph.nodes.length,
|
|
17132
|
+
displayedNodes: nodes.length,
|
|
17133
|
+
totalEdges: graph.edges.length,
|
|
17134
|
+
displayedEdges: edges.length,
|
|
17135
|
+
totalCommunities,
|
|
17136
|
+
displayedCommunities: communities.length
|
|
17137
|
+
}
|
|
17138
|
+
};
|
|
17139
|
+
}
|
|
17140
|
+
|
|
16224
17141
|
// src/watch.ts
|
|
16225
17142
|
import path26 from "path";
|
|
16226
17143
|
import process3 from "process";
|
|
@@ -16686,7 +17603,7 @@ async function ensureViewerDist(viewerDistDir) {
|
|
|
16686
17603
|
await execFileAsync("pnpm", ["build"], { cwd: viewerProjectDir });
|
|
16687
17604
|
}
|
|
16688
17605
|
}
|
|
16689
|
-
async function startGraphServer(rootDir, port) {
|
|
17606
|
+
async function startGraphServer(rootDir, port, options = {}) {
|
|
16690
17607
|
const { config, paths } = await loadVaultConfig(rootDir);
|
|
16691
17608
|
const effectivePort = port ?? config.viewer.port;
|
|
16692
17609
|
await ensureViewerDist(paths.viewerDistDir);
|
|
@@ -16698,8 +17615,16 @@ async function startGraphServer(rootDir, port) {
|
|
|
16698
17615
|
response.end(JSON.stringify({ error: "Graph artifact not found. Run `swarmvault compile` first." }));
|
|
16699
17616
|
return;
|
|
16700
17617
|
}
|
|
17618
|
+
const graph = await readJsonFile(paths.graphPath);
|
|
17619
|
+
if (!graph) {
|
|
17620
|
+
response.writeHead(404, { "content-type": "application/json" });
|
|
17621
|
+
response.end(JSON.stringify({ error: "Graph artifact not found. Run `swarmvault compile` first." }));
|
|
17622
|
+
return;
|
|
17623
|
+
}
|
|
17624
|
+
const reportPath = path27.join(paths.wikiDir, "graph", "report.json");
|
|
17625
|
+
const report = await readJsonFile(reportPath) ?? null;
|
|
16701
17626
|
response.writeHead(200, { "content-type": "application/json" });
|
|
16702
|
-
response.end(
|
|
17627
|
+
response.end(JSON.stringify(buildViewerGraphArtifact(graph, { report, full: options.full ?? false })));
|
|
16703
17628
|
return;
|
|
16704
17629
|
}
|
|
16705
17630
|
if (url.pathname === "/api/graph/query") {
|
|
@@ -16875,7 +17800,7 @@ async function startGraphServer(rootDir, port) {
|
|
|
16875
17800
|
}
|
|
16876
17801
|
};
|
|
16877
17802
|
}
|
|
16878
|
-
async function exportGraphHtml(rootDir, outputPath) {
|
|
17803
|
+
async function exportGraphHtml(rootDir, outputPath, options = {}) {
|
|
16879
17804
|
const { paths } = await loadVaultConfig(rootDir);
|
|
16880
17805
|
const graph = await readJsonFile(paths.graphPath);
|
|
16881
17806
|
if (!graph) {
|
|
@@ -16919,7 +17844,11 @@ async function exportGraphHtml(rootDir, outputPath) {
|
|
|
16919
17844
|
const script = await fs22.readFile(scriptPath, "utf8");
|
|
16920
17845
|
const style = stylePath && await fileExists(stylePath) ? await fs22.readFile(stylePath, "utf8") : "";
|
|
16921
17846
|
const report = await readJsonFile(path27.join(paths.wikiDir, "graph", "report.json"));
|
|
16922
|
-
const embeddedData = JSON.stringify(
|
|
17847
|
+
const embeddedData = JSON.stringify(
|
|
17848
|
+
{ graph: buildViewerGraphArtifact(graph, { report, full: options.full ?? false }), pages: pages.filter(Boolean), report },
|
|
17849
|
+
null,
|
|
17850
|
+
2
|
|
17851
|
+
).replace(/</g, "\\u003c");
|
|
16923
17852
|
const html = [
|
|
16924
17853
|
"<!doctype html>",
|
|
16925
17854
|
'<html lang="en">',
|
|
@@ -16968,6 +17897,7 @@ export {
|
|
|
16968
17897
|
importInbox,
|
|
16969
17898
|
ingestDirectory,
|
|
16970
17899
|
ingestInput,
|
|
17900
|
+
ingestInputDetailed,
|
|
16971
17901
|
initVault,
|
|
16972
17902
|
initWorkspace,
|
|
16973
17903
|
installAgent,
|