@swarmvaultai/engine 0.2.2 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -1
- package/dist/index.d.ts +56 -4
- package/dist/index.js +2133 -147
- package/package.json +8 -1
package/dist/index.js
CHANGED
|
@@ -1729,7 +1729,7 @@ import matter3 from "gray-matter";
|
|
|
1729
1729
|
import ignore from "ignore";
|
|
1730
1730
|
import { JSDOM as JSDOM2 } from "jsdom";
|
|
1731
1731
|
import mime from "mime-types";
|
|
1732
|
-
import
|
|
1732
|
+
import TurndownService2 from "turndown";
|
|
1733
1733
|
|
|
1734
1734
|
// src/code-analysis.ts
|
|
1735
1735
|
import fs6 from "fs/promises";
|
|
@@ -4504,8 +4504,11 @@ async function analyzeCodeSource(manifest, extractedText, schemaHash) {
|
|
|
4504
4504
|
import fs7 from "fs/promises";
|
|
4505
4505
|
import os from "os";
|
|
4506
4506
|
import path7 from "path";
|
|
4507
|
+
import { Readable } from "stream";
|
|
4508
|
+
import { parse as parseCsvSync } from "csv-parse/sync";
|
|
4507
4509
|
import { strFromU8, unzipSync } from "fflate";
|
|
4508
4510
|
import { JSDOM } from "jsdom";
|
|
4511
|
+
import TurndownService from "turndown";
|
|
4509
4512
|
import { z } from "zod";
|
|
4510
4513
|
var imageVisionExtractionSchema = z.object({
|
|
4511
4514
|
title: z.string().min(1).nullable().optional(),
|
|
@@ -4685,7 +4688,7 @@ function normalizePdfMetadata(raw) {
|
|
|
4685
4688
|
function normalizeDocumentText(raw) {
|
|
4686
4689
|
return raw.replace(/\r\n/g, "\n").split(/\n{2,}/).map((section) => normalizeWhitespace(section)).filter(Boolean).join("\n\n").trim();
|
|
4687
4690
|
}
|
|
4688
|
-
function
|
|
4691
|
+
function parseOfficeCoreMetadata(bytes) {
|
|
4689
4692
|
try {
|
|
4690
4693
|
const archive = unzipSync(new Uint8Array(bytes));
|
|
4691
4694
|
const coreXml = archive["docProps/core.xml"];
|
|
@@ -4725,6 +4728,122 @@ function parseDocxCoreMetadata(bytes) {
|
|
|
4725
4728
|
return void 0;
|
|
4726
4729
|
}
|
|
4727
4730
|
}
|
|
4731
|
+
function decodeTextBytes(bytes) {
|
|
4732
|
+
const text = bytes.toString("utf8");
|
|
4733
|
+
return text.charCodeAt(0) === 65279 ? text.slice(1) : text;
|
|
4734
|
+
}
|
|
4735
|
+
function normalizeTableCell(value) {
|
|
4736
|
+
return normalizeWhitespace(String(value ?? ""));
|
|
4737
|
+
}
|
|
4738
|
+
function isNumericCell(value) {
|
|
4739
|
+
return value.length > 0 && Number.isFinite(Number(value));
|
|
4740
|
+
}
|
|
4741
|
+
function detectHeaderRow(rows) {
|
|
4742
|
+
if (!rows.length) {
|
|
4743
|
+
return { headers: [], bodyRows: [] };
|
|
4744
|
+
}
|
|
4745
|
+
const firstRow = rows[0] ?? [];
|
|
4746
|
+
const nonEmpty = firstRow.filter(Boolean);
|
|
4747
|
+
const unique = new Set(nonEmpty);
|
|
4748
|
+
const nonNumeric = nonEmpty.filter((value) => !isNumericCell(value));
|
|
4749
|
+
const looksLikeHeader = nonEmpty.length > 0 && unique.size === nonEmpty.length && nonNumeric.length >= Math.ceil(nonEmpty.length / 2) && rows.length > 1;
|
|
4750
|
+
if (looksLikeHeader) {
|
|
4751
|
+
return {
|
|
4752
|
+
headers: firstRow.map((value, index) => value || `column_${index + 1}`),
|
|
4753
|
+
bodyRows: rows.slice(1)
|
|
4754
|
+
};
|
|
4755
|
+
}
|
|
4756
|
+
const columnCount = Math.max(...rows.map((row) => row.length), 0);
|
|
4757
|
+
return {
|
|
4758
|
+
headers: Array.from({ length: columnCount }, (_, index) => `column_${index + 1}`),
|
|
4759
|
+
bodyRows: rows
|
|
4760
|
+
};
|
|
4761
|
+
}
|
|
4762
|
+
function columnHints(headers, rows) {
|
|
4763
|
+
return headers.map((header, index) => {
|
|
4764
|
+
const values = rows.map((row) => row[index] ?? "").map(normalizeTableCell).filter(Boolean);
|
|
4765
|
+
if (!values.length) {
|
|
4766
|
+
return null;
|
|
4767
|
+
}
|
|
4768
|
+
const uniqueValues = [...new Set(values)];
|
|
4769
|
+
if (values.every(isNumericCell)) {
|
|
4770
|
+
return `- ${header}: numeric`;
|
|
4771
|
+
}
|
|
4772
|
+
if (uniqueValues.length <= 6 && values.length >= uniqueValues.length) {
|
|
4773
|
+
return `- ${header}: low-cardinality (${uniqueValues.slice(0, 6).join(", ")})`;
|
|
4774
|
+
}
|
|
4775
|
+
return null;
|
|
4776
|
+
}).filter((item) => Boolean(item));
|
|
4777
|
+
}
|
|
4778
|
+
function markdownTable(headers, rows, rowLimit = 20) {
|
|
4779
|
+
if (!headers.length) {
|
|
4780
|
+
return ["No tabular preview available."];
|
|
4781
|
+
}
|
|
4782
|
+
const width = headers.length;
|
|
4783
|
+
const lines = [`| ${headers.join(" | ")} |`, `| ${headers.map(() => "---").join(" | ")} |`];
|
|
4784
|
+
for (const row of rows.slice(0, rowLimit)) {
|
|
4785
|
+
const normalized = Array.from({ length: width }, (_, index) => normalizeTableCell(row[index] ?? ""));
|
|
4786
|
+
lines.push(`| ${normalized.join(" | ")} |`);
|
|
4787
|
+
}
|
|
4788
|
+
return lines;
|
|
4789
|
+
}
|
|
4790
|
+
function zipEntryText(archive, entryPath) {
|
|
4791
|
+
const entry = archive[entryPath];
|
|
4792
|
+
return entry ? strFromU8(entry) : void 0;
|
|
4793
|
+
}
|
|
4794
|
+
function parseXmlDocument(xml) {
|
|
4795
|
+
return new JSDOM(xml, { contentType: "text/xml" }).window.document;
|
|
4796
|
+
}
|
|
4797
|
+
function zipDirname(value) {
|
|
4798
|
+
const index = value.lastIndexOf("/");
|
|
4799
|
+
return index === -1 ? "" : value.slice(0, index);
|
|
4800
|
+
}
|
|
4801
|
+
function resolveZipTarget(basePath, target) {
|
|
4802
|
+
return path7.posix.normalize(path7.posix.join(zipDirname(basePath), target));
|
|
4803
|
+
}
|
|
4804
|
+
function relationshipTargets(xml, basePath) {
|
|
4805
|
+
const document = parseXmlDocument(xml);
|
|
4806
|
+
const map = /* @__PURE__ */ new Map();
|
|
4807
|
+
for (const node of Array.from(document.getElementsByTagName("*"))) {
|
|
4808
|
+
if (node.localName !== "Relationship") {
|
|
4809
|
+
continue;
|
|
4810
|
+
}
|
|
4811
|
+
const id = node.getAttribute("Id")?.trim();
|
|
4812
|
+
const target = node.getAttribute("Target")?.trim();
|
|
4813
|
+
const type = node.getAttribute("Type")?.trim() ?? "";
|
|
4814
|
+
if (!id || !target) {
|
|
4815
|
+
continue;
|
|
4816
|
+
}
|
|
4817
|
+
map.set(id, { target: resolveZipTarget(basePath, target), type });
|
|
4818
|
+
}
|
|
4819
|
+
return map;
|
|
4820
|
+
}
|
|
4821
|
+
function xmlTextNodes(xml, localName) {
|
|
4822
|
+
const document = parseXmlDocument(xml);
|
|
4823
|
+
const values = [];
|
|
4824
|
+
for (const node of Array.from(document.getElementsByTagName("*"))) {
|
|
4825
|
+
if (node.localName !== localName) {
|
|
4826
|
+
continue;
|
|
4827
|
+
}
|
|
4828
|
+
const text = normalizeWhitespace(node.textContent ?? "");
|
|
4829
|
+
if (text) {
|
|
4830
|
+
values.push(text);
|
|
4831
|
+
}
|
|
4832
|
+
}
|
|
4833
|
+
return values;
|
|
4834
|
+
}
|
|
4835
|
+
function firstHtmlHeading(html) {
|
|
4836
|
+
const dom = new JSDOM(html);
|
|
4837
|
+
const heading = dom.window.document.querySelector("h1, h2, h3");
|
|
4838
|
+
const title = normalizeWhitespace(heading?.textContent ?? "");
|
|
4839
|
+
return title || void 0;
|
|
4840
|
+
}
|
|
4841
|
+
function htmlToMarkdown(html) {
|
|
4842
|
+
const dom = new JSDOM(html);
|
|
4843
|
+
const turndown = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced" });
|
|
4844
|
+
const body = dom.window.document.body?.innerHTML ?? html;
|
|
4845
|
+
return turndown.turndown(body).trim();
|
|
4846
|
+
}
|
|
4728
4847
|
async function extractPdfText(input) {
|
|
4729
4848
|
try {
|
|
4730
4849
|
const pdfjs = await import("pdfjs-dist/legacy/build/pdf.mjs");
|
|
@@ -4765,39 +4884,793 @@ async function extractPdfText(input) {
|
|
|
4765
4884
|
};
|
|
4766
4885
|
} catch (error) {
|
|
4767
4886
|
return {
|
|
4768
|
-
artifact: {
|
|
4769
|
-
...extractionMetadata("pdf", input.mimeType, "pdf_text"),
|
|
4770
|
-
warnings: [`PDF text extraction failed: ${error instanceof Error ? truncate(error.message, 240) : "unknown error"}`]
|
|
4771
|
-
}
|
|
4887
|
+
artifact: {
|
|
4888
|
+
...extractionMetadata("pdf", input.mimeType, "pdf_text"),
|
|
4889
|
+
warnings: [`PDF text extraction failed: ${error instanceof Error ? truncate(error.message, 240) : "unknown error"}`]
|
|
4890
|
+
}
|
|
4891
|
+
};
|
|
4892
|
+
}
|
|
4893
|
+
}
|
|
4894
|
+
async function extractDocxText(input) {
|
|
4895
|
+
try {
|
|
4896
|
+
const mammoth = await import("mammoth");
|
|
4897
|
+
const result = await mammoth.extractRawText({
|
|
4898
|
+
buffer: input.bytes
|
|
4899
|
+
});
|
|
4900
|
+
const extractedText = normalizeDocumentText(result.value);
|
|
4901
|
+
const warnings = result.messages.map((message) => normalizeWhitespace(message.message)).filter(Boolean).map((message) => truncate(message, 240));
|
|
4902
|
+
const artifact = {
|
|
4903
|
+
...extractionMetadata("docx", input.mimeType, "docx_text"),
|
|
4904
|
+
metadata: parseOfficeCoreMetadata(input.bytes),
|
|
4905
|
+
warnings: warnings.length ? warnings : void 0
|
|
4906
|
+
};
|
|
4907
|
+
if (!extractedText) {
|
|
4908
|
+
artifact.warnings = [...artifact.warnings ?? [], "DOCX text extraction completed but produced no extractable text."];
|
|
4909
|
+
}
|
|
4910
|
+
return {
|
|
4911
|
+
extractedText: extractedText || void 0,
|
|
4912
|
+
artifact
|
|
4913
|
+
};
|
|
4914
|
+
} catch (error) {
|
|
4915
|
+
return {
|
|
4916
|
+
artifact: {
|
|
4917
|
+
...extractionMetadata("docx", input.mimeType, "docx_text"),
|
|
4918
|
+
warnings: [`DOCX text extraction failed: ${error instanceof Error ? truncate(error.message, 240) : "unknown error"}`]
|
|
4919
|
+
}
|
|
4920
|
+
};
|
|
4921
|
+
}
|
|
4922
|
+
}
|
|
4923
|
+
async function extractCsvText(input) {
|
|
4924
|
+
try {
|
|
4925
|
+
const rawText = decodeTextBytes(input.bytes);
|
|
4926
|
+
const delimiter = input.fileName?.toLowerCase().endsWith(".tsv") || input.mimeType.includes("tab-separated") ? " " : ",";
|
|
4927
|
+
const parsed = parseCsvSync(rawText, {
|
|
4928
|
+
delimiter,
|
|
4929
|
+
relax_column_count: true,
|
|
4930
|
+
skip_empty_lines: true,
|
|
4931
|
+
trim: true
|
|
4932
|
+
});
|
|
4933
|
+
const rows = parsed.map((row) => row.map((value) => normalizeTableCell(value)));
|
|
4934
|
+
const { headers, bodyRows } = detectHeaderRow(rows);
|
|
4935
|
+
const hintLines = columnHints(headers, bodyRows);
|
|
4936
|
+
const title = input.fileName ? path7.basename(input.fileName, path7.extname(input.fileName)) : void 0;
|
|
4937
|
+
const extractedText = [
|
|
4938
|
+
title ? `# ${title}` : null,
|
|
4939
|
+
`Format: ${delimiter === " " ? "TSV" : "CSV"}`,
|
|
4940
|
+
`Rows: ${bodyRows.length}`,
|
|
4941
|
+
`Columns: ${headers.length}`,
|
|
4942
|
+
headers.length ? `Headers: ${headers.join(", ")}` : null,
|
|
4943
|
+
"",
|
|
4944
|
+
hintLines.length ? "## Column Hints" : null,
|
|
4945
|
+
hintLines.length ? hintLines.join("\n") : null,
|
|
4946
|
+
hintLines.length ? "" : null,
|
|
4947
|
+
"## Preview",
|
|
4948
|
+
...markdownTable(headers, bodyRows)
|
|
4949
|
+
].filter((item) => Boolean(item)).join("\n").trim();
|
|
4950
|
+
const artifact = {
|
|
4951
|
+
...extractionMetadata("csv", input.mimeType, "csv_text"),
|
|
4952
|
+
metadata: {
|
|
4953
|
+
format: delimiter === " " ? "tsv" : "csv",
|
|
4954
|
+
row_count: String(bodyRows.length),
|
|
4955
|
+
column_count: String(headers.length),
|
|
4956
|
+
headers: headers.join(", ")
|
|
4957
|
+
}
|
|
4958
|
+
};
|
|
4959
|
+
return {
|
|
4960
|
+
title,
|
|
4961
|
+
extractedText,
|
|
4962
|
+
artifact
|
|
4963
|
+
};
|
|
4964
|
+
} catch (error) {
|
|
4965
|
+
return {
|
|
4966
|
+
artifact: {
|
|
4967
|
+
...extractionMetadata("csv", input.mimeType, "csv_text"),
|
|
4968
|
+
warnings: [`CSV extraction failed: ${error instanceof Error ? truncate(error.message, 240) : "unknown error"}`]
|
|
4969
|
+
}
|
|
4970
|
+
};
|
|
4971
|
+
}
|
|
4972
|
+
}
|
|
4973
|
+
async function extractXlsxText(input) {
|
|
4974
|
+
try {
|
|
4975
|
+
const XLSX = await import("xlsx");
|
|
4976
|
+
const workbook = XLSX.read(input.bytes, { type: "buffer", cellFormula: false, cellHTML: false, cellStyles: false });
|
|
4977
|
+
const allSheetNames = workbook.SheetNames;
|
|
4978
|
+
const sheetNames = allSheetNames.slice(0, 10);
|
|
4979
|
+
const sheetSections = [];
|
|
4980
|
+
const metadata = {
|
|
4981
|
+
sheet_count: String(allSheetNames.length),
|
|
4982
|
+
sheet_names: allSheetNames.join(", ")
|
|
4983
|
+
};
|
|
4984
|
+
for (const sheetName of sheetNames) {
|
|
4985
|
+
const sheet = workbook.Sheets[sheetName];
|
|
4986
|
+
if (!sheet) {
|
|
4987
|
+
continue;
|
|
4988
|
+
}
|
|
4989
|
+
const rows = XLSX.utils.sheet_to_json(sheet, {
|
|
4990
|
+
header: 1,
|
|
4991
|
+
raw: false,
|
|
4992
|
+
defval: ""
|
|
4993
|
+
}).map((row) => row.map((value) => normalizeTableCell(value)));
|
|
4994
|
+
const { headers, bodyRows } = detectHeaderRow(rows);
|
|
4995
|
+
sheetSections.push(`## Sheet: ${sheetName}`);
|
|
4996
|
+
sheetSections.push(`Rows: ${bodyRows.length}`);
|
|
4997
|
+
sheetSections.push(`Columns: ${headers.length}`);
|
|
4998
|
+
sheetSections.push(...markdownTable(headers, bodyRows));
|
|
4999
|
+
sheetSections.push("");
|
|
5000
|
+
}
|
|
5001
|
+
const title = normalizeWhitespace(String(workbook.Props?.Title ?? "")) || (input.fileName ? path7.basename(input.fileName, path7.extname(input.fileName)) : void 0);
|
|
5002
|
+
const extractedText = [
|
|
5003
|
+
title ? `# ${title}` : null,
|
|
5004
|
+
`Sheets: ${allSheetNames.length}`,
|
|
5005
|
+
allSheetNames.length ? `Sheet Names: ${allSheetNames.join(", ")}` : null,
|
|
5006
|
+
"",
|
|
5007
|
+
...sheetSections
|
|
5008
|
+
].filter((item) => Boolean(item)).join("\n").trim();
|
|
5009
|
+
const warnings = allSheetNames.length > sheetNames.length ? ["Workbook preview truncated to the first 10 sheets."] : void 0;
|
|
5010
|
+
return {
|
|
5011
|
+
title,
|
|
5012
|
+
extractedText,
|
|
5013
|
+
artifact: {
|
|
5014
|
+
...extractionMetadata("xlsx", input.mimeType, "xlsx_text"),
|
|
5015
|
+
metadata,
|
|
5016
|
+
warnings
|
|
5017
|
+
}
|
|
5018
|
+
};
|
|
5019
|
+
} catch (error) {
|
|
5020
|
+
return {
|
|
5021
|
+
artifact: {
|
|
5022
|
+
...extractionMetadata("xlsx", input.mimeType, "xlsx_text"),
|
|
5023
|
+
warnings: [`XLSX extraction failed: ${error instanceof Error ? truncate(error.message, 240) : "unknown error"}`]
|
|
5024
|
+
}
|
|
5025
|
+
};
|
|
5026
|
+
}
|
|
5027
|
+
}
|
|
5028
|
+
async function extractPptxText(input) {
|
|
5029
|
+
try {
|
|
5030
|
+
const archive = unzipSync(new Uint8Array(input.bytes));
|
|
5031
|
+
const presentationXml = zipEntryText(archive, "ppt/presentation.xml");
|
|
5032
|
+
if (!presentationXml) {
|
|
5033
|
+
throw new Error("Missing ppt/presentation.xml");
|
|
5034
|
+
}
|
|
5035
|
+
const relsXml = zipEntryText(archive, "ppt/_rels/presentation.xml.rels");
|
|
5036
|
+
if (!relsXml) {
|
|
5037
|
+
throw new Error("Missing ppt/_rels/presentation.xml.rels");
|
|
5038
|
+
}
|
|
5039
|
+
const rels = relationshipTargets(relsXml, "ppt/presentation.xml");
|
|
5040
|
+
const document = parseXmlDocument(presentationXml);
|
|
5041
|
+
const slideTargets = Array.from(document.getElementsByTagName("*")).filter((node) => node.localName === "sldId").map((node) => node.getAttribute("r:id")?.trim()).filter((value) => Boolean(value)).map((relationshipId) => rels.get(relationshipId)?.target).filter((value) => Boolean(value)).slice(0, 60);
|
|
5042
|
+
const slideSections = [];
|
|
5043
|
+
for (let index = 0; index < slideTargets.length; index += 1) {
|
|
5044
|
+
const slidePath = slideTargets[index];
|
|
5045
|
+
const slideXml = zipEntryText(archive, slidePath);
|
|
5046
|
+
if (!slideXml) {
|
|
5047
|
+
continue;
|
|
5048
|
+
}
|
|
5049
|
+
const slideTexts = xmlTextNodes(slideXml, "t");
|
|
5050
|
+
const slideTitle = slideTexts[0] ?? `Slide ${index + 1}`;
|
|
5051
|
+
slideSections.push(`## Slide ${index + 1}: ${slideTitle}`);
|
|
5052
|
+
if (slideTexts.length) {
|
|
5053
|
+
slideSections.push(slideTexts.join("\n"));
|
|
5054
|
+
}
|
|
5055
|
+
const slideRelsPath = `${zipDirname(slidePath)}/_rels/${path7.posix.basename(slidePath)}.rels`;
|
|
5056
|
+
const slideRelsXml = zipEntryText(archive, slideRelsPath);
|
|
5057
|
+
if (slideRelsXml) {
|
|
5058
|
+
const slideRels = relationshipTargets(slideRelsXml, slidePath);
|
|
5059
|
+
const notesTarget = [...slideRels.values()].find((entry) => entry.type.endsWith("/notesSlide"))?.target;
|
|
5060
|
+
if (notesTarget) {
|
|
5061
|
+
const notesXml = zipEntryText(archive, notesTarget);
|
|
5062
|
+
const noteTexts = notesXml ? xmlTextNodes(notesXml, "t") : [];
|
|
5063
|
+
if (noteTexts.length) {
|
|
5064
|
+
slideSections.push("Notes:");
|
|
5065
|
+
slideSections.push(noteTexts.join("\n"));
|
|
5066
|
+
}
|
|
5067
|
+
}
|
|
5068
|
+
}
|
|
5069
|
+
slideSections.push("");
|
|
5070
|
+
}
|
|
5071
|
+
const metadata = parseOfficeCoreMetadata(input.bytes);
|
|
5072
|
+
const title = metadata?.title || (input.fileName ? path7.basename(input.fileName, path7.extname(input.fileName)) : void 0);
|
|
5073
|
+
const extractedText = [title ? `# ${title}` : null, `Slides: ${slideTargets.length}`, "", ...slideSections].filter((item) => Boolean(item)).join("\n").trim();
|
|
5074
|
+
return {
|
|
5075
|
+
title,
|
|
5076
|
+
extractedText,
|
|
5077
|
+
artifact: {
|
|
5078
|
+
...extractionMetadata("pptx", input.mimeType, "pptx_text"),
|
|
5079
|
+
metadata: {
|
|
5080
|
+
...metadata ?? {},
|
|
5081
|
+
slide_count: String(slideTargets.length)
|
|
5082
|
+
},
|
|
5083
|
+
warnings: Array.from(document.getElementsByTagName("*")).filter((node) => node.localName === "sldId").length > slideTargets.length ? ["Slide extraction truncated to the first 60 slides."] : void 0
|
|
5084
|
+
}
|
|
5085
|
+
};
|
|
5086
|
+
} catch (error) {
|
|
5087
|
+
return {
|
|
5088
|
+
artifact: {
|
|
5089
|
+
...extractionMetadata("pptx", input.mimeType, "pptx_text"),
|
|
5090
|
+
warnings: [`PPTX extraction failed: ${error instanceof Error ? truncate(error.message, 240) : "unknown error"}`]
|
|
5091
|
+
}
|
|
5092
|
+
};
|
|
5093
|
+
}
|
|
5094
|
+
}
|
|
5095
|
+
async function extractEpubChapters(input) {
|
|
5096
|
+
try {
|
|
5097
|
+
const archive = unzipSync(new Uint8Array(input.bytes));
|
|
5098
|
+
const containerXml = zipEntryText(archive, "META-INF/container.xml");
|
|
5099
|
+
if (!containerXml) {
|
|
5100
|
+
throw new Error("Missing META-INF/container.xml");
|
|
5101
|
+
}
|
|
5102
|
+
const container = parseXmlDocument(containerXml);
|
|
5103
|
+
const rootfile = Array.from(container.getElementsByTagName("*")).find((node) => node.localName === "rootfile");
|
|
5104
|
+
const packagePath = rootfile?.getAttribute("full-path")?.trim();
|
|
5105
|
+
if (!packagePath) {
|
|
5106
|
+
throw new Error("EPUB container did not declare a package document.");
|
|
5107
|
+
}
|
|
5108
|
+
const packageXml = zipEntryText(archive, packagePath);
|
|
5109
|
+
if (!packageXml) {
|
|
5110
|
+
throw new Error(`Missing EPUB package document: ${packagePath}`);
|
|
5111
|
+
}
|
|
5112
|
+
const packageDocument = parseXmlDocument(packageXml);
|
|
5113
|
+
const manifestEntries = new Map(
|
|
5114
|
+
Array.from(packageDocument.getElementsByTagName("*")).filter((node) => node.localName === "item").map(
|
|
5115
|
+
(node) => [
|
|
5116
|
+
node.getAttribute("id")?.trim() ?? "",
|
|
5117
|
+
{
|
|
5118
|
+
href: node.getAttribute("href")?.trim() ?? "",
|
|
5119
|
+
mediaType: node.getAttribute("media-type")?.trim() ?? "",
|
|
5120
|
+
properties: node.getAttribute("properties")?.trim() ?? ""
|
|
5121
|
+
}
|
|
5122
|
+
]
|
|
5123
|
+
).filter(([id, item]) => Boolean(id && item.href))
|
|
5124
|
+
);
|
|
5125
|
+
const spineIds = Array.from(packageDocument.getElementsByTagName("*")).filter((node) => node.localName === "itemref").map((node) => node.getAttribute("idref")?.trim()).filter((value) => Boolean(value));
|
|
5126
|
+
const bookTitle = xmlTextNodes(packageXml, "title")[0] || (input.fileName ? path7.basename(input.fileName, path7.extname(input.fileName)) : void 0);
|
|
5127
|
+
const author = xmlTextNodes(packageXml, "creator")[0];
|
|
5128
|
+
const chapters = [];
|
|
5129
|
+
for (const spineId of spineIds) {
|
|
5130
|
+
const item = manifestEntries.get(spineId);
|
|
5131
|
+
if (!item || !item.mediaType.includes("html") && !item.mediaType.includes("xhtml")) {
|
|
5132
|
+
continue;
|
|
5133
|
+
}
|
|
5134
|
+
if (item.properties.split(/\s+/).includes("nav")) {
|
|
5135
|
+
continue;
|
|
5136
|
+
}
|
|
5137
|
+
const entryPath = resolveZipTarget(packagePath, item.href);
|
|
5138
|
+
const html = zipEntryText(archive, entryPath);
|
|
5139
|
+
if (!html) {
|
|
5140
|
+
continue;
|
|
5141
|
+
}
|
|
5142
|
+
const markdown = htmlToMarkdown(html);
|
|
5143
|
+
if (!markdown) {
|
|
5144
|
+
continue;
|
|
5145
|
+
}
|
|
5146
|
+
const chapterTitle = firstHtmlHeading(html) || markdown.match(/^#\s+(.+)$/m)?.[1]?.trim() || item.href;
|
|
5147
|
+
const normalizedTitle = normalizeWhitespace(chapterTitle);
|
|
5148
|
+
if (!normalizedTitle || /^table of contents$/i.test(normalizedTitle)) {
|
|
5149
|
+
continue;
|
|
5150
|
+
}
|
|
5151
|
+
chapters.push({
|
|
5152
|
+
partKey: item.href,
|
|
5153
|
+
title: normalizedTitle,
|
|
5154
|
+
markdown,
|
|
5155
|
+
metadata: {
|
|
5156
|
+
book_title: bookTitle ?? "",
|
|
5157
|
+
chapter_title: normalizedTitle,
|
|
5158
|
+
author: author ?? ""
|
|
5159
|
+
}
|
|
5160
|
+
});
|
|
5161
|
+
}
|
|
5162
|
+
return {
|
|
5163
|
+
title: bookTitle,
|
|
5164
|
+
author,
|
|
5165
|
+
chapters,
|
|
5166
|
+
warnings: chapters.length ? void 0 : ["EPUB extraction completed but found no chapter-like spine entries."]
|
|
5167
|
+
};
|
|
5168
|
+
} catch (error) {
|
|
5169
|
+
return {
|
|
5170
|
+
chapters: [],
|
|
5171
|
+
warnings: [`EPUB extraction failed: ${error instanceof Error ? truncate(error.message, 240) : "unknown error"}`]
|
|
5172
|
+
};
|
|
5173
|
+
}
|
|
5174
|
+
}
|
|
5175
|
+
function timestampFromMs(value) {
|
|
5176
|
+
const totalMs = Math.max(0, Math.floor(value));
|
|
5177
|
+
const totalSeconds = Math.floor(totalMs / 1e3);
|
|
5178
|
+
const hours = Math.floor(totalSeconds / 3600);
|
|
5179
|
+
const minutes = Math.floor(totalSeconds % 3600 / 60);
|
|
5180
|
+
const seconds = totalSeconds % 60;
|
|
5181
|
+
const milliseconds = totalMs % 1e3;
|
|
5182
|
+
return `${String(hours).padStart(2, "0")}:${String(minutes).padStart(2, "0")}:${String(seconds).padStart(2, "0")}.${String(
|
|
5183
|
+
milliseconds
|
|
5184
|
+
).padStart(3, "0")}`;
|
|
5185
|
+
}
|
|
5186
|
+
function normalizeDelimitedList(values) {
|
|
5187
|
+
const unique = [...new Set(values.map((value) => normalizeWhitespace(value)).filter(Boolean))];
|
|
5188
|
+
return unique.length ? unique.join(", ") : void 0;
|
|
5189
|
+
}
|
|
5190
|
+
function normalizeIsoDate(value) {
|
|
5191
|
+
if (value instanceof Date && Number.isFinite(value.getTime())) {
|
|
5192
|
+
return value.toISOString();
|
|
5193
|
+
}
|
|
5194
|
+
if (typeof value === "string" && value.trim()) {
|
|
5195
|
+
const parsed = new Date(value);
|
|
5196
|
+
if (Number.isFinite(parsed.getTime())) {
|
|
5197
|
+
return parsed.toISOString();
|
|
5198
|
+
}
|
|
5199
|
+
}
|
|
5200
|
+
return void 0;
|
|
5201
|
+
}
|
|
5202
|
+
function addressNames(value) {
|
|
5203
|
+
if (!value || typeof value !== "object" || !("value" in value) || !Array.isArray(value.value)) {
|
|
5204
|
+
return [];
|
|
5205
|
+
}
|
|
5206
|
+
return value.value.map((entry) => normalizeWhitespace(entry.name ?? entry.address ?? "")).filter(Boolean);
|
|
5207
|
+
}
|
|
5208
|
+
function addressList(value) {
|
|
5209
|
+
return normalizeDelimitedList(addressNames(value));
|
|
5210
|
+
}
|
|
5211
|
+
function emailConversationId(parsed) {
|
|
5212
|
+
const asArray = (value) => Array.isArray(value) ? value : typeof value === "string" ? [value] : [];
|
|
5213
|
+
return normalizeWhitespace(parsed.messageId ?? "") || normalizeWhitespace(asArray(parsed.inReplyTo)[0] ?? "") || normalizeWhitespace(asArray(parsed.references)[0] ?? "") || void 0;
|
|
5214
|
+
}
|
|
5215
|
+
function emailBodyMarkdown(parsed) {
|
|
5216
|
+
const text = normalizeDocumentText(parsed.text ?? "");
|
|
5217
|
+
if (text) {
|
|
5218
|
+
return text;
|
|
5219
|
+
}
|
|
5220
|
+
if (typeof parsed.html === "string" && parsed.html.trim()) {
|
|
5221
|
+
return normalizeDocumentText(htmlToMarkdown(parsed.html));
|
|
5222
|
+
}
|
|
5223
|
+
return "";
|
|
5224
|
+
}
|
|
5225
|
+
function normalizeParsedEmail(parsed, fallbackTitle) {
|
|
5226
|
+
const title = normalizeWhitespace(parsed.subject ?? "") || fallbackTitle;
|
|
5227
|
+
const sender = addressList(parsed.from);
|
|
5228
|
+
const recipients = addressList(parsed.to);
|
|
5229
|
+
const cc = addressList(parsed.cc);
|
|
5230
|
+
const occurredAt = normalizeIsoDate(parsed.date);
|
|
5231
|
+
const participants = normalizeDelimitedList([...addressNames(parsed.from), ...addressNames(parsed.to), ...addressNames(parsed.cc)]);
|
|
5232
|
+
const conversationId = emailConversationId(parsed);
|
|
5233
|
+
const body = emailBodyMarkdown(parsed);
|
|
5234
|
+
const attachmentCount = Array.isArray(parsed.attachments) ? parsed.attachments.length : 0;
|
|
5235
|
+
return {
|
|
5236
|
+
title,
|
|
5237
|
+
conversationId,
|
|
5238
|
+
metadata: {
|
|
5239
|
+
...occurredAt ? { occurred_at: occurredAt } : {},
|
|
5240
|
+
...sender ? { sender } : {},
|
|
5241
|
+
...recipients ? { recipients } : {},
|
|
5242
|
+
...cc ? { cc } : {},
|
|
5243
|
+
...participants ? { participants } : {},
|
|
5244
|
+
...conversationId ? { conversation_id: conversationId } : {},
|
|
5245
|
+
...normalizeWhitespace(parsed.messageId ?? "") ? { message_id: normalizeWhitespace(parsed.messageId ?? "") } : {},
|
|
5246
|
+
...attachmentCount ? { attachment_count: String(attachmentCount) } : {}
|
|
5247
|
+
},
|
|
5248
|
+
markdown: [
|
|
5249
|
+
`# ${title}`,
|
|
5250
|
+
"",
|
|
5251
|
+
...occurredAt ? [`Date: ${occurredAt}`] : [],
|
|
5252
|
+
...sender ? [`From: ${sender}`] : [],
|
|
5253
|
+
...recipients ? [`To: ${recipients}`] : [],
|
|
5254
|
+
...cc ? [`CC: ${cc}`] : [],
|
|
5255
|
+
...conversationId ? [`Conversation ID: ${conversationId}`] : [],
|
|
5256
|
+
...attachmentCount ? [`Attachments: ${attachmentCount}`] : [],
|
|
5257
|
+
"",
|
|
5258
|
+
"## Message",
|
|
5259
|
+
"",
|
|
5260
|
+
body || "No readable body content was extracted from this email.",
|
|
5261
|
+
""
|
|
5262
|
+
].join("\n")
|
|
5263
|
+
};
|
|
5264
|
+
}
|
|
5265
|
+
function calendarAttendees(value) {
|
|
5266
|
+
if (!value) {
|
|
5267
|
+
return [];
|
|
5268
|
+
}
|
|
5269
|
+
const attendees = Array.isArray(value) ? value : [value];
|
|
5270
|
+
return attendees.map((entry) => {
|
|
5271
|
+
if (!entry || typeof entry !== "object") {
|
|
5272
|
+
return "";
|
|
5273
|
+
}
|
|
5274
|
+
const item = entry;
|
|
5275
|
+
const name = normalizeWhitespace(String(item.params?.CN ?? ""));
|
|
5276
|
+
const address = normalizeWhitespace(String(item.val ?? item.value ?? ""));
|
|
5277
|
+
return name || address;
|
|
5278
|
+
}).filter(Boolean);
|
|
5279
|
+
}
|
|
5280
|
+
function slackFormatSpeakerId(input, usersById) {
|
|
5281
|
+
return usersById.get(input) ?? input;
|
|
5282
|
+
}
|
|
5283
|
+
function slackNormalizeText(text, usersById) {
|
|
5284
|
+
return normalizeWhitespace(
|
|
5285
|
+
text.replace(/<@([A-Z0-9]+)>/g, (_, userId) => `@${slackFormatSpeakerId(userId, usersById)}`).replace(/<#[A-Z0-9]+\|([^>]+)>/g, "#$1").replace(/<(https?:\/\/[^>|]+)\|([^>]+)>/g, "$2 ($1)").replace(/<(https?:\/\/[^>]+)>/g, "$1")
|
|
5286
|
+
);
|
|
5287
|
+
}
|
|
5288
|
+
function slackMessageTimestamp(ts2, fallbackDate) {
|
|
5289
|
+
const numeric = Number(ts2);
|
|
5290
|
+
if (Number.isFinite(numeric) && numeric > 0) {
|
|
5291
|
+
return new Date(numeric * 1e3).toISOString();
|
|
5292
|
+
}
|
|
5293
|
+
return (/* @__PURE__ */ new Date(`${fallbackDate}T00:00:00.000Z`)).toISOString();
|
|
5294
|
+
}
|
|
5295
|
+
async function loadZipMessageBuffers(bytes) {
|
|
5296
|
+
const { MboxStream } = await import("node-mbox");
|
|
5297
|
+
const stream = MboxStream(Readable.from([bytes]));
|
|
5298
|
+
return await new Promise((resolve, reject) => {
|
|
5299
|
+
const messages = [];
|
|
5300
|
+
stream.on("data", (message) => {
|
|
5301
|
+
messages.push(Buffer.isBuffer(message) ? message : Buffer.from(message));
|
|
5302
|
+
});
|
|
5303
|
+
stream.on("error", reject);
|
|
5304
|
+
stream.on("finish", () => resolve(messages));
|
|
5305
|
+
stream.on("end", () => resolve(messages));
|
|
5306
|
+
});
|
|
5307
|
+
}
|
|
5308
|
+
function archiveEntriesAsText(archive) {
|
|
5309
|
+
return new Map(
|
|
5310
|
+
Object.entries(archive).filter(([, value]) => value).map(([entryPath, value]) => [entryPath, strFromU8(value)])
|
|
5311
|
+
);
|
|
5312
|
+
}
|
|
5313
|
+
function looksLikeSlackEntries(entries) {
|
|
5314
|
+
const all = [...entries];
|
|
5315
|
+
const hasChannelsIndex = all.some(
|
|
5316
|
+
(entry) => entry === "channels.json" || entry === "groups.json" || entry === "dms.json" || entry === "mpims.json"
|
|
5317
|
+
);
|
|
5318
|
+
const hasChannelDayFiles = all.some((entry) => /^[^/]+\/\d{4}-\d{2}-\d{2}\.json$/i.test(entry));
|
|
5319
|
+
return hasChannelsIndex && hasChannelDayFiles;
|
|
5320
|
+
}
|
|
5321
|
+
function slackEntriesFromChannelIndex(raw, usersById) {
|
|
5322
|
+
const entries = /* @__PURE__ */ new Map();
|
|
5323
|
+
if (!Array.isArray(raw)) {
|
|
5324
|
+
return entries;
|
|
5325
|
+
}
|
|
5326
|
+
for (const item of raw) {
|
|
5327
|
+
if (!item || typeof item !== "object") {
|
|
5328
|
+
continue;
|
|
5329
|
+
}
|
|
5330
|
+
const value = item;
|
|
5331
|
+
const id = normalizeWhitespace(value.id ?? "");
|
|
5332
|
+
const title = normalizeWhitespace(value.name ?? "");
|
|
5333
|
+
if (!title) {
|
|
5334
|
+
continue;
|
|
5335
|
+
}
|
|
5336
|
+
const members = (Array.isArray(value.members) ? value.members : value.user ? [value.user] : []).map((member) => slackFormatSpeakerId(member, usersById)).filter(Boolean);
|
|
5337
|
+
entries.set(title, { id, title, members });
|
|
5338
|
+
}
|
|
5339
|
+
return entries;
|
|
5340
|
+
}
|
|
5341
|
+
async function extractTranscriptText(input) {
|
|
5342
|
+
try {
|
|
5343
|
+
const { parseSync } = await import("subtitle");
|
|
5344
|
+
const rawText = decodeTextBytes(input.bytes);
|
|
5345
|
+
const cues = parseSync(rawText).filter((node) => node.type === "cue" && node.data).map((node) => ({
|
|
5346
|
+
start: Math.max(0, node.data?.start ?? 0),
|
|
5347
|
+
end: Math.max(0, node.data?.end ?? 0),
|
|
5348
|
+
text: normalizeWhitespace((node.data?.text ?? "").replace(/\s*\n+\s*/g, " "))
|
|
5349
|
+
})).filter((cue) => cue.text);
|
|
5350
|
+
const title = input.fileName ? path7.basename(input.fileName, path7.extname(input.fileName)) : void 0;
|
|
5351
|
+
const extractedText = [
|
|
5352
|
+
title ? `# ${title}` : null,
|
|
5353
|
+
`Format: ${input.fileName?.toLowerCase().endsWith(".vtt") ? "WebVTT" : "SRT"}`,
|
|
5354
|
+
`Segments: ${cues.length}`,
|
|
5355
|
+
...cues.length ? [`Start: ${timestampFromMs(cues[0].start)}`, `End: ${timestampFromMs(cues[cues.length - 1].end)}`] : [],
|
|
5356
|
+
"",
|
|
5357
|
+
"## Transcript",
|
|
5358
|
+
"",
|
|
5359
|
+
...cues.length ? cues.map((cue) => `- [${timestampFromMs(cue.start)} - ${timestampFromMs(cue.end)}] ${cue.text}`) : ["- No transcript segments were extracted."],
|
|
5360
|
+
""
|
|
5361
|
+
].filter((item) => Boolean(item)).join("\n");
|
|
5362
|
+
return {
|
|
5363
|
+
title,
|
|
5364
|
+
extractedText,
|
|
5365
|
+
artifact: {
|
|
5366
|
+
...extractionMetadata("transcript", input.mimeType, "transcript_text"),
|
|
5367
|
+
metadata: {
|
|
5368
|
+
format: input.fileName?.toLowerCase().endsWith(".vtt") ? "vtt" : "srt",
|
|
5369
|
+
segment_count: String(cues.length),
|
|
5370
|
+
...cues.length ? { started_at: timestampFromMs(cues[0].start), ended_at: timestampFromMs(cues[cues.length - 1].end) } : {}
|
|
5371
|
+
}
|
|
5372
|
+
}
|
|
5373
|
+
};
|
|
5374
|
+
} catch (error) {
|
|
5375
|
+
return {
|
|
5376
|
+
artifact: {
|
|
5377
|
+
...extractionMetadata("transcript", input.mimeType, "transcript_text"),
|
|
5378
|
+
warnings: [`Transcript extraction failed: ${error instanceof Error ? truncate(error.message, 240) : "unknown error"}`]
|
|
5379
|
+
}
|
|
5380
|
+
};
|
|
5381
|
+
}
|
|
5382
|
+
}
|
|
5383
|
+
async function extractEmailText(input) {
|
|
5384
|
+
try {
|
|
5385
|
+
const { simpleParser } = await import("mailparser");
|
|
5386
|
+
const fallbackTitle = input.fileName ? path7.basename(input.fileName, path7.extname(input.fileName)) : "Email";
|
|
5387
|
+
const parsed = await simpleParser(input.bytes);
|
|
5388
|
+
const normalized = normalizeParsedEmail(parsed, fallbackTitle);
|
|
5389
|
+
return {
|
|
5390
|
+
title: normalized.title,
|
|
5391
|
+
extractedText: normalized.markdown,
|
|
5392
|
+
artifact: {
|
|
5393
|
+
...extractionMetadata("email", input.mimeType, "email_text"),
|
|
5394
|
+
metadata: normalized.metadata
|
|
5395
|
+
}
|
|
5396
|
+
};
|
|
5397
|
+
} catch (error) {
|
|
5398
|
+
return {
|
|
5399
|
+
artifact: {
|
|
5400
|
+
...extractionMetadata("email", input.mimeType, "email_text"),
|
|
5401
|
+
warnings: [`Email extraction failed: ${error instanceof Error ? truncate(error.message, 240) : "unknown error"}`]
|
|
5402
|
+
}
|
|
5403
|
+
};
|
|
5404
|
+
}
|
|
5405
|
+
}
|
|
5406
|
+
async function extractMboxMessages(input) {
|
|
5407
|
+
try {
|
|
5408
|
+
const title = input.fileName ? path7.basename(input.fileName, path7.extname(input.fileName)) : "Mailbox";
|
|
5409
|
+
const { simpleParser } = await import("mailparser");
|
|
5410
|
+
const messages = await loadZipMessageBuffers(input.bytes);
|
|
5411
|
+
const extracted = [];
|
|
5412
|
+
for (let index = 0; index < messages.length; index += 1) {
|
|
5413
|
+
const parsed = await simpleParser(messages[index]);
|
|
5414
|
+
const normalized = normalizeParsedEmail(parsed, `Message ${index + 1}`);
|
|
5415
|
+
const conversationId = normalized.conversationId || `${index + 1}`;
|
|
5416
|
+
extracted.push({
|
|
5417
|
+
partKey: `${conversationId}-${index + 1}`,
|
|
5418
|
+
title: normalized.title,
|
|
5419
|
+
markdown: normalized.markdown,
|
|
5420
|
+
metadata: {
|
|
5421
|
+
...normalized.metadata,
|
|
5422
|
+
container_title: title,
|
|
5423
|
+
mailbox_title: title,
|
|
5424
|
+
part_index: String(index + 1),
|
|
5425
|
+
part_count: String(messages.length)
|
|
5426
|
+
}
|
|
5427
|
+
});
|
|
5428
|
+
}
|
|
5429
|
+
return {
|
|
5430
|
+
title,
|
|
5431
|
+
messages: extracted,
|
|
5432
|
+
warnings: extracted.length ? void 0 : ["Mailbox extraction completed but found no readable messages."]
|
|
5433
|
+
};
|
|
5434
|
+
} catch (error) {
|
|
5435
|
+
return {
|
|
5436
|
+
messages: [],
|
|
5437
|
+
warnings: [`Mailbox extraction failed: ${error instanceof Error ? truncate(error.message, 240) : "unknown error"}`]
|
|
5438
|
+
};
|
|
5439
|
+
}
|
|
5440
|
+
}
|
|
5441
|
+
async function extractCalendarEvents(input) {
|
|
5442
|
+
try {
|
|
5443
|
+
const ical = await import("node-ical");
|
|
5444
|
+
const calendarTitle = input.fileName ? path7.basename(input.fileName, path7.extname(input.fileName)) : "Calendar";
|
|
5445
|
+
const parsed = ical.default.sync.parseICS(decodeTextBytes(input.bytes));
|
|
5446
|
+
const events = [];
|
|
5447
|
+
for (const item of Object.values(parsed)) {
|
|
5448
|
+
if (!item || typeof item !== "object" || item.type !== "VEVENT") {
|
|
5449
|
+
continue;
|
|
5450
|
+
}
|
|
5451
|
+
const event = item;
|
|
5452
|
+
const title = normalizeWhitespace(event.summary ?? "") || "Calendar Event";
|
|
5453
|
+
const occurredAt = normalizeIsoDate(event.start);
|
|
5454
|
+
const endsAt = normalizeIsoDate(event.end);
|
|
5455
|
+
const organizer = event.organizer ? normalizeWhitespace(String(event.organizer.params?.CN ?? event.organizer.val ?? "")) : void 0;
|
|
5456
|
+
const attendees = calendarAttendees(event.attendees);
|
|
5457
|
+
const participants = normalizeDelimitedList([organizer ?? "", ...attendees]);
|
|
5458
|
+
const location = normalizeWhitespace(event.location ?? "") || void 0;
|
|
5459
|
+
const description = normalizeDocumentText(event.description ?? "");
|
|
5460
|
+
const conversationId = normalizeWhitespace(event.uid ?? "") || `${title}-${occurredAt ?? events.length + 1}`;
|
|
5461
|
+
events.push({
|
|
5462
|
+
partKey: conversationId,
|
|
5463
|
+
title,
|
|
5464
|
+
metadata: {
|
|
5465
|
+
container_title: calendarTitle,
|
|
5466
|
+
...occurredAt ? { occurred_at: occurredAt } : {},
|
|
5467
|
+
...endsAt ? { ends_at: endsAt } : {},
|
|
5468
|
+
...organizer ? { organizer } : {},
|
|
5469
|
+
...location ? { location } : {},
|
|
5470
|
+
...participants ? { participants } : {},
|
|
5471
|
+
conversation_id: conversationId
|
|
5472
|
+
},
|
|
5473
|
+
markdown: [
|
|
5474
|
+
`# ${title}`,
|
|
5475
|
+
"",
|
|
5476
|
+
...occurredAt ? [`Start: ${occurredAt}`] : [],
|
|
5477
|
+
...endsAt ? [`End: ${endsAt}`] : [],
|
|
5478
|
+
...organizer ? [`Organizer: ${organizer}`] : [],
|
|
5479
|
+
...attendees.length ? [`Attendees: ${attendees.join(", ")}`] : [],
|
|
5480
|
+
...location ? [`Location: ${location}`] : [],
|
|
5481
|
+
...conversationId ? [`Event ID: ${conversationId}`] : [],
|
|
5482
|
+
"",
|
|
5483
|
+
"## Description",
|
|
5484
|
+
"",
|
|
5485
|
+
description || "No event description was provided.",
|
|
5486
|
+
""
|
|
5487
|
+
].join("\n")
|
|
5488
|
+
});
|
|
5489
|
+
}
|
|
5490
|
+
return {
|
|
5491
|
+
title: calendarTitle,
|
|
5492
|
+
events,
|
|
5493
|
+
warnings: events.length ? void 0 : ["Calendar extraction completed but found no VEVENT entries."]
|
|
5494
|
+
};
|
|
5495
|
+
} catch (error) {
|
|
5496
|
+
return {
|
|
5497
|
+
events: [],
|
|
5498
|
+
warnings: [`Calendar extraction failed: ${error instanceof Error ? truncate(error.message, 240) : "unknown error"}`]
|
|
5499
|
+
};
|
|
5500
|
+
}
|
|
5501
|
+
}
|
|
5502
|
+
function parseSlackExportEntries(entries, exportTitle) {
|
|
5503
|
+
const usersById = /* @__PURE__ */ new Map();
|
|
5504
|
+
const rawUsers = entries.get("users.json");
|
|
5505
|
+
if (rawUsers) {
|
|
5506
|
+
const parsed = JSON.parse(rawUsers);
|
|
5507
|
+
for (const user of parsed) {
|
|
5508
|
+
const id = normalizeWhitespace(user.id ?? "");
|
|
5509
|
+
const name = normalizeWhitespace(user.profile?.display_name ?? user.real_name ?? user.profile?.real_name ?? user.name ?? "");
|
|
5510
|
+
if (id && name) {
|
|
5511
|
+
usersById.set(id, name);
|
|
5512
|
+
}
|
|
5513
|
+
}
|
|
5514
|
+
}
|
|
5515
|
+
const channelIndex = /* @__PURE__ */ new Map();
|
|
5516
|
+
for (const indexPath of ["channels.json", "groups.json", "dms.json", "mpims.json"]) {
|
|
5517
|
+
const rawIndex = entries.get(indexPath);
|
|
5518
|
+
if (!rawIndex) {
|
|
5519
|
+
continue;
|
|
5520
|
+
}
|
|
5521
|
+
const parsed = JSON.parse(rawIndex);
|
|
5522
|
+
for (const [key, value] of slackEntriesFromChannelIndex(parsed, usersById)) {
|
|
5523
|
+
channelIndex.set(key, value);
|
|
5524
|
+
}
|
|
5525
|
+
}
|
|
5526
|
+
const conversationPaths = [...entries.keys()].filter((entryPath) => /^[^/]+\/\d{4}-\d{2}-\d{2}\.json$/i.test(entryPath)).sort((left, right) => left.localeCompare(right));
|
|
5527
|
+
const conversations = [];
|
|
5528
|
+
for (const entryPath of conversationPaths) {
|
|
5529
|
+
const raw = entries.get(entryPath);
|
|
5530
|
+
if (!raw) {
|
|
5531
|
+
continue;
|
|
5532
|
+
}
|
|
5533
|
+
const messages = JSON.parse(raw);
|
|
5534
|
+
if (!Array.isArray(messages)) {
|
|
5535
|
+
continue;
|
|
5536
|
+
}
|
|
5537
|
+
const [channelName, dateFile] = entryPath.split("/");
|
|
5538
|
+
const date = dateFile?.replace(/\.json$/i, "") ?? "";
|
|
5539
|
+
const channel = channelIndex.get(channelName ?? "") ?? {
|
|
5540
|
+
id: channelName ?? "",
|
|
5541
|
+
title: channelName ?? "channel",
|
|
5542
|
+
members: []
|
|
5543
|
+
};
|
|
5544
|
+
const participants = new Set(channel.members);
|
|
5545
|
+
const lines = [];
|
|
5546
|
+
const threadIds = /* @__PURE__ */ new Set();
|
|
5547
|
+
const sortedMessages = [...messages].sort((left, right) => Number(left.ts ?? 0) - Number(right.ts ?? 0));
|
|
5548
|
+
let occurredAt;
|
|
5549
|
+
for (const message of sortedMessages) {
|
|
5550
|
+
const speaker = normalizeWhitespace(
|
|
5551
|
+
message.username ?? message.bot_profile?.name ?? (message.user ? slackFormatSpeakerId(message.user, usersById) : "")
|
|
5552
|
+
) || "unknown";
|
|
5553
|
+
participants.add(speaker);
|
|
5554
|
+
const messageTime = slackMessageTimestamp(message.ts, date);
|
|
5555
|
+
occurredAt ??= messageTime;
|
|
5556
|
+
const normalizedText = slackNormalizeText(
|
|
5557
|
+
[
|
|
5558
|
+
message.text ?? "",
|
|
5559
|
+
...Array.isArray(message.files) ? message.files.map((file) => normalizeWhitespace(file.title ?? file.name ?? "")).filter(Boolean).map((label) => `Attachment: ${label}`) : []
|
|
5560
|
+
].join("\n"),
|
|
5561
|
+
usersById
|
|
5562
|
+
);
|
|
5563
|
+
if (message.thread_ts && message.thread_ts !== message.ts) {
|
|
5564
|
+
threadIds.add(message.thread_ts);
|
|
5565
|
+
}
|
|
5566
|
+
lines.push(
|
|
5567
|
+
`- [${messageTime}] ${speaker}${message.thread_ts ? ` {thread:${message.thread_ts}}` : ""}${message.ts ? ` {id:${message.ts}}` : ""}: ${normalizedText || normalizeWhitespace(message.subtype ?? "") || "[no text]"}`
|
|
5568
|
+
);
|
|
5569
|
+
}
|
|
5570
|
+
const participantsList = normalizeDelimitedList([...participants]);
|
|
5571
|
+
const conversationId = `${channel.id || channel.title}:${date}`;
|
|
5572
|
+
conversations.push({
|
|
5573
|
+
partKey: `${channel.title}-${date}`,
|
|
5574
|
+
title: `#${channel.title} - ${date}`,
|
|
5575
|
+
metadata: {
|
|
5576
|
+
workspace_title: exportTitle,
|
|
5577
|
+
channel: channel.title,
|
|
5578
|
+
...channel.id ? { channel_id: channel.id } : {},
|
|
5579
|
+
...occurredAt ? { occurred_at: occurredAt } : {},
|
|
5580
|
+
...participantsList ? { participants: participantsList } : {},
|
|
5581
|
+
container_title: `${exportTitle} / #${channel.title}`,
|
|
5582
|
+
conversation_id: conversationId,
|
|
5583
|
+
date,
|
|
5584
|
+
message_count: String(sortedMessages.length),
|
|
5585
|
+
thread_count: String(threadIds.size)
|
|
5586
|
+
},
|
|
5587
|
+
markdown: [
|
|
5588
|
+
`# #${channel.title} - ${date}`,
|
|
5589
|
+
"",
|
|
5590
|
+
`Workspace: ${exportTitle}`,
|
|
5591
|
+
`Messages: ${sortedMessages.length}`,
|
|
5592
|
+
`Threads: ${threadIds.size}`,
|
|
5593
|
+
...participantsList ? [`Participants: ${participantsList}`] : [],
|
|
5594
|
+
"",
|
|
5595
|
+
"## Messages",
|
|
5596
|
+
"",
|
|
5597
|
+
...lines.length ? lines : ["- No messages were extracted."],
|
|
5598
|
+
""
|
|
5599
|
+
].join("\n")
|
|
5600
|
+
});
|
|
5601
|
+
}
|
|
5602
|
+
return {
|
|
5603
|
+
title: exportTitle,
|
|
5604
|
+
conversations,
|
|
5605
|
+
warnings: conversations.length ? void 0 : ["Slack export parsing completed but found no channel day files."]
|
|
5606
|
+
};
|
|
5607
|
+
}
|
|
5608
|
+
function isSlackExportArchive(bytes) {
|
|
5609
|
+
try {
|
|
5610
|
+
const archive = unzipSync(new Uint8Array(bytes));
|
|
5611
|
+
return looksLikeSlackEntries(Object.keys(archive));
|
|
5612
|
+
} catch {
|
|
5613
|
+
return false;
|
|
5614
|
+
}
|
|
5615
|
+
}
|
|
5616
|
+
async function isSlackExportDirectory(directoryPath) {
|
|
5617
|
+
const entries = await fs7.readdir(directoryPath).catch(() => []);
|
|
5618
|
+
if (!entries.length) {
|
|
5619
|
+
return false;
|
|
5620
|
+
}
|
|
5621
|
+
const fileSet = new Set(entries);
|
|
5622
|
+
const hasIndex = ["channels.json", "groups.json", "dms.json", "mpims.json"].some((name) => fileSet.has(name));
|
|
5623
|
+
if (!hasIndex) {
|
|
5624
|
+
return false;
|
|
5625
|
+
}
|
|
5626
|
+
for (const entry of entries) {
|
|
5627
|
+
const channelDir = path7.join(directoryPath, entry);
|
|
5628
|
+
const stat = await fs7.stat(channelDir).catch(() => null);
|
|
5629
|
+
if (!stat?.isDirectory()) {
|
|
5630
|
+
continue;
|
|
5631
|
+
}
|
|
5632
|
+
const channelEntries = await fs7.readdir(channelDir).catch(() => []);
|
|
5633
|
+
if (channelEntries.some((name) => /^\d{4}-\d{2}-\d{2}\.json$/i.test(name))) {
|
|
5634
|
+
return true;
|
|
5635
|
+
}
|
|
5636
|
+
}
|
|
5637
|
+
return false;
|
|
5638
|
+
}
|
|
5639
|
+
async function extractSlackExportArchive(input) {
|
|
5640
|
+
try {
|
|
5641
|
+
const archive = unzipSync(new Uint8Array(input.bytes));
|
|
5642
|
+
const title = input.fileName ? path7.basename(input.fileName, path7.extname(input.fileName)) : "Slack Export";
|
|
5643
|
+
return parseSlackExportEntries(archiveEntriesAsText(archive), title);
|
|
5644
|
+
} catch (error) {
|
|
5645
|
+
return {
|
|
5646
|
+
conversations: [],
|
|
5647
|
+
warnings: [`Slack export extraction failed: ${error instanceof Error ? truncate(error.message, 240) : "unknown error"}`]
|
|
4772
5648
|
};
|
|
4773
5649
|
}
|
|
4774
5650
|
}
|
|
4775
|
-
async function
|
|
5651
|
+
async function extractSlackExportDirectory(directoryPath) {
|
|
5652
|
+
const title = path7.basename(directoryPath) || "Slack Export";
|
|
4776
5653
|
try {
|
|
4777
|
-
const
|
|
4778
|
-
const
|
|
4779
|
-
|
|
4780
|
-
|
|
4781
|
-
|
|
4782
|
-
|
|
4783
|
-
|
|
4784
|
-
|
|
4785
|
-
|
|
4786
|
-
|
|
4787
|
-
|
|
4788
|
-
|
|
4789
|
-
|
|
5654
|
+
const entries = /* @__PURE__ */ new Map();
|
|
5655
|
+
const queue = [directoryPath];
|
|
5656
|
+
while (queue.length > 0) {
|
|
5657
|
+
const current = queue.shift();
|
|
5658
|
+
const children = await fs7.readdir(current, { withFileTypes: true });
|
|
5659
|
+
for (const child of children) {
|
|
5660
|
+
const absoluteChild = path7.join(current, child.name);
|
|
5661
|
+
if (child.isDirectory()) {
|
|
5662
|
+
queue.push(absoluteChild);
|
|
5663
|
+
continue;
|
|
5664
|
+
}
|
|
5665
|
+
const relativeChild = path7.posix.relative(directoryPath, absoluteChild.split(path7.sep).join(path7.posix.sep));
|
|
5666
|
+
entries.set(relativeChild, await fs7.readFile(absoluteChild, "utf8"));
|
|
5667
|
+
}
|
|
4790
5668
|
}
|
|
4791
|
-
return
|
|
4792
|
-
extractedText: extractedText || void 0,
|
|
4793
|
-
artifact
|
|
4794
|
-
};
|
|
5669
|
+
return parseSlackExportEntries(entries, title);
|
|
4795
5670
|
} catch (error) {
|
|
4796
5671
|
return {
|
|
4797
|
-
|
|
4798
|
-
|
|
4799
|
-
warnings: [`DOCX text extraction failed: ${error instanceof Error ? truncate(error.message, 240) : "unknown error"}`]
|
|
4800
|
-
}
|
|
5672
|
+
conversations: [],
|
|
5673
|
+
warnings: [`Slack export extraction failed: ${error instanceof Error ? truncate(error.message, 240) : "unknown error"}`]
|
|
4801
5674
|
};
|
|
4802
5675
|
}
|
|
4803
5676
|
}
|
|
@@ -5230,21 +6103,42 @@ function inferKind(mimeType, filePath) {
|
|
|
5230
6103
|
if (isRstFilePath(filePath)) {
|
|
5231
6104
|
return "text";
|
|
5232
6105
|
}
|
|
6106
|
+
if (isTranscriptFilePath(filePath) || mimeType === "application/x-subrip" || mimeType === "text/vtt") {
|
|
6107
|
+
return "transcript";
|
|
6108
|
+
}
|
|
5233
6109
|
if (mimeType.includes("markdown")) {
|
|
5234
6110
|
return "markdown";
|
|
5235
6111
|
}
|
|
5236
6112
|
if (mimeType.includes("html")) {
|
|
5237
6113
|
return "html";
|
|
5238
6114
|
}
|
|
5239
|
-
if (mimeType.startsWith("text/")) {
|
|
5240
|
-
return "text";
|
|
5241
|
-
}
|
|
5242
6115
|
if (mimeType === "application/pdf" || filePath.toLowerCase().endsWith(".pdf")) {
|
|
5243
6116
|
return "pdf";
|
|
5244
6117
|
}
|
|
5245
6118
|
if (mimeType === "application/vnd.openxmlformats-officedocument.wordprocessingml.document" || filePath.toLowerCase().endsWith(".docx")) {
|
|
5246
6119
|
return "docx";
|
|
5247
6120
|
}
|
|
6121
|
+
if (isEmailFilePath(filePath) || mimeType === "message/rfc822" || mimeType === "application/mbox") {
|
|
6122
|
+
return "email";
|
|
6123
|
+
}
|
|
6124
|
+
if (isCalendarFilePath(filePath) || mimeType === "text/calendar") {
|
|
6125
|
+
return "calendar";
|
|
6126
|
+
}
|
|
6127
|
+
if (mimeType === "application/epub+zip" || filePath.toLowerCase().endsWith(".epub")) {
|
|
6128
|
+
return "epub";
|
|
6129
|
+
}
|
|
6130
|
+
if (mimeType === "text/csv" || mimeType === "text/tab-separated-values" || filePath.toLowerCase().endsWith(".csv") || filePath.toLowerCase().endsWith(".tsv")) {
|
|
6131
|
+
return "csv";
|
|
6132
|
+
}
|
|
6133
|
+
if (mimeType.startsWith("text/")) {
|
|
6134
|
+
return "text";
|
|
6135
|
+
}
|
|
6136
|
+
if (mimeType === "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" || filePath.toLowerCase().endsWith(".xlsx")) {
|
|
6137
|
+
return "xlsx";
|
|
6138
|
+
}
|
|
6139
|
+
if (mimeType === "application/vnd.openxmlformats-officedocument.presentationml.presentation" || filePath.toLowerCase().endsWith(".pptx")) {
|
|
6140
|
+
return "pptx";
|
|
6141
|
+
}
|
|
5248
6142
|
if (mimeType.startsWith("image/")) {
|
|
5249
6143
|
return "image";
|
|
5250
6144
|
}
|
|
@@ -5254,6 +6148,17 @@ function isRstFilePath(filePath) {
|
|
|
5254
6148
|
const extension = path12.extname(filePath).toLowerCase();
|
|
5255
6149
|
return extension === ".rst" || extension === ".rest";
|
|
5256
6150
|
}
|
|
6151
|
+
function isTranscriptFilePath(filePath) {
|
|
6152
|
+
const extension = path12.extname(filePath).toLowerCase();
|
|
6153
|
+
return extension === ".srt" || extension === ".vtt";
|
|
6154
|
+
}
|
|
6155
|
+
function isEmailFilePath(filePath) {
|
|
6156
|
+
const extension = path12.extname(filePath).toLowerCase();
|
|
6157
|
+
return extension === ".eml" || extension === ".mbox";
|
|
6158
|
+
}
|
|
6159
|
+
function isCalendarFilePath(filePath) {
|
|
6160
|
+
return path12.extname(filePath).toLowerCase() === ".ics";
|
|
6161
|
+
}
|
|
5257
6162
|
function titleFromText(fallback, content, filePath) {
|
|
5258
6163
|
if (filePath && isRstFilePath(filePath)) {
|
|
5259
6164
|
const rstTitle = titleFromRst(fallback, content);
|
|
@@ -5270,6 +6175,57 @@ function guessMimeType(target) {
|
|
|
5270
6175
|
}
|
|
5271
6176
|
return mime.lookup(target) || "application/octet-stream";
|
|
5272
6177
|
}
|
|
6178
|
+
function sourceGroupIdFor(prepared) {
|
|
6179
|
+
const originKey = prepared.originType === "url" ? prepared.url ?? prepared.title : prepared.originalPath ?? prepared.title;
|
|
6180
|
+
return `${slugify(prepared.title)}-${sha256(originKey).slice(0, 8)}`;
|
|
6181
|
+
}
|
|
6182
|
+
function groupedPreparedInputsFor(input) {
|
|
6183
|
+
const groupId = sourceGroupIdFor({
|
|
6184
|
+
title: input.title,
|
|
6185
|
+
originType: input.originType,
|
|
6186
|
+
originalPath: input.originalPath,
|
|
6187
|
+
url: input.url
|
|
6188
|
+
});
|
|
6189
|
+
return input.parts.map(
|
|
6190
|
+
(part, index) => finalizePreparedInput({
|
|
6191
|
+
title: `${input.title} - ${part.title}`,
|
|
6192
|
+
originType: input.originType,
|
|
6193
|
+
sourceKind: input.sourceKind,
|
|
6194
|
+
sourceClass: input.sourceClass,
|
|
6195
|
+
originalPath: input.originalPath,
|
|
6196
|
+
repoRelativePath: input.repoRelativePath,
|
|
6197
|
+
url: input.url,
|
|
6198
|
+
mimeType: "text/markdown",
|
|
6199
|
+
storedExtension: input.storedExtension,
|
|
6200
|
+
payloadBytes: Buffer.from(part.markdown, "utf8"),
|
|
6201
|
+
extractedText: part.markdown,
|
|
6202
|
+
extractionArtifact: {
|
|
6203
|
+
extractor: `${input.sourceKind}_text`,
|
|
6204
|
+
sourceKind: input.sourceKind,
|
|
6205
|
+
mimeType: input.mimeType,
|
|
6206
|
+
producedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
6207
|
+
metadata: {
|
|
6208
|
+
...part.metadata,
|
|
6209
|
+
part_index: String(index + 1),
|
|
6210
|
+
part_count: String(input.parts.length)
|
|
6211
|
+
},
|
|
6212
|
+
warnings: input.warnings
|
|
6213
|
+
},
|
|
6214
|
+
sourceGroupId: groupId,
|
|
6215
|
+
sourceGroupTitle: input.title,
|
|
6216
|
+
sourcePartKey: part.partKey,
|
|
6217
|
+
partIndex: index + 1,
|
|
6218
|
+
partCount: input.parts.length,
|
|
6219
|
+
partTitle: part.title,
|
|
6220
|
+
details: {
|
|
6221
|
+
...part.metadata,
|
|
6222
|
+
part_index: String(index + 1),
|
|
6223
|
+
part_count: String(input.parts.length)
|
|
6224
|
+
},
|
|
6225
|
+
logDetails: input.logDetails
|
|
6226
|
+
})
|
|
6227
|
+
);
|
|
6228
|
+
}
|
|
5273
6229
|
function rstAdornmentLine(line) {
|
|
5274
6230
|
const trimmed = line.trim();
|
|
5275
6231
|
if (trimmed.length < 3) {
|
|
@@ -5500,6 +6456,13 @@ async function findNearestGitRoot2(startPath) {
|
|
|
5500
6456
|
current = parent;
|
|
5501
6457
|
}
|
|
5502
6458
|
}
|
|
6459
|
+
async function detectScopedRepoRoot(rootDir, inputPath, fallbackRoot) {
|
|
6460
|
+
const detectedRepoRoot = await findNearestGitRoot2(inputPath);
|
|
6461
|
+
if (!detectedRepoRoot) {
|
|
6462
|
+
return fallbackRoot;
|
|
6463
|
+
}
|
|
6464
|
+
return withinRoot(rootDir, inputPath) && !withinRoot(rootDir, detectedRepoRoot) ? fallbackRoot : detectedRepoRoot;
|
|
6465
|
+
}
|
|
5503
6466
|
function withinRoot(rootPath, targetPath) {
|
|
5504
6467
|
const relative = path12.relative(rootPath, targetPath);
|
|
5505
6468
|
return relative === "" || !relative.startsWith("..") && !path12.isAbsolute(relative);
|
|
@@ -5844,6 +6807,9 @@ function manifestMatchesOrigin(manifest, prepared) {
|
|
|
5844
6807
|
}
|
|
5845
6808
|
return Boolean(prepared.originalPath && manifest.originalPath && toPosix(manifest.originalPath) === toPosix(prepared.originalPath));
|
|
5846
6809
|
}
|
|
6810
|
+
function manifestMatchesOriginPart(manifest, prepared) {
|
|
6811
|
+
return manifestMatchesOrigin(manifest, prepared) && (manifest.sourcePartKey ?? "") === (prepared.sourcePartKey ?? "");
|
|
6812
|
+
}
|
|
5847
6813
|
function buildCompositeHash(payloadBytes, attachments = []) {
|
|
5848
6814
|
if (!attachments.length) {
|
|
5849
6815
|
return sha256(payloadBytes);
|
|
@@ -5941,7 +6907,7 @@ function extractMarkdownImageReferences(content, baseUrl) {
|
|
|
5941
6907
|
async function convertHtmlToMarkdown(html, url) {
|
|
5942
6908
|
const dom = new JSDOM2(html, { url });
|
|
5943
6909
|
const article = new Readability(dom.window.document).parse();
|
|
5944
|
-
const turndown = new
|
|
6910
|
+
const turndown = new TurndownService2({ headingStyle: "atx", codeBlockStyle: "fenced" });
|
|
5945
6911
|
const body = article?.content ?? dom.window.document.body.innerHTML;
|
|
5946
6912
|
const markdown = turndown.turndown(body);
|
|
5947
6913
|
return {
|
|
@@ -5965,21 +6931,26 @@ async function readManifestByHash(manifestsDir, contentHash) {
|
|
|
5965
6931
|
}
|
|
5966
6932
|
return null;
|
|
5967
6933
|
}
|
|
5968
|
-
async function
|
|
6934
|
+
async function readManifestsByOrigin(manifestsDir, prepared) {
|
|
5969
6935
|
const entries = await fs11.readdir(manifestsDir, { withFileTypes: true }).catch(() => []);
|
|
6936
|
+
const manifests = [];
|
|
5970
6937
|
for (const entry of entries) {
|
|
5971
6938
|
if (!entry.isFile() || !entry.name.endsWith(".json")) {
|
|
5972
6939
|
continue;
|
|
5973
6940
|
}
|
|
5974
6941
|
const manifest = await readJsonFile(path12.join(manifestsDir, entry.name));
|
|
5975
6942
|
if (manifest && manifestMatchesOrigin(manifest, prepared)) {
|
|
5976
|
-
|
|
6943
|
+
manifests.push({
|
|
5977
6944
|
...manifest,
|
|
5978
6945
|
semanticHash: manifest.semanticHash ?? manifest.contentHash
|
|
5979
|
-
};
|
|
6946
|
+
});
|
|
5980
6947
|
}
|
|
5981
6948
|
}
|
|
5982
|
-
return
|
|
6949
|
+
return manifests;
|
|
6950
|
+
}
|
|
6951
|
+
async function readManifestByOrigin(manifestsDir, prepared) {
|
|
6952
|
+
const manifests = await readManifestsByOrigin(manifestsDir, prepared);
|
|
6953
|
+
return manifests.find((manifest) => manifestMatchesOriginPart(manifest, prepared)) ?? null;
|
|
5983
6954
|
}
|
|
5984
6955
|
async function loadGitignoreMatcher(repoRoot, enabled) {
|
|
5985
6956
|
if (!enabled) {
|
|
@@ -6046,7 +7017,13 @@ async function collectDirectoryFiles(rootDir, inputDir, repoRoot, options) {
|
|
|
6046
7017
|
continue;
|
|
6047
7018
|
}
|
|
6048
7019
|
const mimeType = guessMimeType(absolutePath);
|
|
6049
|
-
|
|
7020
|
+
let sourceKind = inferKind(mimeType, absolutePath);
|
|
7021
|
+
if (sourceKind === "binary" && path12.extname(absolutePath).toLowerCase() === ".zip") {
|
|
7022
|
+
const bytes = await fs11.readFile(absolutePath);
|
|
7023
|
+
if (isSlackExportArchive(bytes)) {
|
|
7024
|
+
sourceKind = "chat_export";
|
|
7025
|
+
}
|
|
7026
|
+
}
|
|
6050
7027
|
const sourceClass = sourceClassForRelativePath(relativePath, options);
|
|
6051
7028
|
if (!supportedDirectoryKind(sourceKind)) {
|
|
6052
7029
|
skipped.push({ path: toPosix(path12.relative(rootDir, absolutePath)), reason: `unsupported_kind:${sourceKind}` });
|
|
@@ -6228,8 +7205,8 @@ async function persistPreparedInput(rootDir, prepared, paths) {
|
|
|
6228
7205
|
const semanticHash = prepared.semanticHash ?? contentHash;
|
|
6229
7206
|
const extractionHash = prepared.extractionHash ?? buildExtractionHash(prepared.extractedText, prepared.extractionArtifact);
|
|
6230
7207
|
const existingByOrigin = await readManifestByOrigin(paths.manifestsDir, prepared);
|
|
6231
|
-
const existingByHash = existingByOrigin ? null : await readManifestByHash(paths.manifestsDir, contentHash);
|
|
6232
|
-
if (existingByOrigin && existingByOrigin.contentHash === contentHash && existingByOrigin.semanticHash === semanticHash && existingByOrigin.extractionHash === extractionHash && existingByOrigin.title === prepared.title && existingByOrigin.sourceKind === prepared.sourceKind && existingByOrigin.sourceType === prepared.sourceType && existingByOrigin.sourceClass === prepared.sourceClass && existingByOrigin.language === prepared.language && existingByOrigin.mimeType === prepared.mimeType && existingByOrigin.repoRelativePath === prepared.repoRelativePath) {
|
|
7208
|
+
const existingByHash = existingByOrigin || prepared.sourcePartKey ? null : await readManifestByHash(paths.manifestsDir, contentHash);
|
|
7209
|
+
if (existingByOrigin && existingByOrigin.contentHash === contentHash && existingByOrigin.semanticHash === semanticHash && existingByOrigin.extractionHash === extractionHash && existingByOrigin.title === prepared.title && existingByOrigin.sourceKind === prepared.sourceKind && existingByOrigin.sourceType === prepared.sourceType && existingByOrigin.sourceClass === prepared.sourceClass && existingByOrigin.language === prepared.language && existingByOrigin.mimeType === prepared.mimeType && existingByOrigin.repoRelativePath === prepared.repoRelativePath && existingByOrigin.sourceGroupId === prepared.sourceGroupId && existingByOrigin.sourceGroupTitle === prepared.sourceGroupTitle && existingByOrigin.sourcePartKey === prepared.sourcePartKey && existingByOrigin.partIndex === prepared.partIndex && existingByOrigin.partCount === prepared.partCount && existingByOrigin.partTitle === prepared.partTitle && JSON.stringify(existingByOrigin.details ?? {}) === JSON.stringify(prepared.details ?? {})) {
|
|
6233
7210
|
return { manifest: existingByOrigin, isNew: false, wasUpdated: false };
|
|
6234
7211
|
}
|
|
6235
7212
|
if (existingByHash) {
|
|
@@ -6288,6 +7265,13 @@ async function persistPreparedInput(rootDir, prepared, paths) {
|
|
|
6288
7265
|
mimeType: prepared.mimeType,
|
|
6289
7266
|
contentHash,
|
|
6290
7267
|
semanticHash,
|
|
7268
|
+
sourceGroupId: prepared.sourceGroupId,
|
|
7269
|
+
sourceGroupTitle: prepared.sourceGroupTitle,
|
|
7270
|
+
sourcePartKey: prepared.sourcePartKey,
|
|
7271
|
+
partIndex: prepared.partIndex,
|
|
7272
|
+
partCount: prepared.partCount,
|
|
7273
|
+
partTitle: prepared.partTitle,
|
|
7274
|
+
details: prepared.details,
|
|
6291
7275
|
createdAt: previous?.createdAt ?? now,
|
|
6292
7276
|
updatedAt: now,
|
|
6293
7277
|
attachments: manifestAttachments.length ? manifestAttachments : void 0
|
|
@@ -6309,6 +7293,42 @@ async function persistPreparedInput(rootDir, prepared, paths) {
|
|
|
6309
7293
|
}
|
|
6310
7294
|
return { manifest, isNew: !previous, wasUpdated: Boolean(previous) };
|
|
6311
7295
|
}
|
|
7296
|
+
async function persistPreparedInputs(rootDir, input, preparedInputs, paths) {
|
|
7297
|
+
const template = preparedInputs[0];
|
|
7298
|
+
const existingByOrigin = template ? await readManifestsByOrigin(paths.manifestsDir, template) : [];
|
|
7299
|
+
const created = [];
|
|
7300
|
+
const updated = [];
|
|
7301
|
+
const unchanged = [];
|
|
7302
|
+
const removed = [];
|
|
7303
|
+
const seenSourceIds = /* @__PURE__ */ new Set();
|
|
7304
|
+
for (const prepared of preparedInputs) {
|
|
7305
|
+
const result = await persistPreparedInput(rootDir, prepared, paths);
|
|
7306
|
+
if (result.isNew) {
|
|
7307
|
+
created.push(result.manifest);
|
|
7308
|
+
} else if (result.wasUpdated) {
|
|
7309
|
+
updated.push(result.manifest);
|
|
7310
|
+
} else {
|
|
7311
|
+
unchanged.push(result.manifest);
|
|
7312
|
+
}
|
|
7313
|
+
seenSourceIds.add(result.manifest.sourceId);
|
|
7314
|
+
}
|
|
7315
|
+
for (const manifest of existingByOrigin) {
|
|
7316
|
+
if (seenSourceIds.has(manifest.sourceId)) {
|
|
7317
|
+
continue;
|
|
7318
|
+
}
|
|
7319
|
+
await removeManifestArtifacts(rootDir, manifest, paths);
|
|
7320
|
+
removed.push(manifest);
|
|
7321
|
+
}
|
|
7322
|
+
return {
|
|
7323
|
+
input,
|
|
7324
|
+
scannedCount: preparedInputs.length,
|
|
7325
|
+
created,
|
|
7326
|
+
updated,
|
|
7327
|
+
unchanged,
|
|
7328
|
+
removed,
|
|
7329
|
+
skipped: []
|
|
7330
|
+
};
|
|
7331
|
+
}
|
|
6312
7332
|
async function removeManifestArtifacts(rootDir, manifest, paths) {
|
|
6313
7333
|
await fs11.rm(path12.join(paths.manifestsDir, `${manifest.sourceId}.json`), { force: true });
|
|
6314
7334
|
await fs11.rm(path12.resolve(rootDir, manifest.storedPath), { force: true });
|
|
@@ -6335,10 +7355,10 @@ function repoSyncWorkspaceIgnorePaths(rootDir, paths, repoRoot) {
|
|
|
6335
7355
|
return candidates.map((candidate) => path12.resolve(candidate)).filter((candidate, index, items) => items.indexOf(candidate) === index).filter((candidate) => withinRoot(repoRoot, candidate));
|
|
6336
7356
|
}
|
|
6337
7357
|
function preparedMatchesManifest(manifest, prepared, contentHash) {
|
|
6338
|
-
return manifest.contentHash === contentHash && manifest.extractionHash === (prepared.extractionHash ?? buildExtractionHash(prepared.extractedText, prepared.extractionArtifact)) && manifest.title === prepared.title && manifest.sourceKind === prepared.sourceKind && manifest.sourceType === prepared.sourceType && manifest.sourceClass === prepared.sourceClass && manifest.language === prepared.language && manifest.mimeType === prepared.mimeType && manifest.repoRelativePath === prepared.repoRelativePath;
|
|
7358
|
+
return manifest.contentHash === contentHash && manifest.extractionHash === (prepared.extractionHash ?? buildExtractionHash(prepared.extractedText, prepared.extractionArtifact)) && manifest.semanticHash === (prepared.semanticHash ?? contentHash) && manifest.title === prepared.title && manifest.sourceKind === prepared.sourceKind && manifest.sourceType === prepared.sourceType && manifest.sourceClass === prepared.sourceClass && manifest.language === prepared.language && manifest.mimeType === prepared.mimeType && manifest.repoRelativePath === prepared.repoRelativePath && manifest.sourceGroupId === prepared.sourceGroupId && manifest.sourceGroupTitle === prepared.sourceGroupTitle && manifest.sourcePartKey === prepared.sourcePartKey && manifest.partIndex === prepared.partIndex && manifest.partCount === prepared.partCount && manifest.partTitle === prepared.partTitle && JSON.stringify(manifest.details ?? {}) === JSON.stringify(prepared.details ?? {});
|
|
6339
7359
|
}
|
|
6340
7360
|
function shouldDeferWatchSemanticRefresh(sourceKind) {
|
|
6341
|
-
return sourceKind === "markdown" || sourceKind === "text" || sourceKind === "html" || sourceKind === "pdf" || sourceKind === "docx" || sourceKind === "image";
|
|
7361
|
+
return sourceKind === "markdown" || sourceKind === "text" || sourceKind === "html" || sourceKind === "pdf" || sourceKind === "docx" || sourceKind === "epub" || sourceKind === "csv" || sourceKind === "xlsx" || sourceKind === "pptx" || sourceKind === "transcript" || sourceKind === "chat_export" || sourceKind === "email" || sourceKind === "calendar" || sourceKind === "image";
|
|
6342
7362
|
}
|
|
6343
7363
|
function pendingSemanticRefreshId(changeType, repoRoot, relativePath) {
|
|
6344
7364
|
return `pending:${changeType}:${sha256(`${toPosix(repoRoot)}:${relativePath}`).slice(0, 12)}`;
|
|
@@ -6404,13 +7424,16 @@ async function syncTrackedRepos(rootDir, options, repoRoots) {
|
|
|
6404
7424
|
const currentPaths = new Set(files.map((absolutePath) => path12.resolve(absolutePath)));
|
|
6405
7425
|
for (const absolutePath of files) {
|
|
6406
7426
|
const relativePath = repoRelativePathFor(absolutePath, repoRoot) ?? toPosix(path12.relative(repoRoot, absolutePath));
|
|
6407
|
-
const
|
|
6408
|
-
|
|
6409
|
-
|
|
6410
|
-
|
|
6411
|
-
|
|
6412
|
-
|
|
6413
|
-
|
|
7427
|
+
const preparedInputs = await prepareFileInputs(
|
|
7428
|
+
rootDir,
|
|
7429
|
+
absolutePath,
|
|
7430
|
+
repoRoot,
|
|
7431
|
+
sourceClassForRelativePath(relativePath, normalizedOptions)
|
|
7432
|
+
);
|
|
7433
|
+
const result = await persistPreparedInputs(rootDir, absolutePath, preparedInputs, paths);
|
|
7434
|
+
imported.push(...result.created);
|
|
7435
|
+
updated.push(...result.updated);
|
|
7436
|
+
removed.push(...result.removed);
|
|
6414
7437
|
progress.tick();
|
|
6415
7438
|
}
|
|
6416
7439
|
progress.finish(`repo=${toPosix(path12.relative(rootDir, repoRoot)) || "."}`);
|
|
@@ -6469,9 +7492,6 @@ async function syncTrackedReposForWatch(rootDir, options, repoRoots) {
|
|
|
6469
7492
|
let scannedCount = 0;
|
|
6470
7493
|
for (const repoRoot of uniqueRoots) {
|
|
6471
7494
|
const repoManifests = manifestsByRepoRoot.get(repoRoot) ?? [];
|
|
6472
|
-
const manifestsByOriginalPath = new Map(
|
|
6473
|
-
repoManifests.filter((manifest) => manifest.originalPath).map((manifest) => [path12.resolve(manifest.originalPath), manifest])
|
|
6474
|
-
);
|
|
6475
7495
|
if (!await fileExists(repoRoot)) {
|
|
6476
7496
|
for (const manifest of repoManifests) {
|
|
6477
7497
|
if (shouldDeferWatchSemanticRefresh(manifest.sourceKind)) {
|
|
@@ -6507,38 +7527,50 @@ async function syncTrackedReposForWatch(rootDir, options, repoRoots) {
|
|
|
6507
7527
|
const currentPaths = new Set(files.map((absolutePath) => path12.resolve(absolutePath)));
|
|
6508
7528
|
for (const absolutePath of files) {
|
|
6509
7529
|
const relativePath = repoRelativePathFor(absolutePath, repoRoot) ?? toPosix(path12.relative(repoRoot, absolutePath));
|
|
6510
|
-
const
|
|
6511
|
-
|
|
6512
|
-
|
|
6513
|
-
|
|
6514
|
-
|
|
7530
|
+
const preparedInputs = await prepareFileInputs(
|
|
7531
|
+
rootDir,
|
|
7532
|
+
absolutePath,
|
|
7533
|
+
repoRoot,
|
|
7534
|
+
sourceClassForRelativePath(relativePath, normalizedOptions)
|
|
7535
|
+
);
|
|
7536
|
+
const firstPrepared = preparedInputs[0];
|
|
7537
|
+
if (firstPrepared && shouldDeferWatchSemanticRefresh(firstPrepared.sourceKind)) {
|
|
7538
|
+
const existing = repoManifests.filter(
|
|
7539
|
+
(manifest) => manifest.originalPath && path12.resolve(manifest.originalPath) === path12.resolve(absolutePath)
|
|
7540
|
+
);
|
|
7541
|
+
const existingByPartKey = new Map(existing.map((manifest) => [manifest.sourcePartKey ?? "__single__", manifest]));
|
|
7542
|
+
const changed = existing.length !== preparedInputs.length || preparedInputs.some((prepared) => {
|
|
7543
|
+
const match = existingByPartKey.get(prepared.sourcePartKey ?? "__single__");
|
|
7544
|
+
const contentHash = buildCompositeHash(prepared.payloadBytes, prepared.attachments);
|
|
7545
|
+
return !match || !preparedMatchesManifest(match, prepared, contentHash);
|
|
7546
|
+
}) || existing.some(
|
|
7547
|
+
(manifest) => !preparedInputs.some((prepared) => (prepared.sourcePartKey ?? "") === (manifest.sourcePartKey ?? ""))
|
|
7548
|
+
);
|
|
6515
7549
|
if (changed) {
|
|
6516
7550
|
pendingSemanticRefresh.push({
|
|
6517
7551
|
id: pendingSemanticRefreshId(
|
|
6518
|
-
existing ? "modified" : "added",
|
|
7552
|
+
existing.length ? "modified" : "added",
|
|
6519
7553
|
repoRoot,
|
|
6520
|
-
|
|
7554
|
+
firstPrepared.repoRelativePath ?? toPosix(path12.relative(repoRoot, absolutePath))
|
|
6521
7555
|
),
|
|
6522
7556
|
repoRoot,
|
|
6523
7557
|
path: toPosix(path12.relative(rootDir, absolutePath)),
|
|
6524
|
-
changeType: existing ? "modified" : "added",
|
|
7558
|
+
changeType: existing.length ? "modified" : "added",
|
|
6525
7559
|
detectedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
6526
|
-
sourceId: existing?.sourceId,
|
|
6527
|
-
sourceKind:
|
|
7560
|
+
sourceId: existing[0]?.sourceId,
|
|
7561
|
+
sourceKind: firstPrepared.sourceKind
|
|
6528
7562
|
});
|
|
6529
|
-
|
|
6530
|
-
staleSourceIds.add(
|
|
7563
|
+
for (const manifest of existing) {
|
|
7564
|
+
staleSourceIds.add(manifest.sourceId);
|
|
6531
7565
|
}
|
|
6532
7566
|
}
|
|
6533
7567
|
progress.tick();
|
|
6534
7568
|
continue;
|
|
6535
7569
|
}
|
|
6536
|
-
const result = await
|
|
6537
|
-
|
|
6538
|
-
|
|
6539
|
-
|
|
6540
|
-
updated.push(result.manifest);
|
|
6541
|
-
}
|
|
7570
|
+
const result = await persistPreparedInputs(rootDir, absolutePath, preparedInputs, paths);
|
|
7571
|
+
imported.push(...result.created);
|
|
7572
|
+
updated.push(...result.updated);
|
|
7573
|
+
removed.push(...result.removed);
|
|
6542
7574
|
progress.tick();
|
|
6543
7575
|
}
|
|
6544
7576
|
progress.finish(`repo=${toPosix(path12.relative(rootDir, repoRoot)) || "."}`);
|
|
@@ -6592,8 +7624,25 @@ async function syncTrackedReposForWatch(rootDir, options, repoRoots) {
|
|
|
6592
7624
|
staleSourceIds: [...staleSourceIds]
|
|
6593
7625
|
};
|
|
6594
7626
|
}
|
|
6595
|
-
async function
|
|
7627
|
+
async function prepareFileInputs(rootDir, absoluteInput, repoRoot, sourceClass) {
|
|
6596
7628
|
const payloadBytes = await fs11.readFile(absoluteInput);
|
|
7629
|
+
if (path12.extname(absoluteInput).toLowerCase() === ".zip" && isSlackExportArchive(payloadBytes)) {
|
|
7630
|
+
const slackExport = await extractSlackExportArchive({ mimeType: "application/zip", bytes: payloadBytes, fileName: absoluteInput });
|
|
7631
|
+
if (slackExport.conversations.length) {
|
|
7632
|
+
return groupedPreparedInputsFor({
|
|
7633
|
+
title: slackExport.title?.trim() || path12.basename(absoluteInput, path12.extname(absoluteInput)),
|
|
7634
|
+
originType: "file",
|
|
7635
|
+
sourceKind: "chat_export",
|
|
7636
|
+
sourceClass,
|
|
7637
|
+
originalPath: toPosix(absoluteInput),
|
|
7638
|
+
repoRelativePath: repoRelativePathFor(absoluteInput, repoRoot),
|
|
7639
|
+
mimeType: "application/zip",
|
|
7640
|
+
storedExtension: ".md",
|
|
7641
|
+
warnings: slackExport.warnings,
|
|
7642
|
+
parts: slackExport.conversations
|
|
7643
|
+
});
|
|
7644
|
+
}
|
|
7645
|
+
}
|
|
6597
7646
|
const mimeType = guessMimeType(absoluteInput);
|
|
6598
7647
|
const sourceKind = inferKind(mimeType, absoluteInput);
|
|
6599
7648
|
const language = inferCodeLanguage(absoluteInput, mimeType);
|
|
@@ -6623,6 +7672,118 @@ async function prepareFileInput(rootDir, absoluteInput, repoRoot, sourceClass) {
|
|
|
6623
7672
|
title = extracted.artifact.metadata?.title?.trim() || title;
|
|
6624
7673
|
extractedText = extracted.extractedText;
|
|
6625
7674
|
extractionArtifact = extracted.artifact;
|
|
7675
|
+
} else if (sourceKind === "transcript") {
|
|
7676
|
+
title = path12.basename(absoluteInput, path12.extname(absoluteInput));
|
|
7677
|
+
const extracted = await extractTranscriptText({ mimeType, bytes: payloadBytes, fileName: absoluteInput });
|
|
7678
|
+
title = extracted.title?.trim() || title;
|
|
7679
|
+
extractedText = extracted.extractedText;
|
|
7680
|
+
extractionArtifact = extracted.artifact;
|
|
7681
|
+
} else if (sourceKind === "email" && path12.extname(absoluteInput).toLowerCase() === ".eml") {
|
|
7682
|
+
title = path12.basename(absoluteInput, path12.extname(absoluteInput));
|
|
7683
|
+
const extracted = await extractEmailText({ mimeType, bytes: payloadBytes, fileName: absoluteInput });
|
|
7684
|
+
title = extracted.title?.trim() || title;
|
|
7685
|
+
extractedText = extracted.extractedText;
|
|
7686
|
+
extractionArtifact = extracted.artifact;
|
|
7687
|
+
} else if (sourceKind === "email" && path12.extname(absoluteInput).toLowerCase() === ".mbox") {
|
|
7688
|
+
title = path12.basename(absoluteInput, path12.extname(absoluteInput));
|
|
7689
|
+
const extracted = await extractMboxMessages({ mimeType, bytes: payloadBytes, fileName: absoluteInput });
|
|
7690
|
+
title = extracted.title?.trim() || title;
|
|
7691
|
+
if (extracted.messages.length) {
|
|
7692
|
+
return groupedPreparedInputsFor({
|
|
7693
|
+
title,
|
|
7694
|
+
originType: "file",
|
|
7695
|
+
sourceKind: "email",
|
|
7696
|
+
sourceClass,
|
|
7697
|
+
originalPath: toPosix(absoluteInput),
|
|
7698
|
+
repoRelativePath: repoRelativePathFor(absoluteInput, repoRoot),
|
|
7699
|
+
mimeType,
|
|
7700
|
+
storedExtension: ".md",
|
|
7701
|
+
warnings: extracted.warnings,
|
|
7702
|
+
parts: extracted.messages
|
|
7703
|
+
});
|
|
7704
|
+
}
|
|
7705
|
+
extractionArtifact = {
|
|
7706
|
+
extractor: "email_text",
|
|
7707
|
+
sourceKind: "email",
|
|
7708
|
+
mimeType,
|
|
7709
|
+
producedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
7710
|
+
warnings: extracted.warnings ?? ["Mailbox extraction completed but produced no readable messages."]
|
|
7711
|
+
};
|
|
7712
|
+
} else if (sourceKind === "calendar") {
|
|
7713
|
+
title = path12.basename(absoluteInput, path12.extname(absoluteInput));
|
|
7714
|
+
const extracted = await extractCalendarEvents({ mimeType, bytes: payloadBytes, fileName: absoluteInput });
|
|
7715
|
+
title = extracted.title?.trim() || title;
|
|
7716
|
+
if (extracted.events.length) {
|
|
7717
|
+
return groupedPreparedInputsFor({
|
|
7718
|
+
title,
|
|
7719
|
+
originType: "file",
|
|
7720
|
+
sourceKind: "calendar",
|
|
7721
|
+
sourceClass,
|
|
7722
|
+
originalPath: toPosix(absoluteInput),
|
|
7723
|
+
repoRelativePath: repoRelativePathFor(absoluteInput, repoRoot),
|
|
7724
|
+
mimeType,
|
|
7725
|
+
storedExtension: ".md",
|
|
7726
|
+
warnings: extracted.warnings,
|
|
7727
|
+
parts: extracted.events
|
|
7728
|
+
});
|
|
7729
|
+
}
|
|
7730
|
+
extractionArtifact = {
|
|
7731
|
+
extractor: "calendar_text",
|
|
7732
|
+
sourceKind: "calendar",
|
|
7733
|
+
mimeType,
|
|
7734
|
+
producedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
7735
|
+
warnings: extracted.warnings ?? ["Calendar extraction completed but found no events."]
|
|
7736
|
+
};
|
|
7737
|
+
} else if (sourceKind === "csv") {
|
|
7738
|
+
title = path12.basename(absoluteInput, path12.extname(absoluteInput));
|
|
7739
|
+
const extracted = await extractCsvText({ mimeType, bytes: payloadBytes, fileName: absoluteInput });
|
|
7740
|
+
title = extracted.title?.trim() || title;
|
|
7741
|
+
extractedText = extracted.extractedText;
|
|
7742
|
+
extractionArtifact = extracted.artifact;
|
|
7743
|
+
} else if (sourceKind === "xlsx") {
|
|
7744
|
+
title = path12.basename(absoluteInput, path12.extname(absoluteInput));
|
|
7745
|
+
const extracted = await extractXlsxText({ mimeType, bytes: payloadBytes, fileName: absoluteInput });
|
|
7746
|
+
title = extracted.title?.trim() || title;
|
|
7747
|
+
extractedText = extracted.extractedText;
|
|
7748
|
+
extractionArtifact = extracted.artifact;
|
|
7749
|
+
} else if (sourceKind === "pptx") {
|
|
7750
|
+
title = path12.basename(absoluteInput, path12.extname(absoluteInput));
|
|
7751
|
+
const extracted = await extractPptxText({ mimeType, bytes: payloadBytes, fileName: absoluteInput });
|
|
7752
|
+
title = extracted.title?.trim() || title;
|
|
7753
|
+
extractedText = extracted.extractedText;
|
|
7754
|
+
extractionArtifact = extracted.artifact;
|
|
7755
|
+
} else if (sourceKind === "epub") {
|
|
7756
|
+
title = path12.basename(absoluteInput, path12.extname(absoluteInput));
|
|
7757
|
+
const extracted = await extractEpubChapters({ mimeType, bytes: payloadBytes, fileName: absoluteInput });
|
|
7758
|
+
title = extracted.title?.trim() || title;
|
|
7759
|
+
if (extracted.chapters.length) {
|
|
7760
|
+
return groupedPreparedInputsFor({
|
|
7761
|
+
title,
|
|
7762
|
+
originType: "file",
|
|
7763
|
+
sourceKind: "epub",
|
|
7764
|
+
sourceClass,
|
|
7765
|
+
originalPath: toPosix(absoluteInput),
|
|
7766
|
+
repoRelativePath: repoRelativePathFor(absoluteInput, repoRoot),
|
|
7767
|
+
mimeType,
|
|
7768
|
+
storedExtension: ".md",
|
|
7769
|
+
warnings: extracted.warnings,
|
|
7770
|
+
parts: extracted.chapters.map((chapter) => ({
|
|
7771
|
+
...chapter,
|
|
7772
|
+
metadata: {
|
|
7773
|
+
...chapter.metadata,
|
|
7774
|
+
...extracted.author ? { author: extracted.author } : {}
|
|
7775
|
+
}
|
|
7776
|
+
}))
|
|
7777
|
+
});
|
|
7778
|
+
}
|
|
7779
|
+
extractedText = void 0;
|
|
7780
|
+
extractionArtifact = {
|
|
7781
|
+
extractor: "epub_text",
|
|
7782
|
+
sourceKind: "epub",
|
|
7783
|
+
mimeType,
|
|
7784
|
+
producedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
7785
|
+
warnings: extracted.warnings ?? ["EPUB extraction completed but produced no chapter content."]
|
|
7786
|
+
};
|
|
6626
7787
|
} else if (sourceKind === "image") {
|
|
6627
7788
|
title = path12.basename(absoluteInput, path12.extname(absoluteInput));
|
|
6628
7789
|
const extracted = await extractImageWithVision(rootDir, {
|
|
@@ -6636,23 +7797,33 @@ async function prepareFileInput(rootDir, absoluteInput, repoRoot, sourceClass) {
|
|
|
6636
7797
|
} else {
|
|
6637
7798
|
title = path12.basename(absoluteInput, path12.extname(absoluteInput));
|
|
6638
7799
|
}
|
|
6639
|
-
return
|
|
6640
|
-
|
|
6641
|
-
|
|
6642
|
-
|
|
6643
|
-
|
|
6644
|
-
|
|
6645
|
-
|
|
6646
|
-
|
|
6647
|
-
|
|
6648
|
-
|
|
6649
|
-
|
|
6650
|
-
|
|
6651
|
-
|
|
6652
|
-
|
|
6653
|
-
|
|
7800
|
+
return [
|
|
7801
|
+
finalizePreparedInput({
|
|
7802
|
+
title,
|
|
7803
|
+
originType: "file",
|
|
7804
|
+
sourceKind,
|
|
7805
|
+
sourceClass,
|
|
7806
|
+
language,
|
|
7807
|
+
originalPath: toPosix(absoluteInput),
|
|
7808
|
+
repoRelativePath: repoRelativePathFor(absoluteInput, repoRoot),
|
|
7809
|
+
mimeType,
|
|
7810
|
+
storedExtension,
|
|
7811
|
+
payloadBytes,
|
|
7812
|
+
extractedText,
|
|
7813
|
+
extractionArtifact,
|
|
7814
|
+
extractionHash: buildExtractionHash(extractedText, extractionArtifact),
|
|
7815
|
+
details: extractionArtifact?.metadata
|
|
7816
|
+
})
|
|
7817
|
+
];
|
|
6654
7818
|
}
|
|
6655
|
-
async function
|
|
7819
|
+
async function prepareFileInput(rootDir, absoluteInput, repoRoot, sourceClass) {
|
|
7820
|
+
const prepared = await prepareFileInputs(rootDir, absoluteInput, repoRoot, sourceClass);
|
|
7821
|
+
if (!prepared.length) {
|
|
7822
|
+
throw new Error(`No ingestable sources were extracted from ${absoluteInput}.`);
|
|
7823
|
+
}
|
|
7824
|
+
return prepared[0];
|
|
7825
|
+
}
|
|
7826
|
+
async function prepareUrlInputs(rootDir, input, options) {
|
|
6656
7827
|
await validateUrlSafety(input);
|
|
6657
7828
|
const response = await fetch(input);
|
|
6658
7829
|
if (!response.ok) {
|
|
@@ -6661,6 +7832,25 @@ async function prepareUrlInput(rootDir, input, options) {
|
|
|
6661
7832
|
const finalUrl = normalizeOriginUrl(response.url || input);
|
|
6662
7833
|
const inputUrl = new URL(finalUrl);
|
|
6663
7834
|
const originalPayloadBytes = Buffer.from(await response.arrayBuffer());
|
|
7835
|
+
if (path12.extname(inputUrl.pathname).toLowerCase() === ".zip" && isSlackExportArchive(originalPayloadBytes)) {
|
|
7836
|
+
const slackExport = await extractSlackExportArchive({
|
|
7837
|
+
mimeType: "application/zip",
|
|
7838
|
+
bytes: originalPayloadBytes,
|
|
7839
|
+
fileName: inputUrl.pathname
|
|
7840
|
+
});
|
|
7841
|
+
if (slackExport.conversations.length) {
|
|
7842
|
+
return groupedPreparedInputsFor({
|
|
7843
|
+
title: slackExport.title?.trim() || inputUrl.hostname,
|
|
7844
|
+
originType: "url",
|
|
7845
|
+
sourceKind: "chat_export",
|
|
7846
|
+
url: finalUrl,
|
|
7847
|
+
mimeType: "application/zip",
|
|
7848
|
+
storedExtension: ".md",
|
|
7849
|
+
warnings: slackExport.warnings,
|
|
7850
|
+
parts: slackExport.conversations
|
|
7851
|
+
});
|
|
7852
|
+
}
|
|
7853
|
+
}
|
|
6664
7854
|
let payloadBytes = originalPayloadBytes;
|
|
6665
7855
|
let mimeType = resolveUrlMimeType(input, response);
|
|
6666
7856
|
let sourceKind = inferKind(mimeType, inputUrl.pathname);
|
|
@@ -6747,6 +7937,104 @@ async function prepareUrlInput(rootDir, input, options) {
|
|
|
6747
7937
|
title = extracted.artifact.metadata?.title?.trim() || title;
|
|
6748
7938
|
extractedText = extracted.extractedText;
|
|
6749
7939
|
extractionArtifact = extracted.artifact;
|
|
7940
|
+
} else if (sourceKind === "transcript") {
|
|
7941
|
+
const extracted = await extractTranscriptText({ mimeType, bytes: payloadBytes, fileName: inputUrl.pathname });
|
|
7942
|
+
title = extracted.title?.trim() || title;
|
|
7943
|
+
extractedText = extracted.extractedText;
|
|
7944
|
+
extractionArtifact = extracted.artifact;
|
|
7945
|
+
} else if (sourceKind === "email" && path12.extname(inputUrl.pathname).toLowerCase() === ".eml") {
|
|
7946
|
+
const extracted = await extractEmailText({ mimeType, bytes: payloadBytes, fileName: inputUrl.pathname });
|
|
7947
|
+
title = extracted.title?.trim() || title;
|
|
7948
|
+
extractedText = extracted.extractedText;
|
|
7949
|
+
extractionArtifact = extracted.artifact;
|
|
7950
|
+
} else if (sourceKind === "email" && path12.extname(inputUrl.pathname).toLowerCase() === ".mbox") {
|
|
7951
|
+
const extracted = await extractMboxMessages({ mimeType, bytes: payloadBytes, fileName: inputUrl.pathname });
|
|
7952
|
+
title = extracted.title?.trim() || title;
|
|
7953
|
+
if (extracted.messages.length) {
|
|
7954
|
+
return groupedPreparedInputsFor({
|
|
7955
|
+
title,
|
|
7956
|
+
originType: "url",
|
|
7957
|
+
sourceKind: "email",
|
|
7958
|
+
url: finalUrl,
|
|
7959
|
+
mimeType,
|
|
7960
|
+
storedExtension: ".md",
|
|
7961
|
+
warnings: extracted.warnings,
|
|
7962
|
+
parts: extracted.messages
|
|
7963
|
+
});
|
|
7964
|
+
}
|
|
7965
|
+
extractionArtifact = {
|
|
7966
|
+
extractor: "email_text",
|
|
7967
|
+
sourceKind: "email",
|
|
7968
|
+
mimeType,
|
|
7969
|
+
producedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
7970
|
+
warnings: extracted.warnings ?? ["Mailbox extraction completed but produced no readable messages."]
|
|
7971
|
+
};
|
|
7972
|
+
} else if (sourceKind === "calendar") {
|
|
7973
|
+
const extracted = await extractCalendarEvents({ mimeType, bytes: payloadBytes, fileName: inputUrl.pathname });
|
|
7974
|
+
title = extracted.title?.trim() || title;
|
|
7975
|
+
if (extracted.events.length) {
|
|
7976
|
+
return groupedPreparedInputsFor({
|
|
7977
|
+
title,
|
|
7978
|
+
originType: "url",
|
|
7979
|
+
sourceKind: "calendar",
|
|
7980
|
+
url: finalUrl,
|
|
7981
|
+
mimeType,
|
|
7982
|
+
storedExtension: ".md",
|
|
7983
|
+
warnings: extracted.warnings,
|
|
7984
|
+
parts: extracted.events
|
|
7985
|
+
});
|
|
7986
|
+
}
|
|
7987
|
+
extractionArtifact = {
|
|
7988
|
+
extractor: "calendar_text",
|
|
7989
|
+
sourceKind: "calendar",
|
|
7990
|
+
mimeType,
|
|
7991
|
+
producedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
7992
|
+
warnings: extracted.warnings ?? ["Calendar extraction completed but found no events."]
|
|
7993
|
+
};
|
|
7994
|
+
} else if (sourceKind === "csv") {
|
|
7995
|
+
const extracted = await extractCsvText({ mimeType, bytes: payloadBytes, fileName: inputUrl.pathname });
|
|
7996
|
+
title = extracted.title?.trim() || title;
|
|
7997
|
+
extractedText = extracted.extractedText;
|
|
7998
|
+
extractionArtifact = extracted.artifact;
|
|
7999
|
+
} else if (sourceKind === "xlsx") {
|
|
8000
|
+
const extracted = await extractXlsxText({ mimeType, bytes: payloadBytes, fileName: inputUrl.pathname });
|
|
8001
|
+
title = extracted.title?.trim() || title;
|
|
8002
|
+
extractedText = extracted.extractedText;
|
|
8003
|
+
extractionArtifact = extracted.artifact;
|
|
8004
|
+
} else if (sourceKind === "pptx") {
|
|
8005
|
+
const extracted = await extractPptxText({ mimeType, bytes: payloadBytes, fileName: inputUrl.pathname });
|
|
8006
|
+
title = extracted.title?.trim() || title;
|
|
8007
|
+
extractedText = extracted.extractedText;
|
|
8008
|
+
extractionArtifact = extracted.artifact;
|
|
8009
|
+
} else if (sourceKind === "epub") {
|
|
8010
|
+
const extracted = await extractEpubChapters({ mimeType, bytes: payloadBytes, fileName: inputUrl.pathname });
|
|
8011
|
+
title = extracted.title?.trim() || title;
|
|
8012
|
+
if (extracted.chapters.length) {
|
|
8013
|
+
return groupedPreparedInputsFor({
|
|
8014
|
+
title,
|
|
8015
|
+
originType: "url",
|
|
8016
|
+
sourceKind: "epub",
|
|
8017
|
+
url: finalUrl,
|
|
8018
|
+
mimeType,
|
|
8019
|
+
storedExtension: ".md",
|
|
8020
|
+
warnings: extracted.warnings,
|
|
8021
|
+
parts: extracted.chapters.map((chapter) => ({
|
|
8022
|
+
...chapter,
|
|
8023
|
+
metadata: {
|
|
8024
|
+
...chapter.metadata,
|
|
8025
|
+
...extracted.author ? { author: extracted.author } : {}
|
|
8026
|
+
}
|
|
8027
|
+
})),
|
|
8028
|
+
logDetails
|
|
8029
|
+
});
|
|
8030
|
+
}
|
|
8031
|
+
extractionArtifact = {
|
|
8032
|
+
extractor: "epub_text",
|
|
8033
|
+
sourceKind: "epub",
|
|
8034
|
+
mimeType,
|
|
8035
|
+
producedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
8036
|
+
warnings: extracted.warnings ?? ["EPUB extraction completed but produced no chapter content."]
|
|
8037
|
+
};
|
|
6750
8038
|
} else if (sourceKind === "image") {
|
|
6751
8039
|
const extracted = await extractImageWithVision(rootDir, {
|
|
6752
8040
|
title,
|
|
@@ -6758,22 +8046,32 @@ async function prepareUrlInput(rootDir, input, options) {
|
|
|
6758
8046
|
extractionArtifact = extracted.artifact;
|
|
6759
8047
|
}
|
|
6760
8048
|
}
|
|
6761
|
-
return
|
|
6762
|
-
|
|
6763
|
-
|
|
6764
|
-
|
|
6765
|
-
|
|
6766
|
-
|
|
6767
|
-
|
|
6768
|
-
|
|
6769
|
-
|
|
6770
|
-
|
|
6771
|
-
|
|
6772
|
-
|
|
6773
|
-
|
|
6774
|
-
|
|
6775
|
-
|
|
6776
|
-
|
|
8049
|
+
return [
|
|
8050
|
+
finalizePreparedInput({
|
|
8051
|
+
title,
|
|
8052
|
+
originType: "url",
|
|
8053
|
+
sourceKind,
|
|
8054
|
+
language,
|
|
8055
|
+
url: finalUrl,
|
|
8056
|
+
mimeType,
|
|
8057
|
+
storedExtension,
|
|
8058
|
+
payloadBytes,
|
|
8059
|
+
extractedText,
|
|
8060
|
+
extractionArtifact,
|
|
8061
|
+
extractionHash: buildExtractionHash(extractedText, extractionArtifact),
|
|
8062
|
+
attachments,
|
|
8063
|
+
contentHash,
|
|
8064
|
+
details: extractionArtifact?.metadata,
|
|
8065
|
+
logDetails
|
|
8066
|
+
})
|
|
8067
|
+
];
|
|
8068
|
+
}
|
|
8069
|
+
async function prepareUrlInput(rootDir, input, options) {
|
|
8070
|
+
const prepared = await prepareUrlInputs(rootDir, input, options);
|
|
8071
|
+
if (!prepared.length) {
|
|
8072
|
+
throw new Error(`No ingestable sources were extracted from ${input}.`);
|
|
8073
|
+
}
|
|
8074
|
+
return prepared[0];
|
|
6777
8075
|
}
|
|
6778
8076
|
async function collectInboxAttachmentRefs(inputDir, files) {
|
|
6779
8077
|
const refsBySource = /* @__PURE__ */ new Map();
|
|
@@ -6905,18 +8203,38 @@ async function prepareInboxHtmlInput(absolutePath, attachmentRefs) {
|
|
|
6905
8203
|
};
|
|
6906
8204
|
}
|
|
6907
8205
|
function isSupportedInboxKind(sourceKind) {
|
|
6908
|
-
return [
|
|
8206
|
+
return [
|
|
8207
|
+
"markdown",
|
|
8208
|
+
"text",
|
|
8209
|
+
"html",
|
|
8210
|
+
"pdf",
|
|
8211
|
+
"docx",
|
|
8212
|
+
"epub",
|
|
8213
|
+
"csv",
|
|
8214
|
+
"xlsx",
|
|
8215
|
+
"pptx",
|
|
8216
|
+
"transcript",
|
|
8217
|
+
"chat_export",
|
|
8218
|
+
"email",
|
|
8219
|
+
"calendar",
|
|
8220
|
+
"image"
|
|
8221
|
+
].includes(sourceKind);
|
|
6909
8222
|
}
|
|
6910
8223
|
async function ingestInputDetailed(rootDir, input, options) {
|
|
6911
8224
|
const { paths } = await initWorkspace(rootDir);
|
|
6912
8225
|
const normalizedOptions = normalizeIngestOptions(options);
|
|
6913
8226
|
const absoluteInput = path12.resolve(rootDir, input);
|
|
6914
|
-
const repoRoot = isHttpUrl(input) || normalizedOptions.repoRoot ? normalizedOptions.repoRoot : await
|
|
6915
|
-
const prepared = isHttpUrl(input) ? await
|
|
6916
|
-
return await
|
|
8227
|
+
const repoRoot = isHttpUrl(input) || normalizedOptions.repoRoot ? normalizedOptions.repoRoot : await detectScopedRepoRoot(rootDir, absoluteInput, path12.dirname(absoluteInput));
|
|
8228
|
+
const prepared = isHttpUrl(input) ? await prepareUrlInputs(rootDir, input, normalizedOptions) : await prepareFileInputs(rootDir, absoluteInput, repoRoot);
|
|
8229
|
+
return await persistPreparedInputs(rootDir, input, prepared, paths);
|
|
6917
8230
|
}
|
|
6918
8231
|
async function ingestInput(rootDir, input, options) {
|
|
6919
|
-
|
|
8232
|
+
const result = await ingestInputDetailed(rootDir, input, options);
|
|
8233
|
+
const manifest = [...result.created, ...result.updated, ...result.unchanged][0];
|
|
8234
|
+
if (!manifest) {
|
|
8235
|
+
throw new Error(`No source manifests were created or updated for ${input}.`);
|
|
8236
|
+
}
|
|
8237
|
+
return manifest;
|
|
6920
8238
|
}
|
|
6921
8239
|
async function addInput(rootDir, input, options = {}) {
|
|
6922
8240
|
const { paths } = await initWorkspace(rootDir);
|
|
@@ -7004,23 +8322,59 @@ async function ingestDirectory(rootDir, inputDir, options) {
|
|
|
7004
8322
|
const { paths } = await initWorkspace(rootDir);
|
|
7005
8323
|
const normalizedOptions = await resolveRepoIngestOptions(rootDir, options);
|
|
7006
8324
|
const absoluteInputDir = path12.resolve(rootDir, inputDir);
|
|
7007
|
-
const repoRoot = normalizedOptions.repoRoot ?? await
|
|
8325
|
+
const repoRoot = normalizedOptions.repoRoot ?? await detectScopedRepoRoot(rootDir, absoluteInputDir, absoluteInputDir);
|
|
7008
8326
|
if (!await fileExists(absoluteInputDir)) {
|
|
7009
8327
|
throw new Error(`Directory not found: ${absoluteInputDir}`);
|
|
7010
8328
|
}
|
|
8329
|
+
if (await isSlackExportDirectory(absoluteInputDir)) {
|
|
8330
|
+
const extracted = await extractSlackExportDirectory(absoluteInputDir);
|
|
8331
|
+
const preparedInputs = groupedPreparedInputsFor({
|
|
8332
|
+
title: extracted.title?.trim() || path12.basename(absoluteInputDir),
|
|
8333
|
+
originType: "file",
|
|
8334
|
+
sourceKind: "chat_export",
|
|
8335
|
+
originalPath: toPosix(absoluteInputDir),
|
|
8336
|
+
mimeType: "application/json",
|
|
8337
|
+
storedExtension: ".md",
|
|
8338
|
+
warnings: extracted.warnings,
|
|
8339
|
+
parts: extracted.conversations
|
|
8340
|
+
});
|
|
8341
|
+
const result = await persistPreparedInputs(rootDir, absoluteInputDir, preparedInputs, paths);
|
|
8342
|
+
await appendLogEntry(rootDir, "ingest_directory", toPosix(path12.relative(rootDir, absoluteInputDir)) || ".", [
|
|
8343
|
+
`repo_root=${toPosix(path12.relative(rootDir, repoRoot)) || "."}`,
|
|
8344
|
+
`scanned=${preparedInputs.length}`,
|
|
8345
|
+
`imported=${result.created.length}`,
|
|
8346
|
+
`updated=${result.updated.length}`,
|
|
8347
|
+
`skipped=${result.skipped.length}`
|
|
8348
|
+
]);
|
|
8349
|
+
return {
|
|
8350
|
+
inputDir: absoluteInputDir,
|
|
8351
|
+
repoRoot,
|
|
8352
|
+
scannedCount: preparedInputs.length,
|
|
8353
|
+
imported: result.created,
|
|
8354
|
+
updated: result.updated,
|
|
8355
|
+
skipped: result.skipped
|
|
8356
|
+
};
|
|
8357
|
+
}
|
|
7011
8358
|
const { files, skipped } = await collectDirectoryFiles(rootDir, absoluteInputDir, repoRoot, normalizedOptions);
|
|
7012
8359
|
const imported = [];
|
|
7013
8360
|
const updated = [];
|
|
7014
8361
|
const progress = createProgressReporter("ingest", files.length);
|
|
7015
8362
|
for (const absolutePath of files) {
|
|
7016
8363
|
const relativePath = repoRelativePathFor(absolutePath, repoRoot) ?? toPosix(path12.relative(repoRoot, absolutePath));
|
|
7017
|
-
const
|
|
7018
|
-
|
|
7019
|
-
|
|
7020
|
-
|
|
7021
|
-
|
|
7022
|
-
|
|
7023
|
-
|
|
8364
|
+
const preparedInputs = await prepareFileInputs(
|
|
8365
|
+
rootDir,
|
|
8366
|
+
absolutePath,
|
|
8367
|
+
repoRoot,
|
|
8368
|
+
sourceClassForRelativePath(relativePath, normalizedOptions)
|
|
8369
|
+
);
|
|
8370
|
+
const result = await persistPreparedInputs(rootDir, absolutePath, preparedInputs, paths);
|
|
8371
|
+
if (result.created.length) {
|
|
8372
|
+
imported.push(...result.created);
|
|
8373
|
+
}
|
|
8374
|
+
if (result.updated.length) {
|
|
8375
|
+
updated.push(...result.updated);
|
|
8376
|
+
}
|
|
8377
|
+
if (!result.created.length && !result.updated.length && !result.removed.length) {
|
|
7024
8378
|
skipped.push({ path: toPosix(path12.relative(rootDir, absolutePath)), reason: "duplicate_content" });
|
|
7025
8379
|
}
|
|
7026
8380
|
progress.tick();
|
|
@@ -7065,19 +8419,25 @@ async function importInbox(rootDir, inputDir) {
|
|
|
7065
8419
|
continue;
|
|
7066
8420
|
}
|
|
7067
8421
|
const mimeType = guessMimeType(absolutePath);
|
|
7068
|
-
|
|
8422
|
+
let sourceKind = inferKind(mimeType, absolutePath);
|
|
8423
|
+
if (sourceKind === "binary" && path12.extname(absolutePath).toLowerCase() === ".zip") {
|
|
8424
|
+
const bytes = await fs11.readFile(absolutePath);
|
|
8425
|
+
if (isSlackExportArchive(bytes)) {
|
|
8426
|
+
sourceKind = "chat_export";
|
|
8427
|
+
}
|
|
8428
|
+
}
|
|
7069
8429
|
if (!isSupportedInboxKind(sourceKind)) {
|
|
7070
8430
|
skipped.push({ path: toPosix(path12.relative(rootDir, absolutePath)), reason: `unsupported_kind:${sourceKind}` });
|
|
7071
8431
|
continue;
|
|
7072
8432
|
}
|
|
7073
8433
|
const prepared = sourceKind === "markdown" && refsBySource.has(absolutePath) ? await prepareInboxMarkdownInput(absolutePath, refsBySource.get(absolutePath) ?? []) : sourceKind === "html" && refsBySource.has(absolutePath) ? await prepareInboxHtmlInput(absolutePath, refsBySource.get(absolutePath) ?? []) : await prepareFileInput(rootDir, absolutePath);
|
|
7074
|
-
const result = await
|
|
7075
|
-
if (!result.
|
|
8434
|
+
const result = await persistPreparedInputs(rootDir, absolutePath, [prepared], paths);
|
|
8435
|
+
if (!result.created.length) {
|
|
7076
8436
|
skipped.push({ path: toPosix(path12.relative(rootDir, absolutePath)), reason: "duplicate_content" });
|
|
7077
8437
|
continue;
|
|
7078
8438
|
}
|
|
7079
|
-
attachmentCount += result.manifest.attachments?.length ?? 0;
|
|
7080
|
-
imported.push(result.
|
|
8439
|
+
attachmentCount += result.created.reduce((total, manifest) => total + (manifest.attachments?.length ?? 0), 0);
|
|
8440
|
+
imported.push(...result.created);
|
|
7081
8441
|
}
|
|
7082
8442
|
await appendLogEntry(rootDir, "inbox_import", toPosix(path12.relative(rootDir, effectiveInputDir)) || ".", [
|
|
7083
8443
|
`scanned=${files.length}`,
|
|
@@ -9292,7 +10652,20 @@ function relatedOutputsSection(relatedOutputs) {
|
|
|
9292
10652
|
if (!relatedOutputs.length) {
|
|
9293
10653
|
return [];
|
|
9294
10654
|
}
|
|
9295
|
-
return ["## Related Outputs", "", ...relatedOutputs.map((page) => `- ${pageLink(page)}`), ""];
|
|
10655
|
+
return ["## Related Outputs", "", ...relatedOutputs.map((page) => `- ${pageLink(page)}`), ""];
|
|
10656
|
+
}
|
|
10657
|
+
function detailValue(manifest, key) {
|
|
10658
|
+
const value = manifest.details?.[key];
|
|
10659
|
+
const normalized = typeof value === "string" ? value.trim() : "";
|
|
10660
|
+
return normalized || void 0;
|
|
10661
|
+
}
|
|
10662
|
+
function detailList(manifest, key) {
|
|
10663
|
+
const value = detailValue(manifest, key);
|
|
10664
|
+
if (!value) {
|
|
10665
|
+
return void 0;
|
|
10666
|
+
}
|
|
10667
|
+
const items = value.split(",").map((item) => item.trim()).filter(Boolean);
|
|
10668
|
+
return items.length ? items : void 0;
|
|
9296
10669
|
}
|
|
9297
10670
|
function buildSourcePage(manifest, analysis, schemaHash, metadata, relatedOutputs = [], modulePage, decorations) {
|
|
9298
10671
|
const relativePath = pagePathFor("source", manifest.sourceId);
|
|
@@ -9317,6 +10690,10 @@ function buildSourcePage(manifest, analysis, schemaHash, metadata, relatedOutput
|
|
|
9317
10690
|
title: analysis.title,
|
|
9318
10691
|
...manifest.sourceType ? { source_type: manifest.sourceType } : {},
|
|
9319
10692
|
...manifest.sourceClass ? { source_class: manifest.sourceClass } : {},
|
|
10693
|
+
...detailValue(manifest, "occurred_at") ? { occurred_at: detailValue(manifest, "occurred_at") } : {},
|
|
10694
|
+
...detailList(manifest, "participants") ? { participants: detailList(manifest, "participants") } : {},
|
|
10695
|
+
...detailValue(manifest, "container_title") ? { container_title: detailValue(manifest, "container_title") } : {},
|
|
10696
|
+
...detailValue(manifest, "conversation_id") ? { conversation_id: detailValue(manifest, "conversation_id") } : {},
|
|
9320
10697
|
tags: decoratedTags(analysis.code ? ["source", "code"] : ["source"], decorations),
|
|
9321
10698
|
source_ids: [manifest.sourceId],
|
|
9322
10699
|
project_ids: decorations?.projectIds ?? [],
|
|
@@ -9336,9 +10713,19 @@ function buildSourcePage(manifest, analysis, schemaHash, metadata, relatedOutput
|
|
|
9336
10713
|
`# ${analysis.title}`,
|
|
9337
10714
|
"",
|
|
9338
10715
|
`Source ID: \`${manifest.sourceId}\``,
|
|
10716
|
+
`Source Kind: \`${manifest.sourceKind}\``,
|
|
9339
10717
|
manifest.url ? `Source URL: ${manifest.url}` : `Source Path: \`${manifest.originalPath ?? manifest.storedPath}\``,
|
|
9340
10718
|
...manifest.sourceType ? [`Source Type: \`${manifest.sourceType}\``, ""] : [""],
|
|
9341
10719
|
...manifest.sourceClass ? [`Source Class: \`${manifest.sourceClass}\``, ""] : [],
|
|
10720
|
+
...manifest.sourceGroupTitle ? [`Source Group: ${manifest.sourceGroupTitle}`] : [],
|
|
10721
|
+
...manifest.partTitle ? [`Part: ${manifest.partIndex ?? "?"}/${manifest.partCount ?? "?"} - ${manifest.partTitle}`] : [],
|
|
10722
|
+
...manifest.details && Object.keys(manifest.details).length ? [
|
|
10723
|
+
"",
|
|
10724
|
+
"## Source Details",
|
|
10725
|
+
"",
|
|
10726
|
+
...Object.entries(manifest.details).map(([key, value]) => `- ${key.replace(/_/g, " ")}: ${value}`),
|
|
10727
|
+
""
|
|
10728
|
+
] : [],
|
|
9342
10729
|
"",
|
|
9343
10730
|
"## Summary",
|
|
9344
10731
|
"",
|
|
@@ -9639,6 +11026,9 @@ function buildIndexPage(pages, schemaHash, metadata, projectPages = []) {
|
|
|
9639
11026
|
const outputs = pages.filter((page) => page.kind === "output");
|
|
9640
11027
|
const insights = pages.filter((page) => page.kind === "insight");
|
|
9641
11028
|
const graphPages = pages.filter((page) => page.kind === "graph_report" || page.kind === "community_summary");
|
|
11029
|
+
const dashboards = pages.filter(
|
|
11030
|
+
(page) => page.kind === "index" && page.path.startsWith("dashboards/") && page.path !== "dashboards/index.md"
|
|
11031
|
+
);
|
|
9642
11032
|
return [
|
|
9643
11033
|
"---",
|
|
9644
11034
|
"page_id: index",
|
|
@@ -9684,6 +11074,10 @@ function buildIndexPage(pages, schemaHash, metadata, projectPages = []) {
|
|
|
9684
11074
|
"",
|
|
9685
11075
|
...outputs.length ? outputs.map((page) => `- [[${page.path.replace(/\.md$/, "")}|${page.title}]]`) : ["- No saved outputs yet."],
|
|
9686
11076
|
"",
|
|
11077
|
+
"## Dashboards",
|
|
11078
|
+
"",
|
|
11079
|
+
...dashboards.length ? dashboards.map((page) => `- [[${page.path.replace(/\.md$/, "")}|${page.title}]]`) : ["- No dashboards yet."],
|
|
11080
|
+
"",
|
|
9687
11081
|
"## Graph",
|
|
9688
11082
|
"",
|
|
9689
11083
|
...graphPages.length ? graphPages.map((page) => `- [[${page.path.replace(/\.md$/, "")}|${page.title}]]`) : ["- No graph reports yet."],
|
|
@@ -11160,15 +12554,37 @@ async function rebuildSearchIndex(dbPath, pages, wikiDir) {
|
|
|
11160
12554
|
const insertPage = db.prepare(
|
|
11161
12555
|
"INSERT INTO pages (id, path, title, body, kind, status, source_type, source_class, project_ids, project_key) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
|
|
11162
12556
|
);
|
|
12557
|
+
const rootDir = path21.dirname(wikiDir);
|
|
11163
12558
|
for (const page of pages) {
|
|
11164
12559
|
const absolutePath = path21.join(wikiDir, page.path);
|
|
11165
12560
|
const content = await fs17.readFile(absolutePath, "utf8");
|
|
11166
12561
|
const parsed = matter8(content);
|
|
12562
|
+
let body = parsed.content;
|
|
12563
|
+
const primarySourceId = Array.isArray(parsed.data.source_ids) && typeof parsed.data.source_ids[0] === "string" ? parsed.data.source_ids[0] : page.sourceIds[0];
|
|
12564
|
+
if ((page.kind === "source" || page.kind === "module") && primarySourceId) {
|
|
12565
|
+
try {
|
|
12566
|
+
const manifest = JSON.parse(
|
|
12567
|
+
await fs17.readFile(path21.join(rootDir, "state", "manifests", `${primarySourceId}.json`), "utf8")
|
|
12568
|
+
);
|
|
12569
|
+
const excerptPath = manifest.extractedTextPath ?? manifest.storedPath;
|
|
12570
|
+
if (excerptPath) {
|
|
12571
|
+
const excerpt = await fs17.readFile(path21.join(rootDir, excerptPath), "utf8");
|
|
12572
|
+
if (excerpt.trim()) {
|
|
12573
|
+
body = `${body}
|
|
12574
|
+
|
|
12575
|
+
## Source Excerpt
|
|
12576
|
+
|
|
12577
|
+
${excerpt.trim()}`.trim();
|
|
12578
|
+
}
|
|
12579
|
+
}
|
|
12580
|
+
} catch {
|
|
12581
|
+
}
|
|
12582
|
+
}
|
|
11167
12583
|
insertPage.run(
|
|
11168
12584
|
page.id,
|
|
11169
12585
|
page.path,
|
|
11170
12586
|
page.title,
|
|
11171
|
-
|
|
12587
|
+
body,
|
|
11172
12588
|
page.kind,
|
|
11173
12589
|
page.status,
|
|
11174
12590
|
typeof parsed.data.source_type === "string" ? parsed.data.source_type : "",
|
|
@@ -11229,7 +12645,25 @@ function searchPages(dbPath, query, limitOrOptions = 5) {
|
|
|
11229
12645
|
FROM page_search
|
|
11230
12646
|
JOIN pages ON pages.rowid = page_search.rowid
|
|
11231
12647
|
WHERE ${clauses.join(" AND ")}
|
|
11232
|
-
ORDER BY
|
|
12648
|
+
ORDER BY
|
|
12649
|
+
CASE pages.status
|
|
12650
|
+
WHEN 'active' THEN 0
|
|
12651
|
+
WHEN 'draft' THEN 1
|
|
12652
|
+
WHEN 'candidate' THEN 2
|
|
12653
|
+
ELSE 3
|
|
12654
|
+
END,
|
|
12655
|
+
CASE pages.kind
|
|
12656
|
+
WHEN 'source' THEN 0
|
|
12657
|
+
WHEN 'module' THEN 1
|
|
12658
|
+
WHEN 'output' THEN 2
|
|
12659
|
+
WHEN 'insight' THEN 3
|
|
12660
|
+
WHEN 'graph_report' THEN 4
|
|
12661
|
+
WHEN 'community_summary' THEN 5
|
|
12662
|
+
WHEN 'concept' THEN 6
|
|
12663
|
+
WHEN 'entity' THEN 7
|
|
12664
|
+
ELSE 8
|
|
12665
|
+
END,
|
|
12666
|
+
rank
|
|
11233
12667
|
LIMIT ?
|
|
11234
12668
|
`);
|
|
11235
12669
|
params.push(options.limit ?? 5);
|
|
@@ -11926,6 +13360,267 @@ async function buildManagedContent(absolutePath, defaults, build) {
|
|
|
11926
13360
|
}
|
|
11927
13361
|
return content;
|
|
11928
13362
|
}
|
|
13363
|
+
function manifestDetailValue(manifest, key) {
|
|
13364
|
+
const value = manifest.details?.[key];
|
|
13365
|
+
return typeof value === "string" && value.trim() ? value.trim() : void 0;
|
|
13366
|
+
}
|
|
13367
|
+
async function loadAnalysesBySourceIds(paths, sourceIds) {
|
|
13368
|
+
const analyses = await Promise.all(
|
|
13369
|
+
sourceIds.map(async (sourceId) => await readJsonFile(path22.join(paths.analysesDir, `${sourceId}.json`)))
|
|
13370
|
+
);
|
|
13371
|
+
return analyses.filter((analysis) => Boolean(analysis?.sourceId));
|
|
13372
|
+
}
|
|
13373
|
+
async function buildDashboardRecords(paths, graph, schemaHash, report) {
|
|
13374
|
+
const sourcePages = graph.pages.filter((page) => page.kind === "source");
|
|
13375
|
+
const reviewPages = graph.pages.filter((page) => page.kind === "output" && page.path.startsWith("outputs/source-reviews/"));
|
|
13376
|
+
const briefPages = graph.pages.filter((page) => page.kind === "output" && page.path.startsWith("outputs/source-briefs/"));
|
|
13377
|
+
const manifests = graph.sources;
|
|
13378
|
+
const manifestBySourceId = new Map(manifests.map((manifest) => [manifest.sourceId, manifest]));
|
|
13379
|
+
const timelineManifests = manifests.filter((manifest) => manifestDetailValue(manifest, "occurred_at")).sort((left, right) => (manifestDetailValue(right, "occurred_at") ?? "").localeCompare(manifestDetailValue(left, "occurred_at") ?? "")).slice(0, 25);
|
|
13380
|
+
const recentSourcePages = [...sourcePages].sort((left, right) => right.updatedAt.localeCompare(left.updatedAt)).slice(0, 20);
|
|
13381
|
+
const analyses = await loadAnalysesBySourceIds(paths, uniqueStrings3(sourcePages.flatMap((page) => page.sourceIds)));
|
|
13382
|
+
const openQuestions = uniqueStrings3(
|
|
13383
|
+
analyses.flatMap((analysis) => analysis.questions.map((question) => `${analysis.title}: ${question}`))
|
|
13384
|
+
).slice(0, 20);
|
|
13385
|
+
const dashboards = [
|
|
13386
|
+
{
|
|
13387
|
+
relativePath: "dashboards/index.md",
|
|
13388
|
+
title: "Dashboards",
|
|
13389
|
+
content: (metadata) => matter9.stringify(
|
|
13390
|
+
[
|
|
13391
|
+
"# Dashboards",
|
|
13392
|
+
"",
|
|
13393
|
+
"- [[dashboards/recent-sources|Recent Sources]]",
|
|
13394
|
+
"- [[dashboards/timeline|Timeline]]",
|
|
13395
|
+
"- [[dashboards/contradictions|Contradictions]]",
|
|
13396
|
+
"- [[dashboards/open-questions|Open Questions]]",
|
|
13397
|
+
"",
|
|
13398
|
+
"```dataview",
|
|
13399
|
+
"TABLE file.mtime AS updated",
|
|
13400
|
+
'FROM "dashboards"',
|
|
13401
|
+
'WHERE file.name != "index"',
|
|
13402
|
+
"SORT file.mtime desc",
|
|
13403
|
+
"```",
|
|
13404
|
+
""
|
|
13405
|
+
].join("\n"),
|
|
13406
|
+
{
|
|
13407
|
+
page_id: "dashboards:index",
|
|
13408
|
+
kind: "index",
|
|
13409
|
+
title: "Dashboards",
|
|
13410
|
+
tags: ["index", "dashboards"],
|
|
13411
|
+
source_ids: [],
|
|
13412
|
+
project_ids: [],
|
|
13413
|
+
node_ids: [],
|
|
13414
|
+
freshness: "fresh",
|
|
13415
|
+
status: metadata.status,
|
|
13416
|
+
confidence: 1,
|
|
13417
|
+
created_at: metadata.createdAt,
|
|
13418
|
+
updated_at: metadata.updatedAt,
|
|
13419
|
+
compiled_from: metadata.compiledFrom,
|
|
13420
|
+
managed_by: metadata.managedBy,
|
|
13421
|
+
backlinks: [],
|
|
13422
|
+
schema_hash: schemaHash,
|
|
13423
|
+
source_hashes: {},
|
|
13424
|
+
source_semantic_hashes: {}
|
|
13425
|
+
}
|
|
13426
|
+
)
|
|
13427
|
+
},
|
|
13428
|
+
{
|
|
13429
|
+
relativePath: "dashboards/recent-sources.md",
|
|
13430
|
+
title: "Recent Sources",
|
|
13431
|
+
content: (metadata) => matter9.stringify(
|
|
13432
|
+
[
|
|
13433
|
+
"# Recent Sources",
|
|
13434
|
+
"",
|
|
13435
|
+
...recentSourcePages.length ? recentSourcePages.map((page) => `- ${page.updatedAt}: [[${page.path.replace(/\.md$/, "")}|${page.title}]]`) : ["- No source pages yet."],
|
|
13436
|
+
"",
|
|
13437
|
+
"```dataview",
|
|
13438
|
+
"TABLE source_type, occurred_at, participants",
|
|
13439
|
+
'FROM "sources"',
|
|
13440
|
+
"SORT updated_at desc",
|
|
13441
|
+
"LIMIT 25",
|
|
13442
|
+
"```",
|
|
13443
|
+
""
|
|
13444
|
+
].join("\n"),
|
|
13445
|
+
{
|
|
13446
|
+
page_id: "dashboards:recent-sources",
|
|
13447
|
+
kind: "index",
|
|
13448
|
+
title: "Recent Sources",
|
|
13449
|
+
tags: ["index", "dashboard", "recent-sources"],
|
|
13450
|
+
source_ids: recentSourcePages.flatMap((page) => page.sourceIds),
|
|
13451
|
+
project_ids: [],
|
|
13452
|
+
node_ids: [],
|
|
13453
|
+
freshness: "fresh",
|
|
13454
|
+
status: metadata.status,
|
|
13455
|
+
confidence: 1,
|
|
13456
|
+
created_at: metadata.createdAt,
|
|
13457
|
+
updated_at: metadata.updatedAt,
|
|
13458
|
+
compiled_from: recentSourcePages.flatMap((page) => page.sourceIds),
|
|
13459
|
+
managed_by: metadata.managedBy,
|
|
13460
|
+
backlinks: [],
|
|
13461
|
+
schema_hash: schemaHash,
|
|
13462
|
+
source_hashes: {},
|
|
13463
|
+
source_semantic_hashes: {}
|
|
13464
|
+
}
|
|
13465
|
+
)
|
|
13466
|
+
},
|
|
13467
|
+
{
|
|
13468
|
+
relativePath: "dashboards/timeline.md",
|
|
13469
|
+
title: "Timeline",
|
|
13470
|
+
content: (metadata) => matter9.stringify(
|
|
13471
|
+
[
|
|
13472
|
+
"# Timeline",
|
|
13473
|
+
"",
|
|
13474
|
+
...timelineManifests.length ? timelineManifests.map((manifest) => {
|
|
13475
|
+
const occurredAt = manifestDetailValue(manifest, "occurred_at") ?? manifest.updatedAt;
|
|
13476
|
+
const sourcePage = sourcePages.find((page) => page.sourceIds.includes(manifest.sourceId));
|
|
13477
|
+
return `- ${occurredAt}: ${sourcePage ? `[[${sourcePage.path.replace(/\.md$/, "")}|${sourcePage.title}]]` : manifest.title}`;
|
|
13478
|
+
}) : ["- No timeline-aware sources yet."],
|
|
13479
|
+
"",
|
|
13480
|
+
"```dataview",
|
|
13481
|
+
"TABLE occurred_at, participants, container_title",
|
|
13482
|
+
'FROM "sources"',
|
|
13483
|
+
"WHERE occurred_at",
|
|
13484
|
+
"SORT occurred_at desc",
|
|
13485
|
+
"```",
|
|
13486
|
+
""
|
|
13487
|
+
].join("\n"),
|
|
13488
|
+
{
|
|
13489
|
+
page_id: "dashboards:timeline",
|
|
13490
|
+
kind: "index",
|
|
13491
|
+
title: "Timeline",
|
|
13492
|
+
tags: ["index", "dashboard", "timeline"],
|
|
13493
|
+
source_ids: timelineManifests.map((manifest) => manifest.sourceId),
|
|
13494
|
+
project_ids: [],
|
|
13495
|
+
node_ids: [],
|
|
13496
|
+
freshness: "fresh",
|
|
13497
|
+
status: metadata.status,
|
|
13498
|
+
confidence: 1,
|
|
13499
|
+
created_at: metadata.createdAt,
|
|
13500
|
+
updated_at: metadata.updatedAt,
|
|
13501
|
+
compiled_from: timelineManifests.map((manifest) => manifest.sourceId),
|
|
13502
|
+
managed_by: metadata.managedBy,
|
|
13503
|
+
backlinks: [],
|
|
13504
|
+
schema_hash: schemaHash,
|
|
13505
|
+
source_hashes: {},
|
|
13506
|
+
source_semantic_hashes: {}
|
|
13507
|
+
}
|
|
13508
|
+
)
|
|
13509
|
+
},
|
|
13510
|
+
{
|
|
13511
|
+
relativePath: "dashboards/contradictions.md",
|
|
13512
|
+
title: "Contradictions",
|
|
13513
|
+
content: (metadata) => matter9.stringify(
|
|
13514
|
+
[
|
|
13515
|
+
"# Contradictions",
|
|
13516
|
+
"",
|
|
13517
|
+
...report?.contradictions.length ? report.contradictions.map((contradiction) => {
|
|
13518
|
+
const left = manifestBySourceId.get(contradiction.sourceIdA)?.title ?? contradiction.sourceIdA;
|
|
13519
|
+
const right = manifestBySourceId.get(contradiction.sourceIdB)?.title ?? contradiction.sourceIdB;
|
|
13520
|
+
return `- ${left} / ${right}: ${contradiction.claimA} <> ${contradiction.claimB}`;
|
|
13521
|
+
}) : ["- No contradictions are currently flagged."],
|
|
13522
|
+
"",
|
|
13523
|
+
...reviewPages.length || briefPages.length ? [
|
|
13524
|
+
"## Related Reviews",
|
|
13525
|
+
"",
|
|
13526
|
+
...[...reviewPages, ...briefPages].slice(0, 12).map((page) => `- [[${page.path.replace(/\.md$/, "")}|${page.title}]]`),
|
|
13527
|
+
""
|
|
13528
|
+
] : [],
|
|
13529
|
+
"```dataview",
|
|
13530
|
+
'LIST FROM "outputs/source-reviews"',
|
|
13531
|
+
"SORT file.mtime desc",
|
|
13532
|
+
"```",
|
|
13533
|
+
""
|
|
13534
|
+
].join("\n"),
|
|
13535
|
+
{
|
|
13536
|
+
page_id: "dashboards:contradictions",
|
|
13537
|
+
kind: "index",
|
|
13538
|
+
title: "Contradictions",
|
|
13539
|
+
tags: ["index", "dashboard", "contradictions"],
|
|
13540
|
+
source_ids: report?.contradictions.flatMap((item) => [item.sourceIdA, item.sourceIdB]) ?? [],
|
|
13541
|
+
project_ids: [],
|
|
13542
|
+
node_ids: [],
|
|
13543
|
+
freshness: "fresh",
|
|
13544
|
+
status: metadata.status,
|
|
13545
|
+
confidence: 1,
|
|
13546
|
+
created_at: metadata.createdAt,
|
|
13547
|
+
updated_at: metadata.updatedAt,
|
|
13548
|
+
compiled_from: report?.contradictions.flatMap((item) => [item.sourceIdA, item.sourceIdB]) ?? [],
|
|
13549
|
+
managed_by: metadata.managedBy,
|
|
13550
|
+
backlinks: [],
|
|
13551
|
+
schema_hash: schemaHash,
|
|
13552
|
+
source_hashes: {},
|
|
13553
|
+
source_semantic_hashes: {}
|
|
13554
|
+
}
|
|
13555
|
+
)
|
|
13556
|
+
},
|
|
13557
|
+
{
|
|
13558
|
+
relativePath: "dashboards/open-questions.md",
|
|
13559
|
+
title: "Open Questions",
|
|
13560
|
+
content: (metadata) => matter9.stringify(
|
|
13561
|
+
[
|
|
13562
|
+
"# Open Questions",
|
|
13563
|
+
"",
|
|
13564
|
+
...openQuestions.length ? openQuestions.map((question) => `- ${question}`) : ["- No open questions are currently extracted."],
|
|
13565
|
+
"",
|
|
13566
|
+
"```dataview",
|
|
13567
|
+
'LIST FROM "outputs/source-briefs" OR "outputs/source-reviews"',
|
|
13568
|
+
"SORT file.mtime desc",
|
|
13569
|
+
"```",
|
|
13570
|
+
""
|
|
13571
|
+
].join("\n"),
|
|
13572
|
+
{
|
|
13573
|
+
page_id: "dashboards:open-questions",
|
|
13574
|
+
kind: "index",
|
|
13575
|
+
title: "Open Questions",
|
|
13576
|
+
tags: ["index", "dashboard", "open-questions"],
|
|
13577
|
+
source_ids: analyses.map((analysis) => analysis.sourceId),
|
|
13578
|
+
project_ids: [],
|
|
13579
|
+
node_ids: [],
|
|
13580
|
+
freshness: "fresh",
|
|
13581
|
+
status: metadata.status,
|
|
13582
|
+
confidence: 1,
|
|
13583
|
+
created_at: metadata.createdAt,
|
|
13584
|
+
updated_at: metadata.updatedAt,
|
|
13585
|
+
compiled_from: analyses.map((analysis) => analysis.sourceId),
|
|
13586
|
+
managed_by: metadata.managedBy,
|
|
13587
|
+
backlinks: [],
|
|
13588
|
+
schema_hash: schemaHash,
|
|
13589
|
+
source_hashes: {},
|
|
13590
|
+
source_semantic_hashes: {}
|
|
13591
|
+
}
|
|
13592
|
+
)
|
|
13593
|
+
}
|
|
13594
|
+
];
|
|
13595
|
+
const records = [];
|
|
13596
|
+
for (const dashboard of dashboards) {
|
|
13597
|
+
const absolutePath = path22.join(paths.wikiDir, dashboard.relativePath);
|
|
13598
|
+
const compiledFrom = dashboard.relativePath === "dashboards/recent-sources.md" ? recentSourcePages.flatMap((page) => page.sourceIds) : [];
|
|
13599
|
+
const content = await buildManagedContent(
|
|
13600
|
+
absolutePath,
|
|
13601
|
+
{
|
|
13602
|
+
managedBy: "system",
|
|
13603
|
+
compiledFrom
|
|
13604
|
+
},
|
|
13605
|
+
dashboard.content
|
|
13606
|
+
);
|
|
13607
|
+
records.push({
|
|
13608
|
+
page: emptyGraphPage({
|
|
13609
|
+
id: `dashboard:${dashboard.relativePath.replace(/\.md$/, "")}`,
|
|
13610
|
+
path: dashboard.relativePath,
|
|
13611
|
+
title: dashboard.title,
|
|
13612
|
+
kind: "index",
|
|
13613
|
+
sourceIds: compiledFrom,
|
|
13614
|
+
nodeIds: [],
|
|
13615
|
+
schemaHash,
|
|
13616
|
+
sourceHashes: {},
|
|
13617
|
+
confidence: 1
|
|
13618
|
+
}),
|
|
13619
|
+
content
|
|
13620
|
+
});
|
|
13621
|
+
}
|
|
13622
|
+
return records;
|
|
13623
|
+
}
|
|
11929
13624
|
function indexCompiledFrom(pages) {
|
|
11930
13625
|
return uniqueStrings3(pages.flatMap((page) => page.sourceIds));
|
|
11931
13626
|
}
|
|
@@ -12951,8 +14646,19 @@ async function syncVaultArtifacts(rootDir, input) {
|
|
|
12951
14646
|
input.previousState?.generatedAt,
|
|
12952
14647
|
contradictions
|
|
12953
14648
|
);
|
|
12954
|
-
|
|
12955
|
-
const
|
|
14649
|
+
const preliminaryPages = [...basePages, ...graphOrientation.records.map((record) => record.page)];
|
|
14650
|
+
const dashboardRecords = await buildDashboardRecords(
|
|
14651
|
+
paths,
|
|
14652
|
+
{
|
|
14653
|
+
...baseGraph,
|
|
14654
|
+
sources: input.manifests,
|
|
14655
|
+
pages: preliminaryPages
|
|
14656
|
+
},
|
|
14657
|
+
globalSchemaHash,
|
|
14658
|
+
graphOrientation.report
|
|
14659
|
+
);
|
|
14660
|
+
records.push(...graphOrientation.records, ...dashboardRecords);
|
|
14661
|
+
const allPages = uniqueBy([...preliminaryPages, ...dashboardRecords.map((record) => record.page)], (page) => page.id);
|
|
12956
14662
|
const graph = {
|
|
12957
14663
|
...baseGraph,
|
|
12958
14664
|
pages: allPages
|
|
@@ -13056,6 +14762,11 @@ async function syncVaultArtifacts(rootDir, input) {
|
|
|
13056
14762
|
["concepts/index.md", "concepts", activeConceptPages],
|
|
13057
14763
|
["entities/index.md", "entities", activeEntityPages],
|
|
13058
14764
|
["outputs/index.md", "outputs", allPages.filter((page) => page.kind === "output")],
|
|
14765
|
+
[
|
|
14766
|
+
"dashboards/index.md",
|
|
14767
|
+
"dashboards",
|
|
14768
|
+
allPages.filter((page) => page.kind === "index" && page.path.startsWith("dashboards/") && page.path !== "dashboards/index.md")
|
|
14769
|
+
],
|
|
13059
14770
|
["candidates/index.md", "candidates", candidatePages],
|
|
13060
14771
|
["graph/index.md", "graph", allPages.filter((page) => page.kind === "graph_report" || page.kind === "community_summary")]
|
|
13061
14772
|
]) {
|
|
@@ -13156,17 +14867,40 @@ async function refreshIndexesAndSearch(rootDir, pages) {
|
|
|
13156
14867
|
const compileState = await readJsonFile(paths.compileStatePath);
|
|
13157
14868
|
const globalSchemaHash = schemas.effective.global.hash;
|
|
13158
14869
|
const currentGraph = await readJsonFile(paths.graphPath);
|
|
13159
|
-
const
|
|
14870
|
+
const orientationPages = uniqueBy(
|
|
14871
|
+
pages.filter((page) => page.kind !== "graph_report" && page.kind !== "community_summary"),
|
|
14872
|
+
(page) => page.id
|
|
14873
|
+
);
|
|
14874
|
+
const basePages = uniqueBy(
|
|
14875
|
+
pages.filter(
|
|
14876
|
+
(page) => page.kind !== "graph_report" && page.kind !== "community_summary" && !(page.kind === "index" && page.path.startsWith("dashboards/"))
|
|
14877
|
+
),
|
|
14878
|
+
(page) => page.id
|
|
14879
|
+
);
|
|
13160
14880
|
const graphOrientation = currentGraph ? await buildGraphOrientationPages(
|
|
13161
14881
|
{
|
|
13162
14882
|
...currentGraph,
|
|
13163
|
-
pages:
|
|
14883
|
+
pages: orientationPages
|
|
13164
14884
|
},
|
|
13165
14885
|
paths,
|
|
13166
14886
|
globalSchemaHash,
|
|
13167
14887
|
compileState?.generatedAt
|
|
13168
14888
|
) : { records: [], report: null };
|
|
13169
|
-
const
|
|
14889
|
+
const dashboardRecords = currentGraph ? await buildDashboardRecords(
|
|
14890
|
+
paths,
|
|
14891
|
+
{
|
|
14892
|
+
...currentGraph,
|
|
14893
|
+
pages: [...basePages, ...graphOrientation.records.map((record) => record.page)]
|
|
14894
|
+
},
|
|
14895
|
+
globalSchemaHash,
|
|
14896
|
+
graphOrientation.report
|
|
14897
|
+
) : [];
|
|
14898
|
+
const pagesWithGraph = sortGraphPages(
|
|
14899
|
+
uniqueBy(
|
|
14900
|
+
[...basePages, ...graphOrientation.records.map((record) => record.page), ...dashboardRecords.map((record) => record.page)],
|
|
14901
|
+
(page) => page.id
|
|
14902
|
+
)
|
|
14903
|
+
);
|
|
13170
14904
|
if (currentGraph) {
|
|
13171
14905
|
await writeJsonFile(paths.graphPath, {
|
|
13172
14906
|
...currentGraph,
|
|
@@ -13194,6 +14928,7 @@ async function refreshIndexesAndSearch(rootDir, pages) {
|
|
|
13194
14928
|
ensureDir(path22.join(paths.wikiDir, "concepts")),
|
|
13195
14929
|
ensureDir(path22.join(paths.wikiDir, "entities")),
|
|
13196
14930
|
ensureDir(path22.join(paths.wikiDir, "outputs")),
|
|
14931
|
+
ensureDir(path22.join(paths.wikiDir, "dashboards")),
|
|
13197
14932
|
ensureDir(path22.join(paths.wikiDir, "graph")),
|
|
13198
14933
|
ensureDir(path22.join(paths.wikiDir, "graph", "communities")),
|
|
13199
14934
|
ensureDir(path22.join(paths.wikiDir, "projects")),
|
|
@@ -13256,6 +14991,11 @@ async function refreshIndexesAndSearch(rootDir, pages) {
|
|
|
13256
14991
|
["concepts/index.md", "concepts", pagesWithGraph.filter((page) => page.kind === "concept" && page.status !== "candidate")],
|
|
13257
14992
|
["entities/index.md", "entities", pagesWithGraph.filter((page) => page.kind === "entity" && page.status !== "candidate")],
|
|
13258
14993
|
["outputs/index.md", "outputs", pagesWithGraph.filter((page) => page.kind === "output")],
|
|
14994
|
+
[
|
|
14995
|
+
"dashboards/index.md",
|
|
14996
|
+
"dashboards",
|
|
14997
|
+
pagesWithGraph.filter((page) => page.kind === "index" && page.path.startsWith("dashboards/") && page.path !== "dashboards/index.md")
|
|
14998
|
+
],
|
|
13259
14999
|
["candidates/index.md", "candidates", pagesWithGraph.filter((page) => page.status === "candidate")],
|
|
13260
15000
|
["graph/index.md", "graph", pagesWithGraph.filter((page) => page.kind === "graph_report" || page.kind === "community_summary")]
|
|
13261
15001
|
]) {
|
|
@@ -13275,6 +15015,9 @@ async function refreshIndexesAndSearch(rootDir, pages) {
|
|
|
13275
15015
|
for (const record of graphOrientation.records) {
|
|
13276
15016
|
await writeFileIfChanged(path22.join(paths.wikiDir, record.page.path), record.content);
|
|
13277
15017
|
}
|
|
15018
|
+
for (const record of dashboardRecords) {
|
|
15019
|
+
await writeFileIfChanged(path22.join(paths.wikiDir, record.page.path), record.content);
|
|
15020
|
+
}
|
|
13278
15021
|
if (graphOrientation.report) {
|
|
13279
15022
|
await writeJsonFile(path22.join(paths.wikiDir, "graph", "report.json"), graphOrientation.report);
|
|
13280
15023
|
}
|
|
@@ -13291,6 +15034,11 @@ async function refreshIndexesAndSearch(rootDir, pages) {
|
|
|
13291
15034
|
await Promise.all(
|
|
13292
15035
|
existingGraphPages.filter((relativePath) => !allowedGraphPages.has(relativePath)).map((relativePath) => fs18.rm(path22.join(paths.wikiDir, relativePath), { force: true }))
|
|
13293
15036
|
);
|
|
15037
|
+
const existingDashboardPages = (await listFilesRecursive(path22.join(paths.wikiDir, "dashboards")).catch(() => [])).filter((absolutePath) => absolutePath.endsWith(".md")).map((absolutePath) => toPosix(path22.relative(paths.wikiDir, absolutePath)));
|
|
15038
|
+
const allowedDashboardPages = /* @__PURE__ */ new Set(["dashboards/index.md", ...dashboardRecords.map((record) => record.page.path)]);
|
|
15039
|
+
await Promise.all(
|
|
15040
|
+
existingDashboardPages.filter((relativePath) => !allowedDashboardPages.has(relativePath)).map((relativePath) => fs18.rm(path22.join(paths.wikiDir, relativePath), { force: true }))
|
|
15041
|
+
);
|
|
13294
15042
|
await rebuildSearchIndex(paths.searchDbPath, pagesWithGraph, paths.wikiDir);
|
|
13295
15043
|
}
|
|
13296
15044
|
async function prepareOutputPageSave(rootDir, input) {
|
|
@@ -13426,6 +15174,9 @@ async function stageOutputApprovalBundle(rootDir, stagedPages) {
|
|
|
13426
15174
|
});
|
|
13427
15175
|
return { approvalId, approvalDir };
|
|
13428
15176
|
}
|
|
15177
|
+
async function stageGeneratedOutputPages(rootDir, stagedPages) {
|
|
15178
|
+
return await stageOutputApprovalBundle(rootDir, stagedPages);
|
|
15179
|
+
}
|
|
13429
15180
|
async function executeQuery(rootDir, question, format) {
|
|
13430
15181
|
const { paths } = await loadVaultConfig(rootDir);
|
|
13431
15182
|
const schemas = await loadVaultSchemas(rootDir);
|
|
@@ -14767,7 +16518,17 @@ async function benchmarkVault(rootDir, options = {}) {
|
|
|
14767
16518
|
});
|
|
14768
16519
|
await writeJsonFile(paths.benchmarkPath, artifact);
|
|
14769
16520
|
await refreshIndexesAndSearch(rootDir, graph.pages);
|
|
14770
|
-
|
|
16521
|
+
const refreshedGraph = await readJsonFile(paths.graphPath) ?? graph;
|
|
16522
|
+
const refreshedHash = graphHash(refreshedGraph);
|
|
16523
|
+
if (artifact.graphHash === refreshedHash) {
|
|
16524
|
+
return artifact;
|
|
16525
|
+
}
|
|
16526
|
+
const refreshedArtifact = {
|
|
16527
|
+
...artifact,
|
|
16528
|
+
graphHash: refreshedHash
|
|
16529
|
+
};
|
|
16530
|
+
await writeJsonFile(paths.benchmarkPath, refreshedArtifact);
|
|
16531
|
+
return refreshedArtifact;
|
|
14771
16532
|
}
|
|
14772
16533
|
async function pathGraphVault(rootDir, from, to) {
|
|
14773
16534
|
const graph = await ensureCompiledGraph(rootDir);
|
|
@@ -14987,7 +16748,7 @@ async function bootstrapDemo(rootDir, input) {
|
|
|
14987
16748
|
}
|
|
14988
16749
|
|
|
14989
16750
|
// src/mcp.ts
|
|
14990
|
-
var SERVER_VERSION = "0.
|
|
16751
|
+
var SERVER_VERSION = "0.4.0";
|
|
14991
16752
|
async function createMcpServer(rootDir) {
|
|
14992
16753
|
const server = new McpServer({
|
|
14993
16754
|
name: "swarmvault",
|
|
@@ -15165,8 +16926,8 @@ async function createMcpServer(rootDir) {
|
|
|
15165
16926
|
}
|
|
15166
16927
|
},
|
|
15167
16928
|
async ({ input }) => {
|
|
15168
|
-
const
|
|
15169
|
-
return asToolText(
|
|
16929
|
+
const result = await ingestInputDetailed(rootDir, input);
|
|
16930
|
+
return asToolText(result);
|
|
15170
16931
|
}
|
|
15171
16932
|
);
|
|
15172
16933
|
server.registerTool(
|
|
@@ -15843,7 +17604,7 @@ function matchesManagedSourceSpec(existing, input) {
|
|
|
15843
17604
|
if (existing.kind !== input.kind) {
|
|
15844
17605
|
return false;
|
|
15845
17606
|
}
|
|
15846
|
-
if (input.kind === "directory") {
|
|
17607
|
+
if (input.kind === "directory" || input.kind === "file") {
|
|
15847
17608
|
return path25.resolve(existing.path ?? "") === path25.resolve(input.path);
|
|
15848
17609
|
}
|
|
15849
17610
|
return (existing.url ?? "") === input.url;
|
|
@@ -15855,10 +17616,15 @@ async function resolveManagedSourceInput(rootDir, input) {
|
|
|
15855
17616
|
if (!stat) {
|
|
15856
17617
|
throw new Error(`Source not found: ${input}`);
|
|
15857
17618
|
}
|
|
17619
|
+
if (stat.isFile()) {
|
|
17620
|
+
return {
|
|
17621
|
+
kind: "file",
|
|
17622
|
+
path: absoluteInput,
|
|
17623
|
+
title: path25.basename(absoluteInput, path25.extname(absoluteInput)) || absoluteInput
|
|
17624
|
+
};
|
|
17625
|
+
}
|
|
15858
17626
|
if (!stat.isDirectory()) {
|
|
15859
|
-
throw new Error(
|
|
15860
|
-
"`swarmvault source add` supports directories, public GitHub repo root URLs, and docs hubs. Use `swarmvault ingest` for single files."
|
|
15861
|
-
);
|
|
17627
|
+
throw new Error("`swarmvault source add` supports local files, directories, public GitHub repo root URLs, and docs hubs.");
|
|
15862
17628
|
}
|
|
15863
17629
|
const detectedRepoRoot = await findNearestGitRoot3(absoluteInput);
|
|
15864
17630
|
const repoRoot = detectedRepoRoot && !(withinRoot2(rootDir, absoluteInput) && !withinRoot2(rootDir, detectedRepoRoot)) ? detectedRepoRoot : absoluteInput;
|
|
@@ -15891,6 +17657,10 @@ async function resolveManagedSourceInput(rootDir, input) {
|
|
|
15891
17657
|
function directorySourceIdsFor(manifests, inputPath) {
|
|
15892
17658
|
return manifests.filter((manifest) => manifest.originalPath && withinRoot2(path25.resolve(inputPath), path25.resolve(manifest.originalPath))).map((manifest) => manifest.sourceId).sort((left, right) => left.localeCompare(right));
|
|
15893
17659
|
}
|
|
17660
|
+
function fileSourceIdsFor(manifests, inputPath) {
|
|
17661
|
+
const absoluteInput = path25.resolve(inputPath);
|
|
17662
|
+
return manifests.filter((manifest) => manifest.originalPath && path25.resolve(manifest.originalPath) === absoluteInput).map((manifest) => manifest.sourceId).sort((left, right) => left.localeCompare(right));
|
|
17663
|
+
}
|
|
15894
17664
|
async function syncDirectorySource(rootDir, inputPath, repoRoot) {
|
|
15895
17665
|
const manifestsBefore = await listManifests(rootDir);
|
|
15896
17666
|
const previousInScope = manifestsBefore.filter(
|
|
@@ -15924,6 +17694,22 @@ async function syncDirectorySource(rootDir, inputPath, repoRoot) {
|
|
|
15924
17694
|
changed: result.imported.length + result.updated.length + removed.length > 0
|
|
15925
17695
|
};
|
|
15926
17696
|
}
|
|
17697
|
+
async function syncFileSource(rootDir, inputPath) {
|
|
17698
|
+
const result = await ingestInputDetailed(rootDir, inputPath);
|
|
17699
|
+
const manifestsAfter = await listManifests(rootDir);
|
|
17700
|
+
return {
|
|
17701
|
+
title: path25.basename(inputPath, path25.extname(inputPath)) || inputPath,
|
|
17702
|
+
sourceIds: fileSourceIdsFor(manifestsAfter, inputPath),
|
|
17703
|
+
counts: {
|
|
17704
|
+
scannedCount: result.scannedCount,
|
|
17705
|
+
importedCount: result.created.length,
|
|
17706
|
+
updatedCount: result.updated.length,
|
|
17707
|
+
removedCount: result.removed.length,
|
|
17708
|
+
skippedCount: result.skipped.length
|
|
17709
|
+
},
|
|
17710
|
+
changed: result.created.length + result.updated.length + result.removed.length > 0
|
|
17711
|
+
};
|
|
17712
|
+
}
|
|
15927
17713
|
async function runGitCommand(cwd, args) {
|
|
15928
17714
|
await new Promise((resolve, reject) => {
|
|
15929
17715
|
const child = spawn2("git", args, {
|
|
@@ -15970,12 +17756,11 @@ async function syncCrawlSource(rootDir, entry, options) {
|
|
|
15970
17756
|
let updatedCount = 0;
|
|
15971
17757
|
for (const pageUrl of crawl.pages) {
|
|
15972
17758
|
const persisted = await ingestInputDetailed(rootDir, pageUrl);
|
|
15973
|
-
currentSourceIds.push(persisted.manifest.sourceId);
|
|
15974
|
-
|
|
15975
|
-
|
|
15976
|
-
|
|
15977
|
-
|
|
15978
|
-
}
|
|
17759
|
+
currentSourceIds.push(...persisted.created.map((manifest) => manifest.sourceId));
|
|
17760
|
+
currentSourceIds.push(...persisted.updated.map((manifest) => manifest.sourceId));
|
|
17761
|
+
currentSourceIds.push(...persisted.unchanged.map((manifest) => manifest.sourceId));
|
|
17762
|
+
importedCount += persisted.created.length;
|
|
17763
|
+
updatedCount += persisted.updated.length;
|
|
15979
17764
|
}
|
|
15980
17765
|
let removedCount = 0;
|
|
15981
17766
|
for (const sourceId of previousSourceIds) {
|
|
@@ -16019,6 +17804,22 @@ async function syncManagedSource(rootDir, entry, options) {
|
|
|
16019
17804
|
};
|
|
16020
17805
|
}
|
|
16021
17806
|
sync = await syncDirectorySource(rootDir, entry.path, entry.repoRoot);
|
|
17807
|
+
} else if (entry.kind === "file") {
|
|
17808
|
+
if (!entry.path) {
|
|
17809
|
+
throw new Error(`Managed source ${entry.id} is missing its file path.`);
|
|
17810
|
+
}
|
|
17811
|
+
if (!await fileExists(entry.path)) {
|
|
17812
|
+
return {
|
|
17813
|
+
...entry,
|
|
17814
|
+
status: "missing",
|
|
17815
|
+
updatedAt: now,
|
|
17816
|
+
lastSyncAt: now,
|
|
17817
|
+
lastSyncStatus: "error",
|
|
17818
|
+
lastError: `File not found: ${entry.path}`,
|
|
17819
|
+
changed: false
|
|
17820
|
+
};
|
|
17821
|
+
}
|
|
17822
|
+
sync = await syncFileSource(rootDir, entry.path);
|
|
16022
17823
|
} else if (entry.kind === "github_repo") {
|
|
16023
17824
|
sync = await syncGitHubRepoSource(rootDir, entry);
|
|
16024
17825
|
} else {
|
|
@@ -16237,6 +18038,179 @@ async function generateBriefsForSources(rootDir, sources) {
|
|
|
16237
18038
|
}
|
|
16238
18039
|
return briefPaths;
|
|
16239
18040
|
}
|
|
18041
|
+
function renderDeterministicSourceReview(input) {
|
|
18042
|
+
const canonicalPages = input.sourcePages.filter((page) => page.kind === "source" || page.kind === "concept" || page.kind === "entity").slice(0, 10);
|
|
18043
|
+
const modulePages = input.sourcePages.filter((page) => page.kind === "module").slice(0, 8);
|
|
18044
|
+
const questions = uniqueStrings4(input.analyses.flatMap((analysis) => analysis.questions)).slice(0, 8);
|
|
18045
|
+
const concepts = uniqueStrings4(input.analyses.flatMap((analysis) => analysis.concepts.map((concept) => concept.name))).slice(0, 8);
|
|
18046
|
+
const entities = uniqueStrings4(input.analyses.flatMap((analysis) => analysis.entities.map((entity) => entity.name))).slice(0, 8);
|
|
18047
|
+
const contradictions = input.report?.contradictions.filter(
|
|
18048
|
+
(contradiction) => input.scope.sourceIds.includes(contradiction.sourceIdA) || input.scope.sourceIds.includes(contradiction.sourceIdB)
|
|
18049
|
+
) ?? [];
|
|
18050
|
+
return [
|
|
18051
|
+
`# Source Review: ${input.scope.title}`,
|
|
18052
|
+
"",
|
|
18053
|
+
"## What This Source Contains",
|
|
18054
|
+
"",
|
|
18055
|
+
...input.analyses.length ? input.analyses.map((analysis) => `- ${analysis.title}: ${analysis.summary}`) : ["- This source has not been analyzed yet. Compile the vault before trusting downstream pages."],
|
|
18056
|
+
"",
|
|
18057
|
+
"## Likely Canonical Pages To Update",
|
|
18058
|
+
"",
|
|
18059
|
+
...canonicalPages.length ? canonicalPages.map((page) => `- [[${page.path.replace(/\.md$/, "")}|${page.title}]]`) : ["- No canonical source, concept, or entity pages are linked to this source yet."],
|
|
18060
|
+
"",
|
|
18061
|
+
"## Important Topics And Entities",
|
|
18062
|
+
"",
|
|
18063
|
+
...concepts.length ? [`Concepts: ${concepts.join(", ")}`] : ["Concepts: none detected."],
|
|
18064
|
+
...entities.length ? [`Entities: ${entities.join(", ")}`] : ["Entities: none detected."],
|
|
18065
|
+
...modulePages.length ? ["", ...modulePages.map((page) => `- Module: [[${page.path.replace(/\.md$/, "")}|${page.title}]]`)] : [],
|
|
18066
|
+
"",
|
|
18067
|
+
"## Contradictions To Inspect",
|
|
18068
|
+
"",
|
|
18069
|
+
...contradictions.length ? contradictions.map((contradiction) => `- ${contradiction.claimA} / ${contradiction.claimB}`) : ["- No contradictions are currently flagged for this source scope."],
|
|
18070
|
+
"",
|
|
18071
|
+
"## Open Questions",
|
|
18072
|
+
"",
|
|
18073
|
+
...questions.length ? questions.map((question) => `- ${question}`) : ["- No extracted open questions yet."],
|
|
18074
|
+
"",
|
|
18075
|
+
"## Suggested Next Steps",
|
|
18076
|
+
"",
|
|
18077
|
+
...canonicalPages.length ? canonicalPages.slice(0, 5).map((page) => `- Review [[${page.path.replace(/\.md$/, "")}|${page.title}]] for canonical updates.`) : ["- Review the source page and decide which canonical pages should exist."],
|
|
18078
|
+
""
|
|
18079
|
+
].join("\n");
|
|
18080
|
+
}
|
|
18081
|
+
async function generateSourceReviewMarkdown(rootDir, scope) {
|
|
18082
|
+
const { paths } = await loadVaultConfig(rootDir);
|
|
18083
|
+
let graph = await readJsonFile(paths.graphPath);
|
|
18084
|
+
if (!graph) {
|
|
18085
|
+
await compileVault(rootDir, {});
|
|
18086
|
+
graph = await readJsonFile(paths.graphPath);
|
|
18087
|
+
}
|
|
18088
|
+
if (!graph) {
|
|
18089
|
+
return null;
|
|
18090
|
+
}
|
|
18091
|
+
const sourcePages = scopedSourcePages(graph, scope.sourceIds);
|
|
18092
|
+
const analyses = await loadSourceAnalyses(rootDir, scope.sourceIds);
|
|
18093
|
+
const report = await readGraphReport(rootDir);
|
|
18094
|
+
const fallback = renderDeterministicSourceReview({
|
|
18095
|
+
scope,
|
|
18096
|
+
sourcePages,
|
|
18097
|
+
graph,
|
|
18098
|
+
analyses,
|
|
18099
|
+
report
|
|
18100
|
+
});
|
|
18101
|
+
const provider = await getProviderForTask(rootDir, "queryProvider");
|
|
18102
|
+
if (provider.type === "heuristic") {
|
|
18103
|
+
return fallback;
|
|
18104
|
+
}
|
|
18105
|
+
try {
|
|
18106
|
+
const schemas = await loadVaultSchemas(rootDir);
|
|
18107
|
+
const pageContext = sourcePages.slice(0, 12).map((page) => `- ${page.title} (${page.kind}) -> ${page.path}`).join("\n");
|
|
18108
|
+
const analysisContext = analyses.slice(0, 8).map(
|
|
18109
|
+
(analysis) => `# ${analysis.title}
|
|
18110
|
+
Summary: ${analysis.summary}
|
|
18111
|
+
Questions: ${analysis.questions.join(" | ") || "none"}
|
|
18112
|
+
Concepts: ${analysis.concepts.map((concept) => concept.name).join(", ") || "none"}
|
|
18113
|
+
Entities: ${analysis.entities.map((entity) => entity.name).join(", ") || "none"}`
|
|
18114
|
+
).join("\n\n---\n\n");
|
|
18115
|
+
const response = await provider.generateText({
|
|
18116
|
+
system: buildSchemaPrompt(
|
|
18117
|
+
schemas.effective.global,
|
|
18118
|
+
"Write a concise markdown source review with sections: What This Source Contains, Likely Canonical Pages To Update, Important Topics And Entities, Contradictions To Inspect, Open Questions, Suggested Next Steps. Focus on helping a human decide what to keep, update, or question in the wiki."
|
|
18119
|
+
),
|
|
18120
|
+
prompt: [
|
|
18121
|
+
`Source scope: ${scope.title}`,
|
|
18122
|
+
`Scope id: ${scope.id}`,
|
|
18123
|
+
`Tracked source ids: ${scope.sourceIds.join(", ") || "none"}`,
|
|
18124
|
+
"",
|
|
18125
|
+
"Pages:",
|
|
18126
|
+
pageContext || "- none",
|
|
18127
|
+
"",
|
|
18128
|
+
"Analyses:",
|
|
18129
|
+
analysisContext || "No analysis context available.",
|
|
18130
|
+
"",
|
|
18131
|
+
"Deterministic fallback draft:",
|
|
18132
|
+
fallback
|
|
18133
|
+
].join("\n")
|
|
18134
|
+
});
|
|
18135
|
+
return response.text?.trim() ? response.text.trim() : fallback;
|
|
18136
|
+
} catch {
|
|
18137
|
+
return fallback;
|
|
18138
|
+
}
|
|
18139
|
+
}
|
|
18140
|
+
async function stageSourceReviewForScope(rootDir, scope) {
|
|
18141
|
+
const { paths } = await loadVaultConfig(rootDir);
|
|
18142
|
+
const markdown = await generateSourceReviewMarkdown(rootDir, scope);
|
|
18143
|
+
if (!markdown) {
|
|
18144
|
+
throw new Error(`Could not generate a source review for ${scope.id}.`);
|
|
18145
|
+
}
|
|
18146
|
+
const graph = await readJsonFile(paths.graphPath);
|
|
18147
|
+
const relatedPages = graph ? scopedSourcePages(graph, scope.sourceIds) : [];
|
|
18148
|
+
const relatedPageIds = relatedPages.slice(0, 16).map((page) => page.id);
|
|
18149
|
+
const relatedNodeIds = graph ? scopedNodeIds(graph, scope.sourceIds).slice(0, 24) : [];
|
|
18150
|
+
const projectIds = uniqueStrings4(relatedPages.flatMap((page) => page.projectIds));
|
|
18151
|
+
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
18152
|
+
const output = buildOutputPage({
|
|
18153
|
+
title: `Source Review: ${scope.title}`,
|
|
18154
|
+
question: `Review ${scope.title}`,
|
|
18155
|
+
answer: markdown,
|
|
18156
|
+
citations: scope.sourceIds,
|
|
18157
|
+
schemaHash: graph?.generatedAt ?? "",
|
|
18158
|
+
outputFormat: "report",
|
|
18159
|
+
relatedPageIds,
|
|
18160
|
+
relatedNodeIds,
|
|
18161
|
+
relatedSourceIds: scope.sourceIds,
|
|
18162
|
+
projectIds,
|
|
18163
|
+
extraTags: ["source-review"],
|
|
18164
|
+
origin: "query",
|
|
18165
|
+
slug: `source-reviews/${scope.id}`,
|
|
18166
|
+
metadata: {
|
|
18167
|
+
status: "draft",
|
|
18168
|
+
createdAt: now,
|
|
18169
|
+
updatedAt: now,
|
|
18170
|
+
compiledFrom: scope.sourceIds,
|
|
18171
|
+
managedBy: "system",
|
|
18172
|
+
confidence: 0.79
|
|
18173
|
+
}
|
|
18174
|
+
});
|
|
18175
|
+
const approval = await stageGeneratedOutputPages(rootDir, [{ page: output.page, content: output.content }]);
|
|
18176
|
+
return {
|
|
18177
|
+
sourceId: scope.id,
|
|
18178
|
+
pageId: output.page.id,
|
|
18179
|
+
reviewPath: path25.join(approval.approvalDir, "wiki", output.page.path),
|
|
18180
|
+
staged: true,
|
|
18181
|
+
approvalId: approval.approvalId,
|
|
18182
|
+
approvalDir: approval.approvalDir
|
|
18183
|
+
};
|
|
18184
|
+
}
|
|
18185
|
+
function scopeFromManagedSource(source) {
|
|
18186
|
+
return {
|
|
18187
|
+
id: source.id,
|
|
18188
|
+
title: source.title,
|
|
18189
|
+
sourceIds: source.sourceIds
|
|
18190
|
+
};
|
|
18191
|
+
}
|
|
18192
|
+
async function reviewSourceScope(rootDir, scope) {
|
|
18193
|
+
return await stageSourceReviewForScope(rootDir, scope);
|
|
18194
|
+
}
|
|
18195
|
+
async function reviewManagedSource(rootDir, id) {
|
|
18196
|
+
const managedSources = await loadManagedSources(rootDir);
|
|
18197
|
+
const managedSource = managedSources.find((source) => source.id === id);
|
|
18198
|
+
if (managedSource) {
|
|
18199
|
+
if (!await loadVaultConfig(rootDir).then(({ paths }) => fileExists(paths.graphPath))) {
|
|
18200
|
+
await compileVault(rootDir, {});
|
|
18201
|
+
}
|
|
18202
|
+
return await stageSourceReviewForScope(rootDir, scopeFromManagedSource(managedSource));
|
|
18203
|
+
}
|
|
18204
|
+
const manifest = (await listManifests(rootDir)).find((candidate) => candidate.sourceId === id);
|
|
18205
|
+
if (!manifest) {
|
|
18206
|
+
throw new Error(`Managed source or source id not found: ${id}`);
|
|
18207
|
+
}
|
|
18208
|
+
return await stageSourceReviewForScope(rootDir, {
|
|
18209
|
+
id: manifest.sourceId,
|
|
18210
|
+
title: manifest.title,
|
|
18211
|
+
sourceIds: [manifest.sourceId]
|
|
18212
|
+
});
|
|
18213
|
+
}
|
|
16240
18214
|
function shouldCompile(changedSources, graphExists, compileRequested) {
|
|
16241
18215
|
return compileRequested && (!graphExists || changedSources.length > 0);
|
|
16242
18216
|
}
|
|
@@ -16247,17 +18221,18 @@ async function listManagedSourceRecords(rootDir) {
|
|
|
16247
18221
|
async function addManagedSource(rootDir, input, options = {}) {
|
|
16248
18222
|
const compileRequested = options.compile ?? true;
|
|
16249
18223
|
const briefRequested = options.brief ?? true;
|
|
18224
|
+
const reviewRequested = options.review ?? false;
|
|
16250
18225
|
const sources = await loadManagedSources(rootDir);
|
|
16251
18226
|
const resolved = await resolveManagedSourceInput(rootDir, input);
|
|
16252
18227
|
const existing = sources.find((candidate) => matchesManagedSourceSpec(candidate, resolved));
|
|
16253
18228
|
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
16254
18229
|
const source = existing ?? {
|
|
16255
|
-
id: resolved.kind === "directory" ? stableManagedSourceId(
|
|
18230
|
+
id: resolved.kind === "directory" || resolved.kind === "file" ? stableManagedSourceId(resolved.kind, path25.resolve(resolved.path), resolved.title) : stableManagedSourceId(resolved.kind, resolved.url, resolved.title),
|
|
16256
18231
|
kind: resolved.kind,
|
|
16257
18232
|
title: resolved.title,
|
|
16258
|
-
path: resolved.kind === "directory" ? resolved.path : void 0,
|
|
18233
|
+
path: resolved.kind === "directory" || resolved.kind === "file" ? resolved.path : void 0,
|
|
16259
18234
|
repoRoot: resolved.kind === "directory" ? resolved.repoRoot : void 0,
|
|
16260
|
-
url: resolved.kind === "directory" ? void 0 : resolved.url,
|
|
18235
|
+
url: resolved.kind === "directory" || resolved.kind === "file" ? void 0 : resolved.url,
|
|
16261
18236
|
createdAt: now,
|
|
16262
18237
|
updatedAt: now,
|
|
16263
18238
|
status: "ready",
|
|
@@ -16286,15 +18261,18 @@ async function addManagedSource(rootDir, input, options = {}) {
|
|
|
16286
18261
|
};
|
|
16287
18262
|
const nextSources = existing ? sources.map((candidate) => candidate.id === nextSource.id ? nextSource : candidate) : [...sources, nextSource];
|
|
16288
18263
|
await saveManagedSources(rootDir, nextSources);
|
|
18264
|
+
const review = reviewRequested && nextSource.status === "ready" ? await stageSourceReviewForScope(rootDir, scopeFromManagedSource(nextSource)) : void 0;
|
|
16289
18265
|
return {
|
|
16290
18266
|
source: nextSource,
|
|
16291
18267
|
compile,
|
|
16292
|
-
briefGenerated
|
|
18268
|
+
briefGenerated,
|
|
18269
|
+
review
|
|
16293
18270
|
};
|
|
16294
18271
|
}
|
|
16295
18272
|
async function reloadManagedSources(rootDir, options = {}) {
|
|
16296
18273
|
const compileRequested = options.compile ?? true;
|
|
16297
18274
|
const briefRequested = options.brief ?? true;
|
|
18275
|
+
const reviewRequested = options.review ?? false;
|
|
16298
18276
|
const sources = await loadManagedSources(rootDir);
|
|
16299
18277
|
const selected = options.all || !options.id ? sources : sources.filter((source) => source.id === options.id);
|
|
16300
18278
|
if (!selected.length) {
|
|
@@ -16330,10 +18308,14 @@ async function reloadManagedSources(rootDir, options = {}) {
|
|
|
16330
18308
|
};
|
|
16331
18309
|
});
|
|
16332
18310
|
await saveManagedSources(rootDir, nextSources);
|
|
18311
|
+
const reviews = reviewRequested ? await Promise.all(
|
|
18312
|
+
nextSources.filter((source) => selected.some((candidate) => candidate.id === source.id)).filter((source) => source.status === "ready").map(async (source) => await stageSourceReviewForScope(rootDir, scopeFromManagedSource(source)))
|
|
18313
|
+
) : [];
|
|
16333
18314
|
return {
|
|
16334
18315
|
sources: nextSources.filter((source) => selected.some((candidate) => candidate.id === source.id)),
|
|
16335
18316
|
compile,
|
|
16336
|
-
briefPaths: [...briefPaths.values()]
|
|
18317
|
+
briefPaths: [...briefPaths.values()],
|
|
18318
|
+
reviews
|
|
16337
18319
|
};
|
|
16338
18320
|
}
|
|
16339
18321
|
async function deleteManagedSource(rootDir, id) {
|
|
@@ -17237,6 +19219,7 @@ export {
|
|
|
17237
19219
|
importInbox,
|
|
17238
19220
|
ingestDirectory,
|
|
17239
19221
|
ingestInput,
|
|
19222
|
+
ingestInputDetailed,
|
|
17240
19223
|
initVault,
|
|
17241
19224
|
initWorkspace,
|
|
17242
19225
|
installAgent,
|
|
@@ -17267,10 +19250,13 @@ export {
|
|
|
17267
19250
|
rejectApproval,
|
|
17268
19251
|
reloadManagedSources,
|
|
17269
19252
|
resolvePaths,
|
|
19253
|
+
reviewManagedSource,
|
|
19254
|
+
reviewSourceScope,
|
|
17270
19255
|
runSchedule,
|
|
17271
19256
|
runWatchCycle,
|
|
17272
19257
|
searchVault,
|
|
17273
19258
|
serveSchedules,
|
|
19259
|
+
stageGeneratedOutputPages,
|
|
17274
19260
|
startGraphServer,
|
|
17275
19261
|
startMcpServer,
|
|
17276
19262
|
syncTrackedRepos,
|