npm - @swarmvaultai/engine - Versions diffs - 0.2.2 → 0.3.0 - Mend

@swarmvaultai/engine 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md CHANGED Viewed

@@ -186,17 +186,21 @@ This matters because many "OpenAI-compatible" backends only implement part of th
 - `reloadManagedSources(rootDir, { id, all, compile, brief, maxPages, maxDepth })` re-syncs one managed source or the full registry
 - `deleteManagedSource(rootDir, id)` removes a managed-source registry entry and transient sync state without deleting canonical vault artifacts
 - `ingestInput(rootDir, input, { includeAssets, maxAssetSize })` ingests a local file path or URL
+- `ingestInputDetailed(rootDir, input, { includeAssets, maxAssetSize })` returns a summary envelope with `created`, `updated`, `unchanged`, and `removed` manifests when one input expands into multiple sources
 - `addInput(rootDir, input, { author, contributor })` captures supported URLs into normalized markdown before ingesting them, or falls back to generic URL ingest
 - `ingestDirectory(rootDir, inputDir, { repoRoot, include, exclude, maxFiles, gitignore, extractClasses })` recursively ingests a local directory as a repo-aware code/content source tree
 - `importInbox(rootDir, inputDir?)` recursively imports supported inbox files plus markdown and HTML browser-clipper style bundles
 - managed sources support local directories, public GitHub repo root URLs, and bounded same-domain docs hubs
 - registry data lives in `state/sources.json`, working state lives under `state/sources/<id>/`, and source briefs are written to `wiki/outputs/source-briefs/<id>.md`
+- EPUB inputs split into chapter-level manifests with shared group metadata so books stay navigable instead of becoming one giant source
+- CSV and TSV inputs produce bounded tabular summaries with delimiter-aware previews and compact column hints
+- XLSX inputs extract workbook-level and sheet-level previews, while PPTX inputs extract slide text plus speaker notes when present
 - JavaScript, JSX, TypeScript, TSX, Python, Go, Rust, Java, Kotlin, Scala, Lua, Zig, C#, C, C++, PHP, Ruby, and PowerShell inputs are treated as code sources and compiled into both source pages and `wiki/code/` module pages
 - `.rst` and `.rest` inputs are treated as first-class text sources with lightweight heading and directive normalization before analysis
 - code manifests can carry `repoRelativePath`, and compile writes `state/code-index.json` so local imports can resolve across an ingested repo tree
 - repo-aware manifests, graph nodes, and graph pages can also carry `sourceClass` so first-party, third-party, resource, and generated material can be filtered and reported separately
 - HTML and markdown URL ingests localize remote image references into `raw/assets/<sourceId>/` by default and rewrite the stored markdown to local relative paths
-- PDF and DOCX ingests now write extracted-text and metadata sidecars under `state/extracts/`, and image ingest keeps the same sidecar model for vision extraction
+- PDF, DOCX, EPUB, CSV/TSV, XLSX, and PPTX ingests write extracted-text and metadata sidecars under `state/extracts/`, and image ingest keeps the same sidecar model for vision extraction
 - Tree-sitter-backed languages now verify runtime and grammar compatibility per language; failures stay local to the affected source and surface as diagnostics instead of aborting the whole compile
 ### Compile + Query

package/dist/index.d.ts CHANGED Viewed

@@ -54,7 +54,7 @@ type PageStatus = "draft" | "candidate" | "active" | "archived";
 type PageManager = "system" | "human";
 type ApprovalEntryStatus = "pending" | "accepted" | "rejected";
 type ApprovalChangeType = "create" | "update" | "delete" | "promote";
-type SourceKind = "markdown" | "text" | "pdf" | "image" | "html" | "docx" | "binary" | "code";
+type SourceKind = "markdown" | "text" | "pdf" | "image" | "html" | "docx" | "epub" | "csv" | "xlsx" | "pptx" | "binary" | "code";
 type SourceCaptureType = "arxiv" | "doi" | "tweet" | "article" | "url";
 type SourceClass = "first_party" | "third_party" | "resource" | "generated";
 type ManagedSourceKind = "directory" | "github_repo" | "crawl_url";
@@ -231,7 +231,7 @@ interface SourceAttachment {
     mimeType: string;
     originalPath?: string;
 }
-type ExtractionKind = "plain_text" | "html_readability" | "pdf_text" | "docx_text" | "image_vision";
+type ExtractionKind = "plain_text" | "html_readability" | "pdf_text" | "docx_text" | "epub_text" | "csv_text" | "xlsx_text" | "pptx_text" | "image_vision";
 interface ExtractionTerm {
     name: string;
     description: string;
@@ -284,6 +284,15 @@ interface DirectoryIngestResult {
     updated: SourceManifest[];
     skipped: DirectoryIngestSkip[];
 }
+interface InputIngestResult {
+    input: string;
+    scannedCount: number;
+    created: SourceManifest[];
+    updated: SourceManifest[];
+    unchanged: SourceManifest[];
+    removed: SourceManifest[];
+    skipped: DirectoryIngestSkip[];
+}
 interface SourceManifest {
     sourceId: string;
     title: string;
@@ -302,6 +311,13 @@ interface SourceManifest {
     mimeType: string;
     contentHash: string;
     semanticHash: string;
+    sourceGroupId?: string;
+    sourceGroupTitle?: string;
+    sourcePartKey?: string;
+    partIndex?: number;
+    partCount?: number;
+    partTitle?: string;
+    details?: Record<string, string>;
     createdAt: string;
     updatedAt: string;
     attachments?: SourceAttachment[];
@@ -1206,6 +1222,7 @@ declare function uninstallGitHooks(rootDir: string): Promise<GitHookStatus>;
 declare function listTrackedRepoRoots(rootDir: string): Promise<string[]>;
 declare function syncTrackedRepos(rootDir: string, options?: IngestOptions, repoRoots?: string[]): Promise<RepoSyncResult>;
 declare function syncTrackedReposForWatch(rootDir: string, options?: IngestOptions, repoRoots?: string[]): Promise<WatchRepoSyncResult>;
+declare function ingestInputDetailed(rootDir: string, input: string, options?: IngestOptions): Promise<InputIngestResult>;
 declare function ingestInput(rootDir: string, input: string, options?: IngestOptions): Promise<SourceManifest>;
 declare function addInput(rootDir: string, input: string, options?: AddOptions): Promise<AddResult>;
 declare function ingestDirectory(rootDir: string, inputDir: string, options?: IngestOptions): Promise<DirectoryIngestResult>;
@@ -1327,4 +1344,4 @@ declare function getWatchStatus(rootDir: string): Promise<WatchStatusResult>;
 declare function createWebSearchAdapter(id: string, config: WebSearchProviderConfig, rootDir: string): Promise<WebSearchAdapter>;
 declare function getWebSearchAdapterForTask(rootDir: string, task: "deepLintProvider"): Promise<WebSearchAdapter>;
-export { type AddOptions, type AddResult, type AgentType, type AnalyzedTerm, type ApprovalChangeType, type ApprovalDetail, type ApprovalEntry, type ApprovalEntryDetail, type ApprovalEntryStatus, type ApprovalManifest, type ApprovalSummary, type BenchmarkArtifact, type BenchmarkOptions, type BenchmarkQuestionResult, type BenchmarkSummary, type CandidateRecord, type ChartDatum, type ChartSpec, type ClaimStatus, type CodeAnalysis, type CodeDiagnostic, type CodeImport, type CodeIndexArtifact, type CodeIndexEntry, type CodeLanguage, type CodeSymbol, type CodeSymbolKind, type CommandRoleExecutorConfig, type CompileOptions, type CompileResult, type CompileState, type DirectoryIngestResult, type DirectoryIngestSkip, type EmbeddingCacheArtifact, type EmbeddingCacheEntry, type EvidenceClass, type ExploreOptions, type ExploreResult, type ExploreStepResult, type ExtractionClaim, type ExtractionKind, type ExtractionTerm, type Freshness, type GenerationAttachment, type GenerationRequest, type GenerationResponse, type GitHookStatus, type GraphArtifact, type GraphEdge, type GraphExplainNeighbor, type GraphExplainResult, type GraphExportFormat, type GraphExportResult, type GraphHyperedge, type GraphNode, type GraphPage, type GraphPathResult, type GraphPushCounts, type GraphPushNeo4jOptions, type GraphPushResult, type GraphQueryMatch, type GraphQueryResult, type GraphReportArtifact, type ImageGenerationRequest, type ImageGenerationResponse, type ImageVisionExtraction, type InboxImportResult, type InboxImportSkip, type IngestOptions, type InitOptions, type InstallAgentOptions, type InstallAgentResult, type LintFinding, type LintOptions, type ManagedSourceAddOptions, type ManagedSourceAddResult, type ManagedSourceDeleteResult, type ManagedSourceKind, type ManagedSourceRecord, type ManagedSourceReloadOptions, type ManagedSourceReloadResult, type ManagedSourceStatus, type ManagedSourceSyncCounts, type ManagedSourcesArtifact, type Neo4jGraphSinkConfig, type OrchestrationConfig, type OrchestrationFinding, type OrchestrationProposal, type OrchestrationRole, type OrchestrationRoleConfig, type OrchestrationRoleResult, type OutputAsset, type OutputAssetRole, type OutputFormat, type OutputOrigin, type PageKind, type PageManager, type PageStatus, type PendingSemanticRefreshEntry, type Polarity, type ProviderAdapter, type ProviderCapability, type ProviderConfig, type ProviderRoleExecutorConfig, type ProviderType, type QueryOptions, type QueryResult, type RepoSyncResult, type ResolvedPaths, type ReviewActionResult, type RoleExecutorConfig, type SceneElement, type SceneSpec, type ScheduleController, type ScheduleJobConfig, type ScheduleStateRecord, type ScheduleTriggerConfig, type ScheduledCompileTask, type ScheduledExploreTask, type ScheduledLintTask, type ScheduledQueryTask, type ScheduledRunResult, type ScheduledTaskConfig, type SearchResult, type SourceAnalysis, type SourceAttachment, type SourceCaptureType, type SourceClaim, type SourceClass, type SourceExtractionArtifact, type SourceKind, type SourceManifest, type SourceRationale, type VaultConfig, type WatchController, type WatchOptions, type WatchRepoSyncResult, type WatchRunRecord, type WatchStatusResult, type WebSearchAdapter, type WebSearchProviderConfig, type WebSearchProviderType, type WebSearchResult, acceptApproval, addInput, addManagedSource, agentTypeSchema, archiveCandidate, assertProviderCapability, benchmarkVault, bootstrapDemo, compileVault, createMcpServer, createProvider, createWebSearchAdapter, defaultVaultConfig, defaultVaultSchema, deleteManagedSource, explainGraphVault, exploreVault, exportGraphFormat, exportGraphHtml, getGitHookStatus, getProviderForTask, getWatchStatus, getWebSearchAdapterForTask, getWorkspaceInfo, importInbox, ingestDirectory, ingestInput, initVault, initWorkspace, installAgent, installConfiguredAgents, installGitHooks, lintVault, listApprovals, listCandidates, listGodNodes, listGraphHyperedges, listManagedSourceRecords, listManifests, listPages, listSchedules, listTrackedRepoRoots, loadVaultConfig, loadVaultSchema, loadVaultSchemas, pathGraphVault, promoteCandidate, providerCapabilitySchema, providerTypeSchema, pushGraphNeo4j, queryGraphVault, queryVault, readApproval, readExtractedText, readGraphReport, readPage, rejectApproval, reloadManagedSources, resolvePaths, runSchedule, runWatchCycle, searchVault, serveSchedules, startGraphServer, startMcpServer, syncTrackedRepos, syncTrackedReposForWatch, uninstallGitHooks, watchVault, webSearchProviderTypeSchema };
+export { type AddOptions, type AddResult, type AgentType, type AnalyzedTerm, type ApprovalChangeType, type ApprovalDetail, type ApprovalEntry, type ApprovalEntryDetail, type ApprovalEntryStatus, type ApprovalManifest, type ApprovalSummary, type BenchmarkArtifact, type BenchmarkOptions, type BenchmarkQuestionResult, type BenchmarkSummary, type CandidateRecord, type ChartDatum, type ChartSpec, type ClaimStatus, type CodeAnalysis, type CodeDiagnostic, type CodeImport, type CodeIndexArtifact, type CodeIndexEntry, type CodeLanguage, type CodeSymbol, type CodeSymbolKind, type CommandRoleExecutorConfig, type CompileOptions, type CompileResult, type CompileState, type DirectoryIngestResult, type DirectoryIngestSkip, type EmbeddingCacheArtifact, type EmbeddingCacheEntry, type EvidenceClass, type ExploreOptions, type ExploreResult, type ExploreStepResult, type ExtractionClaim, type ExtractionKind, type ExtractionTerm, type Freshness, type GenerationAttachment, type GenerationRequest, type GenerationResponse, type GitHookStatus, type GraphArtifact, type GraphEdge, type GraphExplainNeighbor, type GraphExplainResult, type GraphExportFormat, type GraphExportResult, type GraphHyperedge, type GraphNode, type GraphPage, type GraphPathResult, type GraphPushCounts, type GraphPushNeo4jOptions, type GraphPushResult, type GraphQueryMatch, type GraphQueryResult, type GraphReportArtifact, type ImageGenerationRequest, type ImageGenerationResponse, type ImageVisionExtraction, type InboxImportResult, type InboxImportSkip, type IngestOptions, type InitOptions, type InputIngestResult, type InstallAgentOptions, type InstallAgentResult, type LintFinding, type LintOptions, type ManagedSourceAddOptions, type ManagedSourceAddResult, type ManagedSourceDeleteResult, type ManagedSourceKind, type ManagedSourceRecord, type ManagedSourceReloadOptions, type ManagedSourceReloadResult, type ManagedSourceStatus, type ManagedSourceSyncCounts, type ManagedSourcesArtifact, type Neo4jGraphSinkConfig, type OrchestrationConfig, type OrchestrationFinding, type OrchestrationProposal, type OrchestrationRole, type OrchestrationRoleConfig, type OrchestrationRoleResult, type OutputAsset, type OutputAssetRole, type OutputFormat, type OutputOrigin, type PageKind, type PageManager, type PageStatus, type PendingSemanticRefreshEntry, type Polarity, type ProviderAdapter, type ProviderCapability, type ProviderConfig, type ProviderRoleExecutorConfig, type ProviderType, type QueryOptions, type QueryResult, type RepoSyncResult, type ResolvedPaths, type ReviewActionResult, type RoleExecutorConfig, type SceneElement, type SceneSpec, type ScheduleController, type ScheduleJobConfig, type ScheduleStateRecord, type ScheduleTriggerConfig, type ScheduledCompileTask, type ScheduledExploreTask, type ScheduledLintTask, type ScheduledQueryTask, type ScheduledRunResult, type ScheduledTaskConfig, type SearchResult, type SourceAnalysis, type SourceAttachment, type SourceCaptureType, type SourceClaim, type SourceClass, type SourceExtractionArtifact, type SourceKind, type SourceManifest, type SourceRationale, type VaultConfig, type WatchController, type WatchOptions, type WatchRepoSyncResult, type WatchRunRecord, type WatchStatusResult, type WebSearchAdapter, type WebSearchProviderConfig, type WebSearchProviderType, type WebSearchResult, acceptApproval, addInput, addManagedSource, agentTypeSchema, archiveCandidate, assertProviderCapability, benchmarkVault, bootstrapDemo, compileVault, createMcpServer, createProvider, createWebSearchAdapter, defaultVaultConfig, defaultVaultSchema, deleteManagedSource, explainGraphVault, exploreVault, exportGraphFormat, exportGraphHtml, getGitHookStatus, getProviderForTask, getWatchStatus, getWebSearchAdapterForTask, getWorkspaceInfo, importInbox, ingestDirectory, ingestInput, ingestInputDetailed, initVault, initWorkspace, installAgent, installConfiguredAgents, installGitHooks, lintVault, listApprovals, listCandidates, listGodNodes, listGraphHyperedges, listManagedSourceRecords, listManifests, listPages, listSchedules, listTrackedRepoRoots, loadVaultConfig, loadVaultSchema, loadVaultSchemas, pathGraphVault, promoteCandidate, providerCapabilitySchema, providerTypeSchema, pushGraphNeo4j, queryGraphVault, queryVault, readApproval, readExtractedText, readGraphReport, readPage, rejectApproval, reloadManagedSources, resolvePaths, runSchedule, runWatchCycle, searchVault, serveSchedules, startGraphServer, startMcpServer, syncTrackedRepos, syncTrackedReposForWatch, uninstallGitHooks, watchVault, webSearchProviderTypeSchema };

package/dist/index.js CHANGED Viewed

@@ -1729,7 +1729,7 @@ import matter3 from "gray-matter";
 import ignore from "ignore";
 import { JSDOM as JSDOM2 } from "jsdom";
 import mime from "mime-types";
-import TurndownService from "turndown";
+import TurndownService2 from "turndown";
 // src/code-analysis.ts
 import fs6 from "fs/promises";
@@ -4504,8 +4504,10 @@ async function analyzeCodeSource(manifest, extractedText, schemaHash) {
 import fs7 from "fs/promises";
 import os from "os";
 import path7 from "path";
+import { parse as parseCsvSync } from "csv-parse/sync";
 import { strFromU8, unzipSync } from "fflate";
 import { JSDOM } from "jsdom";
+import TurndownService from "turndown";
 import { z } from "zod";
 var imageVisionExtractionSchema = z.object({
   title: z.string().min(1).nullable().optional(),
@@ -4685,7 +4687,7 @@ function normalizePdfMetadata(raw) {
 function normalizeDocumentText(raw) {
   return raw.replace(/\r\n/g, "\n").split(/\n{2,}/).map((section) => normalizeWhitespace(section)).filter(Boolean).join("\n\n").trim();
 }
-function parseDocxCoreMetadata(bytes) {
+function parseOfficeCoreMetadata(bytes) {
   try {
     const archive = unzipSync(new Uint8Array(bytes));
     const coreXml = archive["docProps/core.xml"];
@@ -4725,6 +4727,122 @@ function parseDocxCoreMetadata(bytes) {
     return void 0;
   }
 }
+function decodeTextBytes(bytes) {
+  const text = bytes.toString("utf8");
+  return text.charCodeAt(0) === 65279 ? text.slice(1) : text;
+}
+function normalizeTableCell(value) {
+  return normalizeWhitespace(String(value ?? ""));
+}
+function isNumericCell(value) {
+  return value.length > 0 && Number.isFinite(Number(value));
+}
+function detectHeaderRow(rows) {
+  if (!rows.length) {
+    return { headers: [], bodyRows: [] };
+  }
+  const firstRow = rows[0] ?? [];
+  const nonEmpty = firstRow.filter(Boolean);
+  const unique = new Set(nonEmpty);
+  const nonNumeric = nonEmpty.filter((value) => !isNumericCell(value));
+  const looksLikeHeader = nonEmpty.length > 0 && unique.size === nonEmpty.length && nonNumeric.length >= Math.ceil(nonEmpty.length / 2) && rows.length > 1;
+  if (looksLikeHeader) {
+    return {
+      headers: firstRow.map((value, index) => value || `column_${index + 1}`),
+      bodyRows: rows.slice(1)
+    };
+  }
+  const columnCount = Math.max(...rows.map((row) => row.length), 0);
+  return {
+    headers: Array.from({ length: columnCount }, (_, index) => `column_${index + 1}`),
+    bodyRows: rows
+  };
+}
+function columnHints(headers, rows) {
+  return headers.map((header, index) => {
+    const values = rows.map((row) => row[index] ?? "").map(normalizeTableCell).filter(Boolean);
+    if (!values.length) {
+      return null;
+    }
+    const uniqueValues = [...new Set(values)];
+    if (values.every(isNumericCell)) {
+      return `- ${header}: numeric`;
+    }
+    if (uniqueValues.length <= 6 && values.length >= uniqueValues.length) {
+      return `- ${header}: low-cardinality (${uniqueValues.slice(0, 6).join(", ")})`;
+    }
+    return null;
+  }).filter((item) => Boolean(item));
+}
+function markdownTable(headers, rows, rowLimit = 20) {
+  if (!headers.length) {
+    return ["No tabular preview available."];
+  }
+  const width = headers.length;
+  const lines = [`| ${headers.join(" | ")} |`, `| ${headers.map(() => "---").join(" | ")} |`];
+  for (const row of rows.slice(0, rowLimit)) {
+    const normalized = Array.from({ length: width }, (_, index) => normalizeTableCell(row[index] ?? ""));
+    lines.push(`| ${normalized.join(" | ")} |`);
+  }
+  return lines;
+}
+function zipEntryText(archive, entryPath) {
+  const entry = archive[entryPath];
+  return entry ? strFromU8(entry) : void 0;
+}
+function parseXmlDocument(xml) {
+  return new JSDOM(xml, { contentType: "text/xml" }).window.document;
+}
+function zipDirname(value) {
+  const index = value.lastIndexOf("/");
+  return index === -1 ? "" : value.slice(0, index);
+}
+function resolveZipTarget(basePath, target) {
+  return path7.posix.normalize(path7.posix.join(zipDirname(basePath), target));
+}
+function relationshipTargets(xml, basePath) {
+  const document = parseXmlDocument(xml);
+  const map = /* @__PURE__ */ new Map();
+  for (const node of Array.from(document.getElementsByTagName("*"))) {
+    if (node.localName !== "Relationship") {
+      continue;
+    }
+    const id = node.getAttribute("Id")?.trim();
+    const target = node.getAttribute("Target")?.trim();
+    const type = node.getAttribute("Type")?.trim() ?? "";
+    if (!id || !target) {
+      continue;
+    }
+    map.set(id, { target: resolveZipTarget(basePath, target), type });
+  }
+  return map;
+}
+function xmlTextNodes(xml, localName) {
+  const document = parseXmlDocument(xml);
+  const values = [];
+  for (const node of Array.from(document.getElementsByTagName("*"))) {
+    if (node.localName !== localName) {
+      continue;
+    }
+    const text = normalizeWhitespace(node.textContent ?? "");
+    if (text) {
+      values.push(text);
+    }
+  }
+  return values;
+}
+function firstHtmlHeading(html) {
+  const dom = new JSDOM(html);
+  const heading = dom.window.document.querySelector("h1, h2, h3");
+  const title = normalizeWhitespace(heading?.textContent ?? "");
+  return title || void 0;
+}
+function htmlToMarkdown(html) {
+  const dom = new JSDOM(html);
+  const turndown = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced" });
+  const body = dom.window.document.body?.innerHTML ?? html;
+  return turndown.turndown(body).trim();
+}
 async function extractPdfText(input) {
   try {
     const pdfjs = await import("pdfjs-dist/legacy/build/pdf.mjs");
@@ -4782,7 +4900,7 @@ async function extractDocxText(input) {
     const warnings = result.messages.map((message) => normalizeWhitespace(message.message)).filter(Boolean).map((message) => truncate(message, 240));
     const artifact = {
       ...extractionMetadata("docx", input.mimeType, "docx_text"),
-      metadata: parseDocxCoreMetadata(input.bytes),
+      metadata: parseOfficeCoreMetadata(input.bytes),
       warnings: warnings.length ? warnings : void 0
     };
     if (!extractedText) {
@@ -4801,6 +4919,258 @@ async function extractDocxText(input) {
     };
   }
 }
+async function extractCsvText(input) {
+  try {
+    const rawText = decodeTextBytes(input.bytes);
+    const delimiter = input.fileName?.toLowerCase().endsWith(".tsv") || input.mimeType.includes("tab-separated") ? "	" : ",";
+    const parsed = parseCsvSync(rawText, {
+      delimiter,
+      relax_column_count: true,
+      skip_empty_lines: true,
+      trim: true
+    });
+    const rows = parsed.map((row) => row.map((value) => normalizeTableCell(value)));
+    const { headers, bodyRows } = detectHeaderRow(rows);
+    const hintLines = columnHints(headers, bodyRows);
+    const title = input.fileName ? path7.basename(input.fileName, path7.extname(input.fileName)) : void 0;
+    const extractedText = [
+      title ? `# ${title}` : null,
+      `Format: ${delimiter === "	" ? "TSV" : "CSV"}`,
+      `Rows: ${bodyRows.length}`,
+      `Columns: ${headers.length}`,
+      headers.length ? `Headers: ${headers.join(", ")}` : null,
+      "",
+      hintLines.length ? "## Column Hints" : null,
+      hintLines.length ? hintLines.join("\n") : null,
+      hintLines.length ? "" : null,
+      "## Preview",
+      ...markdownTable(headers, bodyRows)
+    ].filter((item) => Boolean(item)).join("\n").trim();
+    const artifact = {
+      ...extractionMetadata("csv", input.mimeType, "csv_text"),
+      metadata: {
+        format: delimiter === "	" ? "tsv" : "csv",
+        row_count: String(bodyRows.length),
+        column_count: String(headers.length),
+        headers: headers.join(", ")
+      }
+    };
+    return {
+      title,
+      extractedText,
+      artifact
+    };
+  } catch (error) {
+    return {
+      artifact: {
+        ...extractionMetadata("csv", input.mimeType, "csv_text"),
+        warnings: [`CSV extraction failed: ${error instanceof Error ? truncate(error.message, 240) : "unknown error"}`]
+      }
+    };
+  }
+}
+async function extractXlsxText(input) {
+  try {
+    const XLSX = await import("xlsx");
+    const workbook = XLSX.read(input.bytes, { type: "buffer", cellFormula: false, cellHTML: false, cellStyles: false });
+    const allSheetNames = workbook.SheetNames;
+    const sheetNames = allSheetNames.slice(0, 10);
+    const sheetSections = [];
+    const metadata = {
+      sheet_count: String(allSheetNames.length),
+      sheet_names: allSheetNames.join(", ")
+    };
+    for (const sheetName of sheetNames) {
+      const sheet = workbook.Sheets[sheetName];
+      if (!sheet) {
+        continue;
+      }
+      const rows = XLSX.utils.sheet_to_json(sheet, {
+        header: 1,
+        raw: false,
+        defval: ""
+      }).map((row) => row.map((value) => normalizeTableCell(value)));
+      const { headers, bodyRows } = detectHeaderRow(rows);
+      sheetSections.push(`## Sheet: ${sheetName}`);
+      sheetSections.push(`Rows: ${bodyRows.length}`);
+      sheetSections.push(`Columns: ${headers.length}`);
+      sheetSections.push(...markdownTable(headers, bodyRows));
+      sheetSections.push("");
+    }
+    const title = normalizeWhitespace(String(workbook.Props?.Title ?? "")) || (input.fileName ? path7.basename(input.fileName, path7.extname(input.fileName)) : void 0);
+    const extractedText = [
+      title ? `# ${title}` : null,
+      `Sheets: ${allSheetNames.length}`,
+      allSheetNames.length ? `Sheet Names: ${allSheetNames.join(", ")}` : null,
+      "",
+      ...sheetSections
+    ].filter((item) => Boolean(item)).join("\n").trim();
+    const warnings = allSheetNames.length > sheetNames.length ? ["Workbook preview truncated to the first 10 sheets."] : void 0;
+    return {
+      title,
+      extractedText,
+      artifact: {
+        ...extractionMetadata("xlsx", input.mimeType, "xlsx_text"),
+        metadata,
+        warnings
+      }
+    };
+  } catch (error) {
+    return {
+      artifact: {
+        ...extractionMetadata("xlsx", input.mimeType, "xlsx_text"),
+        warnings: [`XLSX extraction failed: ${error instanceof Error ? truncate(error.message, 240) : "unknown error"}`]
+      }
+    };
+  }
+}
+async function extractPptxText(input) {
+  try {
+    const archive = unzipSync(new Uint8Array(input.bytes));
+    const presentationXml = zipEntryText(archive, "ppt/presentation.xml");
+    if (!presentationXml) {
+      throw new Error("Missing ppt/presentation.xml");
+    }
+    const relsXml = zipEntryText(archive, "ppt/_rels/presentation.xml.rels");
+    if (!relsXml) {
+      throw new Error("Missing ppt/_rels/presentation.xml.rels");
+    }
+    const rels = relationshipTargets(relsXml, "ppt/presentation.xml");
+    const document = parseXmlDocument(presentationXml);
+    const slideTargets = Array.from(document.getElementsByTagName("*")).filter((node) => node.localName === "sldId").map((node) => node.getAttribute("r:id")?.trim()).filter((value) => Boolean(value)).map((relationshipId) => rels.get(relationshipId)?.target).filter((value) => Boolean(value)).slice(0, 60);
+    const slideSections = [];
+    for (let index = 0; index < slideTargets.length; index += 1) {
+      const slidePath = slideTargets[index];
+      const slideXml = zipEntryText(archive, slidePath);
+      if (!slideXml) {
+        continue;
+      }
+      const slideTexts = xmlTextNodes(slideXml, "t");
+      const slideTitle = slideTexts[0] ?? `Slide ${index + 1}`;
+      slideSections.push(`## Slide ${index + 1}: ${slideTitle}`);
+      if (slideTexts.length) {
+        slideSections.push(slideTexts.join("\n"));
+      }
+      const slideRelsPath = `${zipDirname(slidePath)}/_rels/${path7.posix.basename(slidePath)}.rels`;
+      const slideRelsXml = zipEntryText(archive, slideRelsPath);
+      if (slideRelsXml) {
+        const slideRels = relationshipTargets(slideRelsXml, slidePath);
+        const notesTarget = [...slideRels.values()].find((entry) => entry.type.endsWith("/notesSlide"))?.target;
+        if (notesTarget) {
+          const notesXml = zipEntryText(archive, notesTarget);
+          const noteTexts = notesXml ? xmlTextNodes(notesXml, "t") : [];
+          if (noteTexts.length) {
+            slideSections.push("Notes:");
+            slideSections.push(noteTexts.join("\n"));
+          }
+        }
+      }
+      slideSections.push("");
+    }
+    const metadata = parseOfficeCoreMetadata(input.bytes);
+    const title = metadata?.title || (input.fileName ? path7.basename(input.fileName, path7.extname(input.fileName)) : void 0);
+    const extractedText = [title ? `# ${title}` : null, `Slides: ${slideTargets.length}`, "", ...slideSections].filter((item) => Boolean(item)).join("\n").trim();
+    return {
+      title,
+      extractedText,
+      artifact: {
+        ...extractionMetadata("pptx", input.mimeType, "pptx_text"),
+        metadata: {
+          ...metadata ?? {},
+          slide_count: String(slideTargets.length)
+        },
+        warnings: Array.from(document.getElementsByTagName("*")).filter((node) => node.localName === "sldId").length > slideTargets.length ? ["Slide extraction truncated to the first 60 slides."] : void 0
+      }
+    };
+  } catch (error) {
+    return {
+      artifact: {
+        ...extractionMetadata("pptx", input.mimeType, "pptx_text"),
+        warnings: [`PPTX extraction failed: ${error instanceof Error ? truncate(error.message, 240) : "unknown error"}`]
+      }
+    };
+  }
+}
+async function extractEpubChapters(input) {
+  try {
+    const archive = unzipSync(new Uint8Array(input.bytes));
+    const containerXml = zipEntryText(archive, "META-INF/container.xml");
+    if (!containerXml) {
+      throw new Error("Missing META-INF/container.xml");
+    }
+    const container = parseXmlDocument(containerXml);
+    const rootfile = Array.from(container.getElementsByTagName("*")).find((node) => node.localName === "rootfile");
+    const packagePath = rootfile?.getAttribute("full-path")?.trim();
+    if (!packagePath) {
+      throw new Error("EPUB container did not declare a package document.");
+    }
+    const packageXml = zipEntryText(archive, packagePath);
+    if (!packageXml) {
+      throw new Error(`Missing EPUB package document: ${packagePath}`);
+    }
+    const packageDocument = parseXmlDocument(packageXml);
+    const manifestEntries = new Map(
+      Array.from(packageDocument.getElementsByTagName("*")).filter((node) => node.localName === "item").map(
+        (node) => [
+          node.getAttribute("id")?.trim() ?? "",
+          {
+            href: node.getAttribute("href")?.trim() ?? "",
+            mediaType: node.getAttribute("media-type")?.trim() ?? "",
+            properties: node.getAttribute("properties")?.trim() ?? ""
+          }
+        ]
+      ).filter(([id, item]) => Boolean(id && item.href))
+    );
+    const spineIds = Array.from(packageDocument.getElementsByTagName("*")).filter((node) => node.localName === "itemref").map((node) => node.getAttribute("idref")?.trim()).filter((value) => Boolean(value));
+    const bookTitle = xmlTextNodes(packageXml, "title")[0] || (input.fileName ? path7.basename(input.fileName, path7.extname(input.fileName)) : void 0);
+    const author = xmlTextNodes(packageXml, "creator")[0];
+    const chapters = [];
+    for (const spineId of spineIds) {
+      const item = manifestEntries.get(spineId);
+      if (!item || !item.mediaType.includes("html") && !item.mediaType.includes("xhtml")) {
+        continue;
+      }
+      if (item.properties.split(/\s+/).includes("nav")) {
+        continue;
+      }
+      const entryPath = resolveZipTarget(packagePath, item.href);
+      const html = zipEntryText(archive, entryPath);
+      if (!html) {
+        continue;
+      }
+      const markdown = htmlToMarkdown(html);
+      if (!markdown) {
+        continue;
+      }
+      const chapterTitle = firstHtmlHeading(html) || markdown.match(/^#\s+(.+)$/m)?.[1]?.trim() || item.href;
+      const normalizedTitle = normalizeWhitespace(chapterTitle);
+      if (!normalizedTitle || /^table of contents$/i.test(normalizedTitle)) {
+        continue;
+      }
+      chapters.push({
+        partKey: item.href,
+        title: normalizedTitle,
+        markdown,
+        metadata: {
+          book_title: bookTitle ?? "",
+          chapter_title: normalizedTitle,
+          author: author ?? ""
+        }
+      });
+    }
+    return {
+      title: bookTitle,
+      author,
+      chapters,
+      warnings: chapters.length ? void 0 : ["EPUB extraction completed but found no chapter-like spine entries."]
+    };
+  } catch (error) {
+    return {
+      chapters: [],
+      warnings: [`EPUB extraction failed: ${error instanceof Error ? truncate(error.message, 240) : "unknown error"}`]
+    };
+  }
+}
 // src/logs.ts
 import fs8 from "fs/promises";
@@ -5236,15 +5606,27 @@ function inferKind(mimeType, filePath) {
   if (mimeType.includes("html")) {
     return "html";
   }
-  if (mimeType.startsWith("text/")) {
-    return "text";
-  }
   if (mimeType === "application/pdf" || filePath.toLowerCase().endsWith(".pdf")) {
     return "pdf";
   }
   if (mimeType === "application/vnd.openxmlformats-officedocument.wordprocessingml.document" || filePath.toLowerCase().endsWith(".docx")) {
     return "docx";
   }
+  if (mimeType === "application/epub+zip" || filePath.toLowerCase().endsWith(".epub")) {
+    return "epub";
+  }
+  if (mimeType === "text/csv" || mimeType === "text/tab-separated-values" || filePath.toLowerCase().endsWith(".csv") || filePath.toLowerCase().endsWith(".tsv")) {
+    return "csv";
+  }
+  if (mimeType.startsWith("text/")) {
+    return "text";
+  }
+  if (mimeType === "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" || filePath.toLowerCase().endsWith(".xlsx")) {
+    return "xlsx";
+  }
+  if (mimeType === "application/vnd.openxmlformats-officedocument.presentationml.presentation" || filePath.toLowerCase().endsWith(".pptx")) {
+    return "pptx";
+  }
   if (mimeType.startsWith("image/")) {
     return "image";
   }
@@ -5270,6 +5652,10 @@ function guessMimeType(target) {
   }
   return mime.lookup(target) || "application/octet-stream";
 }
+function sourceGroupIdFor(prepared) {
+  const originKey = prepared.originType === "url" ? prepared.url ?? prepared.title : prepared.originalPath ?? prepared.title;
+  return `${slugify(prepared.title)}-${sha256(originKey).slice(0, 8)}`;
+}
 function rstAdornmentLine(line) {
   const trimmed = line.trim();
   if (trimmed.length < 3) {
@@ -5844,6 +6230,9 @@ function manifestMatchesOrigin(manifest, prepared) {
   }
   return Boolean(prepared.originalPath && manifest.originalPath && toPosix(manifest.originalPath) === toPosix(prepared.originalPath));
 }
+function manifestMatchesOriginPart(manifest, prepared) {
+  return manifestMatchesOrigin(manifest, prepared) && (manifest.sourcePartKey ?? "") === (prepared.sourcePartKey ?? "");
+}
 function buildCompositeHash(payloadBytes, attachments = []) {
   if (!attachments.length) {
     return sha256(payloadBytes);
@@ -5941,7 +6330,7 @@ function extractMarkdownImageReferences(content, baseUrl) {
 async function convertHtmlToMarkdown(html, url) {
   const dom = new JSDOM2(html, { url });
   const article = new Readability(dom.window.document).parse();
-  const turndown = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced" });
+  const turndown = new TurndownService2({ headingStyle: "atx", codeBlockStyle: "fenced" });
   const body = article?.content ?? dom.window.document.body.innerHTML;
   const markdown = turndown.turndown(body);
   return {
@@ -5965,21 +6354,26 @@ async function readManifestByHash(manifestsDir, contentHash) {
   }
   return null;
 }
-async function readManifestByOrigin(manifestsDir, prepared) {
+async function readManifestsByOrigin(manifestsDir, prepared) {
   const entries = await fs11.readdir(manifestsDir, { withFileTypes: true }).catch(() => []);
+  const manifests = [];
   for (const entry of entries) {
     if (!entry.isFile() || !entry.name.endsWith(".json")) {
       continue;
     }
     const manifest = await readJsonFile(path12.join(manifestsDir, entry.name));
     if (manifest && manifestMatchesOrigin(manifest, prepared)) {
-      return {
+      manifests.push({
         ...manifest,
         semanticHash: manifest.semanticHash ?? manifest.contentHash
-      };
+      });
     }
   }
-  return null;
+  return manifests;
+}
+async function readManifestByOrigin(manifestsDir, prepared) {
+  const manifests = await readManifestsByOrigin(manifestsDir, prepared);
+  return manifests.find((manifest) => manifestMatchesOriginPart(manifest, prepared)) ?? null;
 }
 async function loadGitignoreMatcher(repoRoot, enabled) {
   if (!enabled) {
@@ -6228,8 +6622,8 @@ async function persistPreparedInput(rootDir, prepared, paths) {
   const semanticHash = prepared.semanticHash ?? contentHash;
   const extractionHash = prepared.extractionHash ?? buildExtractionHash(prepared.extractedText, prepared.extractionArtifact);
   const existingByOrigin = await readManifestByOrigin(paths.manifestsDir, prepared);
-  const existingByHash = existingByOrigin ? null : await readManifestByHash(paths.manifestsDir, contentHash);
-  if (existingByOrigin && existingByOrigin.contentHash === contentHash && existingByOrigin.semanticHash === semanticHash && existingByOrigin.extractionHash === extractionHash && existingByOrigin.title === prepared.title && existingByOrigin.sourceKind === prepared.sourceKind && existingByOrigin.sourceType === prepared.sourceType && existingByOrigin.sourceClass === prepared.sourceClass && existingByOrigin.language === prepared.language && existingByOrigin.mimeType === prepared.mimeType && existingByOrigin.repoRelativePath === prepared.repoRelativePath) {
+  const existingByHash = existingByOrigin || prepared.sourcePartKey ? null : await readManifestByHash(paths.manifestsDir, contentHash);
+  if (existingByOrigin && existingByOrigin.contentHash === contentHash && existingByOrigin.semanticHash === semanticHash && existingByOrigin.extractionHash === extractionHash && existingByOrigin.title === prepared.title && existingByOrigin.sourceKind === prepared.sourceKind && existingByOrigin.sourceType === prepared.sourceType && existingByOrigin.sourceClass === prepared.sourceClass && existingByOrigin.language === prepared.language && existingByOrigin.mimeType === prepared.mimeType && existingByOrigin.repoRelativePath === prepared.repoRelativePath && existingByOrigin.sourceGroupId === prepared.sourceGroupId && existingByOrigin.sourceGroupTitle === prepared.sourceGroupTitle && existingByOrigin.sourcePartKey === prepared.sourcePartKey && existingByOrigin.partIndex === prepared.partIndex && existingByOrigin.partCount === prepared.partCount && existingByOrigin.partTitle === prepared.partTitle && JSON.stringify(existingByOrigin.details ?? {}) === JSON.stringify(prepared.details ?? {})) {
     return { manifest: existingByOrigin, isNew: false, wasUpdated: false };
   }
   if (existingByHash) {
@@ -6288,6 +6682,13 @@ async function persistPreparedInput(rootDir, prepared, paths) {
     mimeType: prepared.mimeType,
     contentHash,
     semanticHash,
+    sourceGroupId: prepared.sourceGroupId,
+    sourceGroupTitle: prepared.sourceGroupTitle,
+    sourcePartKey: prepared.sourcePartKey,
+    partIndex: prepared.partIndex,
+    partCount: prepared.partCount,
+    partTitle: prepared.partTitle,
+    details: prepared.details,
     createdAt: previous?.createdAt ?? now,
     updatedAt: now,
     attachments: manifestAttachments.length ? manifestAttachments : void 0
@@ -6309,6 +6710,42 @@ async function persistPreparedInput(rootDir, prepared, paths) {
   }
   return { manifest, isNew: !previous, wasUpdated: Boolean(previous) };
 }
+async function persistPreparedInputs(rootDir, input, preparedInputs, paths) {
+  const template = preparedInputs[0];
+  const existingByOrigin = template ? await readManifestsByOrigin(paths.manifestsDir, template) : [];
+  const created = [];
+  const updated = [];
+  const unchanged = [];
+  const removed = [];
+  const seenSourceIds = /* @__PURE__ */ new Set();
+  for (const prepared of preparedInputs) {
+    const result = await persistPreparedInput(rootDir, prepared, paths);
+    if (result.isNew) {
+      created.push(result.manifest);
+    } else if (result.wasUpdated) {
+      updated.push(result.manifest);
+    } else {
+      unchanged.push(result.manifest);
+    }
+    seenSourceIds.add(result.manifest.sourceId);
+  }
+  for (const manifest of existingByOrigin) {
+    if (seenSourceIds.has(manifest.sourceId)) {
+      continue;
+    }
+    await removeManifestArtifacts(rootDir, manifest, paths);
+    removed.push(manifest);
+  }
+  return {
+    input,
+    scannedCount: preparedInputs.length,
+    created,
+    updated,
+    unchanged,
+    removed,
+    skipped: []
+  };
+}
 async function removeManifestArtifacts(rootDir, manifest, paths) {
   await fs11.rm(path12.join(paths.manifestsDir, `${manifest.sourceId}.json`), { force: true });
   await fs11.rm(path12.resolve(rootDir, manifest.storedPath), { force: true });
@@ -6335,10 +6772,10 @@ function repoSyncWorkspaceIgnorePaths(rootDir, paths, repoRoot) {
   return candidates.map((candidate) => path12.resolve(candidate)).filter((candidate, index, items) => items.indexOf(candidate) === index).filter((candidate) => withinRoot(repoRoot, candidate));
 }
 function preparedMatchesManifest(manifest, prepared, contentHash) {
-  return manifest.contentHash === contentHash && manifest.extractionHash === (prepared.extractionHash ?? buildExtractionHash(prepared.extractedText, prepared.extractionArtifact)) && manifest.title === prepared.title && manifest.sourceKind === prepared.sourceKind && manifest.sourceType === prepared.sourceType && manifest.sourceClass === prepared.sourceClass && manifest.language === prepared.language && manifest.mimeType === prepared.mimeType && manifest.repoRelativePath === prepared.repoRelativePath;
+  return manifest.contentHash === contentHash && manifest.extractionHash === (prepared.extractionHash ?? buildExtractionHash(prepared.extractedText, prepared.extractionArtifact)) && manifest.semanticHash === (prepared.semanticHash ?? contentHash) && manifest.title === prepared.title && manifest.sourceKind === prepared.sourceKind && manifest.sourceType === prepared.sourceType && manifest.sourceClass === prepared.sourceClass && manifest.language === prepared.language && manifest.mimeType === prepared.mimeType && manifest.repoRelativePath === prepared.repoRelativePath && manifest.sourceGroupId === prepared.sourceGroupId && manifest.sourceGroupTitle === prepared.sourceGroupTitle && manifest.sourcePartKey === prepared.sourcePartKey && manifest.partIndex === prepared.partIndex && manifest.partCount === prepared.partCount && manifest.partTitle === prepared.partTitle && JSON.stringify(manifest.details ?? {}) === JSON.stringify(prepared.details ?? {});
 }
 function shouldDeferWatchSemanticRefresh(sourceKind) {
-  return sourceKind === "markdown" || sourceKind === "text" || sourceKind === "html" || sourceKind === "pdf" || sourceKind === "docx" || sourceKind === "image";
+  return sourceKind === "markdown" || sourceKind === "text" || sourceKind === "html" || sourceKind === "pdf" || sourceKind === "docx" || sourceKind === "epub" || sourceKind === "csv" || sourceKind === "xlsx" || sourceKind === "pptx" || sourceKind === "image";
 }
 function pendingSemanticRefreshId(changeType, repoRoot, relativePath) {
   return `pending:${changeType}:${sha256(`${toPosix(repoRoot)}:${relativePath}`).slice(0, 12)}`;
@@ -6404,13 +6841,16 @@ async function syncTrackedRepos(rootDir, options, repoRoots) {
     const currentPaths = new Set(files.map((absolutePath) => path12.resolve(absolutePath)));
     for (const absolutePath of files) {
       const relativePath = repoRelativePathFor(absolutePath, repoRoot) ?? toPosix(path12.relative(repoRoot, absolutePath));
-      const prepared = await prepareFileInput(rootDir, absolutePath, repoRoot, sourceClassForRelativePath(relativePath, normalizedOptions));
-      const result = await persistPreparedInput(rootDir, prepared, paths);
-      if (result.isNew) {
-        imported.push(result.manifest);
-      } else if (result.wasUpdated) {
-        updated.push(result.manifest);
-      }
+      const preparedInputs = await prepareFileInputs(
+        rootDir,
+        absolutePath,
+        repoRoot,
+        sourceClassForRelativePath(relativePath, normalizedOptions)
+      );
+      const result = await persistPreparedInputs(rootDir, absolutePath, preparedInputs, paths);
+      imported.push(...result.created);
+      updated.push(...result.updated);
+      removed.push(...result.removed);
       progress.tick();
     }
     progress.finish(`repo=${toPosix(path12.relative(rootDir, repoRoot)) || "."}`);
@@ -6469,9 +6909,6 @@ async function syncTrackedReposForWatch(rootDir, options, repoRoots) {
   let scannedCount = 0;
   for (const repoRoot of uniqueRoots) {
     const repoManifests = manifestsByRepoRoot.get(repoRoot) ?? [];
-    const manifestsByOriginalPath = new Map(
-      repoManifests.filter((manifest) => manifest.originalPath).map((manifest) => [path12.resolve(manifest.originalPath), manifest])
-    );
     if (!await fileExists(repoRoot)) {
       for (const manifest of repoManifests) {
         if (shouldDeferWatchSemanticRefresh(manifest.sourceKind)) {
@@ -6507,38 +6944,50 @@ async function syncTrackedReposForWatch(rootDir, options, repoRoots) {
     const currentPaths = new Set(files.map((absolutePath) => path12.resolve(absolutePath)));
     for (const absolutePath of files) {
       const relativePath = repoRelativePathFor(absolutePath, repoRoot) ?? toPosix(path12.relative(repoRoot, absolutePath));
-      const prepared = await prepareFileInput(rootDir, absolutePath, repoRoot, sourceClassForRelativePath(relativePath, normalizedOptions));
-      if (shouldDeferWatchSemanticRefresh(prepared.sourceKind)) {
-        const existing = manifestsByOriginalPath.get(path12.resolve(absolutePath));
-        const contentHash = buildCompositeHash(prepared.payloadBytes, prepared.attachments);
-        const changed = !existing || !preparedMatchesManifest(existing, prepared, contentHash);
+      const preparedInputs = await prepareFileInputs(
+        rootDir,
+        absolutePath,
+        repoRoot,
+        sourceClassForRelativePath(relativePath, normalizedOptions)
+      );
+      const firstPrepared = preparedInputs[0];
+      if (firstPrepared && shouldDeferWatchSemanticRefresh(firstPrepared.sourceKind)) {
+        const existing = repoManifests.filter(
+          (manifest) => manifest.originalPath && path12.resolve(manifest.originalPath) === path12.resolve(absolutePath)
+        );
+        const existingByPartKey = new Map(existing.map((manifest) => [manifest.sourcePartKey ?? "__single__", manifest]));
+        const changed = existing.length !== preparedInputs.length || preparedInputs.some((prepared) => {
+          const match = existingByPartKey.get(prepared.sourcePartKey ?? "__single__");
+          const contentHash = buildCompositeHash(prepared.payloadBytes, prepared.attachments);
+          return !match || !preparedMatchesManifest(match, prepared, contentHash);
+        }) || existing.some(
+          (manifest) => !preparedInputs.some((prepared) => (prepared.sourcePartKey ?? "") === (manifest.sourcePartKey ?? ""))
+        );
         if (changed) {
           pendingSemanticRefresh.push({
             id: pendingSemanticRefreshId(
-              existing ? "modified" : "added",
+              existing.length ? "modified" : "added",
               repoRoot,
-              prepared.repoRelativePath ?? toPosix(path12.relative(repoRoot, absolutePath))
+              firstPrepared.repoRelativePath ?? toPosix(path12.relative(repoRoot, absolutePath))
             ),
             repoRoot,
             path: toPosix(path12.relative(rootDir, absolutePath)),
-            changeType: existing ? "modified" : "added",
+            changeType: existing.length ? "modified" : "added",
             detectedAt: (/* @__PURE__ */ new Date()).toISOString(),
-            sourceId: existing?.sourceId,
-            sourceKind: prepared.sourceKind
+            sourceId: existing[0]?.sourceId,
+            sourceKind: firstPrepared.sourceKind
           });
-          if (existing?.sourceId) {
-            staleSourceIds.add(existing.sourceId);
+          for (const manifest of existing) {
+            staleSourceIds.add(manifest.sourceId);
           }
         }
         progress.tick();
         continue;
       }
-      const result = await persistPreparedInput(rootDir, prepared, paths);
-      if (result.isNew) {
-        imported.push(result.manifest);
-      } else if (result.wasUpdated) {
-        updated.push(result.manifest);
-      }
+      const result = await persistPreparedInputs(rootDir, absolutePath, preparedInputs, paths);
+      imported.push(...result.created);
+      updated.push(...result.updated);
+      removed.push(...result.removed);
       progress.tick();
     }
     progress.finish(`repo=${toPosix(path12.relative(rootDir, repoRoot)) || "."}`);
@@ -6592,7 +7041,7 @@ async function syncTrackedReposForWatch(rootDir, options, repoRoots) {
     staleSourceIds: [...staleSourceIds]
   };
 }
-async function prepareFileInput(rootDir, absoluteInput, repoRoot, sourceClass) {
+async function prepareFileInputs(rootDir, absoluteInput, repoRoot, sourceClass) {
   const payloadBytes = await fs11.readFile(absoluteInput);
   const mimeType = guessMimeType(absoluteInput);
   const sourceKind = inferKind(mimeType, absoluteInput);
@@ -6623,6 +7072,94 @@ async function prepareFileInput(rootDir, absoluteInput, repoRoot, sourceClass) {
     title = extracted.artifact.metadata?.title?.trim() || title;
     extractedText = extracted.extractedText;
     extractionArtifact = extracted.artifact;
+  } else if (sourceKind === "csv") {
+    title = path12.basename(absoluteInput, path12.extname(absoluteInput));
+    const extracted = await extractCsvText({ mimeType, bytes: payloadBytes, fileName: absoluteInput });
+    title = extracted.title?.trim() || title;
+    extractedText = extracted.extractedText;
+    extractionArtifact = extracted.artifact;
+  } else if (sourceKind === "xlsx") {
+    title = path12.basename(absoluteInput, path12.extname(absoluteInput));
+    const extracted = await extractXlsxText({ mimeType, bytes: payloadBytes, fileName: absoluteInput });
+    title = extracted.title?.trim() || title;
+    extractedText = extracted.extractedText;
+    extractionArtifact = extracted.artifact;
+  } else if (sourceKind === "pptx") {
+    title = path12.basename(absoluteInput, path12.extname(absoluteInput));
+    const extracted = await extractPptxText({ mimeType, bytes: payloadBytes, fileName: absoluteInput });
+    title = extracted.title?.trim() || title;
+    extractedText = extracted.extractedText;
+    extractionArtifact = extracted.artifact;
+  } else if (sourceKind === "epub") {
+    title = path12.basename(absoluteInput, path12.extname(absoluteInput));
+    const extracted = await extractEpubChapters({ mimeType, bytes: payloadBytes, fileName: absoluteInput });
+    title = extracted.title?.trim() || title;
+    const groupId = sourceGroupIdFor({
+      title,
+      originType: "file",
+      originalPath: toPosix(absoluteInput)
+    });
+    if (extracted.chapters.length) {
+      return extracted.chapters.map(
+        (chapter, index) => finalizePreparedInput({
+          title: `${title} - ${chapter.title}`,
+          originType: "file",
+          sourceKind: "epub",
+          sourceClass,
+          originalPath: toPosix(absoluteInput),
+          repoRelativePath: repoRelativePathFor(absoluteInput, repoRoot),
+          mimeType: "text/markdown",
+          storedExtension: ".md",
+          payloadBytes: Buffer.from(chapter.markdown, "utf8"),
+          extractedText: chapter.markdown,
+          extractionArtifact: {
+            extractor: "epub_text",
+            sourceKind: "epub",
+            mimeType,
+            producedAt: (/* @__PURE__ */ new Date()).toISOString(),
+            metadata: {
+              ...chapter.metadata,
+              chapter_index: String(index + 1),
+              chapter_count: String(extracted.chapters.length)
+            },
+            warnings: extracted.warnings
+          },
+          extractionHash: buildExtractionHash(chapter.markdown, {
+            extractor: "epub_text",
+            sourceKind: "epub",
+            mimeType,
+            producedAt: (/* @__PURE__ */ new Date()).toISOString(),
+            metadata: {
+              ...chapter.metadata,
+              chapter_index: String(index + 1),
+              chapter_count: String(extracted.chapters.length)
+            },
+            warnings: extracted.warnings
+          }),
+          sourceGroupId: groupId,
+          sourceGroupTitle: title,
+          sourcePartKey: chapter.partKey,
+          partIndex: index + 1,
+          partCount: extracted.chapters.length,
+          partTitle: chapter.title,
+          details: {
+            book_title: title,
+            chapter_title: chapter.title,
+            chapter_index: String(index + 1),
+            chapter_count: String(extracted.chapters.length),
+            ...extracted.author ? { author: extracted.author } : {}
+          }
+        })
+      );
+    }
+    extractedText = void 0;
+    extractionArtifact = {
+      extractor: "epub_text",
+      sourceKind: "epub",
+      mimeType,
+      producedAt: (/* @__PURE__ */ new Date()).toISOString(),
+      warnings: extracted.warnings ?? ["EPUB extraction completed but produced no chapter content."]
+    };
   } else if (sourceKind === "image") {
     title = path12.basename(absoluteInput, path12.extname(absoluteInput));
     const extracted = await extractImageWithVision(rootDir, {
@@ -6636,23 +7173,33 @@ async function prepareFileInput(rootDir, absoluteInput, repoRoot, sourceClass) {
   } else {
     title = path12.basename(absoluteInput, path12.extname(absoluteInput));
   }
-  return finalizePreparedInput({
-    title,
-    originType: "file",
-    sourceKind,
-    sourceClass,
-    language,
-    originalPath: toPosix(absoluteInput),
-    repoRelativePath: repoRelativePathFor(absoluteInput, repoRoot),
-    mimeType,
-    storedExtension,
-    payloadBytes,
-    extractedText,
-    extractionArtifact,
-    extractionHash: buildExtractionHash(extractedText, extractionArtifact)
-  });
+  return [
+    finalizePreparedInput({
+      title,
+      originType: "file",
+      sourceKind,
+      sourceClass,
+      language,
+      originalPath: toPosix(absoluteInput),
+      repoRelativePath: repoRelativePathFor(absoluteInput, repoRoot),
+      mimeType,
+      storedExtension,
+      payloadBytes,
+      extractedText,
+      extractionArtifact,
+      extractionHash: buildExtractionHash(extractedText, extractionArtifact),
+      details: extractionArtifact?.metadata
+    })
+  ];
 }
-async function prepareUrlInput(rootDir, input, options) {
+async function prepareFileInput(rootDir, absoluteInput, repoRoot, sourceClass) {
+  const prepared = await prepareFileInputs(rootDir, absoluteInput, repoRoot, sourceClass);
+  if (!prepared.length) {
+    throw new Error(`No ingestable sources were extracted from ${absoluteInput}.`);
+  }
+  return prepared[0];
+}
+async function prepareUrlInputs(rootDir, input, options) {
   await validateUrlSafety(input);
   const response = await fetch(input);
   if (!response.ok) {
@@ -6747,6 +7294,88 @@ async function prepareUrlInput(rootDir, input, options) {
       title = extracted.artifact.metadata?.title?.trim() || title;
       extractedText = extracted.extractedText;
       extractionArtifact = extracted.artifact;
+    } else if (sourceKind === "csv") {
+      const extracted = await extractCsvText({ mimeType, bytes: payloadBytes, fileName: inputUrl.pathname });
+      title = extracted.title?.trim() || title;
+      extractedText = extracted.extractedText;
+      extractionArtifact = extracted.artifact;
+    } else if (sourceKind === "xlsx") {
+      const extracted = await extractXlsxText({ mimeType, bytes: payloadBytes, fileName: inputUrl.pathname });
+      title = extracted.title?.trim() || title;
+      extractedText = extracted.extractedText;
+      extractionArtifact = extracted.artifact;
+    } else if (sourceKind === "pptx") {
+      const extracted = await extractPptxText({ mimeType, bytes: payloadBytes, fileName: inputUrl.pathname });
+      title = extracted.title?.trim() || title;
+      extractedText = extracted.extractedText;
+      extractionArtifact = extracted.artifact;
+    } else if (sourceKind === "epub") {
+      const extracted = await extractEpubChapters({ mimeType, bytes: payloadBytes, fileName: inputUrl.pathname });
+      title = extracted.title?.trim() || title;
+      const groupId = sourceGroupIdFor({
+        title,
+        originType: "url",
+        url: finalUrl
+      });
+      if (extracted.chapters.length) {
+        return extracted.chapters.map(
+          (chapter, index) => finalizePreparedInput({
+            title: `${title} - ${chapter.title}`,
+            originType: "url",
+            sourceKind: "epub",
+            url: finalUrl,
+            mimeType: "text/markdown",
+            storedExtension: ".md",
+            payloadBytes: Buffer.from(chapter.markdown, "utf8"),
+            extractedText: chapter.markdown,
+            extractionArtifact: {
+              extractor: "epub_text",
+              sourceKind: "epub",
+              mimeType,
+              producedAt: (/* @__PURE__ */ new Date()).toISOString(),
+              metadata: {
+                ...chapter.metadata,
+                chapter_index: String(index + 1),
+                chapter_count: String(extracted.chapters.length)
+              },
+              warnings: extracted.warnings
+            },
+            extractionHash: buildExtractionHash(chapter.markdown, {
+              extractor: "epub_text",
+              sourceKind: "epub",
+              mimeType,
+              producedAt: (/* @__PURE__ */ new Date()).toISOString(),
+              metadata: {
+                ...chapter.metadata,
+                chapter_index: String(index + 1),
+                chapter_count: String(extracted.chapters.length)
+              },
+              warnings: extracted.warnings
+            }),
+            sourceGroupId: groupId,
+            sourceGroupTitle: title,
+            sourcePartKey: chapter.partKey,
+            partIndex: index + 1,
+            partCount: extracted.chapters.length,
+            partTitle: chapter.title,
+            details: {
+              book_title: title,
+              chapter_title: chapter.title,
+              chapter_index: String(index + 1),
+              chapter_count: String(extracted.chapters.length),
+              ...extracted.author ? { author: extracted.author } : {}
+            },
+            logDetails
+          })
+        );
+      }
+      extractionArtifact = {
+        extractor: "epub_text",
+        sourceKind: "epub",
+        mimeType,
+        producedAt: (/* @__PURE__ */ new Date()).toISOString(),
+        warnings: extracted.warnings ?? ["EPUB extraction completed but produced no chapter content."]
+      };
     } else if (sourceKind === "image") {
       const extracted = await extractImageWithVision(rootDir, {
         title,
@@ -6758,22 +7387,32 @@ async function prepareUrlInput(rootDir, input, options) {
       extractionArtifact = extracted.artifact;
     }
   }
-  return finalizePreparedInput({
-    title,
-    originType: "url",
-    sourceKind,
-    language,
-    url: finalUrl,
-    mimeType,
-    storedExtension,
-    payloadBytes,
-    extractedText,
-    extractionArtifact,
-    extractionHash: buildExtractionHash(extractedText, extractionArtifact),
-    attachments,
-    contentHash,
-    logDetails
-  });
+  return [
+    finalizePreparedInput({
+      title,
+      originType: "url",
+      sourceKind,
+      language,
+      url: finalUrl,
+      mimeType,
+      storedExtension,
+      payloadBytes,
+      extractedText,
+      extractionArtifact,
+      extractionHash: buildExtractionHash(extractedText, extractionArtifact),
+      attachments,
+      contentHash,
+      details: extractionArtifact?.metadata,
+      logDetails
+    })
+  ];
+}
+async function prepareUrlInput(rootDir, input, options) {
+  const prepared = await prepareUrlInputs(rootDir, input, options);
+  if (!prepared.length) {
+    throw new Error(`No ingestable sources were extracted from ${input}.`);
+  }
+  return prepared[0];
 }
 async function collectInboxAttachmentRefs(inputDir, files) {
   const refsBySource = /* @__PURE__ */ new Map();
@@ -6905,18 +7544,23 @@ async function prepareInboxHtmlInput(absolutePath, attachmentRefs) {
   };
 }
 function isSupportedInboxKind(sourceKind) {
-  return ["markdown", "text", "html", "pdf", "docx", "image"].includes(sourceKind);
+  return ["markdown", "text", "html", "pdf", "docx", "epub", "csv", "xlsx", "pptx", "image"].includes(sourceKind);
 }
 async function ingestInputDetailed(rootDir, input, options) {
   const { paths } = await initWorkspace(rootDir);
   const normalizedOptions = normalizeIngestOptions(options);
   const absoluteInput = path12.resolve(rootDir, input);
   const repoRoot = isHttpUrl(input) || normalizedOptions.repoRoot ? normalizedOptions.repoRoot : await findNearestGitRoot2(absoluteInput).then((value) => value ?? path12.dirname(absoluteInput));
-  const prepared = isHttpUrl(input) ? await prepareUrlInput(rootDir, input, normalizedOptions) : await prepareFileInput(rootDir, absoluteInput, repoRoot);
-  return await persistPreparedInput(rootDir, prepared, paths);
+  const prepared = isHttpUrl(input) ? await prepareUrlInputs(rootDir, input, normalizedOptions) : await prepareFileInputs(rootDir, absoluteInput, repoRoot);
+  return await persistPreparedInputs(rootDir, input, prepared, paths);
 }
 async function ingestInput(rootDir, input, options) {
-  return (await ingestInputDetailed(rootDir, input, options)).manifest;
+  const result = await ingestInputDetailed(rootDir, input, options);
+  const manifest = [...result.created, ...result.updated, ...result.unchanged][0];
+  if (!manifest) {
+    throw new Error(`No source manifests were created or updated for ${input}.`);
+  }
+  return manifest;
 }
 async function addInput(rootDir, input, options = {}) {
   const { paths } = await initWorkspace(rootDir);
@@ -7014,13 +7658,20 @@ async function ingestDirectory(rootDir, inputDir, options) {
   const progress = createProgressReporter("ingest", files.length);
   for (const absolutePath of files) {
     const relativePath = repoRelativePathFor(absolutePath, repoRoot) ?? toPosix(path12.relative(repoRoot, absolutePath));
-    const prepared = await prepareFileInput(rootDir, absolutePath, repoRoot, sourceClassForRelativePath(relativePath, normalizedOptions));
-    const result = await persistPreparedInput(rootDir, prepared, paths);
-    if (result.isNew) {
-      imported.push(result.manifest);
-    } else if (result.wasUpdated) {
-      updated.push(result.manifest);
-    } else {
+    const preparedInputs = await prepareFileInputs(
+      rootDir,
+      absolutePath,
+      repoRoot,
+      sourceClassForRelativePath(relativePath, normalizedOptions)
+    );
+    const result = await persistPreparedInputs(rootDir, absolutePath, preparedInputs, paths);
+    if (result.created.length) {
+      imported.push(...result.created);
+    }
+    if (result.updated.length) {
+      updated.push(...result.updated);
+    }
+    if (!result.created.length && !result.updated.length && !result.removed.length) {
       skipped.push({ path: toPosix(path12.relative(rootDir, absolutePath)), reason: "duplicate_content" });
     }
     progress.tick();
@@ -7071,13 +7722,13 @@ async function importInbox(rootDir, inputDir) {
       continue;
     }
     const prepared = sourceKind === "markdown" && refsBySource.has(absolutePath) ? await prepareInboxMarkdownInput(absolutePath, refsBySource.get(absolutePath) ?? []) : sourceKind === "html" && refsBySource.has(absolutePath) ? await prepareInboxHtmlInput(absolutePath, refsBySource.get(absolutePath) ?? []) : await prepareFileInput(rootDir, absolutePath);
-    const result = await persistPreparedInput(rootDir, prepared, paths);
-    if (!result.isNew) {
+    const result = await persistPreparedInputs(rootDir, absolutePath, [prepared], paths);
+    if (!result.created.length) {
       skipped.push({ path: toPosix(path12.relative(rootDir, absolutePath)), reason: "duplicate_content" });
       continue;
     }
-    attachmentCount += result.manifest.attachments?.length ?? 0;
-    imported.push(result.manifest);
+    attachmentCount += result.created.reduce((total, manifest) => total + (manifest.attachments?.length ?? 0), 0);
+    imported.push(...result.created);
   }
   await appendLogEntry(rootDir, "inbox_import", toPosix(path12.relative(rootDir, effectiveInputDir)) || ".", [
     `scanned=${files.length}`,
@@ -9336,9 +9987,19 @@ function buildSourcePage(manifest, analysis, schemaHash, metadata, relatedOutput
     `# ${analysis.title}`,
     "",
     `Source ID: \`${manifest.sourceId}\``,
+    `Source Kind: \`${manifest.sourceKind}\``,
     manifest.url ? `Source URL: ${manifest.url}` : `Source Path: \`${manifest.originalPath ?? manifest.storedPath}\``,
     ...manifest.sourceType ? [`Source Type: \`${manifest.sourceType}\``, ""] : [""],
     ...manifest.sourceClass ? [`Source Class: \`${manifest.sourceClass}\``, ""] : [],
+    ...manifest.sourceGroupTitle ? [`Source Group: ${manifest.sourceGroupTitle}`] : [],
+    ...manifest.partTitle ? [`Part: ${manifest.partIndex ?? "?"}/${manifest.partCount ?? "?"} - ${manifest.partTitle}`] : [],
+    ...manifest.details && Object.keys(manifest.details).length ? [
+      "",
+      "## Source Details",
+      "",
+      ...Object.entries(manifest.details).map(([key, value]) => `- ${key.replace(/_/g, " ")}: ${value}`),
+      ""
+    ] : [],
     "",
     "## Summary",
     "",
@@ -14987,7 +15648,7 @@ async function bootstrapDemo(rootDir, input) {
 }
 // src/mcp.ts
-var SERVER_VERSION = "0.2.2";
+var SERVER_VERSION = "0.3.0";
 async function createMcpServer(rootDir) {
   const server = new McpServer({
     name: "swarmvault",
@@ -15165,8 +15826,8 @@ async function createMcpServer(rootDir) {
       }
     },
     async ({ input }) => {
-      const manifest = await ingestInput(rootDir, input);
-      return asToolText(manifest);
+      const result = await ingestInputDetailed(rootDir, input);
+      return asToolText(result);
     }
   );
   server.registerTool(
@@ -15970,12 +16631,11 @@ async function syncCrawlSource(rootDir, entry, options) {
   let updatedCount = 0;
   for (const pageUrl of crawl.pages) {
     const persisted = await ingestInputDetailed(rootDir, pageUrl);
-    currentSourceIds.push(persisted.manifest.sourceId);
-    if (persisted.isNew) {
-      importedCount += 1;
-    } else if (persisted.wasUpdated) {
-      updatedCount += 1;
-    }
+    currentSourceIds.push(...persisted.created.map((manifest) => manifest.sourceId));
+    currentSourceIds.push(...persisted.updated.map((manifest) => manifest.sourceId));
+    currentSourceIds.push(...persisted.unchanged.map((manifest) => manifest.sourceId));
+    importedCount += persisted.created.length;
+    updatedCount += persisted.updated.length;
   }
   let removedCount = 0;
   for (const sourceId of previousSourceIds) {
@@ -17237,6 +17897,7 @@ export {
   importInbox,
   ingestDirectory,
   ingestInput,
+  ingestInputDetailed,
   initVault,
   initWorkspace,
   installAgent,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@swarmvaultai/engine",
-  "version": "0.2.2",
+  "version": "0.3.0",
   "description": "Core engine for SwarmVault: ingest, compile, query, lint, and provider abstractions.",
   "type": "module",
   "main": "dist/index.js",
@@ -44,6 +44,7 @@
     "@mozilla/readability": "^0.6.0",
     "@vscode/tree-sitter-wasm": "^0.3.1",
     "chokidar": "^4.0.3",
+    "csv-parse": "^6.2.1",
     "fflate": "^0.8.2",
     "gray-matter": "^4.0.3",
     "ignore": "^7.0.5",
@@ -55,6 +56,7 @@
     "tree-sitter-wasms": "^0.1.13",
     "turndown": "^7.2.1",
     "typescript": "^5.9.3",
+    "xlsx": "^0.18.5",
     "yaml": "^2.8.1",
     "zod": "^4.1.8"
   },