@swarmvaultai/engine 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -186,17 +186,21 @@ This matters because many "OpenAI-compatible" backends only implement part of th
186
186
  - `reloadManagedSources(rootDir, { id, all, compile, brief, maxPages, maxDepth })` re-syncs one managed source or the full registry
187
187
  - `deleteManagedSource(rootDir, id)` removes a managed-source registry entry and transient sync state without deleting canonical vault artifacts
188
188
  - `ingestInput(rootDir, input, { includeAssets, maxAssetSize })` ingests a local file path or URL
189
+ - `ingestInputDetailed(rootDir, input, { includeAssets, maxAssetSize })` returns a summary envelope with `created`, `updated`, `unchanged`, and `removed` manifests when one input expands into multiple sources
189
190
  - `addInput(rootDir, input, { author, contributor })` captures supported URLs into normalized markdown before ingesting them, or falls back to generic URL ingest
190
191
  - `ingestDirectory(rootDir, inputDir, { repoRoot, include, exclude, maxFiles, gitignore, extractClasses })` recursively ingests a local directory as a repo-aware code/content source tree
191
192
  - `importInbox(rootDir, inputDir?)` recursively imports supported inbox files plus markdown and HTML browser-clipper style bundles
192
193
  - managed sources support local directories, public GitHub repo root URLs, and bounded same-domain docs hubs
193
194
  - registry data lives in `state/sources.json`, working state lives under `state/sources/<id>/`, and source briefs are written to `wiki/outputs/source-briefs/<id>.md`
195
+ - EPUB inputs split into chapter-level manifests with shared group metadata so books stay navigable instead of becoming one giant source
196
+ - CSV and TSV inputs produce bounded tabular summaries with delimiter-aware previews and compact column hints
197
+ - XLSX inputs extract workbook-level and sheet-level previews, while PPTX inputs extract slide text plus speaker notes when present
194
198
  - JavaScript, JSX, TypeScript, TSX, Python, Go, Rust, Java, Kotlin, Scala, Lua, Zig, C#, C, C++, PHP, Ruby, and PowerShell inputs are treated as code sources and compiled into both source pages and `wiki/code/` module pages
195
199
  - `.rst` and `.rest` inputs are treated as first-class text sources with lightweight heading and directive normalization before analysis
196
200
  - code manifests can carry `repoRelativePath`, and compile writes `state/code-index.json` so local imports can resolve across an ingested repo tree
197
201
  - repo-aware manifests, graph nodes, and graph pages can also carry `sourceClass` so first-party, third-party, resource, and generated material can be filtered and reported separately
198
202
  - HTML and markdown URL ingests localize remote image references into `raw/assets/<sourceId>/` by default and rewrite the stored markdown to local relative paths
199
- - PDF and DOCX ingests now write extracted-text and metadata sidecars under `state/extracts/`, and image ingest keeps the same sidecar model for vision extraction
203
+ - PDF, DOCX, EPUB, CSV/TSV, XLSX, and PPTX ingests write extracted-text and metadata sidecars under `state/extracts/`, and image ingest keeps the same sidecar model for vision extraction
200
204
  - Tree-sitter-backed languages now verify runtime and grammar compatibility per language; failures stay local to the affected source and surface as diagnostics instead of aborting the whole compile
201
205
 
202
206
  ### Compile + Query
package/dist/index.d.ts CHANGED
@@ -54,7 +54,7 @@ type PageStatus = "draft" | "candidate" | "active" | "archived";
54
54
  type PageManager = "system" | "human";
55
55
  type ApprovalEntryStatus = "pending" | "accepted" | "rejected";
56
56
  type ApprovalChangeType = "create" | "update" | "delete" | "promote";
57
- type SourceKind = "markdown" | "text" | "pdf" | "image" | "html" | "docx" | "binary" | "code";
57
+ type SourceKind = "markdown" | "text" | "pdf" | "image" | "html" | "docx" | "epub" | "csv" | "xlsx" | "pptx" | "binary" | "code";
58
58
  type SourceCaptureType = "arxiv" | "doi" | "tweet" | "article" | "url";
59
59
  type SourceClass = "first_party" | "third_party" | "resource" | "generated";
60
60
  type ManagedSourceKind = "directory" | "github_repo" | "crawl_url";
@@ -231,7 +231,7 @@ interface SourceAttachment {
231
231
  mimeType: string;
232
232
  originalPath?: string;
233
233
  }
234
- type ExtractionKind = "plain_text" | "html_readability" | "pdf_text" | "docx_text" | "image_vision";
234
+ type ExtractionKind = "plain_text" | "html_readability" | "pdf_text" | "docx_text" | "epub_text" | "csv_text" | "xlsx_text" | "pptx_text" | "image_vision";
235
235
  interface ExtractionTerm {
236
236
  name: string;
237
237
  description: string;
@@ -284,6 +284,15 @@ interface DirectoryIngestResult {
284
284
  updated: SourceManifest[];
285
285
  skipped: DirectoryIngestSkip[];
286
286
  }
287
+ interface InputIngestResult {
288
+ input: string;
289
+ scannedCount: number;
290
+ created: SourceManifest[];
291
+ updated: SourceManifest[];
292
+ unchanged: SourceManifest[];
293
+ removed: SourceManifest[];
294
+ skipped: DirectoryIngestSkip[];
295
+ }
287
296
  interface SourceManifest {
288
297
  sourceId: string;
289
298
  title: string;
@@ -302,6 +311,13 @@ interface SourceManifest {
302
311
  mimeType: string;
303
312
  contentHash: string;
304
313
  semanticHash: string;
314
+ sourceGroupId?: string;
315
+ sourceGroupTitle?: string;
316
+ sourcePartKey?: string;
317
+ partIndex?: number;
318
+ partCount?: number;
319
+ partTitle?: string;
320
+ details?: Record<string, string>;
305
321
  createdAt: string;
306
322
  updatedAt: string;
307
323
  attachments?: SourceAttachment[];
@@ -1206,6 +1222,7 @@ declare function uninstallGitHooks(rootDir: string): Promise<GitHookStatus>;
1206
1222
  declare function listTrackedRepoRoots(rootDir: string): Promise<string[]>;
1207
1223
  declare function syncTrackedRepos(rootDir: string, options?: IngestOptions, repoRoots?: string[]): Promise<RepoSyncResult>;
1208
1224
  declare function syncTrackedReposForWatch(rootDir: string, options?: IngestOptions, repoRoots?: string[]): Promise<WatchRepoSyncResult>;
1225
+ declare function ingestInputDetailed(rootDir: string, input: string, options?: IngestOptions): Promise<InputIngestResult>;
1209
1226
  declare function ingestInput(rootDir: string, input: string, options?: IngestOptions): Promise<SourceManifest>;
1210
1227
  declare function addInput(rootDir: string, input: string, options?: AddOptions): Promise<AddResult>;
1211
1228
  declare function ingestDirectory(rootDir: string, inputDir: string, options?: IngestOptions): Promise<DirectoryIngestResult>;
@@ -1327,4 +1344,4 @@ declare function getWatchStatus(rootDir: string): Promise<WatchStatusResult>;
1327
1344
  declare function createWebSearchAdapter(id: string, config: WebSearchProviderConfig, rootDir: string): Promise<WebSearchAdapter>;
1328
1345
  declare function getWebSearchAdapterForTask(rootDir: string, task: "deepLintProvider"): Promise<WebSearchAdapter>;
1329
1346
 
1330
- export { type AddOptions, type AddResult, type AgentType, type AnalyzedTerm, type ApprovalChangeType, type ApprovalDetail, type ApprovalEntry, type ApprovalEntryDetail, type ApprovalEntryStatus, type ApprovalManifest, type ApprovalSummary, type BenchmarkArtifact, type BenchmarkOptions, type BenchmarkQuestionResult, type BenchmarkSummary, type CandidateRecord, type ChartDatum, type ChartSpec, type ClaimStatus, type CodeAnalysis, type CodeDiagnostic, type CodeImport, type CodeIndexArtifact, type CodeIndexEntry, type CodeLanguage, type CodeSymbol, type CodeSymbolKind, type CommandRoleExecutorConfig, type CompileOptions, type CompileResult, type CompileState, type DirectoryIngestResult, type DirectoryIngestSkip, type EmbeddingCacheArtifact, type EmbeddingCacheEntry, type EvidenceClass, type ExploreOptions, type ExploreResult, type ExploreStepResult, type ExtractionClaim, type ExtractionKind, type ExtractionTerm, type Freshness, type GenerationAttachment, type GenerationRequest, type GenerationResponse, type GitHookStatus, type GraphArtifact, type GraphEdge, type GraphExplainNeighbor, type GraphExplainResult, type GraphExportFormat, type GraphExportResult, type GraphHyperedge, type GraphNode, type GraphPage, type GraphPathResult, type GraphPushCounts, type GraphPushNeo4jOptions, type GraphPushResult, type GraphQueryMatch, type GraphQueryResult, type GraphReportArtifact, type ImageGenerationRequest, type ImageGenerationResponse, type ImageVisionExtraction, type InboxImportResult, type InboxImportSkip, type IngestOptions, type InitOptions, type InstallAgentOptions, type InstallAgentResult, type LintFinding, type LintOptions, type ManagedSourceAddOptions, type ManagedSourceAddResult, type ManagedSourceDeleteResult, type ManagedSourceKind, type ManagedSourceRecord, type ManagedSourceReloadOptions, type ManagedSourceReloadResult, type ManagedSourceStatus, type ManagedSourceSyncCounts, type ManagedSourcesArtifact, type Neo4jGraphSinkConfig, type OrchestrationConfig, type OrchestrationFinding, type OrchestrationProposal, type OrchestrationRole, type OrchestrationRoleConfig, type OrchestrationRoleResult, type OutputAsset, type OutputAssetRole, type OutputFormat, type OutputOrigin, type PageKind, type PageManager, type PageStatus, type PendingSemanticRefreshEntry, type Polarity, type ProviderAdapter, type ProviderCapability, type ProviderConfig, type ProviderRoleExecutorConfig, type ProviderType, type QueryOptions, type QueryResult, type RepoSyncResult, type ResolvedPaths, type ReviewActionResult, type RoleExecutorConfig, type SceneElement, type SceneSpec, type ScheduleController, type ScheduleJobConfig, type ScheduleStateRecord, type ScheduleTriggerConfig, type ScheduledCompileTask, type ScheduledExploreTask, type ScheduledLintTask, type ScheduledQueryTask, type ScheduledRunResult, type ScheduledTaskConfig, type SearchResult, type SourceAnalysis, type SourceAttachment, type SourceCaptureType, type SourceClaim, type SourceClass, type SourceExtractionArtifact, type SourceKind, type SourceManifest, type SourceRationale, type VaultConfig, type WatchController, type WatchOptions, type WatchRepoSyncResult, type WatchRunRecord, type WatchStatusResult, type WebSearchAdapter, type WebSearchProviderConfig, type WebSearchProviderType, type WebSearchResult, acceptApproval, addInput, addManagedSource, agentTypeSchema, archiveCandidate, assertProviderCapability, benchmarkVault, bootstrapDemo, compileVault, createMcpServer, createProvider, createWebSearchAdapter, defaultVaultConfig, defaultVaultSchema, deleteManagedSource, explainGraphVault, exploreVault, exportGraphFormat, exportGraphHtml, getGitHookStatus, getProviderForTask, getWatchStatus, getWebSearchAdapterForTask, getWorkspaceInfo, importInbox, ingestDirectory, ingestInput, initVault, initWorkspace, installAgent, installConfiguredAgents, installGitHooks, lintVault, listApprovals, listCandidates, listGodNodes, listGraphHyperedges, listManagedSourceRecords, listManifests, listPages, listSchedules, listTrackedRepoRoots, loadVaultConfig, loadVaultSchema, loadVaultSchemas, pathGraphVault, promoteCandidate, providerCapabilitySchema, providerTypeSchema, pushGraphNeo4j, queryGraphVault, queryVault, readApproval, readExtractedText, readGraphReport, readPage, rejectApproval, reloadManagedSources, resolvePaths, runSchedule, runWatchCycle, searchVault, serveSchedules, startGraphServer, startMcpServer, syncTrackedRepos, syncTrackedReposForWatch, uninstallGitHooks, watchVault, webSearchProviderTypeSchema };
1347
+ export { type AddOptions, type AddResult, type AgentType, type AnalyzedTerm, type ApprovalChangeType, type ApprovalDetail, type ApprovalEntry, type ApprovalEntryDetail, type ApprovalEntryStatus, type ApprovalManifest, type ApprovalSummary, type BenchmarkArtifact, type BenchmarkOptions, type BenchmarkQuestionResult, type BenchmarkSummary, type CandidateRecord, type ChartDatum, type ChartSpec, type ClaimStatus, type CodeAnalysis, type CodeDiagnostic, type CodeImport, type CodeIndexArtifact, type CodeIndexEntry, type CodeLanguage, type CodeSymbol, type CodeSymbolKind, type CommandRoleExecutorConfig, type CompileOptions, type CompileResult, type CompileState, type DirectoryIngestResult, type DirectoryIngestSkip, type EmbeddingCacheArtifact, type EmbeddingCacheEntry, type EvidenceClass, type ExploreOptions, type ExploreResult, type ExploreStepResult, type ExtractionClaim, type ExtractionKind, type ExtractionTerm, type Freshness, type GenerationAttachment, type GenerationRequest, type GenerationResponse, type GitHookStatus, type GraphArtifact, type GraphEdge, type GraphExplainNeighbor, type GraphExplainResult, type GraphExportFormat, type GraphExportResult, type GraphHyperedge, type GraphNode, type GraphPage, type GraphPathResult, type GraphPushCounts, type GraphPushNeo4jOptions, type GraphPushResult, type GraphQueryMatch, type GraphQueryResult, type GraphReportArtifact, type ImageGenerationRequest, type ImageGenerationResponse, type ImageVisionExtraction, type InboxImportResult, type InboxImportSkip, type IngestOptions, type InitOptions, type InputIngestResult, type InstallAgentOptions, type InstallAgentResult, type LintFinding, type LintOptions, type ManagedSourceAddOptions, type ManagedSourceAddResult, type ManagedSourceDeleteResult, type ManagedSourceKind, type ManagedSourceRecord, type ManagedSourceReloadOptions, type ManagedSourceReloadResult, type ManagedSourceStatus, type ManagedSourceSyncCounts, type ManagedSourcesArtifact, type Neo4jGraphSinkConfig, type OrchestrationConfig, type OrchestrationFinding, type OrchestrationProposal, type OrchestrationRole, type OrchestrationRoleConfig, type OrchestrationRoleResult, type OutputAsset, type OutputAssetRole, type OutputFormat, type OutputOrigin, type PageKind, type PageManager, type PageStatus, type PendingSemanticRefreshEntry, type Polarity, type ProviderAdapter, type ProviderCapability, type ProviderConfig, type ProviderRoleExecutorConfig, type ProviderType, type QueryOptions, type QueryResult, type RepoSyncResult, type ResolvedPaths, type ReviewActionResult, type RoleExecutorConfig, type SceneElement, type SceneSpec, type ScheduleController, type ScheduleJobConfig, type ScheduleStateRecord, type ScheduleTriggerConfig, type ScheduledCompileTask, type ScheduledExploreTask, type ScheduledLintTask, type ScheduledQueryTask, type ScheduledRunResult, type ScheduledTaskConfig, type SearchResult, type SourceAnalysis, type SourceAttachment, type SourceCaptureType, type SourceClaim, type SourceClass, type SourceExtractionArtifact, type SourceKind, type SourceManifest, type SourceRationale, type VaultConfig, type WatchController, type WatchOptions, type WatchRepoSyncResult, type WatchRunRecord, type WatchStatusResult, type WebSearchAdapter, type WebSearchProviderConfig, type WebSearchProviderType, type WebSearchResult, acceptApproval, addInput, addManagedSource, agentTypeSchema, archiveCandidate, assertProviderCapability, benchmarkVault, bootstrapDemo, compileVault, createMcpServer, createProvider, createWebSearchAdapter, defaultVaultConfig, defaultVaultSchema, deleteManagedSource, explainGraphVault, exploreVault, exportGraphFormat, exportGraphHtml, getGitHookStatus, getProviderForTask, getWatchStatus, getWebSearchAdapterForTask, getWorkspaceInfo, importInbox, ingestDirectory, ingestInput, ingestInputDetailed, initVault, initWorkspace, installAgent, installConfiguredAgents, installGitHooks, lintVault, listApprovals, listCandidates, listGodNodes, listGraphHyperedges, listManagedSourceRecords, listManifests, listPages, listSchedules, listTrackedRepoRoots, loadVaultConfig, loadVaultSchema, loadVaultSchemas, pathGraphVault, promoteCandidate, providerCapabilitySchema, providerTypeSchema, pushGraphNeo4j, queryGraphVault, queryVault, readApproval, readExtractedText, readGraphReport, readPage, rejectApproval, reloadManagedSources, resolvePaths, runSchedule, runWatchCycle, searchVault, serveSchedules, startGraphServer, startMcpServer, syncTrackedRepos, syncTrackedReposForWatch, uninstallGitHooks, watchVault, webSearchProviderTypeSchema };
package/dist/index.js CHANGED
@@ -1729,7 +1729,7 @@ import matter3 from "gray-matter";
1729
1729
  import ignore from "ignore";
1730
1730
  import { JSDOM as JSDOM2 } from "jsdom";
1731
1731
  import mime from "mime-types";
1732
- import TurndownService from "turndown";
1732
+ import TurndownService2 from "turndown";
1733
1733
 
1734
1734
  // src/code-analysis.ts
1735
1735
  import fs6 from "fs/promises";
@@ -4504,8 +4504,10 @@ async function analyzeCodeSource(manifest, extractedText, schemaHash) {
4504
4504
  import fs7 from "fs/promises";
4505
4505
  import os from "os";
4506
4506
  import path7 from "path";
4507
+ import { parse as parseCsvSync } from "csv-parse/sync";
4507
4508
  import { strFromU8, unzipSync } from "fflate";
4508
4509
  import { JSDOM } from "jsdom";
4510
+ import TurndownService from "turndown";
4509
4511
  import { z } from "zod";
4510
4512
  var imageVisionExtractionSchema = z.object({
4511
4513
  title: z.string().min(1).nullable().optional(),
@@ -4685,7 +4687,7 @@ function normalizePdfMetadata(raw) {
4685
4687
  function normalizeDocumentText(raw) {
4686
4688
  return raw.replace(/\r\n/g, "\n").split(/\n{2,}/).map((section) => normalizeWhitespace(section)).filter(Boolean).join("\n\n").trim();
4687
4689
  }
4688
- function parseDocxCoreMetadata(bytes) {
4690
+ function parseOfficeCoreMetadata(bytes) {
4689
4691
  try {
4690
4692
  const archive = unzipSync(new Uint8Array(bytes));
4691
4693
  const coreXml = archive["docProps/core.xml"];
@@ -4725,6 +4727,122 @@ function parseDocxCoreMetadata(bytes) {
4725
4727
  return void 0;
4726
4728
  }
4727
4729
  }
4730
+ function decodeTextBytes(bytes) {
4731
+ const text = bytes.toString("utf8");
4732
+ return text.charCodeAt(0) === 65279 ? text.slice(1) : text;
4733
+ }
4734
+ function normalizeTableCell(value) {
4735
+ return normalizeWhitespace(String(value ?? ""));
4736
+ }
4737
+ function isNumericCell(value) {
4738
+ return value.length > 0 && Number.isFinite(Number(value));
4739
+ }
4740
+ function detectHeaderRow(rows) {
4741
+ if (!rows.length) {
4742
+ return { headers: [], bodyRows: [] };
4743
+ }
4744
+ const firstRow = rows[0] ?? [];
4745
+ const nonEmpty = firstRow.filter(Boolean);
4746
+ const unique = new Set(nonEmpty);
4747
+ const nonNumeric = nonEmpty.filter((value) => !isNumericCell(value));
4748
+ const looksLikeHeader = nonEmpty.length > 0 && unique.size === nonEmpty.length && nonNumeric.length >= Math.ceil(nonEmpty.length / 2) && rows.length > 1;
4749
+ if (looksLikeHeader) {
4750
+ return {
4751
+ headers: firstRow.map((value, index) => value || `column_${index + 1}`),
4752
+ bodyRows: rows.slice(1)
4753
+ };
4754
+ }
4755
+ const columnCount = Math.max(...rows.map((row) => row.length), 0);
4756
+ return {
4757
+ headers: Array.from({ length: columnCount }, (_, index) => `column_${index + 1}`),
4758
+ bodyRows: rows
4759
+ };
4760
+ }
4761
+ function columnHints(headers, rows) {
4762
+ return headers.map((header, index) => {
4763
+ const values = rows.map((row) => row[index] ?? "").map(normalizeTableCell).filter(Boolean);
4764
+ if (!values.length) {
4765
+ return null;
4766
+ }
4767
+ const uniqueValues = [...new Set(values)];
4768
+ if (values.every(isNumericCell)) {
4769
+ return `- ${header}: numeric`;
4770
+ }
4771
+ if (uniqueValues.length <= 6 && values.length >= uniqueValues.length) {
4772
+ return `- ${header}: low-cardinality (${uniqueValues.slice(0, 6).join(", ")})`;
4773
+ }
4774
+ return null;
4775
+ }).filter((item) => Boolean(item));
4776
+ }
4777
+ function markdownTable(headers, rows, rowLimit = 20) {
4778
+ if (!headers.length) {
4779
+ return ["No tabular preview available."];
4780
+ }
4781
+ const width = headers.length;
4782
+ const lines = [`| ${headers.join(" | ")} |`, `| ${headers.map(() => "---").join(" | ")} |`];
4783
+ for (const row of rows.slice(0, rowLimit)) {
4784
+ const normalized = Array.from({ length: width }, (_, index) => normalizeTableCell(row[index] ?? ""));
4785
+ lines.push(`| ${normalized.join(" | ")} |`);
4786
+ }
4787
+ return lines;
4788
+ }
4789
+ function zipEntryText(archive, entryPath) {
4790
+ const entry = archive[entryPath];
4791
+ return entry ? strFromU8(entry) : void 0;
4792
+ }
4793
+ function parseXmlDocument(xml) {
4794
+ return new JSDOM(xml, { contentType: "text/xml" }).window.document;
4795
+ }
4796
+ function zipDirname(value) {
4797
+ const index = value.lastIndexOf("/");
4798
+ return index === -1 ? "" : value.slice(0, index);
4799
+ }
4800
+ function resolveZipTarget(basePath, target) {
4801
+ return path7.posix.normalize(path7.posix.join(zipDirname(basePath), target));
4802
+ }
4803
+ function relationshipTargets(xml, basePath) {
4804
+ const document = parseXmlDocument(xml);
4805
+ const map = /* @__PURE__ */ new Map();
4806
+ for (const node of Array.from(document.getElementsByTagName("*"))) {
4807
+ if (node.localName !== "Relationship") {
4808
+ continue;
4809
+ }
4810
+ const id = node.getAttribute("Id")?.trim();
4811
+ const target = node.getAttribute("Target")?.trim();
4812
+ const type = node.getAttribute("Type")?.trim() ?? "";
4813
+ if (!id || !target) {
4814
+ continue;
4815
+ }
4816
+ map.set(id, { target: resolveZipTarget(basePath, target), type });
4817
+ }
4818
+ return map;
4819
+ }
4820
+ function xmlTextNodes(xml, localName) {
4821
+ const document = parseXmlDocument(xml);
4822
+ const values = [];
4823
+ for (const node of Array.from(document.getElementsByTagName("*"))) {
4824
+ if (node.localName !== localName) {
4825
+ continue;
4826
+ }
4827
+ const text = normalizeWhitespace(node.textContent ?? "");
4828
+ if (text) {
4829
+ values.push(text);
4830
+ }
4831
+ }
4832
+ return values;
4833
+ }
4834
+ function firstHtmlHeading(html) {
4835
+ const dom = new JSDOM(html);
4836
+ const heading = dom.window.document.querySelector("h1, h2, h3");
4837
+ const title = normalizeWhitespace(heading?.textContent ?? "");
4838
+ return title || void 0;
4839
+ }
4840
+ function htmlToMarkdown(html) {
4841
+ const dom = new JSDOM(html);
4842
+ const turndown = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced" });
4843
+ const body = dom.window.document.body?.innerHTML ?? html;
4844
+ return turndown.turndown(body).trim();
4845
+ }
4728
4846
  async function extractPdfText(input) {
4729
4847
  try {
4730
4848
  const pdfjs = await import("pdfjs-dist/legacy/build/pdf.mjs");
@@ -4782,7 +4900,7 @@ async function extractDocxText(input) {
4782
4900
  const warnings = result.messages.map((message) => normalizeWhitespace(message.message)).filter(Boolean).map((message) => truncate(message, 240));
4783
4901
  const artifact = {
4784
4902
  ...extractionMetadata("docx", input.mimeType, "docx_text"),
4785
- metadata: parseDocxCoreMetadata(input.bytes),
4903
+ metadata: parseOfficeCoreMetadata(input.bytes),
4786
4904
  warnings: warnings.length ? warnings : void 0
4787
4905
  };
4788
4906
  if (!extractedText) {
@@ -4801,6 +4919,258 @@ async function extractDocxText(input) {
4801
4919
  };
4802
4920
  }
4803
4921
  }
4922
+ async function extractCsvText(input) {
4923
+ try {
4924
+ const rawText = decodeTextBytes(input.bytes);
4925
+ const delimiter = input.fileName?.toLowerCase().endsWith(".tsv") || input.mimeType.includes("tab-separated") ? " " : ",";
4926
+ const parsed = parseCsvSync(rawText, {
4927
+ delimiter,
4928
+ relax_column_count: true,
4929
+ skip_empty_lines: true,
4930
+ trim: true
4931
+ });
4932
+ const rows = parsed.map((row) => row.map((value) => normalizeTableCell(value)));
4933
+ const { headers, bodyRows } = detectHeaderRow(rows);
4934
+ const hintLines = columnHints(headers, bodyRows);
4935
+ const title = input.fileName ? path7.basename(input.fileName, path7.extname(input.fileName)) : void 0;
4936
+ const extractedText = [
4937
+ title ? `# ${title}` : null,
4938
+ `Format: ${delimiter === " " ? "TSV" : "CSV"}`,
4939
+ `Rows: ${bodyRows.length}`,
4940
+ `Columns: ${headers.length}`,
4941
+ headers.length ? `Headers: ${headers.join(", ")}` : null,
4942
+ "",
4943
+ hintLines.length ? "## Column Hints" : null,
4944
+ hintLines.length ? hintLines.join("\n") : null,
4945
+ hintLines.length ? "" : null,
4946
+ "## Preview",
4947
+ ...markdownTable(headers, bodyRows)
4948
+ ].filter((item) => Boolean(item)).join("\n").trim();
4949
+ const artifact = {
4950
+ ...extractionMetadata("csv", input.mimeType, "csv_text"),
4951
+ metadata: {
4952
+ format: delimiter === " " ? "tsv" : "csv",
4953
+ row_count: String(bodyRows.length),
4954
+ column_count: String(headers.length),
4955
+ headers: headers.join(", ")
4956
+ }
4957
+ };
4958
+ return {
4959
+ title,
4960
+ extractedText,
4961
+ artifact
4962
+ };
4963
+ } catch (error) {
4964
+ return {
4965
+ artifact: {
4966
+ ...extractionMetadata("csv", input.mimeType, "csv_text"),
4967
+ warnings: [`CSV extraction failed: ${error instanceof Error ? truncate(error.message, 240) : "unknown error"}`]
4968
+ }
4969
+ };
4970
+ }
4971
+ }
4972
+ async function extractXlsxText(input) {
4973
+ try {
4974
+ const XLSX = await import("xlsx");
4975
+ const workbook = XLSX.read(input.bytes, { type: "buffer", cellFormula: false, cellHTML: false, cellStyles: false });
4976
+ const allSheetNames = workbook.SheetNames;
4977
+ const sheetNames = allSheetNames.slice(0, 10);
4978
+ const sheetSections = [];
4979
+ const metadata = {
4980
+ sheet_count: String(allSheetNames.length),
4981
+ sheet_names: allSheetNames.join(", ")
4982
+ };
4983
+ for (const sheetName of sheetNames) {
4984
+ const sheet = workbook.Sheets[sheetName];
4985
+ if (!sheet) {
4986
+ continue;
4987
+ }
4988
+ const rows = XLSX.utils.sheet_to_json(sheet, {
4989
+ header: 1,
4990
+ raw: false,
4991
+ defval: ""
4992
+ }).map((row) => row.map((value) => normalizeTableCell(value)));
4993
+ const { headers, bodyRows } = detectHeaderRow(rows);
4994
+ sheetSections.push(`## Sheet: ${sheetName}`);
4995
+ sheetSections.push(`Rows: ${bodyRows.length}`);
4996
+ sheetSections.push(`Columns: ${headers.length}`);
4997
+ sheetSections.push(...markdownTable(headers, bodyRows));
4998
+ sheetSections.push("");
4999
+ }
5000
+ const title = normalizeWhitespace(String(workbook.Props?.Title ?? "")) || (input.fileName ? path7.basename(input.fileName, path7.extname(input.fileName)) : void 0);
5001
+ const extractedText = [
5002
+ title ? `# ${title}` : null,
5003
+ `Sheets: ${allSheetNames.length}`,
5004
+ allSheetNames.length ? `Sheet Names: ${allSheetNames.join(", ")}` : null,
5005
+ "",
5006
+ ...sheetSections
5007
+ ].filter((item) => Boolean(item)).join("\n").trim();
5008
+ const warnings = allSheetNames.length > sheetNames.length ? ["Workbook preview truncated to the first 10 sheets."] : void 0;
5009
+ return {
5010
+ title,
5011
+ extractedText,
5012
+ artifact: {
5013
+ ...extractionMetadata("xlsx", input.mimeType, "xlsx_text"),
5014
+ metadata,
5015
+ warnings
5016
+ }
5017
+ };
5018
+ } catch (error) {
5019
+ return {
5020
+ artifact: {
5021
+ ...extractionMetadata("xlsx", input.mimeType, "xlsx_text"),
5022
+ warnings: [`XLSX extraction failed: ${error instanceof Error ? truncate(error.message, 240) : "unknown error"}`]
5023
+ }
5024
+ };
5025
+ }
5026
+ }
5027
+ async function extractPptxText(input) {
5028
+ try {
5029
+ const archive = unzipSync(new Uint8Array(input.bytes));
5030
+ const presentationXml = zipEntryText(archive, "ppt/presentation.xml");
5031
+ if (!presentationXml) {
5032
+ throw new Error("Missing ppt/presentation.xml");
5033
+ }
5034
+ const relsXml = zipEntryText(archive, "ppt/_rels/presentation.xml.rels");
5035
+ if (!relsXml) {
5036
+ throw new Error("Missing ppt/_rels/presentation.xml.rels");
5037
+ }
5038
+ const rels = relationshipTargets(relsXml, "ppt/presentation.xml");
5039
+ const document = parseXmlDocument(presentationXml);
5040
+ const slideTargets = Array.from(document.getElementsByTagName("*")).filter((node) => node.localName === "sldId").map((node) => node.getAttribute("r:id")?.trim()).filter((value) => Boolean(value)).map((relationshipId) => rels.get(relationshipId)?.target).filter((value) => Boolean(value)).slice(0, 60);
5041
+ const slideSections = [];
5042
+ for (let index = 0; index < slideTargets.length; index += 1) {
5043
+ const slidePath = slideTargets[index];
5044
+ const slideXml = zipEntryText(archive, slidePath);
5045
+ if (!slideXml) {
5046
+ continue;
5047
+ }
5048
+ const slideTexts = xmlTextNodes(slideXml, "t");
5049
+ const slideTitle = slideTexts[0] ?? `Slide ${index + 1}`;
5050
+ slideSections.push(`## Slide ${index + 1}: ${slideTitle}`);
5051
+ if (slideTexts.length) {
5052
+ slideSections.push(slideTexts.join("\n"));
5053
+ }
5054
+ const slideRelsPath = `${zipDirname(slidePath)}/_rels/${path7.posix.basename(slidePath)}.rels`;
5055
+ const slideRelsXml = zipEntryText(archive, slideRelsPath);
5056
+ if (slideRelsXml) {
5057
+ const slideRels = relationshipTargets(slideRelsXml, slidePath);
5058
+ const notesTarget = [...slideRels.values()].find((entry) => entry.type.endsWith("/notesSlide"))?.target;
5059
+ if (notesTarget) {
5060
+ const notesXml = zipEntryText(archive, notesTarget);
5061
+ const noteTexts = notesXml ? xmlTextNodes(notesXml, "t") : [];
5062
+ if (noteTexts.length) {
5063
+ slideSections.push("Notes:");
5064
+ slideSections.push(noteTexts.join("\n"));
5065
+ }
5066
+ }
5067
+ }
5068
+ slideSections.push("");
5069
+ }
5070
+ const metadata = parseOfficeCoreMetadata(input.bytes);
5071
+ const title = metadata?.title || (input.fileName ? path7.basename(input.fileName, path7.extname(input.fileName)) : void 0);
5072
+ const extractedText = [title ? `# ${title}` : null, `Slides: ${slideTargets.length}`, "", ...slideSections].filter((item) => Boolean(item)).join("\n").trim();
5073
+ return {
5074
+ title,
5075
+ extractedText,
5076
+ artifact: {
5077
+ ...extractionMetadata("pptx", input.mimeType, "pptx_text"),
5078
+ metadata: {
5079
+ ...metadata ?? {},
5080
+ slide_count: String(slideTargets.length)
5081
+ },
5082
+ warnings: Array.from(document.getElementsByTagName("*")).filter((node) => node.localName === "sldId").length > slideTargets.length ? ["Slide extraction truncated to the first 60 slides."] : void 0
5083
+ }
5084
+ };
5085
+ } catch (error) {
5086
+ return {
5087
+ artifact: {
5088
+ ...extractionMetadata("pptx", input.mimeType, "pptx_text"),
5089
+ warnings: [`PPTX extraction failed: ${error instanceof Error ? truncate(error.message, 240) : "unknown error"}`]
5090
+ }
5091
+ };
5092
+ }
5093
+ }
5094
+ async function extractEpubChapters(input) {
5095
+ try {
5096
+ const archive = unzipSync(new Uint8Array(input.bytes));
5097
+ const containerXml = zipEntryText(archive, "META-INF/container.xml");
5098
+ if (!containerXml) {
5099
+ throw new Error("Missing META-INF/container.xml");
5100
+ }
5101
+ const container = parseXmlDocument(containerXml);
5102
+ const rootfile = Array.from(container.getElementsByTagName("*")).find((node) => node.localName === "rootfile");
5103
+ const packagePath = rootfile?.getAttribute("full-path")?.trim();
5104
+ if (!packagePath) {
5105
+ throw new Error("EPUB container did not declare a package document.");
5106
+ }
5107
+ const packageXml = zipEntryText(archive, packagePath);
5108
+ if (!packageXml) {
5109
+ throw new Error(`Missing EPUB package document: ${packagePath}`);
5110
+ }
5111
+ const packageDocument = parseXmlDocument(packageXml);
5112
+ const manifestEntries = new Map(
5113
+ Array.from(packageDocument.getElementsByTagName("*")).filter((node) => node.localName === "item").map(
5114
+ (node) => [
5115
+ node.getAttribute("id")?.trim() ?? "",
5116
+ {
5117
+ href: node.getAttribute("href")?.trim() ?? "",
5118
+ mediaType: node.getAttribute("media-type")?.trim() ?? "",
5119
+ properties: node.getAttribute("properties")?.trim() ?? ""
5120
+ }
5121
+ ]
5122
+ ).filter(([id, item]) => Boolean(id && item.href))
5123
+ );
5124
+ const spineIds = Array.from(packageDocument.getElementsByTagName("*")).filter((node) => node.localName === "itemref").map((node) => node.getAttribute("idref")?.trim()).filter((value) => Boolean(value));
5125
+ const bookTitle = xmlTextNodes(packageXml, "title")[0] || (input.fileName ? path7.basename(input.fileName, path7.extname(input.fileName)) : void 0);
5126
+ const author = xmlTextNodes(packageXml, "creator")[0];
5127
+ const chapters = [];
5128
+ for (const spineId of spineIds) {
5129
+ const item = manifestEntries.get(spineId);
5130
+ if (!item || !item.mediaType.includes("html") && !item.mediaType.includes("xhtml")) {
5131
+ continue;
5132
+ }
5133
+ if (item.properties.split(/\s+/).includes("nav")) {
5134
+ continue;
5135
+ }
5136
+ const entryPath = resolveZipTarget(packagePath, item.href);
5137
+ const html = zipEntryText(archive, entryPath);
5138
+ if (!html) {
5139
+ continue;
5140
+ }
5141
+ const markdown = htmlToMarkdown(html);
5142
+ if (!markdown) {
5143
+ continue;
5144
+ }
5145
+ const chapterTitle = firstHtmlHeading(html) || markdown.match(/^#\s+(.+)$/m)?.[1]?.trim() || item.href;
5146
+ const normalizedTitle = normalizeWhitespace(chapterTitle);
5147
+ if (!normalizedTitle || /^table of contents$/i.test(normalizedTitle)) {
5148
+ continue;
5149
+ }
5150
+ chapters.push({
5151
+ partKey: item.href,
5152
+ title: normalizedTitle,
5153
+ markdown,
5154
+ metadata: {
5155
+ book_title: bookTitle ?? "",
5156
+ chapter_title: normalizedTitle,
5157
+ author: author ?? ""
5158
+ }
5159
+ });
5160
+ }
5161
+ return {
5162
+ title: bookTitle,
5163
+ author,
5164
+ chapters,
5165
+ warnings: chapters.length ? void 0 : ["EPUB extraction completed but found no chapter-like spine entries."]
5166
+ };
5167
+ } catch (error) {
5168
+ return {
5169
+ chapters: [],
5170
+ warnings: [`EPUB extraction failed: ${error instanceof Error ? truncate(error.message, 240) : "unknown error"}`]
5171
+ };
5172
+ }
5173
+ }
4804
5174
 
4805
5175
  // src/logs.ts
4806
5176
  import fs8 from "fs/promises";
@@ -5236,15 +5606,27 @@ function inferKind(mimeType, filePath) {
5236
5606
  if (mimeType.includes("html")) {
5237
5607
  return "html";
5238
5608
  }
5239
- if (mimeType.startsWith("text/")) {
5240
- return "text";
5241
- }
5242
5609
  if (mimeType === "application/pdf" || filePath.toLowerCase().endsWith(".pdf")) {
5243
5610
  return "pdf";
5244
5611
  }
5245
5612
  if (mimeType === "application/vnd.openxmlformats-officedocument.wordprocessingml.document" || filePath.toLowerCase().endsWith(".docx")) {
5246
5613
  return "docx";
5247
5614
  }
5615
+ if (mimeType === "application/epub+zip" || filePath.toLowerCase().endsWith(".epub")) {
5616
+ return "epub";
5617
+ }
5618
+ if (mimeType === "text/csv" || mimeType === "text/tab-separated-values" || filePath.toLowerCase().endsWith(".csv") || filePath.toLowerCase().endsWith(".tsv")) {
5619
+ return "csv";
5620
+ }
5621
+ if (mimeType.startsWith("text/")) {
5622
+ return "text";
5623
+ }
5624
+ if (mimeType === "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" || filePath.toLowerCase().endsWith(".xlsx")) {
5625
+ return "xlsx";
5626
+ }
5627
+ if (mimeType === "application/vnd.openxmlformats-officedocument.presentationml.presentation" || filePath.toLowerCase().endsWith(".pptx")) {
5628
+ return "pptx";
5629
+ }
5248
5630
  if (mimeType.startsWith("image/")) {
5249
5631
  return "image";
5250
5632
  }
@@ -5270,6 +5652,10 @@ function guessMimeType(target) {
5270
5652
  }
5271
5653
  return mime.lookup(target) || "application/octet-stream";
5272
5654
  }
5655
+ function sourceGroupIdFor(prepared) {
5656
+ const originKey = prepared.originType === "url" ? prepared.url ?? prepared.title : prepared.originalPath ?? prepared.title;
5657
+ return `${slugify(prepared.title)}-${sha256(originKey).slice(0, 8)}`;
5658
+ }
5273
5659
  function rstAdornmentLine(line) {
5274
5660
  const trimmed = line.trim();
5275
5661
  if (trimmed.length < 3) {
@@ -5844,6 +6230,9 @@ function manifestMatchesOrigin(manifest, prepared) {
5844
6230
  }
5845
6231
  return Boolean(prepared.originalPath && manifest.originalPath && toPosix(manifest.originalPath) === toPosix(prepared.originalPath));
5846
6232
  }
6233
+ function manifestMatchesOriginPart(manifest, prepared) {
6234
+ return manifestMatchesOrigin(manifest, prepared) && (manifest.sourcePartKey ?? "") === (prepared.sourcePartKey ?? "");
6235
+ }
5847
6236
  function buildCompositeHash(payloadBytes, attachments = []) {
5848
6237
  if (!attachments.length) {
5849
6238
  return sha256(payloadBytes);
@@ -5941,7 +6330,7 @@ function extractMarkdownImageReferences(content, baseUrl) {
5941
6330
  async function convertHtmlToMarkdown(html, url) {
5942
6331
  const dom = new JSDOM2(html, { url });
5943
6332
  const article = new Readability(dom.window.document).parse();
5944
- const turndown = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced" });
6333
+ const turndown = new TurndownService2({ headingStyle: "atx", codeBlockStyle: "fenced" });
5945
6334
  const body = article?.content ?? dom.window.document.body.innerHTML;
5946
6335
  const markdown = turndown.turndown(body);
5947
6336
  return {
@@ -5965,21 +6354,26 @@ async function readManifestByHash(manifestsDir, contentHash) {
5965
6354
  }
5966
6355
  return null;
5967
6356
  }
5968
- async function readManifestByOrigin(manifestsDir, prepared) {
6357
+ async function readManifestsByOrigin(manifestsDir, prepared) {
5969
6358
  const entries = await fs11.readdir(manifestsDir, { withFileTypes: true }).catch(() => []);
6359
+ const manifests = [];
5970
6360
  for (const entry of entries) {
5971
6361
  if (!entry.isFile() || !entry.name.endsWith(".json")) {
5972
6362
  continue;
5973
6363
  }
5974
6364
  const manifest = await readJsonFile(path12.join(manifestsDir, entry.name));
5975
6365
  if (manifest && manifestMatchesOrigin(manifest, prepared)) {
5976
- return {
6366
+ manifests.push({
5977
6367
  ...manifest,
5978
6368
  semanticHash: manifest.semanticHash ?? manifest.contentHash
5979
- };
6369
+ });
5980
6370
  }
5981
6371
  }
5982
- return null;
6372
+ return manifests;
6373
+ }
6374
+ async function readManifestByOrigin(manifestsDir, prepared) {
6375
+ const manifests = await readManifestsByOrigin(manifestsDir, prepared);
6376
+ return manifests.find((manifest) => manifestMatchesOriginPart(manifest, prepared)) ?? null;
5983
6377
  }
5984
6378
  async function loadGitignoreMatcher(repoRoot, enabled) {
5985
6379
  if (!enabled) {
@@ -6228,8 +6622,8 @@ async function persistPreparedInput(rootDir, prepared, paths) {
6228
6622
  const semanticHash = prepared.semanticHash ?? contentHash;
6229
6623
  const extractionHash = prepared.extractionHash ?? buildExtractionHash(prepared.extractedText, prepared.extractionArtifact);
6230
6624
  const existingByOrigin = await readManifestByOrigin(paths.manifestsDir, prepared);
6231
- const existingByHash = existingByOrigin ? null : await readManifestByHash(paths.manifestsDir, contentHash);
6232
- if (existingByOrigin && existingByOrigin.contentHash === contentHash && existingByOrigin.semanticHash === semanticHash && existingByOrigin.extractionHash === extractionHash && existingByOrigin.title === prepared.title && existingByOrigin.sourceKind === prepared.sourceKind && existingByOrigin.sourceType === prepared.sourceType && existingByOrigin.sourceClass === prepared.sourceClass && existingByOrigin.language === prepared.language && existingByOrigin.mimeType === prepared.mimeType && existingByOrigin.repoRelativePath === prepared.repoRelativePath) {
6625
+ const existingByHash = existingByOrigin || prepared.sourcePartKey ? null : await readManifestByHash(paths.manifestsDir, contentHash);
6626
+ if (existingByOrigin && existingByOrigin.contentHash === contentHash && existingByOrigin.semanticHash === semanticHash && existingByOrigin.extractionHash === extractionHash && existingByOrigin.title === prepared.title && existingByOrigin.sourceKind === prepared.sourceKind && existingByOrigin.sourceType === prepared.sourceType && existingByOrigin.sourceClass === prepared.sourceClass && existingByOrigin.language === prepared.language && existingByOrigin.mimeType === prepared.mimeType && existingByOrigin.repoRelativePath === prepared.repoRelativePath && existingByOrigin.sourceGroupId === prepared.sourceGroupId && existingByOrigin.sourceGroupTitle === prepared.sourceGroupTitle && existingByOrigin.sourcePartKey === prepared.sourcePartKey && existingByOrigin.partIndex === prepared.partIndex && existingByOrigin.partCount === prepared.partCount && existingByOrigin.partTitle === prepared.partTitle && JSON.stringify(existingByOrigin.details ?? {}) === JSON.stringify(prepared.details ?? {})) {
6233
6627
  return { manifest: existingByOrigin, isNew: false, wasUpdated: false };
6234
6628
  }
6235
6629
  if (existingByHash) {
@@ -6288,6 +6682,13 @@ async function persistPreparedInput(rootDir, prepared, paths) {
6288
6682
  mimeType: prepared.mimeType,
6289
6683
  contentHash,
6290
6684
  semanticHash,
6685
+ sourceGroupId: prepared.sourceGroupId,
6686
+ sourceGroupTitle: prepared.sourceGroupTitle,
6687
+ sourcePartKey: prepared.sourcePartKey,
6688
+ partIndex: prepared.partIndex,
6689
+ partCount: prepared.partCount,
6690
+ partTitle: prepared.partTitle,
6691
+ details: prepared.details,
6291
6692
  createdAt: previous?.createdAt ?? now,
6292
6693
  updatedAt: now,
6293
6694
  attachments: manifestAttachments.length ? manifestAttachments : void 0
@@ -6309,6 +6710,42 @@ async function persistPreparedInput(rootDir, prepared, paths) {
6309
6710
  }
6310
6711
  return { manifest, isNew: !previous, wasUpdated: Boolean(previous) };
6311
6712
  }
6713
+ async function persistPreparedInputs(rootDir, input, preparedInputs, paths) {
6714
+ const template = preparedInputs[0];
6715
+ const existingByOrigin = template ? await readManifestsByOrigin(paths.manifestsDir, template) : [];
6716
+ const created = [];
6717
+ const updated = [];
6718
+ const unchanged = [];
6719
+ const removed = [];
6720
+ const seenSourceIds = /* @__PURE__ */ new Set();
6721
+ for (const prepared of preparedInputs) {
6722
+ const result = await persistPreparedInput(rootDir, prepared, paths);
6723
+ if (result.isNew) {
6724
+ created.push(result.manifest);
6725
+ } else if (result.wasUpdated) {
6726
+ updated.push(result.manifest);
6727
+ } else {
6728
+ unchanged.push(result.manifest);
6729
+ }
6730
+ seenSourceIds.add(result.manifest.sourceId);
6731
+ }
6732
+ for (const manifest of existingByOrigin) {
6733
+ if (seenSourceIds.has(manifest.sourceId)) {
6734
+ continue;
6735
+ }
6736
+ await removeManifestArtifacts(rootDir, manifest, paths);
6737
+ removed.push(manifest);
6738
+ }
6739
+ return {
6740
+ input,
6741
+ scannedCount: preparedInputs.length,
6742
+ created,
6743
+ updated,
6744
+ unchanged,
6745
+ removed,
6746
+ skipped: []
6747
+ };
6748
+ }
6312
6749
  async function removeManifestArtifacts(rootDir, manifest, paths) {
6313
6750
  await fs11.rm(path12.join(paths.manifestsDir, `${manifest.sourceId}.json`), { force: true });
6314
6751
  await fs11.rm(path12.resolve(rootDir, manifest.storedPath), { force: true });
@@ -6335,10 +6772,10 @@ function repoSyncWorkspaceIgnorePaths(rootDir, paths, repoRoot) {
6335
6772
  return candidates.map((candidate) => path12.resolve(candidate)).filter((candidate, index, items) => items.indexOf(candidate) === index).filter((candidate) => withinRoot(repoRoot, candidate));
6336
6773
  }
6337
6774
  function preparedMatchesManifest(manifest, prepared, contentHash) {
6338
- return manifest.contentHash === contentHash && manifest.extractionHash === (prepared.extractionHash ?? buildExtractionHash(prepared.extractedText, prepared.extractionArtifact)) && manifest.title === prepared.title && manifest.sourceKind === prepared.sourceKind && manifest.sourceType === prepared.sourceType && manifest.sourceClass === prepared.sourceClass && manifest.language === prepared.language && manifest.mimeType === prepared.mimeType && manifest.repoRelativePath === prepared.repoRelativePath;
6775
+ return manifest.contentHash === contentHash && manifest.extractionHash === (prepared.extractionHash ?? buildExtractionHash(prepared.extractedText, prepared.extractionArtifact)) && manifest.semanticHash === (prepared.semanticHash ?? contentHash) && manifest.title === prepared.title && manifest.sourceKind === prepared.sourceKind && manifest.sourceType === prepared.sourceType && manifest.sourceClass === prepared.sourceClass && manifest.language === prepared.language && manifest.mimeType === prepared.mimeType && manifest.repoRelativePath === prepared.repoRelativePath && manifest.sourceGroupId === prepared.sourceGroupId && manifest.sourceGroupTitle === prepared.sourceGroupTitle && manifest.sourcePartKey === prepared.sourcePartKey && manifest.partIndex === prepared.partIndex && manifest.partCount === prepared.partCount && manifest.partTitle === prepared.partTitle && JSON.stringify(manifest.details ?? {}) === JSON.stringify(prepared.details ?? {});
6339
6776
  }
6340
6777
  function shouldDeferWatchSemanticRefresh(sourceKind) {
6341
- return sourceKind === "markdown" || sourceKind === "text" || sourceKind === "html" || sourceKind === "pdf" || sourceKind === "docx" || sourceKind === "image";
6778
+ return sourceKind === "markdown" || sourceKind === "text" || sourceKind === "html" || sourceKind === "pdf" || sourceKind === "docx" || sourceKind === "epub" || sourceKind === "csv" || sourceKind === "xlsx" || sourceKind === "pptx" || sourceKind === "image";
6342
6779
  }
6343
6780
  function pendingSemanticRefreshId(changeType, repoRoot, relativePath) {
6344
6781
  return `pending:${changeType}:${sha256(`${toPosix(repoRoot)}:${relativePath}`).slice(0, 12)}`;
@@ -6404,13 +6841,16 @@ async function syncTrackedRepos(rootDir, options, repoRoots) {
6404
6841
  const currentPaths = new Set(files.map((absolutePath) => path12.resolve(absolutePath)));
6405
6842
  for (const absolutePath of files) {
6406
6843
  const relativePath = repoRelativePathFor(absolutePath, repoRoot) ?? toPosix(path12.relative(repoRoot, absolutePath));
6407
- const prepared = await prepareFileInput(rootDir, absolutePath, repoRoot, sourceClassForRelativePath(relativePath, normalizedOptions));
6408
- const result = await persistPreparedInput(rootDir, prepared, paths);
6409
- if (result.isNew) {
6410
- imported.push(result.manifest);
6411
- } else if (result.wasUpdated) {
6412
- updated.push(result.manifest);
6413
- }
6844
+ const preparedInputs = await prepareFileInputs(
6845
+ rootDir,
6846
+ absolutePath,
6847
+ repoRoot,
6848
+ sourceClassForRelativePath(relativePath, normalizedOptions)
6849
+ );
6850
+ const result = await persistPreparedInputs(rootDir, absolutePath, preparedInputs, paths);
6851
+ imported.push(...result.created);
6852
+ updated.push(...result.updated);
6853
+ removed.push(...result.removed);
6414
6854
  progress.tick();
6415
6855
  }
6416
6856
  progress.finish(`repo=${toPosix(path12.relative(rootDir, repoRoot)) || "."}`);
@@ -6469,9 +6909,6 @@ async function syncTrackedReposForWatch(rootDir, options, repoRoots) {
6469
6909
  let scannedCount = 0;
6470
6910
  for (const repoRoot of uniqueRoots) {
6471
6911
  const repoManifests = manifestsByRepoRoot.get(repoRoot) ?? [];
6472
- const manifestsByOriginalPath = new Map(
6473
- repoManifests.filter((manifest) => manifest.originalPath).map((manifest) => [path12.resolve(manifest.originalPath), manifest])
6474
- );
6475
6912
  if (!await fileExists(repoRoot)) {
6476
6913
  for (const manifest of repoManifests) {
6477
6914
  if (shouldDeferWatchSemanticRefresh(manifest.sourceKind)) {
@@ -6507,38 +6944,50 @@ async function syncTrackedReposForWatch(rootDir, options, repoRoots) {
6507
6944
  const currentPaths = new Set(files.map((absolutePath) => path12.resolve(absolutePath)));
6508
6945
  for (const absolutePath of files) {
6509
6946
  const relativePath = repoRelativePathFor(absolutePath, repoRoot) ?? toPosix(path12.relative(repoRoot, absolutePath));
6510
- const prepared = await prepareFileInput(rootDir, absolutePath, repoRoot, sourceClassForRelativePath(relativePath, normalizedOptions));
6511
- if (shouldDeferWatchSemanticRefresh(prepared.sourceKind)) {
6512
- const existing = manifestsByOriginalPath.get(path12.resolve(absolutePath));
6513
- const contentHash = buildCompositeHash(prepared.payloadBytes, prepared.attachments);
6514
- const changed = !existing || !preparedMatchesManifest(existing, prepared, contentHash);
6947
+ const preparedInputs = await prepareFileInputs(
6948
+ rootDir,
6949
+ absolutePath,
6950
+ repoRoot,
6951
+ sourceClassForRelativePath(relativePath, normalizedOptions)
6952
+ );
6953
+ const firstPrepared = preparedInputs[0];
6954
+ if (firstPrepared && shouldDeferWatchSemanticRefresh(firstPrepared.sourceKind)) {
6955
+ const existing = repoManifests.filter(
6956
+ (manifest) => manifest.originalPath && path12.resolve(manifest.originalPath) === path12.resolve(absolutePath)
6957
+ );
6958
+ const existingByPartKey = new Map(existing.map((manifest) => [manifest.sourcePartKey ?? "__single__", manifest]));
6959
+ const changed = existing.length !== preparedInputs.length || preparedInputs.some((prepared) => {
6960
+ const match = existingByPartKey.get(prepared.sourcePartKey ?? "__single__");
6961
+ const contentHash = buildCompositeHash(prepared.payloadBytes, prepared.attachments);
6962
+ return !match || !preparedMatchesManifest(match, prepared, contentHash);
6963
+ }) || existing.some(
6964
+ (manifest) => !preparedInputs.some((prepared) => (prepared.sourcePartKey ?? "") === (manifest.sourcePartKey ?? ""))
6965
+ );
6515
6966
  if (changed) {
6516
6967
  pendingSemanticRefresh.push({
6517
6968
  id: pendingSemanticRefreshId(
6518
- existing ? "modified" : "added",
6969
+ existing.length ? "modified" : "added",
6519
6970
  repoRoot,
6520
- prepared.repoRelativePath ?? toPosix(path12.relative(repoRoot, absolutePath))
6971
+ firstPrepared.repoRelativePath ?? toPosix(path12.relative(repoRoot, absolutePath))
6521
6972
  ),
6522
6973
  repoRoot,
6523
6974
  path: toPosix(path12.relative(rootDir, absolutePath)),
6524
- changeType: existing ? "modified" : "added",
6975
+ changeType: existing.length ? "modified" : "added",
6525
6976
  detectedAt: (/* @__PURE__ */ new Date()).toISOString(),
6526
- sourceId: existing?.sourceId,
6527
- sourceKind: prepared.sourceKind
6977
+ sourceId: existing[0]?.sourceId,
6978
+ sourceKind: firstPrepared.sourceKind
6528
6979
  });
6529
- if (existing?.sourceId) {
6530
- staleSourceIds.add(existing.sourceId);
6980
+ for (const manifest of existing) {
6981
+ staleSourceIds.add(manifest.sourceId);
6531
6982
  }
6532
6983
  }
6533
6984
  progress.tick();
6534
6985
  continue;
6535
6986
  }
6536
- const result = await persistPreparedInput(rootDir, prepared, paths);
6537
- if (result.isNew) {
6538
- imported.push(result.manifest);
6539
- } else if (result.wasUpdated) {
6540
- updated.push(result.manifest);
6541
- }
6987
+ const result = await persistPreparedInputs(rootDir, absolutePath, preparedInputs, paths);
6988
+ imported.push(...result.created);
6989
+ updated.push(...result.updated);
6990
+ removed.push(...result.removed);
6542
6991
  progress.tick();
6543
6992
  }
6544
6993
  progress.finish(`repo=${toPosix(path12.relative(rootDir, repoRoot)) || "."}`);
@@ -6592,7 +7041,7 @@ async function syncTrackedReposForWatch(rootDir, options, repoRoots) {
6592
7041
  staleSourceIds: [...staleSourceIds]
6593
7042
  };
6594
7043
  }
6595
- async function prepareFileInput(rootDir, absoluteInput, repoRoot, sourceClass) {
7044
+ async function prepareFileInputs(rootDir, absoluteInput, repoRoot, sourceClass) {
6596
7045
  const payloadBytes = await fs11.readFile(absoluteInput);
6597
7046
  const mimeType = guessMimeType(absoluteInput);
6598
7047
  const sourceKind = inferKind(mimeType, absoluteInput);
@@ -6623,6 +7072,94 @@ async function prepareFileInput(rootDir, absoluteInput, repoRoot, sourceClass) {
6623
7072
  title = extracted.artifact.metadata?.title?.trim() || title;
6624
7073
  extractedText = extracted.extractedText;
6625
7074
  extractionArtifact = extracted.artifact;
7075
+ } else if (sourceKind === "csv") {
7076
+ title = path12.basename(absoluteInput, path12.extname(absoluteInput));
7077
+ const extracted = await extractCsvText({ mimeType, bytes: payloadBytes, fileName: absoluteInput });
7078
+ title = extracted.title?.trim() || title;
7079
+ extractedText = extracted.extractedText;
7080
+ extractionArtifact = extracted.artifact;
7081
+ } else if (sourceKind === "xlsx") {
7082
+ title = path12.basename(absoluteInput, path12.extname(absoluteInput));
7083
+ const extracted = await extractXlsxText({ mimeType, bytes: payloadBytes, fileName: absoluteInput });
7084
+ title = extracted.title?.trim() || title;
7085
+ extractedText = extracted.extractedText;
7086
+ extractionArtifact = extracted.artifact;
7087
+ } else if (sourceKind === "pptx") {
7088
+ title = path12.basename(absoluteInput, path12.extname(absoluteInput));
7089
+ const extracted = await extractPptxText({ mimeType, bytes: payloadBytes, fileName: absoluteInput });
7090
+ title = extracted.title?.trim() || title;
7091
+ extractedText = extracted.extractedText;
7092
+ extractionArtifact = extracted.artifact;
7093
+ } else if (sourceKind === "epub") {
7094
+ title = path12.basename(absoluteInput, path12.extname(absoluteInput));
7095
+ const extracted = await extractEpubChapters({ mimeType, bytes: payloadBytes, fileName: absoluteInput });
7096
+ title = extracted.title?.trim() || title;
7097
+ const groupId = sourceGroupIdFor({
7098
+ title,
7099
+ originType: "file",
7100
+ originalPath: toPosix(absoluteInput)
7101
+ });
7102
+ if (extracted.chapters.length) {
7103
+ return extracted.chapters.map(
7104
+ (chapter, index) => finalizePreparedInput({
7105
+ title: `${title} - ${chapter.title}`,
7106
+ originType: "file",
7107
+ sourceKind: "epub",
7108
+ sourceClass,
7109
+ originalPath: toPosix(absoluteInput),
7110
+ repoRelativePath: repoRelativePathFor(absoluteInput, repoRoot),
7111
+ mimeType: "text/markdown",
7112
+ storedExtension: ".md",
7113
+ payloadBytes: Buffer.from(chapter.markdown, "utf8"),
7114
+ extractedText: chapter.markdown,
7115
+ extractionArtifact: {
7116
+ extractor: "epub_text",
7117
+ sourceKind: "epub",
7118
+ mimeType,
7119
+ producedAt: (/* @__PURE__ */ new Date()).toISOString(),
7120
+ metadata: {
7121
+ ...chapter.metadata,
7122
+ chapter_index: String(index + 1),
7123
+ chapter_count: String(extracted.chapters.length)
7124
+ },
7125
+ warnings: extracted.warnings
7126
+ },
7127
+ extractionHash: buildExtractionHash(chapter.markdown, {
7128
+ extractor: "epub_text",
7129
+ sourceKind: "epub",
7130
+ mimeType,
7131
+ producedAt: (/* @__PURE__ */ new Date()).toISOString(),
7132
+ metadata: {
7133
+ ...chapter.metadata,
7134
+ chapter_index: String(index + 1),
7135
+ chapter_count: String(extracted.chapters.length)
7136
+ },
7137
+ warnings: extracted.warnings
7138
+ }),
7139
+ sourceGroupId: groupId,
7140
+ sourceGroupTitle: title,
7141
+ sourcePartKey: chapter.partKey,
7142
+ partIndex: index + 1,
7143
+ partCount: extracted.chapters.length,
7144
+ partTitle: chapter.title,
7145
+ details: {
7146
+ book_title: title,
7147
+ chapter_title: chapter.title,
7148
+ chapter_index: String(index + 1),
7149
+ chapter_count: String(extracted.chapters.length),
7150
+ ...extracted.author ? { author: extracted.author } : {}
7151
+ }
7152
+ })
7153
+ );
7154
+ }
7155
+ extractedText = void 0;
7156
+ extractionArtifact = {
7157
+ extractor: "epub_text",
7158
+ sourceKind: "epub",
7159
+ mimeType,
7160
+ producedAt: (/* @__PURE__ */ new Date()).toISOString(),
7161
+ warnings: extracted.warnings ?? ["EPUB extraction completed but produced no chapter content."]
7162
+ };
6626
7163
  } else if (sourceKind === "image") {
6627
7164
  title = path12.basename(absoluteInput, path12.extname(absoluteInput));
6628
7165
  const extracted = await extractImageWithVision(rootDir, {
@@ -6636,23 +7173,33 @@ async function prepareFileInput(rootDir, absoluteInput, repoRoot, sourceClass) {
6636
7173
  } else {
6637
7174
  title = path12.basename(absoluteInput, path12.extname(absoluteInput));
6638
7175
  }
6639
- return finalizePreparedInput({
6640
- title,
6641
- originType: "file",
6642
- sourceKind,
6643
- sourceClass,
6644
- language,
6645
- originalPath: toPosix(absoluteInput),
6646
- repoRelativePath: repoRelativePathFor(absoluteInput, repoRoot),
6647
- mimeType,
6648
- storedExtension,
6649
- payloadBytes,
6650
- extractedText,
6651
- extractionArtifact,
6652
- extractionHash: buildExtractionHash(extractedText, extractionArtifact)
6653
- });
7176
+ return [
7177
+ finalizePreparedInput({
7178
+ title,
7179
+ originType: "file",
7180
+ sourceKind,
7181
+ sourceClass,
7182
+ language,
7183
+ originalPath: toPosix(absoluteInput),
7184
+ repoRelativePath: repoRelativePathFor(absoluteInput, repoRoot),
7185
+ mimeType,
7186
+ storedExtension,
7187
+ payloadBytes,
7188
+ extractedText,
7189
+ extractionArtifact,
7190
+ extractionHash: buildExtractionHash(extractedText, extractionArtifact),
7191
+ details: extractionArtifact?.metadata
7192
+ })
7193
+ ];
6654
7194
  }
6655
- async function prepareUrlInput(rootDir, input, options) {
7195
+ async function prepareFileInput(rootDir, absoluteInput, repoRoot, sourceClass) {
7196
+ const prepared = await prepareFileInputs(rootDir, absoluteInput, repoRoot, sourceClass);
7197
+ if (!prepared.length) {
7198
+ throw new Error(`No ingestable sources were extracted from ${absoluteInput}.`);
7199
+ }
7200
+ return prepared[0];
7201
+ }
7202
+ async function prepareUrlInputs(rootDir, input, options) {
6656
7203
  await validateUrlSafety(input);
6657
7204
  const response = await fetch(input);
6658
7205
  if (!response.ok) {
@@ -6747,6 +7294,88 @@ async function prepareUrlInput(rootDir, input, options) {
6747
7294
  title = extracted.artifact.metadata?.title?.trim() || title;
6748
7295
  extractedText = extracted.extractedText;
6749
7296
  extractionArtifact = extracted.artifact;
7297
+ } else if (sourceKind === "csv") {
7298
+ const extracted = await extractCsvText({ mimeType, bytes: payloadBytes, fileName: inputUrl.pathname });
7299
+ title = extracted.title?.trim() || title;
7300
+ extractedText = extracted.extractedText;
7301
+ extractionArtifact = extracted.artifact;
7302
+ } else if (sourceKind === "xlsx") {
7303
+ const extracted = await extractXlsxText({ mimeType, bytes: payloadBytes, fileName: inputUrl.pathname });
7304
+ title = extracted.title?.trim() || title;
7305
+ extractedText = extracted.extractedText;
7306
+ extractionArtifact = extracted.artifact;
7307
+ } else if (sourceKind === "pptx") {
7308
+ const extracted = await extractPptxText({ mimeType, bytes: payloadBytes, fileName: inputUrl.pathname });
7309
+ title = extracted.title?.trim() || title;
7310
+ extractedText = extracted.extractedText;
7311
+ extractionArtifact = extracted.artifact;
7312
+ } else if (sourceKind === "epub") {
7313
+ const extracted = await extractEpubChapters({ mimeType, bytes: payloadBytes, fileName: inputUrl.pathname });
7314
+ title = extracted.title?.trim() || title;
7315
+ const groupId = sourceGroupIdFor({
7316
+ title,
7317
+ originType: "url",
7318
+ url: finalUrl
7319
+ });
7320
+ if (extracted.chapters.length) {
7321
+ return extracted.chapters.map(
7322
+ (chapter, index) => finalizePreparedInput({
7323
+ title: `${title} - ${chapter.title}`,
7324
+ originType: "url",
7325
+ sourceKind: "epub",
7326
+ url: finalUrl,
7327
+ mimeType: "text/markdown",
7328
+ storedExtension: ".md",
7329
+ payloadBytes: Buffer.from(chapter.markdown, "utf8"),
7330
+ extractedText: chapter.markdown,
7331
+ extractionArtifact: {
7332
+ extractor: "epub_text",
7333
+ sourceKind: "epub",
7334
+ mimeType,
7335
+ producedAt: (/* @__PURE__ */ new Date()).toISOString(),
7336
+ metadata: {
7337
+ ...chapter.metadata,
7338
+ chapter_index: String(index + 1),
7339
+ chapter_count: String(extracted.chapters.length)
7340
+ },
7341
+ warnings: extracted.warnings
7342
+ },
7343
+ extractionHash: buildExtractionHash(chapter.markdown, {
7344
+ extractor: "epub_text",
7345
+ sourceKind: "epub",
7346
+ mimeType,
7347
+ producedAt: (/* @__PURE__ */ new Date()).toISOString(),
7348
+ metadata: {
7349
+ ...chapter.metadata,
7350
+ chapter_index: String(index + 1),
7351
+ chapter_count: String(extracted.chapters.length)
7352
+ },
7353
+ warnings: extracted.warnings
7354
+ }),
7355
+ sourceGroupId: groupId,
7356
+ sourceGroupTitle: title,
7357
+ sourcePartKey: chapter.partKey,
7358
+ partIndex: index + 1,
7359
+ partCount: extracted.chapters.length,
7360
+ partTitle: chapter.title,
7361
+ details: {
7362
+ book_title: title,
7363
+ chapter_title: chapter.title,
7364
+ chapter_index: String(index + 1),
7365
+ chapter_count: String(extracted.chapters.length),
7366
+ ...extracted.author ? { author: extracted.author } : {}
7367
+ },
7368
+ logDetails
7369
+ })
7370
+ );
7371
+ }
7372
+ extractionArtifact = {
7373
+ extractor: "epub_text",
7374
+ sourceKind: "epub",
7375
+ mimeType,
7376
+ producedAt: (/* @__PURE__ */ new Date()).toISOString(),
7377
+ warnings: extracted.warnings ?? ["EPUB extraction completed but produced no chapter content."]
7378
+ };
6750
7379
  } else if (sourceKind === "image") {
6751
7380
  const extracted = await extractImageWithVision(rootDir, {
6752
7381
  title,
@@ -6758,22 +7387,32 @@ async function prepareUrlInput(rootDir, input, options) {
6758
7387
  extractionArtifact = extracted.artifact;
6759
7388
  }
6760
7389
  }
6761
- return finalizePreparedInput({
6762
- title,
6763
- originType: "url",
6764
- sourceKind,
6765
- language,
6766
- url: finalUrl,
6767
- mimeType,
6768
- storedExtension,
6769
- payloadBytes,
6770
- extractedText,
6771
- extractionArtifact,
6772
- extractionHash: buildExtractionHash(extractedText, extractionArtifact),
6773
- attachments,
6774
- contentHash,
6775
- logDetails
6776
- });
7390
+ return [
7391
+ finalizePreparedInput({
7392
+ title,
7393
+ originType: "url",
7394
+ sourceKind,
7395
+ language,
7396
+ url: finalUrl,
7397
+ mimeType,
7398
+ storedExtension,
7399
+ payloadBytes,
7400
+ extractedText,
7401
+ extractionArtifact,
7402
+ extractionHash: buildExtractionHash(extractedText, extractionArtifact),
7403
+ attachments,
7404
+ contentHash,
7405
+ details: extractionArtifact?.metadata,
7406
+ logDetails
7407
+ })
7408
+ ];
7409
+ }
7410
+ async function prepareUrlInput(rootDir, input, options) {
7411
+ const prepared = await prepareUrlInputs(rootDir, input, options);
7412
+ if (!prepared.length) {
7413
+ throw new Error(`No ingestable sources were extracted from ${input}.`);
7414
+ }
7415
+ return prepared[0];
6777
7416
  }
6778
7417
  async function collectInboxAttachmentRefs(inputDir, files) {
6779
7418
  const refsBySource = /* @__PURE__ */ new Map();
@@ -6905,18 +7544,23 @@ async function prepareInboxHtmlInput(absolutePath, attachmentRefs) {
6905
7544
  };
6906
7545
  }
6907
7546
  function isSupportedInboxKind(sourceKind) {
6908
- return ["markdown", "text", "html", "pdf", "docx", "image"].includes(sourceKind);
7547
+ return ["markdown", "text", "html", "pdf", "docx", "epub", "csv", "xlsx", "pptx", "image"].includes(sourceKind);
6909
7548
  }
6910
7549
  async function ingestInputDetailed(rootDir, input, options) {
6911
7550
  const { paths } = await initWorkspace(rootDir);
6912
7551
  const normalizedOptions = normalizeIngestOptions(options);
6913
7552
  const absoluteInput = path12.resolve(rootDir, input);
6914
7553
  const repoRoot = isHttpUrl(input) || normalizedOptions.repoRoot ? normalizedOptions.repoRoot : await findNearestGitRoot2(absoluteInput).then((value) => value ?? path12.dirname(absoluteInput));
6915
- const prepared = isHttpUrl(input) ? await prepareUrlInput(rootDir, input, normalizedOptions) : await prepareFileInput(rootDir, absoluteInput, repoRoot);
6916
- return await persistPreparedInput(rootDir, prepared, paths);
7554
+ const prepared = isHttpUrl(input) ? await prepareUrlInputs(rootDir, input, normalizedOptions) : await prepareFileInputs(rootDir, absoluteInput, repoRoot);
7555
+ return await persistPreparedInputs(rootDir, input, prepared, paths);
6917
7556
  }
6918
7557
  async function ingestInput(rootDir, input, options) {
6919
- return (await ingestInputDetailed(rootDir, input, options)).manifest;
7558
+ const result = await ingestInputDetailed(rootDir, input, options);
7559
+ const manifest = [...result.created, ...result.updated, ...result.unchanged][0];
7560
+ if (!manifest) {
7561
+ throw new Error(`No source manifests were created or updated for ${input}.`);
7562
+ }
7563
+ return manifest;
6920
7564
  }
6921
7565
  async function addInput(rootDir, input, options = {}) {
6922
7566
  const { paths } = await initWorkspace(rootDir);
@@ -7014,13 +7658,20 @@ async function ingestDirectory(rootDir, inputDir, options) {
7014
7658
  const progress = createProgressReporter("ingest", files.length);
7015
7659
  for (const absolutePath of files) {
7016
7660
  const relativePath = repoRelativePathFor(absolutePath, repoRoot) ?? toPosix(path12.relative(repoRoot, absolutePath));
7017
- const prepared = await prepareFileInput(rootDir, absolutePath, repoRoot, sourceClassForRelativePath(relativePath, normalizedOptions));
7018
- const result = await persistPreparedInput(rootDir, prepared, paths);
7019
- if (result.isNew) {
7020
- imported.push(result.manifest);
7021
- } else if (result.wasUpdated) {
7022
- updated.push(result.manifest);
7023
- } else {
7661
+ const preparedInputs = await prepareFileInputs(
7662
+ rootDir,
7663
+ absolutePath,
7664
+ repoRoot,
7665
+ sourceClassForRelativePath(relativePath, normalizedOptions)
7666
+ );
7667
+ const result = await persistPreparedInputs(rootDir, absolutePath, preparedInputs, paths);
7668
+ if (result.created.length) {
7669
+ imported.push(...result.created);
7670
+ }
7671
+ if (result.updated.length) {
7672
+ updated.push(...result.updated);
7673
+ }
7674
+ if (!result.created.length && !result.updated.length && !result.removed.length) {
7024
7675
  skipped.push({ path: toPosix(path12.relative(rootDir, absolutePath)), reason: "duplicate_content" });
7025
7676
  }
7026
7677
  progress.tick();
@@ -7071,13 +7722,13 @@ async function importInbox(rootDir, inputDir) {
7071
7722
  continue;
7072
7723
  }
7073
7724
  const prepared = sourceKind === "markdown" && refsBySource.has(absolutePath) ? await prepareInboxMarkdownInput(absolutePath, refsBySource.get(absolutePath) ?? []) : sourceKind === "html" && refsBySource.has(absolutePath) ? await prepareInboxHtmlInput(absolutePath, refsBySource.get(absolutePath) ?? []) : await prepareFileInput(rootDir, absolutePath);
7074
- const result = await persistPreparedInput(rootDir, prepared, paths);
7075
- if (!result.isNew) {
7725
+ const result = await persistPreparedInputs(rootDir, absolutePath, [prepared], paths);
7726
+ if (!result.created.length) {
7076
7727
  skipped.push({ path: toPosix(path12.relative(rootDir, absolutePath)), reason: "duplicate_content" });
7077
7728
  continue;
7078
7729
  }
7079
- attachmentCount += result.manifest.attachments?.length ?? 0;
7080
- imported.push(result.manifest);
7730
+ attachmentCount += result.created.reduce((total, manifest) => total + (manifest.attachments?.length ?? 0), 0);
7731
+ imported.push(...result.created);
7081
7732
  }
7082
7733
  await appendLogEntry(rootDir, "inbox_import", toPosix(path12.relative(rootDir, effectiveInputDir)) || ".", [
7083
7734
  `scanned=${files.length}`,
@@ -9336,9 +9987,19 @@ function buildSourcePage(manifest, analysis, schemaHash, metadata, relatedOutput
9336
9987
  `# ${analysis.title}`,
9337
9988
  "",
9338
9989
  `Source ID: \`${manifest.sourceId}\``,
9990
+ `Source Kind: \`${manifest.sourceKind}\``,
9339
9991
  manifest.url ? `Source URL: ${manifest.url}` : `Source Path: \`${manifest.originalPath ?? manifest.storedPath}\``,
9340
9992
  ...manifest.sourceType ? [`Source Type: \`${manifest.sourceType}\``, ""] : [""],
9341
9993
  ...manifest.sourceClass ? [`Source Class: \`${manifest.sourceClass}\``, ""] : [],
9994
+ ...manifest.sourceGroupTitle ? [`Source Group: ${manifest.sourceGroupTitle}`] : [],
9995
+ ...manifest.partTitle ? [`Part: ${manifest.partIndex ?? "?"}/${manifest.partCount ?? "?"} - ${manifest.partTitle}`] : [],
9996
+ ...manifest.details && Object.keys(manifest.details).length ? [
9997
+ "",
9998
+ "## Source Details",
9999
+ "",
10000
+ ...Object.entries(manifest.details).map(([key, value]) => `- ${key.replace(/_/g, " ")}: ${value}`),
10001
+ ""
10002
+ ] : [],
9342
10003
  "",
9343
10004
  "## Summary",
9344
10005
  "",
@@ -14987,7 +15648,7 @@ async function bootstrapDemo(rootDir, input) {
14987
15648
  }
14988
15649
 
14989
15650
  // src/mcp.ts
14990
- var SERVER_VERSION = "0.2.2";
15651
+ var SERVER_VERSION = "0.3.0";
14991
15652
  async function createMcpServer(rootDir) {
14992
15653
  const server = new McpServer({
14993
15654
  name: "swarmvault",
@@ -15165,8 +15826,8 @@ async function createMcpServer(rootDir) {
15165
15826
  }
15166
15827
  },
15167
15828
  async ({ input }) => {
15168
- const manifest = await ingestInput(rootDir, input);
15169
- return asToolText(manifest);
15829
+ const result = await ingestInputDetailed(rootDir, input);
15830
+ return asToolText(result);
15170
15831
  }
15171
15832
  );
15172
15833
  server.registerTool(
@@ -15970,12 +16631,11 @@ async function syncCrawlSource(rootDir, entry, options) {
15970
16631
  let updatedCount = 0;
15971
16632
  for (const pageUrl of crawl.pages) {
15972
16633
  const persisted = await ingestInputDetailed(rootDir, pageUrl);
15973
- currentSourceIds.push(persisted.manifest.sourceId);
15974
- if (persisted.isNew) {
15975
- importedCount += 1;
15976
- } else if (persisted.wasUpdated) {
15977
- updatedCount += 1;
15978
- }
16634
+ currentSourceIds.push(...persisted.created.map((manifest) => manifest.sourceId));
16635
+ currentSourceIds.push(...persisted.updated.map((manifest) => manifest.sourceId));
16636
+ currentSourceIds.push(...persisted.unchanged.map((manifest) => manifest.sourceId));
16637
+ importedCount += persisted.created.length;
16638
+ updatedCount += persisted.updated.length;
15979
16639
  }
15980
16640
  let removedCount = 0;
15981
16641
  for (const sourceId of previousSourceIds) {
@@ -17237,6 +17897,7 @@ export {
17237
17897
  importInbox,
17238
17898
  ingestDirectory,
17239
17899
  ingestInput,
17900
+ ingestInputDetailed,
17240
17901
  initVault,
17241
17902
  initWorkspace,
17242
17903
  installAgent,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@swarmvaultai/engine",
3
- "version": "0.2.2",
3
+ "version": "0.3.0",
4
4
  "description": "Core engine for SwarmVault: ingest, compile, query, lint, and provider abstractions.",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -44,6 +44,7 @@
44
44
  "@mozilla/readability": "^0.6.0",
45
45
  "@vscode/tree-sitter-wasm": "^0.3.1",
46
46
  "chokidar": "^4.0.3",
47
+ "csv-parse": "^6.2.1",
47
48
  "fflate": "^0.8.2",
48
49
  "gray-matter": "^4.0.3",
49
50
  "ignore": "^7.0.5",
@@ -55,6 +56,7 @@
55
56
  "tree-sitter-wasms": "^0.1.13",
56
57
  "turndown": "^7.2.1",
57
58
  "typescript": "^5.9.3",
59
+ "xlsx": "^0.18.5",
58
60
  "yaml": "^2.8.1",
59
61
  "zod": "^4.1.8"
60
62
  },