@swarmvaultai/engine 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -1
- package/dist/index.d.ts +20 -3
- package/dist/index.js +761 -100
- package/package.json +3 -1
package/README.md
CHANGED
|
@@ -186,17 +186,21 @@ This matters because many "OpenAI-compatible" backends only implement part of th
|
|
|
186
186
|
- `reloadManagedSources(rootDir, { id, all, compile, brief, maxPages, maxDepth })` re-syncs one managed source or the full registry
|
|
187
187
|
- `deleteManagedSource(rootDir, id)` removes a managed-source registry entry and transient sync state without deleting canonical vault artifacts
|
|
188
188
|
- `ingestInput(rootDir, input, { includeAssets, maxAssetSize })` ingests a local file path or URL
|
|
189
|
+
- `ingestInputDetailed(rootDir, input, { includeAssets, maxAssetSize })` returns a summary envelope with `created`, `updated`, `unchanged`, and `removed` manifests when one input expands into multiple sources
|
|
189
190
|
- `addInput(rootDir, input, { author, contributor })` captures supported URLs into normalized markdown before ingesting them, or falls back to generic URL ingest
|
|
190
191
|
- `ingestDirectory(rootDir, inputDir, { repoRoot, include, exclude, maxFiles, gitignore, extractClasses })` recursively ingests a local directory as a repo-aware code/content source tree
|
|
191
192
|
- `importInbox(rootDir, inputDir?)` recursively imports supported inbox files plus markdown and HTML browser-clipper style bundles
|
|
192
193
|
- managed sources support local directories, public GitHub repo root URLs, and bounded same-domain docs hubs
|
|
193
194
|
- registry data lives in `state/sources.json`, working state lives under `state/sources/<id>/`, and source briefs are written to `wiki/outputs/source-briefs/<id>.md`
|
|
195
|
+
- EPUB inputs split into chapter-level manifests with shared group metadata so books stay navigable instead of becoming one giant source
|
|
196
|
+
- CSV and TSV inputs produce bounded tabular summaries with delimiter-aware previews and compact column hints
|
|
197
|
+
- XLSX inputs extract workbook-level and sheet-level previews, while PPTX inputs extract slide text plus speaker notes when present
|
|
194
198
|
- JavaScript, JSX, TypeScript, TSX, Python, Go, Rust, Java, Kotlin, Scala, Lua, Zig, C#, C, C++, PHP, Ruby, and PowerShell inputs are treated as code sources and compiled into both source pages and `wiki/code/` module pages
|
|
195
199
|
- `.rst` and `.rest` inputs are treated as first-class text sources with lightweight heading and directive normalization before analysis
|
|
196
200
|
- code manifests can carry `repoRelativePath`, and compile writes `state/code-index.json` so local imports can resolve across an ingested repo tree
|
|
197
201
|
- repo-aware manifests, graph nodes, and graph pages can also carry `sourceClass` so first-party, third-party, resource, and generated material can be filtered and reported separately
|
|
198
202
|
- HTML and markdown URL ingests localize remote image references into `raw/assets/<sourceId>/` by default and rewrite the stored markdown to local relative paths
|
|
199
|
-
- PDF and
|
|
203
|
+
- PDF, DOCX, EPUB, CSV/TSV, XLSX, and PPTX ingests write extracted-text and metadata sidecars under `state/extracts/`, and image ingest keeps the same sidecar model for vision extraction
|
|
200
204
|
- Tree-sitter-backed languages now verify runtime and grammar compatibility per language; failures stay local to the affected source and surface as diagnostics instead of aborting the whole compile
|
|
201
205
|
|
|
202
206
|
### Compile + Query
|
package/dist/index.d.ts
CHANGED
|
@@ -54,7 +54,7 @@ type PageStatus = "draft" | "candidate" | "active" | "archived";
|
|
|
54
54
|
type PageManager = "system" | "human";
|
|
55
55
|
type ApprovalEntryStatus = "pending" | "accepted" | "rejected";
|
|
56
56
|
type ApprovalChangeType = "create" | "update" | "delete" | "promote";
|
|
57
|
-
type SourceKind = "markdown" | "text" | "pdf" | "image" | "html" | "docx" | "binary" | "code";
|
|
57
|
+
type SourceKind = "markdown" | "text" | "pdf" | "image" | "html" | "docx" | "epub" | "csv" | "xlsx" | "pptx" | "binary" | "code";
|
|
58
58
|
type SourceCaptureType = "arxiv" | "doi" | "tweet" | "article" | "url";
|
|
59
59
|
type SourceClass = "first_party" | "third_party" | "resource" | "generated";
|
|
60
60
|
type ManagedSourceKind = "directory" | "github_repo" | "crawl_url";
|
|
@@ -231,7 +231,7 @@ interface SourceAttachment {
|
|
|
231
231
|
mimeType: string;
|
|
232
232
|
originalPath?: string;
|
|
233
233
|
}
|
|
234
|
-
type ExtractionKind = "plain_text" | "html_readability" | "pdf_text" | "docx_text" | "image_vision";
|
|
234
|
+
type ExtractionKind = "plain_text" | "html_readability" | "pdf_text" | "docx_text" | "epub_text" | "csv_text" | "xlsx_text" | "pptx_text" | "image_vision";
|
|
235
235
|
interface ExtractionTerm {
|
|
236
236
|
name: string;
|
|
237
237
|
description: string;
|
|
@@ -284,6 +284,15 @@ interface DirectoryIngestResult {
|
|
|
284
284
|
updated: SourceManifest[];
|
|
285
285
|
skipped: DirectoryIngestSkip[];
|
|
286
286
|
}
|
|
287
|
+
interface InputIngestResult {
|
|
288
|
+
input: string;
|
|
289
|
+
scannedCount: number;
|
|
290
|
+
created: SourceManifest[];
|
|
291
|
+
updated: SourceManifest[];
|
|
292
|
+
unchanged: SourceManifest[];
|
|
293
|
+
removed: SourceManifest[];
|
|
294
|
+
skipped: DirectoryIngestSkip[];
|
|
295
|
+
}
|
|
287
296
|
interface SourceManifest {
|
|
288
297
|
sourceId: string;
|
|
289
298
|
title: string;
|
|
@@ -302,6 +311,13 @@ interface SourceManifest {
|
|
|
302
311
|
mimeType: string;
|
|
303
312
|
contentHash: string;
|
|
304
313
|
semanticHash: string;
|
|
314
|
+
sourceGroupId?: string;
|
|
315
|
+
sourceGroupTitle?: string;
|
|
316
|
+
sourcePartKey?: string;
|
|
317
|
+
partIndex?: number;
|
|
318
|
+
partCount?: number;
|
|
319
|
+
partTitle?: string;
|
|
320
|
+
details?: Record<string, string>;
|
|
305
321
|
createdAt: string;
|
|
306
322
|
updatedAt: string;
|
|
307
323
|
attachments?: SourceAttachment[];
|
|
@@ -1206,6 +1222,7 @@ declare function uninstallGitHooks(rootDir: string): Promise<GitHookStatus>;
|
|
|
1206
1222
|
declare function listTrackedRepoRoots(rootDir: string): Promise<string[]>;
|
|
1207
1223
|
declare function syncTrackedRepos(rootDir: string, options?: IngestOptions, repoRoots?: string[]): Promise<RepoSyncResult>;
|
|
1208
1224
|
declare function syncTrackedReposForWatch(rootDir: string, options?: IngestOptions, repoRoots?: string[]): Promise<WatchRepoSyncResult>;
|
|
1225
|
+
declare function ingestInputDetailed(rootDir: string, input: string, options?: IngestOptions): Promise<InputIngestResult>;
|
|
1209
1226
|
declare function ingestInput(rootDir: string, input: string, options?: IngestOptions): Promise<SourceManifest>;
|
|
1210
1227
|
declare function addInput(rootDir: string, input: string, options?: AddOptions): Promise<AddResult>;
|
|
1211
1228
|
declare function ingestDirectory(rootDir: string, inputDir: string, options?: IngestOptions): Promise<DirectoryIngestResult>;
|
|
@@ -1327,4 +1344,4 @@ declare function getWatchStatus(rootDir: string): Promise<WatchStatusResult>;
|
|
|
1327
1344
|
declare function createWebSearchAdapter(id: string, config: WebSearchProviderConfig, rootDir: string): Promise<WebSearchAdapter>;
|
|
1328
1345
|
declare function getWebSearchAdapterForTask(rootDir: string, task: "deepLintProvider"): Promise<WebSearchAdapter>;
|
|
1329
1346
|
|
|
1330
|
-
export { type AddOptions, type AddResult, type AgentType, type AnalyzedTerm, type ApprovalChangeType, type ApprovalDetail, type ApprovalEntry, type ApprovalEntryDetail, type ApprovalEntryStatus, type ApprovalManifest, type ApprovalSummary, type BenchmarkArtifact, type BenchmarkOptions, type BenchmarkQuestionResult, type BenchmarkSummary, type CandidateRecord, type ChartDatum, type ChartSpec, type ClaimStatus, type CodeAnalysis, type CodeDiagnostic, type CodeImport, type CodeIndexArtifact, type CodeIndexEntry, type CodeLanguage, type CodeSymbol, type CodeSymbolKind, type CommandRoleExecutorConfig, type CompileOptions, type CompileResult, type CompileState, type DirectoryIngestResult, type DirectoryIngestSkip, type EmbeddingCacheArtifact, type EmbeddingCacheEntry, type EvidenceClass, type ExploreOptions, type ExploreResult, type ExploreStepResult, type ExtractionClaim, type ExtractionKind, type ExtractionTerm, type Freshness, type GenerationAttachment, type GenerationRequest, type GenerationResponse, type GitHookStatus, type GraphArtifact, type GraphEdge, type GraphExplainNeighbor, type GraphExplainResult, type GraphExportFormat, type GraphExportResult, type GraphHyperedge, type GraphNode, type GraphPage, type GraphPathResult, type GraphPushCounts, type GraphPushNeo4jOptions, type GraphPushResult, type GraphQueryMatch, type GraphQueryResult, type GraphReportArtifact, type ImageGenerationRequest, type ImageGenerationResponse, type ImageVisionExtraction, type InboxImportResult, type InboxImportSkip, type IngestOptions, type InitOptions, type InstallAgentOptions, type InstallAgentResult, type LintFinding, type LintOptions, type ManagedSourceAddOptions, type ManagedSourceAddResult, type ManagedSourceDeleteResult, type ManagedSourceKind, type ManagedSourceRecord, type ManagedSourceReloadOptions, type ManagedSourceReloadResult, type ManagedSourceStatus, type ManagedSourceSyncCounts, type ManagedSourcesArtifact, type Neo4jGraphSinkConfig, type OrchestrationConfig, type OrchestrationFinding, type OrchestrationProposal, type OrchestrationRole, type OrchestrationRoleConfig, type OrchestrationRoleResult, type OutputAsset, type OutputAssetRole, type OutputFormat, type OutputOrigin, type PageKind, type PageManager, type PageStatus, type PendingSemanticRefreshEntry, type Polarity, type ProviderAdapter, type ProviderCapability, type ProviderConfig, type ProviderRoleExecutorConfig, type ProviderType, type QueryOptions, type QueryResult, type RepoSyncResult, type ResolvedPaths, type ReviewActionResult, type RoleExecutorConfig, type SceneElement, type SceneSpec, type ScheduleController, type ScheduleJobConfig, type ScheduleStateRecord, type ScheduleTriggerConfig, type ScheduledCompileTask, type ScheduledExploreTask, type ScheduledLintTask, type ScheduledQueryTask, type ScheduledRunResult, type ScheduledTaskConfig, type SearchResult, type SourceAnalysis, type SourceAttachment, type SourceCaptureType, type SourceClaim, type SourceClass, type SourceExtractionArtifact, type SourceKind, type SourceManifest, type SourceRationale, type VaultConfig, type WatchController, type WatchOptions, type WatchRepoSyncResult, type WatchRunRecord, type WatchStatusResult, type WebSearchAdapter, type WebSearchProviderConfig, type WebSearchProviderType, type WebSearchResult, acceptApproval, addInput, addManagedSource, agentTypeSchema, archiveCandidate, assertProviderCapability, benchmarkVault, bootstrapDemo, compileVault, createMcpServer, createProvider, createWebSearchAdapter, defaultVaultConfig, defaultVaultSchema, deleteManagedSource, explainGraphVault, exploreVault, exportGraphFormat, exportGraphHtml, getGitHookStatus, getProviderForTask, getWatchStatus, getWebSearchAdapterForTask, getWorkspaceInfo, importInbox, ingestDirectory, ingestInput, initVault, initWorkspace, installAgent, installConfiguredAgents, installGitHooks, lintVault, listApprovals, listCandidates, listGodNodes, listGraphHyperedges, listManagedSourceRecords, listManifests, listPages, listSchedules, listTrackedRepoRoots, loadVaultConfig, loadVaultSchema, loadVaultSchemas, pathGraphVault, promoteCandidate, providerCapabilitySchema, providerTypeSchema, pushGraphNeo4j, queryGraphVault, queryVault, readApproval, readExtractedText, readGraphReport, readPage, rejectApproval, reloadManagedSources, resolvePaths, runSchedule, runWatchCycle, searchVault, serveSchedules, startGraphServer, startMcpServer, syncTrackedRepos, syncTrackedReposForWatch, uninstallGitHooks, watchVault, webSearchProviderTypeSchema };
|
|
1347
|
+
export { type AddOptions, type AddResult, type AgentType, type AnalyzedTerm, type ApprovalChangeType, type ApprovalDetail, type ApprovalEntry, type ApprovalEntryDetail, type ApprovalEntryStatus, type ApprovalManifest, type ApprovalSummary, type BenchmarkArtifact, type BenchmarkOptions, type BenchmarkQuestionResult, type BenchmarkSummary, type CandidateRecord, type ChartDatum, type ChartSpec, type ClaimStatus, type CodeAnalysis, type CodeDiagnostic, type CodeImport, type CodeIndexArtifact, type CodeIndexEntry, type CodeLanguage, type CodeSymbol, type CodeSymbolKind, type CommandRoleExecutorConfig, type CompileOptions, type CompileResult, type CompileState, type DirectoryIngestResult, type DirectoryIngestSkip, type EmbeddingCacheArtifact, type EmbeddingCacheEntry, type EvidenceClass, type ExploreOptions, type ExploreResult, type ExploreStepResult, type ExtractionClaim, type ExtractionKind, type ExtractionTerm, type Freshness, type GenerationAttachment, type GenerationRequest, type GenerationResponse, type GitHookStatus, type GraphArtifact, type GraphEdge, type GraphExplainNeighbor, type GraphExplainResult, type GraphExportFormat, type GraphExportResult, type GraphHyperedge, type GraphNode, type GraphPage, type GraphPathResult, type GraphPushCounts, type GraphPushNeo4jOptions, type GraphPushResult, type GraphQueryMatch, type GraphQueryResult, type GraphReportArtifact, type ImageGenerationRequest, type ImageGenerationResponse, type ImageVisionExtraction, type InboxImportResult, type InboxImportSkip, type IngestOptions, type InitOptions, type InputIngestResult, type InstallAgentOptions, type InstallAgentResult, type LintFinding, type LintOptions, type ManagedSourceAddOptions, type ManagedSourceAddResult, type ManagedSourceDeleteResult, type ManagedSourceKind, type ManagedSourceRecord, type ManagedSourceReloadOptions, type ManagedSourceReloadResult, type ManagedSourceStatus, type ManagedSourceSyncCounts, type ManagedSourcesArtifact, type Neo4jGraphSinkConfig, type OrchestrationConfig, type OrchestrationFinding, type OrchestrationProposal, type OrchestrationRole, type OrchestrationRoleConfig, type OrchestrationRoleResult, type OutputAsset, type OutputAssetRole, type OutputFormat, type OutputOrigin, type PageKind, type PageManager, type PageStatus, type PendingSemanticRefreshEntry, type Polarity, type ProviderAdapter, type ProviderCapability, type ProviderConfig, type ProviderRoleExecutorConfig, type ProviderType, type QueryOptions, type QueryResult, type RepoSyncResult, type ResolvedPaths, type ReviewActionResult, type RoleExecutorConfig, type SceneElement, type SceneSpec, type ScheduleController, type ScheduleJobConfig, type ScheduleStateRecord, type ScheduleTriggerConfig, type ScheduledCompileTask, type ScheduledExploreTask, type ScheduledLintTask, type ScheduledQueryTask, type ScheduledRunResult, type ScheduledTaskConfig, type SearchResult, type SourceAnalysis, type SourceAttachment, type SourceCaptureType, type SourceClaim, type SourceClass, type SourceExtractionArtifact, type SourceKind, type SourceManifest, type SourceRationale, type VaultConfig, type WatchController, type WatchOptions, type WatchRepoSyncResult, type WatchRunRecord, type WatchStatusResult, type WebSearchAdapter, type WebSearchProviderConfig, type WebSearchProviderType, type WebSearchResult, acceptApproval, addInput, addManagedSource, agentTypeSchema, archiveCandidate, assertProviderCapability, benchmarkVault, bootstrapDemo, compileVault, createMcpServer, createProvider, createWebSearchAdapter, defaultVaultConfig, defaultVaultSchema, deleteManagedSource, explainGraphVault, exploreVault, exportGraphFormat, exportGraphHtml, getGitHookStatus, getProviderForTask, getWatchStatus, getWebSearchAdapterForTask, getWorkspaceInfo, importInbox, ingestDirectory, ingestInput, ingestInputDetailed, initVault, initWorkspace, installAgent, installConfiguredAgents, installGitHooks, lintVault, listApprovals, listCandidates, listGodNodes, listGraphHyperedges, listManagedSourceRecords, listManifests, listPages, listSchedules, listTrackedRepoRoots, loadVaultConfig, loadVaultSchema, loadVaultSchemas, pathGraphVault, promoteCandidate, providerCapabilitySchema, providerTypeSchema, pushGraphNeo4j, queryGraphVault, queryVault, readApproval, readExtractedText, readGraphReport, readPage, rejectApproval, reloadManagedSources, resolvePaths, runSchedule, runWatchCycle, searchVault, serveSchedules, startGraphServer, startMcpServer, syncTrackedRepos, syncTrackedReposForWatch, uninstallGitHooks, watchVault, webSearchProviderTypeSchema };
|
package/dist/index.js
CHANGED
|
@@ -1729,7 +1729,7 @@ import matter3 from "gray-matter";
|
|
|
1729
1729
|
import ignore from "ignore";
|
|
1730
1730
|
import { JSDOM as JSDOM2 } from "jsdom";
|
|
1731
1731
|
import mime from "mime-types";
|
|
1732
|
-
import
|
|
1732
|
+
import TurndownService2 from "turndown";
|
|
1733
1733
|
|
|
1734
1734
|
// src/code-analysis.ts
|
|
1735
1735
|
import fs6 from "fs/promises";
|
|
@@ -4504,8 +4504,10 @@ async function analyzeCodeSource(manifest, extractedText, schemaHash) {
|
|
|
4504
4504
|
import fs7 from "fs/promises";
|
|
4505
4505
|
import os from "os";
|
|
4506
4506
|
import path7 from "path";
|
|
4507
|
+
import { parse as parseCsvSync } from "csv-parse/sync";
|
|
4507
4508
|
import { strFromU8, unzipSync } from "fflate";
|
|
4508
4509
|
import { JSDOM } from "jsdom";
|
|
4510
|
+
import TurndownService from "turndown";
|
|
4509
4511
|
import { z } from "zod";
|
|
4510
4512
|
var imageVisionExtractionSchema = z.object({
|
|
4511
4513
|
title: z.string().min(1).nullable().optional(),
|
|
@@ -4685,7 +4687,7 @@ function normalizePdfMetadata(raw) {
|
|
|
4685
4687
|
function normalizeDocumentText(raw) {
|
|
4686
4688
|
return raw.replace(/\r\n/g, "\n").split(/\n{2,}/).map((section) => normalizeWhitespace(section)).filter(Boolean).join("\n\n").trim();
|
|
4687
4689
|
}
|
|
4688
|
-
function
|
|
4690
|
+
function parseOfficeCoreMetadata(bytes) {
|
|
4689
4691
|
try {
|
|
4690
4692
|
const archive = unzipSync(new Uint8Array(bytes));
|
|
4691
4693
|
const coreXml = archive["docProps/core.xml"];
|
|
@@ -4725,6 +4727,122 @@ function parseDocxCoreMetadata(bytes) {
|
|
|
4725
4727
|
return void 0;
|
|
4726
4728
|
}
|
|
4727
4729
|
}
|
|
4730
|
+
function decodeTextBytes(bytes) {
|
|
4731
|
+
const text = bytes.toString("utf8");
|
|
4732
|
+
return text.charCodeAt(0) === 65279 ? text.slice(1) : text;
|
|
4733
|
+
}
|
|
4734
|
+
function normalizeTableCell(value) {
|
|
4735
|
+
return normalizeWhitespace(String(value ?? ""));
|
|
4736
|
+
}
|
|
4737
|
+
function isNumericCell(value) {
|
|
4738
|
+
return value.length > 0 && Number.isFinite(Number(value));
|
|
4739
|
+
}
|
|
4740
|
+
function detectHeaderRow(rows) {
|
|
4741
|
+
if (!rows.length) {
|
|
4742
|
+
return { headers: [], bodyRows: [] };
|
|
4743
|
+
}
|
|
4744
|
+
const firstRow = rows[0] ?? [];
|
|
4745
|
+
const nonEmpty = firstRow.filter(Boolean);
|
|
4746
|
+
const unique = new Set(nonEmpty);
|
|
4747
|
+
const nonNumeric = nonEmpty.filter((value) => !isNumericCell(value));
|
|
4748
|
+
const looksLikeHeader = nonEmpty.length > 0 && unique.size === nonEmpty.length && nonNumeric.length >= Math.ceil(nonEmpty.length / 2) && rows.length > 1;
|
|
4749
|
+
if (looksLikeHeader) {
|
|
4750
|
+
return {
|
|
4751
|
+
headers: firstRow.map((value, index) => value || `column_${index + 1}`),
|
|
4752
|
+
bodyRows: rows.slice(1)
|
|
4753
|
+
};
|
|
4754
|
+
}
|
|
4755
|
+
const columnCount = Math.max(...rows.map((row) => row.length), 0);
|
|
4756
|
+
return {
|
|
4757
|
+
headers: Array.from({ length: columnCount }, (_, index) => `column_${index + 1}`),
|
|
4758
|
+
bodyRows: rows
|
|
4759
|
+
};
|
|
4760
|
+
}
|
|
4761
|
+
function columnHints(headers, rows) {
|
|
4762
|
+
return headers.map((header, index) => {
|
|
4763
|
+
const values = rows.map((row) => row[index] ?? "").map(normalizeTableCell).filter(Boolean);
|
|
4764
|
+
if (!values.length) {
|
|
4765
|
+
return null;
|
|
4766
|
+
}
|
|
4767
|
+
const uniqueValues = [...new Set(values)];
|
|
4768
|
+
if (values.every(isNumericCell)) {
|
|
4769
|
+
return `- ${header}: numeric`;
|
|
4770
|
+
}
|
|
4771
|
+
if (uniqueValues.length <= 6 && values.length >= uniqueValues.length) {
|
|
4772
|
+
return `- ${header}: low-cardinality (${uniqueValues.slice(0, 6).join(", ")})`;
|
|
4773
|
+
}
|
|
4774
|
+
return null;
|
|
4775
|
+
}).filter((item) => Boolean(item));
|
|
4776
|
+
}
|
|
4777
|
+
function markdownTable(headers, rows, rowLimit = 20) {
|
|
4778
|
+
if (!headers.length) {
|
|
4779
|
+
return ["No tabular preview available."];
|
|
4780
|
+
}
|
|
4781
|
+
const width = headers.length;
|
|
4782
|
+
const lines = [`| ${headers.join(" | ")} |`, `| ${headers.map(() => "---").join(" | ")} |`];
|
|
4783
|
+
for (const row of rows.slice(0, rowLimit)) {
|
|
4784
|
+
const normalized = Array.from({ length: width }, (_, index) => normalizeTableCell(row[index] ?? ""));
|
|
4785
|
+
lines.push(`| ${normalized.join(" | ")} |`);
|
|
4786
|
+
}
|
|
4787
|
+
return lines;
|
|
4788
|
+
}
|
|
4789
|
+
function zipEntryText(archive, entryPath) {
|
|
4790
|
+
const entry = archive[entryPath];
|
|
4791
|
+
return entry ? strFromU8(entry) : void 0;
|
|
4792
|
+
}
|
|
4793
|
+
function parseXmlDocument(xml) {
|
|
4794
|
+
return new JSDOM(xml, { contentType: "text/xml" }).window.document;
|
|
4795
|
+
}
|
|
4796
|
+
function zipDirname(value) {
|
|
4797
|
+
const index = value.lastIndexOf("/");
|
|
4798
|
+
return index === -1 ? "" : value.slice(0, index);
|
|
4799
|
+
}
|
|
4800
|
+
function resolveZipTarget(basePath, target) {
|
|
4801
|
+
return path7.posix.normalize(path7.posix.join(zipDirname(basePath), target));
|
|
4802
|
+
}
|
|
4803
|
+
function relationshipTargets(xml, basePath) {
|
|
4804
|
+
const document = parseXmlDocument(xml);
|
|
4805
|
+
const map = /* @__PURE__ */ new Map();
|
|
4806
|
+
for (const node of Array.from(document.getElementsByTagName("*"))) {
|
|
4807
|
+
if (node.localName !== "Relationship") {
|
|
4808
|
+
continue;
|
|
4809
|
+
}
|
|
4810
|
+
const id = node.getAttribute("Id")?.trim();
|
|
4811
|
+
const target = node.getAttribute("Target")?.trim();
|
|
4812
|
+
const type = node.getAttribute("Type")?.trim() ?? "";
|
|
4813
|
+
if (!id || !target) {
|
|
4814
|
+
continue;
|
|
4815
|
+
}
|
|
4816
|
+
map.set(id, { target: resolveZipTarget(basePath, target), type });
|
|
4817
|
+
}
|
|
4818
|
+
return map;
|
|
4819
|
+
}
|
|
4820
|
+
function xmlTextNodes(xml, localName) {
|
|
4821
|
+
const document = parseXmlDocument(xml);
|
|
4822
|
+
const values = [];
|
|
4823
|
+
for (const node of Array.from(document.getElementsByTagName("*"))) {
|
|
4824
|
+
if (node.localName !== localName) {
|
|
4825
|
+
continue;
|
|
4826
|
+
}
|
|
4827
|
+
const text = normalizeWhitespace(node.textContent ?? "");
|
|
4828
|
+
if (text) {
|
|
4829
|
+
values.push(text);
|
|
4830
|
+
}
|
|
4831
|
+
}
|
|
4832
|
+
return values;
|
|
4833
|
+
}
|
|
4834
|
+
function firstHtmlHeading(html) {
|
|
4835
|
+
const dom = new JSDOM(html);
|
|
4836
|
+
const heading = dom.window.document.querySelector("h1, h2, h3");
|
|
4837
|
+
const title = normalizeWhitespace(heading?.textContent ?? "");
|
|
4838
|
+
return title || void 0;
|
|
4839
|
+
}
|
|
4840
|
+
function htmlToMarkdown(html) {
|
|
4841
|
+
const dom = new JSDOM(html);
|
|
4842
|
+
const turndown = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced" });
|
|
4843
|
+
const body = dom.window.document.body?.innerHTML ?? html;
|
|
4844
|
+
return turndown.turndown(body).trim();
|
|
4845
|
+
}
|
|
4728
4846
|
async function extractPdfText(input) {
|
|
4729
4847
|
try {
|
|
4730
4848
|
const pdfjs = await import("pdfjs-dist/legacy/build/pdf.mjs");
|
|
@@ -4782,7 +4900,7 @@ async function extractDocxText(input) {
|
|
|
4782
4900
|
const warnings = result.messages.map((message) => normalizeWhitespace(message.message)).filter(Boolean).map((message) => truncate(message, 240));
|
|
4783
4901
|
const artifact = {
|
|
4784
4902
|
...extractionMetadata("docx", input.mimeType, "docx_text"),
|
|
4785
|
-
metadata:
|
|
4903
|
+
metadata: parseOfficeCoreMetadata(input.bytes),
|
|
4786
4904
|
warnings: warnings.length ? warnings : void 0
|
|
4787
4905
|
};
|
|
4788
4906
|
if (!extractedText) {
|
|
@@ -4801,6 +4919,258 @@ async function extractDocxText(input) {
|
|
|
4801
4919
|
};
|
|
4802
4920
|
}
|
|
4803
4921
|
}
|
|
4922
|
+
async function extractCsvText(input) {
|
|
4923
|
+
try {
|
|
4924
|
+
const rawText = decodeTextBytes(input.bytes);
|
|
4925
|
+
const delimiter = input.fileName?.toLowerCase().endsWith(".tsv") || input.mimeType.includes("tab-separated") ? " " : ",";
|
|
4926
|
+
const parsed = parseCsvSync(rawText, {
|
|
4927
|
+
delimiter,
|
|
4928
|
+
relax_column_count: true,
|
|
4929
|
+
skip_empty_lines: true,
|
|
4930
|
+
trim: true
|
|
4931
|
+
});
|
|
4932
|
+
const rows = parsed.map((row) => row.map((value) => normalizeTableCell(value)));
|
|
4933
|
+
const { headers, bodyRows } = detectHeaderRow(rows);
|
|
4934
|
+
const hintLines = columnHints(headers, bodyRows);
|
|
4935
|
+
const title = input.fileName ? path7.basename(input.fileName, path7.extname(input.fileName)) : void 0;
|
|
4936
|
+
const extractedText = [
|
|
4937
|
+
title ? `# ${title}` : null,
|
|
4938
|
+
`Format: ${delimiter === " " ? "TSV" : "CSV"}`,
|
|
4939
|
+
`Rows: ${bodyRows.length}`,
|
|
4940
|
+
`Columns: ${headers.length}`,
|
|
4941
|
+
headers.length ? `Headers: ${headers.join(", ")}` : null,
|
|
4942
|
+
"",
|
|
4943
|
+
hintLines.length ? "## Column Hints" : null,
|
|
4944
|
+
hintLines.length ? hintLines.join("\n") : null,
|
|
4945
|
+
hintLines.length ? "" : null,
|
|
4946
|
+
"## Preview",
|
|
4947
|
+
...markdownTable(headers, bodyRows)
|
|
4948
|
+
].filter((item) => Boolean(item)).join("\n").trim();
|
|
4949
|
+
const artifact = {
|
|
4950
|
+
...extractionMetadata("csv", input.mimeType, "csv_text"),
|
|
4951
|
+
metadata: {
|
|
4952
|
+
format: delimiter === " " ? "tsv" : "csv",
|
|
4953
|
+
row_count: String(bodyRows.length),
|
|
4954
|
+
column_count: String(headers.length),
|
|
4955
|
+
headers: headers.join(", ")
|
|
4956
|
+
}
|
|
4957
|
+
};
|
|
4958
|
+
return {
|
|
4959
|
+
title,
|
|
4960
|
+
extractedText,
|
|
4961
|
+
artifact
|
|
4962
|
+
};
|
|
4963
|
+
} catch (error) {
|
|
4964
|
+
return {
|
|
4965
|
+
artifact: {
|
|
4966
|
+
...extractionMetadata("csv", input.mimeType, "csv_text"),
|
|
4967
|
+
warnings: [`CSV extraction failed: ${error instanceof Error ? truncate(error.message, 240) : "unknown error"}`]
|
|
4968
|
+
}
|
|
4969
|
+
};
|
|
4970
|
+
}
|
|
4971
|
+
}
|
|
4972
|
+
async function extractXlsxText(input) {
|
|
4973
|
+
try {
|
|
4974
|
+
const XLSX = await import("xlsx");
|
|
4975
|
+
const workbook = XLSX.read(input.bytes, { type: "buffer", cellFormula: false, cellHTML: false, cellStyles: false });
|
|
4976
|
+
const allSheetNames = workbook.SheetNames;
|
|
4977
|
+
const sheetNames = allSheetNames.slice(0, 10);
|
|
4978
|
+
const sheetSections = [];
|
|
4979
|
+
const metadata = {
|
|
4980
|
+
sheet_count: String(allSheetNames.length),
|
|
4981
|
+
sheet_names: allSheetNames.join(", ")
|
|
4982
|
+
};
|
|
4983
|
+
for (const sheetName of sheetNames) {
|
|
4984
|
+
const sheet = workbook.Sheets[sheetName];
|
|
4985
|
+
if (!sheet) {
|
|
4986
|
+
continue;
|
|
4987
|
+
}
|
|
4988
|
+
const rows = XLSX.utils.sheet_to_json(sheet, {
|
|
4989
|
+
header: 1,
|
|
4990
|
+
raw: false,
|
|
4991
|
+
defval: ""
|
|
4992
|
+
}).map((row) => row.map((value) => normalizeTableCell(value)));
|
|
4993
|
+
const { headers, bodyRows } = detectHeaderRow(rows);
|
|
4994
|
+
sheetSections.push(`## Sheet: ${sheetName}`);
|
|
4995
|
+
sheetSections.push(`Rows: ${bodyRows.length}`);
|
|
4996
|
+
sheetSections.push(`Columns: ${headers.length}`);
|
|
4997
|
+
sheetSections.push(...markdownTable(headers, bodyRows));
|
|
4998
|
+
sheetSections.push("");
|
|
4999
|
+
}
|
|
5000
|
+
const title = normalizeWhitespace(String(workbook.Props?.Title ?? "")) || (input.fileName ? path7.basename(input.fileName, path7.extname(input.fileName)) : void 0);
|
|
5001
|
+
const extractedText = [
|
|
5002
|
+
title ? `# ${title}` : null,
|
|
5003
|
+
`Sheets: ${allSheetNames.length}`,
|
|
5004
|
+
allSheetNames.length ? `Sheet Names: ${allSheetNames.join(", ")}` : null,
|
|
5005
|
+
"",
|
|
5006
|
+
...sheetSections
|
|
5007
|
+
].filter((item) => Boolean(item)).join("\n").trim();
|
|
5008
|
+
const warnings = allSheetNames.length > sheetNames.length ? ["Workbook preview truncated to the first 10 sheets."] : void 0;
|
|
5009
|
+
return {
|
|
5010
|
+
title,
|
|
5011
|
+
extractedText,
|
|
5012
|
+
artifact: {
|
|
5013
|
+
...extractionMetadata("xlsx", input.mimeType, "xlsx_text"),
|
|
5014
|
+
metadata,
|
|
5015
|
+
warnings
|
|
5016
|
+
}
|
|
5017
|
+
};
|
|
5018
|
+
} catch (error) {
|
|
5019
|
+
return {
|
|
5020
|
+
artifact: {
|
|
5021
|
+
...extractionMetadata("xlsx", input.mimeType, "xlsx_text"),
|
|
5022
|
+
warnings: [`XLSX extraction failed: ${error instanceof Error ? truncate(error.message, 240) : "unknown error"}`]
|
|
5023
|
+
}
|
|
5024
|
+
};
|
|
5025
|
+
}
|
|
5026
|
+
}
|
|
5027
|
+
async function extractPptxText(input) {
|
|
5028
|
+
try {
|
|
5029
|
+
const archive = unzipSync(new Uint8Array(input.bytes));
|
|
5030
|
+
const presentationXml = zipEntryText(archive, "ppt/presentation.xml");
|
|
5031
|
+
if (!presentationXml) {
|
|
5032
|
+
throw new Error("Missing ppt/presentation.xml");
|
|
5033
|
+
}
|
|
5034
|
+
const relsXml = zipEntryText(archive, "ppt/_rels/presentation.xml.rels");
|
|
5035
|
+
if (!relsXml) {
|
|
5036
|
+
throw new Error("Missing ppt/_rels/presentation.xml.rels");
|
|
5037
|
+
}
|
|
5038
|
+
const rels = relationshipTargets(relsXml, "ppt/presentation.xml");
|
|
5039
|
+
const document = parseXmlDocument(presentationXml);
|
|
5040
|
+
const slideTargets = Array.from(document.getElementsByTagName("*")).filter((node) => node.localName === "sldId").map((node) => node.getAttribute("r:id")?.trim()).filter((value) => Boolean(value)).map((relationshipId) => rels.get(relationshipId)?.target).filter((value) => Boolean(value)).slice(0, 60);
|
|
5041
|
+
const slideSections = [];
|
|
5042
|
+
for (let index = 0; index < slideTargets.length; index += 1) {
|
|
5043
|
+
const slidePath = slideTargets[index];
|
|
5044
|
+
const slideXml = zipEntryText(archive, slidePath);
|
|
5045
|
+
if (!slideXml) {
|
|
5046
|
+
continue;
|
|
5047
|
+
}
|
|
5048
|
+
const slideTexts = xmlTextNodes(slideXml, "t");
|
|
5049
|
+
const slideTitle = slideTexts[0] ?? `Slide ${index + 1}`;
|
|
5050
|
+
slideSections.push(`## Slide ${index + 1}: ${slideTitle}`);
|
|
5051
|
+
if (slideTexts.length) {
|
|
5052
|
+
slideSections.push(slideTexts.join("\n"));
|
|
5053
|
+
}
|
|
5054
|
+
const slideRelsPath = `${zipDirname(slidePath)}/_rels/${path7.posix.basename(slidePath)}.rels`;
|
|
5055
|
+
const slideRelsXml = zipEntryText(archive, slideRelsPath);
|
|
5056
|
+
if (slideRelsXml) {
|
|
5057
|
+
const slideRels = relationshipTargets(slideRelsXml, slidePath);
|
|
5058
|
+
const notesTarget = [...slideRels.values()].find((entry) => entry.type.endsWith("/notesSlide"))?.target;
|
|
5059
|
+
if (notesTarget) {
|
|
5060
|
+
const notesXml = zipEntryText(archive, notesTarget);
|
|
5061
|
+
const noteTexts = notesXml ? xmlTextNodes(notesXml, "t") : [];
|
|
5062
|
+
if (noteTexts.length) {
|
|
5063
|
+
slideSections.push("Notes:");
|
|
5064
|
+
slideSections.push(noteTexts.join("\n"));
|
|
5065
|
+
}
|
|
5066
|
+
}
|
|
5067
|
+
}
|
|
5068
|
+
slideSections.push("");
|
|
5069
|
+
}
|
|
5070
|
+
const metadata = parseOfficeCoreMetadata(input.bytes);
|
|
5071
|
+
const title = metadata?.title || (input.fileName ? path7.basename(input.fileName, path7.extname(input.fileName)) : void 0);
|
|
5072
|
+
const extractedText = [title ? `# ${title}` : null, `Slides: ${slideTargets.length}`, "", ...slideSections].filter((item) => Boolean(item)).join("\n").trim();
|
|
5073
|
+
return {
|
|
5074
|
+
title,
|
|
5075
|
+
extractedText,
|
|
5076
|
+
artifact: {
|
|
5077
|
+
...extractionMetadata("pptx", input.mimeType, "pptx_text"),
|
|
5078
|
+
metadata: {
|
|
5079
|
+
...metadata ?? {},
|
|
5080
|
+
slide_count: String(slideTargets.length)
|
|
5081
|
+
},
|
|
5082
|
+
warnings: Array.from(document.getElementsByTagName("*")).filter((node) => node.localName === "sldId").length > slideTargets.length ? ["Slide extraction truncated to the first 60 slides."] : void 0
|
|
5083
|
+
}
|
|
5084
|
+
};
|
|
5085
|
+
} catch (error) {
|
|
5086
|
+
return {
|
|
5087
|
+
artifact: {
|
|
5088
|
+
...extractionMetadata("pptx", input.mimeType, "pptx_text"),
|
|
5089
|
+
warnings: [`PPTX extraction failed: ${error instanceof Error ? truncate(error.message, 240) : "unknown error"}`]
|
|
5090
|
+
}
|
|
5091
|
+
};
|
|
5092
|
+
}
|
|
5093
|
+
}
|
|
5094
|
+
async function extractEpubChapters(input) {
|
|
5095
|
+
try {
|
|
5096
|
+
const archive = unzipSync(new Uint8Array(input.bytes));
|
|
5097
|
+
const containerXml = zipEntryText(archive, "META-INF/container.xml");
|
|
5098
|
+
if (!containerXml) {
|
|
5099
|
+
throw new Error("Missing META-INF/container.xml");
|
|
5100
|
+
}
|
|
5101
|
+
const container = parseXmlDocument(containerXml);
|
|
5102
|
+
const rootfile = Array.from(container.getElementsByTagName("*")).find((node) => node.localName === "rootfile");
|
|
5103
|
+
const packagePath = rootfile?.getAttribute("full-path")?.trim();
|
|
5104
|
+
if (!packagePath) {
|
|
5105
|
+
throw new Error("EPUB container did not declare a package document.");
|
|
5106
|
+
}
|
|
5107
|
+
const packageXml = zipEntryText(archive, packagePath);
|
|
5108
|
+
if (!packageXml) {
|
|
5109
|
+
throw new Error(`Missing EPUB package document: ${packagePath}`);
|
|
5110
|
+
}
|
|
5111
|
+
const packageDocument = parseXmlDocument(packageXml);
|
|
5112
|
+
const manifestEntries = new Map(
|
|
5113
|
+
Array.from(packageDocument.getElementsByTagName("*")).filter((node) => node.localName === "item").map(
|
|
5114
|
+
(node) => [
|
|
5115
|
+
node.getAttribute("id")?.trim() ?? "",
|
|
5116
|
+
{
|
|
5117
|
+
href: node.getAttribute("href")?.trim() ?? "",
|
|
5118
|
+
mediaType: node.getAttribute("media-type")?.trim() ?? "",
|
|
5119
|
+
properties: node.getAttribute("properties")?.trim() ?? ""
|
|
5120
|
+
}
|
|
5121
|
+
]
|
|
5122
|
+
).filter(([id, item]) => Boolean(id && item.href))
|
|
5123
|
+
);
|
|
5124
|
+
const spineIds = Array.from(packageDocument.getElementsByTagName("*")).filter((node) => node.localName === "itemref").map((node) => node.getAttribute("idref")?.trim()).filter((value) => Boolean(value));
|
|
5125
|
+
const bookTitle = xmlTextNodes(packageXml, "title")[0] || (input.fileName ? path7.basename(input.fileName, path7.extname(input.fileName)) : void 0);
|
|
5126
|
+
const author = xmlTextNodes(packageXml, "creator")[0];
|
|
5127
|
+
const chapters = [];
|
|
5128
|
+
for (const spineId of spineIds) {
|
|
5129
|
+
const item = manifestEntries.get(spineId);
|
|
5130
|
+
if (!item || !item.mediaType.includes("html") && !item.mediaType.includes("xhtml")) {
|
|
5131
|
+
continue;
|
|
5132
|
+
}
|
|
5133
|
+
if (item.properties.split(/\s+/).includes("nav")) {
|
|
5134
|
+
continue;
|
|
5135
|
+
}
|
|
5136
|
+
const entryPath = resolveZipTarget(packagePath, item.href);
|
|
5137
|
+
const html = zipEntryText(archive, entryPath);
|
|
5138
|
+
if (!html) {
|
|
5139
|
+
continue;
|
|
5140
|
+
}
|
|
5141
|
+
const markdown = htmlToMarkdown(html);
|
|
5142
|
+
if (!markdown) {
|
|
5143
|
+
continue;
|
|
5144
|
+
}
|
|
5145
|
+
const chapterTitle = firstHtmlHeading(html) || markdown.match(/^#\s+(.+)$/m)?.[1]?.trim() || item.href;
|
|
5146
|
+
const normalizedTitle = normalizeWhitespace(chapterTitle);
|
|
5147
|
+
if (!normalizedTitle || /^table of contents$/i.test(normalizedTitle)) {
|
|
5148
|
+
continue;
|
|
5149
|
+
}
|
|
5150
|
+
chapters.push({
|
|
5151
|
+
partKey: item.href,
|
|
5152
|
+
title: normalizedTitle,
|
|
5153
|
+
markdown,
|
|
5154
|
+
metadata: {
|
|
5155
|
+
book_title: bookTitle ?? "",
|
|
5156
|
+
chapter_title: normalizedTitle,
|
|
5157
|
+
author: author ?? ""
|
|
5158
|
+
}
|
|
5159
|
+
});
|
|
5160
|
+
}
|
|
5161
|
+
return {
|
|
5162
|
+
title: bookTitle,
|
|
5163
|
+
author,
|
|
5164
|
+
chapters,
|
|
5165
|
+
warnings: chapters.length ? void 0 : ["EPUB extraction completed but found no chapter-like spine entries."]
|
|
5166
|
+
};
|
|
5167
|
+
} catch (error) {
|
|
5168
|
+
return {
|
|
5169
|
+
chapters: [],
|
|
5170
|
+
warnings: [`EPUB extraction failed: ${error instanceof Error ? truncate(error.message, 240) : "unknown error"}`]
|
|
5171
|
+
};
|
|
5172
|
+
}
|
|
5173
|
+
}
|
|
4804
5174
|
|
|
4805
5175
|
// src/logs.ts
|
|
4806
5176
|
import fs8 from "fs/promises";
|
|
@@ -5236,15 +5606,27 @@ function inferKind(mimeType, filePath) {
|
|
|
5236
5606
|
if (mimeType.includes("html")) {
|
|
5237
5607
|
return "html";
|
|
5238
5608
|
}
|
|
5239
|
-
if (mimeType.startsWith("text/")) {
|
|
5240
|
-
return "text";
|
|
5241
|
-
}
|
|
5242
5609
|
if (mimeType === "application/pdf" || filePath.toLowerCase().endsWith(".pdf")) {
|
|
5243
5610
|
return "pdf";
|
|
5244
5611
|
}
|
|
5245
5612
|
if (mimeType === "application/vnd.openxmlformats-officedocument.wordprocessingml.document" || filePath.toLowerCase().endsWith(".docx")) {
|
|
5246
5613
|
return "docx";
|
|
5247
5614
|
}
|
|
5615
|
+
if (mimeType === "application/epub+zip" || filePath.toLowerCase().endsWith(".epub")) {
|
|
5616
|
+
return "epub";
|
|
5617
|
+
}
|
|
5618
|
+
if (mimeType === "text/csv" || mimeType === "text/tab-separated-values" || filePath.toLowerCase().endsWith(".csv") || filePath.toLowerCase().endsWith(".tsv")) {
|
|
5619
|
+
return "csv";
|
|
5620
|
+
}
|
|
5621
|
+
if (mimeType.startsWith("text/")) {
|
|
5622
|
+
return "text";
|
|
5623
|
+
}
|
|
5624
|
+
if (mimeType === "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" || filePath.toLowerCase().endsWith(".xlsx")) {
|
|
5625
|
+
return "xlsx";
|
|
5626
|
+
}
|
|
5627
|
+
if (mimeType === "application/vnd.openxmlformats-officedocument.presentationml.presentation" || filePath.toLowerCase().endsWith(".pptx")) {
|
|
5628
|
+
return "pptx";
|
|
5629
|
+
}
|
|
5248
5630
|
if (mimeType.startsWith("image/")) {
|
|
5249
5631
|
return "image";
|
|
5250
5632
|
}
|
|
@@ -5270,6 +5652,10 @@ function guessMimeType(target) {
|
|
|
5270
5652
|
}
|
|
5271
5653
|
return mime.lookup(target) || "application/octet-stream";
|
|
5272
5654
|
}
|
|
5655
|
+
function sourceGroupIdFor(prepared) {
|
|
5656
|
+
const originKey = prepared.originType === "url" ? prepared.url ?? prepared.title : prepared.originalPath ?? prepared.title;
|
|
5657
|
+
return `${slugify(prepared.title)}-${sha256(originKey).slice(0, 8)}`;
|
|
5658
|
+
}
|
|
5273
5659
|
function rstAdornmentLine(line) {
|
|
5274
5660
|
const trimmed = line.trim();
|
|
5275
5661
|
if (trimmed.length < 3) {
|
|
@@ -5844,6 +6230,9 @@ function manifestMatchesOrigin(manifest, prepared) {
|
|
|
5844
6230
|
}
|
|
5845
6231
|
return Boolean(prepared.originalPath && manifest.originalPath && toPosix(manifest.originalPath) === toPosix(prepared.originalPath));
|
|
5846
6232
|
}
|
|
6233
|
+
function manifestMatchesOriginPart(manifest, prepared) {
|
|
6234
|
+
return manifestMatchesOrigin(manifest, prepared) && (manifest.sourcePartKey ?? "") === (prepared.sourcePartKey ?? "");
|
|
6235
|
+
}
|
|
5847
6236
|
function buildCompositeHash(payloadBytes, attachments = []) {
|
|
5848
6237
|
if (!attachments.length) {
|
|
5849
6238
|
return sha256(payloadBytes);
|
|
@@ -5941,7 +6330,7 @@ function extractMarkdownImageReferences(content, baseUrl) {
|
|
|
5941
6330
|
async function convertHtmlToMarkdown(html, url) {
|
|
5942
6331
|
const dom = new JSDOM2(html, { url });
|
|
5943
6332
|
const article = new Readability(dom.window.document).parse();
|
|
5944
|
-
const turndown = new
|
|
6333
|
+
const turndown = new TurndownService2({ headingStyle: "atx", codeBlockStyle: "fenced" });
|
|
5945
6334
|
const body = article?.content ?? dom.window.document.body.innerHTML;
|
|
5946
6335
|
const markdown = turndown.turndown(body);
|
|
5947
6336
|
return {
|
|
@@ -5965,21 +6354,26 @@ async function readManifestByHash(manifestsDir, contentHash) {
|
|
|
5965
6354
|
}
|
|
5966
6355
|
return null;
|
|
5967
6356
|
}
|
|
5968
|
-
async function
|
|
6357
|
+
async function readManifestsByOrigin(manifestsDir, prepared) {
|
|
5969
6358
|
const entries = await fs11.readdir(manifestsDir, { withFileTypes: true }).catch(() => []);
|
|
6359
|
+
const manifests = [];
|
|
5970
6360
|
for (const entry of entries) {
|
|
5971
6361
|
if (!entry.isFile() || !entry.name.endsWith(".json")) {
|
|
5972
6362
|
continue;
|
|
5973
6363
|
}
|
|
5974
6364
|
const manifest = await readJsonFile(path12.join(manifestsDir, entry.name));
|
|
5975
6365
|
if (manifest && manifestMatchesOrigin(manifest, prepared)) {
|
|
5976
|
-
|
|
6366
|
+
manifests.push({
|
|
5977
6367
|
...manifest,
|
|
5978
6368
|
semanticHash: manifest.semanticHash ?? manifest.contentHash
|
|
5979
|
-
};
|
|
6369
|
+
});
|
|
5980
6370
|
}
|
|
5981
6371
|
}
|
|
5982
|
-
return
|
|
6372
|
+
return manifests;
|
|
6373
|
+
}
|
|
6374
|
+
async function readManifestByOrigin(manifestsDir, prepared) {
|
|
6375
|
+
const manifests = await readManifestsByOrigin(manifestsDir, prepared);
|
|
6376
|
+
return manifests.find((manifest) => manifestMatchesOriginPart(manifest, prepared)) ?? null;
|
|
5983
6377
|
}
|
|
5984
6378
|
async function loadGitignoreMatcher(repoRoot, enabled) {
|
|
5985
6379
|
if (!enabled) {
|
|
@@ -6228,8 +6622,8 @@ async function persistPreparedInput(rootDir, prepared, paths) {
|
|
|
6228
6622
|
const semanticHash = prepared.semanticHash ?? contentHash;
|
|
6229
6623
|
const extractionHash = prepared.extractionHash ?? buildExtractionHash(prepared.extractedText, prepared.extractionArtifact);
|
|
6230
6624
|
const existingByOrigin = await readManifestByOrigin(paths.manifestsDir, prepared);
|
|
6231
|
-
const existingByHash = existingByOrigin ? null : await readManifestByHash(paths.manifestsDir, contentHash);
|
|
6232
|
-
if (existingByOrigin && existingByOrigin.contentHash === contentHash && existingByOrigin.semanticHash === semanticHash && existingByOrigin.extractionHash === extractionHash && existingByOrigin.title === prepared.title && existingByOrigin.sourceKind === prepared.sourceKind && existingByOrigin.sourceType === prepared.sourceType && existingByOrigin.sourceClass === prepared.sourceClass && existingByOrigin.language === prepared.language && existingByOrigin.mimeType === prepared.mimeType && existingByOrigin.repoRelativePath === prepared.repoRelativePath) {
|
|
6625
|
+
const existingByHash = existingByOrigin || prepared.sourcePartKey ? null : await readManifestByHash(paths.manifestsDir, contentHash);
|
|
6626
|
+
if (existingByOrigin && existingByOrigin.contentHash === contentHash && existingByOrigin.semanticHash === semanticHash && existingByOrigin.extractionHash === extractionHash && existingByOrigin.title === prepared.title && existingByOrigin.sourceKind === prepared.sourceKind && existingByOrigin.sourceType === prepared.sourceType && existingByOrigin.sourceClass === prepared.sourceClass && existingByOrigin.language === prepared.language && existingByOrigin.mimeType === prepared.mimeType && existingByOrigin.repoRelativePath === prepared.repoRelativePath && existingByOrigin.sourceGroupId === prepared.sourceGroupId && existingByOrigin.sourceGroupTitle === prepared.sourceGroupTitle && existingByOrigin.sourcePartKey === prepared.sourcePartKey && existingByOrigin.partIndex === prepared.partIndex && existingByOrigin.partCount === prepared.partCount && existingByOrigin.partTitle === prepared.partTitle && JSON.stringify(existingByOrigin.details ?? {}) === JSON.stringify(prepared.details ?? {})) {
|
|
6233
6627
|
return { manifest: existingByOrigin, isNew: false, wasUpdated: false };
|
|
6234
6628
|
}
|
|
6235
6629
|
if (existingByHash) {
|
|
@@ -6288,6 +6682,13 @@ async function persistPreparedInput(rootDir, prepared, paths) {
|
|
|
6288
6682
|
mimeType: prepared.mimeType,
|
|
6289
6683
|
contentHash,
|
|
6290
6684
|
semanticHash,
|
|
6685
|
+
sourceGroupId: prepared.sourceGroupId,
|
|
6686
|
+
sourceGroupTitle: prepared.sourceGroupTitle,
|
|
6687
|
+
sourcePartKey: prepared.sourcePartKey,
|
|
6688
|
+
partIndex: prepared.partIndex,
|
|
6689
|
+
partCount: prepared.partCount,
|
|
6690
|
+
partTitle: prepared.partTitle,
|
|
6691
|
+
details: prepared.details,
|
|
6291
6692
|
createdAt: previous?.createdAt ?? now,
|
|
6292
6693
|
updatedAt: now,
|
|
6293
6694
|
attachments: manifestAttachments.length ? manifestAttachments : void 0
|
|
@@ -6309,6 +6710,42 @@ async function persistPreparedInput(rootDir, prepared, paths) {
|
|
|
6309
6710
|
}
|
|
6310
6711
|
return { manifest, isNew: !previous, wasUpdated: Boolean(previous) };
|
|
6311
6712
|
}
|
|
6713
|
+
async function persistPreparedInputs(rootDir, input, preparedInputs, paths) {
|
|
6714
|
+
const template = preparedInputs[0];
|
|
6715
|
+
const existingByOrigin = template ? await readManifestsByOrigin(paths.manifestsDir, template) : [];
|
|
6716
|
+
const created = [];
|
|
6717
|
+
const updated = [];
|
|
6718
|
+
const unchanged = [];
|
|
6719
|
+
const removed = [];
|
|
6720
|
+
const seenSourceIds = /* @__PURE__ */ new Set();
|
|
6721
|
+
for (const prepared of preparedInputs) {
|
|
6722
|
+
const result = await persistPreparedInput(rootDir, prepared, paths);
|
|
6723
|
+
if (result.isNew) {
|
|
6724
|
+
created.push(result.manifest);
|
|
6725
|
+
} else if (result.wasUpdated) {
|
|
6726
|
+
updated.push(result.manifest);
|
|
6727
|
+
} else {
|
|
6728
|
+
unchanged.push(result.manifest);
|
|
6729
|
+
}
|
|
6730
|
+
seenSourceIds.add(result.manifest.sourceId);
|
|
6731
|
+
}
|
|
6732
|
+
for (const manifest of existingByOrigin) {
|
|
6733
|
+
if (seenSourceIds.has(manifest.sourceId)) {
|
|
6734
|
+
continue;
|
|
6735
|
+
}
|
|
6736
|
+
await removeManifestArtifacts(rootDir, manifest, paths);
|
|
6737
|
+
removed.push(manifest);
|
|
6738
|
+
}
|
|
6739
|
+
return {
|
|
6740
|
+
input,
|
|
6741
|
+
scannedCount: preparedInputs.length,
|
|
6742
|
+
created,
|
|
6743
|
+
updated,
|
|
6744
|
+
unchanged,
|
|
6745
|
+
removed,
|
|
6746
|
+
skipped: []
|
|
6747
|
+
};
|
|
6748
|
+
}
|
|
6312
6749
|
async function removeManifestArtifacts(rootDir, manifest, paths) {
|
|
6313
6750
|
await fs11.rm(path12.join(paths.manifestsDir, `${manifest.sourceId}.json`), { force: true });
|
|
6314
6751
|
await fs11.rm(path12.resolve(rootDir, manifest.storedPath), { force: true });
|
|
@@ -6335,10 +6772,10 @@ function repoSyncWorkspaceIgnorePaths(rootDir, paths, repoRoot) {
|
|
|
6335
6772
|
return candidates.map((candidate) => path12.resolve(candidate)).filter((candidate, index, items) => items.indexOf(candidate) === index).filter((candidate) => withinRoot(repoRoot, candidate));
|
|
6336
6773
|
}
|
|
6337
6774
|
function preparedMatchesManifest(manifest, prepared, contentHash) {
|
|
6338
|
-
return manifest.contentHash === contentHash && manifest.extractionHash === (prepared.extractionHash ?? buildExtractionHash(prepared.extractedText, prepared.extractionArtifact)) && manifest.title === prepared.title && manifest.sourceKind === prepared.sourceKind && manifest.sourceType === prepared.sourceType && manifest.sourceClass === prepared.sourceClass && manifest.language === prepared.language && manifest.mimeType === prepared.mimeType && manifest.repoRelativePath === prepared.repoRelativePath;
|
|
6775
|
+
return manifest.contentHash === contentHash && manifest.extractionHash === (prepared.extractionHash ?? buildExtractionHash(prepared.extractedText, prepared.extractionArtifact)) && manifest.semanticHash === (prepared.semanticHash ?? contentHash) && manifest.title === prepared.title && manifest.sourceKind === prepared.sourceKind && manifest.sourceType === prepared.sourceType && manifest.sourceClass === prepared.sourceClass && manifest.language === prepared.language && manifest.mimeType === prepared.mimeType && manifest.repoRelativePath === prepared.repoRelativePath && manifest.sourceGroupId === prepared.sourceGroupId && manifest.sourceGroupTitle === prepared.sourceGroupTitle && manifest.sourcePartKey === prepared.sourcePartKey && manifest.partIndex === prepared.partIndex && manifest.partCount === prepared.partCount && manifest.partTitle === prepared.partTitle && JSON.stringify(manifest.details ?? {}) === JSON.stringify(prepared.details ?? {});
|
|
6339
6776
|
}
|
|
6340
6777
|
function shouldDeferWatchSemanticRefresh(sourceKind) {
|
|
6341
|
-
return sourceKind === "markdown" || sourceKind === "text" || sourceKind === "html" || sourceKind === "pdf" || sourceKind === "docx" || sourceKind === "image";
|
|
6778
|
+
return sourceKind === "markdown" || sourceKind === "text" || sourceKind === "html" || sourceKind === "pdf" || sourceKind === "docx" || sourceKind === "epub" || sourceKind === "csv" || sourceKind === "xlsx" || sourceKind === "pptx" || sourceKind === "image";
|
|
6342
6779
|
}
|
|
6343
6780
|
function pendingSemanticRefreshId(changeType, repoRoot, relativePath) {
|
|
6344
6781
|
return `pending:${changeType}:${sha256(`${toPosix(repoRoot)}:${relativePath}`).slice(0, 12)}`;
|
|
@@ -6404,13 +6841,16 @@ async function syncTrackedRepos(rootDir, options, repoRoots) {
|
|
|
6404
6841
|
const currentPaths = new Set(files.map((absolutePath) => path12.resolve(absolutePath)));
|
|
6405
6842
|
for (const absolutePath of files) {
|
|
6406
6843
|
const relativePath = repoRelativePathFor(absolutePath, repoRoot) ?? toPosix(path12.relative(repoRoot, absolutePath));
|
|
6407
|
-
const
|
|
6408
|
-
|
|
6409
|
-
|
|
6410
|
-
|
|
6411
|
-
|
|
6412
|
-
|
|
6413
|
-
|
|
6844
|
+
const preparedInputs = await prepareFileInputs(
|
|
6845
|
+
rootDir,
|
|
6846
|
+
absolutePath,
|
|
6847
|
+
repoRoot,
|
|
6848
|
+
sourceClassForRelativePath(relativePath, normalizedOptions)
|
|
6849
|
+
);
|
|
6850
|
+
const result = await persistPreparedInputs(rootDir, absolutePath, preparedInputs, paths);
|
|
6851
|
+
imported.push(...result.created);
|
|
6852
|
+
updated.push(...result.updated);
|
|
6853
|
+
removed.push(...result.removed);
|
|
6414
6854
|
progress.tick();
|
|
6415
6855
|
}
|
|
6416
6856
|
progress.finish(`repo=${toPosix(path12.relative(rootDir, repoRoot)) || "."}`);
|
|
@@ -6469,9 +6909,6 @@ async function syncTrackedReposForWatch(rootDir, options, repoRoots) {
|
|
|
6469
6909
|
let scannedCount = 0;
|
|
6470
6910
|
for (const repoRoot of uniqueRoots) {
|
|
6471
6911
|
const repoManifests = manifestsByRepoRoot.get(repoRoot) ?? [];
|
|
6472
|
-
const manifestsByOriginalPath = new Map(
|
|
6473
|
-
repoManifests.filter((manifest) => manifest.originalPath).map((manifest) => [path12.resolve(manifest.originalPath), manifest])
|
|
6474
|
-
);
|
|
6475
6912
|
if (!await fileExists(repoRoot)) {
|
|
6476
6913
|
for (const manifest of repoManifests) {
|
|
6477
6914
|
if (shouldDeferWatchSemanticRefresh(manifest.sourceKind)) {
|
|
@@ -6507,38 +6944,50 @@ async function syncTrackedReposForWatch(rootDir, options, repoRoots) {
|
|
|
6507
6944
|
const currentPaths = new Set(files.map((absolutePath) => path12.resolve(absolutePath)));
|
|
6508
6945
|
for (const absolutePath of files) {
|
|
6509
6946
|
const relativePath = repoRelativePathFor(absolutePath, repoRoot) ?? toPosix(path12.relative(repoRoot, absolutePath));
|
|
6510
|
-
const
|
|
6511
|
-
|
|
6512
|
-
|
|
6513
|
-
|
|
6514
|
-
|
|
6947
|
+
const preparedInputs = await prepareFileInputs(
|
|
6948
|
+
rootDir,
|
|
6949
|
+
absolutePath,
|
|
6950
|
+
repoRoot,
|
|
6951
|
+
sourceClassForRelativePath(relativePath, normalizedOptions)
|
|
6952
|
+
);
|
|
6953
|
+
const firstPrepared = preparedInputs[0];
|
|
6954
|
+
if (firstPrepared && shouldDeferWatchSemanticRefresh(firstPrepared.sourceKind)) {
|
|
6955
|
+
const existing = repoManifests.filter(
|
|
6956
|
+
(manifest) => manifest.originalPath && path12.resolve(manifest.originalPath) === path12.resolve(absolutePath)
|
|
6957
|
+
);
|
|
6958
|
+
const existingByPartKey = new Map(existing.map((manifest) => [manifest.sourcePartKey ?? "__single__", manifest]));
|
|
6959
|
+
const changed = existing.length !== preparedInputs.length || preparedInputs.some((prepared) => {
|
|
6960
|
+
const match = existingByPartKey.get(prepared.sourcePartKey ?? "__single__");
|
|
6961
|
+
const contentHash = buildCompositeHash(prepared.payloadBytes, prepared.attachments);
|
|
6962
|
+
return !match || !preparedMatchesManifest(match, prepared, contentHash);
|
|
6963
|
+
}) || existing.some(
|
|
6964
|
+
(manifest) => !preparedInputs.some((prepared) => (prepared.sourcePartKey ?? "") === (manifest.sourcePartKey ?? ""))
|
|
6965
|
+
);
|
|
6515
6966
|
if (changed) {
|
|
6516
6967
|
pendingSemanticRefresh.push({
|
|
6517
6968
|
id: pendingSemanticRefreshId(
|
|
6518
|
-
existing ? "modified" : "added",
|
|
6969
|
+
existing.length ? "modified" : "added",
|
|
6519
6970
|
repoRoot,
|
|
6520
|
-
|
|
6971
|
+
firstPrepared.repoRelativePath ?? toPosix(path12.relative(repoRoot, absolutePath))
|
|
6521
6972
|
),
|
|
6522
6973
|
repoRoot,
|
|
6523
6974
|
path: toPosix(path12.relative(rootDir, absolutePath)),
|
|
6524
|
-
changeType: existing ? "modified" : "added",
|
|
6975
|
+
changeType: existing.length ? "modified" : "added",
|
|
6525
6976
|
detectedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
6526
|
-
sourceId: existing?.sourceId,
|
|
6527
|
-
sourceKind:
|
|
6977
|
+
sourceId: existing[0]?.sourceId,
|
|
6978
|
+
sourceKind: firstPrepared.sourceKind
|
|
6528
6979
|
});
|
|
6529
|
-
|
|
6530
|
-
staleSourceIds.add(
|
|
6980
|
+
for (const manifest of existing) {
|
|
6981
|
+
staleSourceIds.add(manifest.sourceId);
|
|
6531
6982
|
}
|
|
6532
6983
|
}
|
|
6533
6984
|
progress.tick();
|
|
6534
6985
|
continue;
|
|
6535
6986
|
}
|
|
6536
|
-
const result = await
|
|
6537
|
-
|
|
6538
|
-
|
|
6539
|
-
|
|
6540
|
-
updated.push(result.manifest);
|
|
6541
|
-
}
|
|
6987
|
+
const result = await persistPreparedInputs(rootDir, absolutePath, preparedInputs, paths);
|
|
6988
|
+
imported.push(...result.created);
|
|
6989
|
+
updated.push(...result.updated);
|
|
6990
|
+
removed.push(...result.removed);
|
|
6542
6991
|
progress.tick();
|
|
6543
6992
|
}
|
|
6544
6993
|
progress.finish(`repo=${toPosix(path12.relative(rootDir, repoRoot)) || "."}`);
|
|
@@ -6592,7 +7041,7 @@ async function syncTrackedReposForWatch(rootDir, options, repoRoots) {
|
|
|
6592
7041
|
staleSourceIds: [...staleSourceIds]
|
|
6593
7042
|
};
|
|
6594
7043
|
}
|
|
6595
|
-
async function
|
|
7044
|
+
async function prepareFileInputs(rootDir, absoluteInput, repoRoot, sourceClass) {
|
|
6596
7045
|
const payloadBytes = await fs11.readFile(absoluteInput);
|
|
6597
7046
|
const mimeType = guessMimeType(absoluteInput);
|
|
6598
7047
|
const sourceKind = inferKind(mimeType, absoluteInput);
|
|
@@ -6623,6 +7072,94 @@ async function prepareFileInput(rootDir, absoluteInput, repoRoot, sourceClass) {
|
|
|
6623
7072
|
title = extracted.artifact.metadata?.title?.trim() || title;
|
|
6624
7073
|
extractedText = extracted.extractedText;
|
|
6625
7074
|
extractionArtifact = extracted.artifact;
|
|
7075
|
+
} else if (sourceKind === "csv") {
|
|
7076
|
+
title = path12.basename(absoluteInput, path12.extname(absoluteInput));
|
|
7077
|
+
const extracted = await extractCsvText({ mimeType, bytes: payloadBytes, fileName: absoluteInput });
|
|
7078
|
+
title = extracted.title?.trim() || title;
|
|
7079
|
+
extractedText = extracted.extractedText;
|
|
7080
|
+
extractionArtifact = extracted.artifact;
|
|
7081
|
+
} else if (sourceKind === "xlsx") {
|
|
7082
|
+
title = path12.basename(absoluteInput, path12.extname(absoluteInput));
|
|
7083
|
+
const extracted = await extractXlsxText({ mimeType, bytes: payloadBytes, fileName: absoluteInput });
|
|
7084
|
+
title = extracted.title?.trim() || title;
|
|
7085
|
+
extractedText = extracted.extractedText;
|
|
7086
|
+
extractionArtifact = extracted.artifact;
|
|
7087
|
+
} else if (sourceKind === "pptx") {
|
|
7088
|
+
title = path12.basename(absoluteInput, path12.extname(absoluteInput));
|
|
7089
|
+
const extracted = await extractPptxText({ mimeType, bytes: payloadBytes, fileName: absoluteInput });
|
|
7090
|
+
title = extracted.title?.trim() || title;
|
|
7091
|
+
extractedText = extracted.extractedText;
|
|
7092
|
+
extractionArtifact = extracted.artifact;
|
|
7093
|
+
} else if (sourceKind === "epub") {
|
|
7094
|
+
title = path12.basename(absoluteInput, path12.extname(absoluteInput));
|
|
7095
|
+
const extracted = await extractEpubChapters({ mimeType, bytes: payloadBytes, fileName: absoluteInput });
|
|
7096
|
+
title = extracted.title?.trim() || title;
|
|
7097
|
+
const groupId = sourceGroupIdFor({
|
|
7098
|
+
title,
|
|
7099
|
+
originType: "file",
|
|
7100
|
+
originalPath: toPosix(absoluteInput)
|
|
7101
|
+
});
|
|
7102
|
+
if (extracted.chapters.length) {
|
|
7103
|
+
return extracted.chapters.map(
|
|
7104
|
+
(chapter, index) => finalizePreparedInput({
|
|
7105
|
+
title: `${title} - ${chapter.title}`,
|
|
7106
|
+
originType: "file",
|
|
7107
|
+
sourceKind: "epub",
|
|
7108
|
+
sourceClass,
|
|
7109
|
+
originalPath: toPosix(absoluteInput),
|
|
7110
|
+
repoRelativePath: repoRelativePathFor(absoluteInput, repoRoot),
|
|
7111
|
+
mimeType: "text/markdown",
|
|
7112
|
+
storedExtension: ".md",
|
|
7113
|
+
payloadBytes: Buffer.from(chapter.markdown, "utf8"),
|
|
7114
|
+
extractedText: chapter.markdown,
|
|
7115
|
+
extractionArtifact: {
|
|
7116
|
+
extractor: "epub_text",
|
|
7117
|
+
sourceKind: "epub",
|
|
7118
|
+
mimeType,
|
|
7119
|
+
producedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
7120
|
+
metadata: {
|
|
7121
|
+
...chapter.metadata,
|
|
7122
|
+
chapter_index: String(index + 1),
|
|
7123
|
+
chapter_count: String(extracted.chapters.length)
|
|
7124
|
+
},
|
|
7125
|
+
warnings: extracted.warnings
|
|
7126
|
+
},
|
|
7127
|
+
extractionHash: buildExtractionHash(chapter.markdown, {
|
|
7128
|
+
extractor: "epub_text",
|
|
7129
|
+
sourceKind: "epub",
|
|
7130
|
+
mimeType,
|
|
7131
|
+
producedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
7132
|
+
metadata: {
|
|
7133
|
+
...chapter.metadata,
|
|
7134
|
+
chapter_index: String(index + 1),
|
|
7135
|
+
chapter_count: String(extracted.chapters.length)
|
|
7136
|
+
},
|
|
7137
|
+
warnings: extracted.warnings
|
|
7138
|
+
}),
|
|
7139
|
+
sourceGroupId: groupId,
|
|
7140
|
+
sourceGroupTitle: title,
|
|
7141
|
+
sourcePartKey: chapter.partKey,
|
|
7142
|
+
partIndex: index + 1,
|
|
7143
|
+
partCount: extracted.chapters.length,
|
|
7144
|
+
partTitle: chapter.title,
|
|
7145
|
+
details: {
|
|
7146
|
+
book_title: title,
|
|
7147
|
+
chapter_title: chapter.title,
|
|
7148
|
+
chapter_index: String(index + 1),
|
|
7149
|
+
chapter_count: String(extracted.chapters.length),
|
|
7150
|
+
...extracted.author ? { author: extracted.author } : {}
|
|
7151
|
+
}
|
|
7152
|
+
})
|
|
7153
|
+
);
|
|
7154
|
+
}
|
|
7155
|
+
extractedText = void 0;
|
|
7156
|
+
extractionArtifact = {
|
|
7157
|
+
extractor: "epub_text",
|
|
7158
|
+
sourceKind: "epub",
|
|
7159
|
+
mimeType,
|
|
7160
|
+
producedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
7161
|
+
warnings: extracted.warnings ?? ["EPUB extraction completed but produced no chapter content."]
|
|
7162
|
+
};
|
|
6626
7163
|
} else if (sourceKind === "image") {
|
|
6627
7164
|
title = path12.basename(absoluteInput, path12.extname(absoluteInput));
|
|
6628
7165
|
const extracted = await extractImageWithVision(rootDir, {
|
|
@@ -6636,23 +7173,33 @@ async function prepareFileInput(rootDir, absoluteInput, repoRoot, sourceClass) {
|
|
|
6636
7173
|
} else {
|
|
6637
7174
|
title = path12.basename(absoluteInput, path12.extname(absoluteInput));
|
|
6638
7175
|
}
|
|
6639
|
-
return
|
|
6640
|
-
|
|
6641
|
-
|
|
6642
|
-
|
|
6643
|
-
|
|
6644
|
-
|
|
6645
|
-
|
|
6646
|
-
|
|
6647
|
-
|
|
6648
|
-
|
|
6649
|
-
|
|
6650
|
-
|
|
6651
|
-
|
|
6652
|
-
|
|
6653
|
-
|
|
7176
|
+
return [
|
|
7177
|
+
finalizePreparedInput({
|
|
7178
|
+
title,
|
|
7179
|
+
originType: "file",
|
|
7180
|
+
sourceKind,
|
|
7181
|
+
sourceClass,
|
|
7182
|
+
language,
|
|
7183
|
+
originalPath: toPosix(absoluteInput),
|
|
7184
|
+
repoRelativePath: repoRelativePathFor(absoluteInput, repoRoot),
|
|
7185
|
+
mimeType,
|
|
7186
|
+
storedExtension,
|
|
7187
|
+
payloadBytes,
|
|
7188
|
+
extractedText,
|
|
7189
|
+
extractionArtifact,
|
|
7190
|
+
extractionHash: buildExtractionHash(extractedText, extractionArtifact),
|
|
7191
|
+
details: extractionArtifact?.metadata
|
|
7192
|
+
})
|
|
7193
|
+
];
|
|
6654
7194
|
}
|
|
6655
|
-
async function
|
|
7195
|
+
async function prepareFileInput(rootDir, absoluteInput, repoRoot, sourceClass) {
|
|
7196
|
+
const prepared = await prepareFileInputs(rootDir, absoluteInput, repoRoot, sourceClass);
|
|
7197
|
+
if (!prepared.length) {
|
|
7198
|
+
throw new Error(`No ingestable sources were extracted from ${absoluteInput}.`);
|
|
7199
|
+
}
|
|
7200
|
+
return prepared[0];
|
|
7201
|
+
}
|
|
7202
|
+
async function prepareUrlInputs(rootDir, input, options) {
|
|
6656
7203
|
await validateUrlSafety(input);
|
|
6657
7204
|
const response = await fetch(input);
|
|
6658
7205
|
if (!response.ok) {
|
|
@@ -6747,6 +7294,88 @@ async function prepareUrlInput(rootDir, input, options) {
|
|
|
6747
7294
|
title = extracted.artifact.metadata?.title?.trim() || title;
|
|
6748
7295
|
extractedText = extracted.extractedText;
|
|
6749
7296
|
extractionArtifact = extracted.artifact;
|
|
7297
|
+
} else if (sourceKind === "csv") {
|
|
7298
|
+
const extracted = await extractCsvText({ mimeType, bytes: payloadBytes, fileName: inputUrl.pathname });
|
|
7299
|
+
title = extracted.title?.trim() || title;
|
|
7300
|
+
extractedText = extracted.extractedText;
|
|
7301
|
+
extractionArtifact = extracted.artifact;
|
|
7302
|
+
} else if (sourceKind === "xlsx") {
|
|
7303
|
+
const extracted = await extractXlsxText({ mimeType, bytes: payloadBytes, fileName: inputUrl.pathname });
|
|
7304
|
+
title = extracted.title?.trim() || title;
|
|
7305
|
+
extractedText = extracted.extractedText;
|
|
7306
|
+
extractionArtifact = extracted.artifact;
|
|
7307
|
+
} else if (sourceKind === "pptx") {
|
|
7308
|
+
const extracted = await extractPptxText({ mimeType, bytes: payloadBytes, fileName: inputUrl.pathname });
|
|
7309
|
+
title = extracted.title?.trim() || title;
|
|
7310
|
+
extractedText = extracted.extractedText;
|
|
7311
|
+
extractionArtifact = extracted.artifact;
|
|
7312
|
+
} else if (sourceKind === "epub") {
|
|
7313
|
+
const extracted = await extractEpubChapters({ mimeType, bytes: payloadBytes, fileName: inputUrl.pathname });
|
|
7314
|
+
title = extracted.title?.trim() || title;
|
|
7315
|
+
const groupId = sourceGroupIdFor({
|
|
7316
|
+
title,
|
|
7317
|
+
originType: "url",
|
|
7318
|
+
url: finalUrl
|
|
7319
|
+
});
|
|
7320
|
+
if (extracted.chapters.length) {
|
|
7321
|
+
return extracted.chapters.map(
|
|
7322
|
+
(chapter, index) => finalizePreparedInput({
|
|
7323
|
+
title: `${title} - ${chapter.title}`,
|
|
7324
|
+
originType: "url",
|
|
7325
|
+
sourceKind: "epub",
|
|
7326
|
+
url: finalUrl,
|
|
7327
|
+
mimeType: "text/markdown",
|
|
7328
|
+
storedExtension: ".md",
|
|
7329
|
+
payloadBytes: Buffer.from(chapter.markdown, "utf8"),
|
|
7330
|
+
extractedText: chapter.markdown,
|
|
7331
|
+
extractionArtifact: {
|
|
7332
|
+
extractor: "epub_text",
|
|
7333
|
+
sourceKind: "epub",
|
|
7334
|
+
mimeType,
|
|
7335
|
+
producedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
7336
|
+
metadata: {
|
|
7337
|
+
...chapter.metadata,
|
|
7338
|
+
chapter_index: String(index + 1),
|
|
7339
|
+
chapter_count: String(extracted.chapters.length)
|
|
7340
|
+
},
|
|
7341
|
+
warnings: extracted.warnings
|
|
7342
|
+
},
|
|
7343
|
+
extractionHash: buildExtractionHash(chapter.markdown, {
|
|
7344
|
+
extractor: "epub_text",
|
|
7345
|
+
sourceKind: "epub",
|
|
7346
|
+
mimeType,
|
|
7347
|
+
producedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
7348
|
+
metadata: {
|
|
7349
|
+
...chapter.metadata,
|
|
7350
|
+
chapter_index: String(index + 1),
|
|
7351
|
+
chapter_count: String(extracted.chapters.length)
|
|
7352
|
+
},
|
|
7353
|
+
warnings: extracted.warnings
|
|
7354
|
+
}),
|
|
7355
|
+
sourceGroupId: groupId,
|
|
7356
|
+
sourceGroupTitle: title,
|
|
7357
|
+
sourcePartKey: chapter.partKey,
|
|
7358
|
+
partIndex: index + 1,
|
|
7359
|
+
partCount: extracted.chapters.length,
|
|
7360
|
+
partTitle: chapter.title,
|
|
7361
|
+
details: {
|
|
7362
|
+
book_title: title,
|
|
7363
|
+
chapter_title: chapter.title,
|
|
7364
|
+
chapter_index: String(index + 1),
|
|
7365
|
+
chapter_count: String(extracted.chapters.length),
|
|
7366
|
+
...extracted.author ? { author: extracted.author } : {}
|
|
7367
|
+
},
|
|
7368
|
+
logDetails
|
|
7369
|
+
})
|
|
7370
|
+
);
|
|
7371
|
+
}
|
|
7372
|
+
extractionArtifact = {
|
|
7373
|
+
extractor: "epub_text",
|
|
7374
|
+
sourceKind: "epub",
|
|
7375
|
+
mimeType,
|
|
7376
|
+
producedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
7377
|
+
warnings: extracted.warnings ?? ["EPUB extraction completed but produced no chapter content."]
|
|
7378
|
+
};
|
|
6750
7379
|
} else if (sourceKind === "image") {
|
|
6751
7380
|
const extracted = await extractImageWithVision(rootDir, {
|
|
6752
7381
|
title,
|
|
@@ -6758,22 +7387,32 @@ async function prepareUrlInput(rootDir, input, options) {
|
|
|
6758
7387
|
extractionArtifact = extracted.artifact;
|
|
6759
7388
|
}
|
|
6760
7389
|
}
|
|
6761
|
-
return
|
|
6762
|
-
|
|
6763
|
-
|
|
6764
|
-
|
|
6765
|
-
|
|
6766
|
-
|
|
6767
|
-
|
|
6768
|
-
|
|
6769
|
-
|
|
6770
|
-
|
|
6771
|
-
|
|
6772
|
-
|
|
6773
|
-
|
|
6774
|
-
|
|
6775
|
-
|
|
6776
|
-
|
|
7390
|
+
return [
|
|
7391
|
+
finalizePreparedInput({
|
|
7392
|
+
title,
|
|
7393
|
+
originType: "url",
|
|
7394
|
+
sourceKind,
|
|
7395
|
+
language,
|
|
7396
|
+
url: finalUrl,
|
|
7397
|
+
mimeType,
|
|
7398
|
+
storedExtension,
|
|
7399
|
+
payloadBytes,
|
|
7400
|
+
extractedText,
|
|
7401
|
+
extractionArtifact,
|
|
7402
|
+
extractionHash: buildExtractionHash(extractedText, extractionArtifact),
|
|
7403
|
+
attachments,
|
|
7404
|
+
contentHash,
|
|
7405
|
+
details: extractionArtifact?.metadata,
|
|
7406
|
+
logDetails
|
|
7407
|
+
})
|
|
7408
|
+
];
|
|
7409
|
+
}
|
|
7410
|
+
async function prepareUrlInput(rootDir, input, options) {
|
|
7411
|
+
const prepared = await prepareUrlInputs(rootDir, input, options);
|
|
7412
|
+
if (!prepared.length) {
|
|
7413
|
+
throw new Error(`No ingestable sources were extracted from ${input}.`);
|
|
7414
|
+
}
|
|
7415
|
+
return prepared[0];
|
|
6777
7416
|
}
|
|
6778
7417
|
async function collectInboxAttachmentRefs(inputDir, files) {
|
|
6779
7418
|
const refsBySource = /* @__PURE__ */ new Map();
|
|
@@ -6905,18 +7544,23 @@ async function prepareInboxHtmlInput(absolutePath, attachmentRefs) {
|
|
|
6905
7544
|
};
|
|
6906
7545
|
}
|
|
6907
7546
|
function isSupportedInboxKind(sourceKind) {
|
|
6908
|
-
return ["markdown", "text", "html", "pdf", "docx", "image"].includes(sourceKind);
|
|
7547
|
+
return ["markdown", "text", "html", "pdf", "docx", "epub", "csv", "xlsx", "pptx", "image"].includes(sourceKind);
|
|
6909
7548
|
}
|
|
6910
7549
|
async function ingestInputDetailed(rootDir, input, options) {
|
|
6911
7550
|
const { paths } = await initWorkspace(rootDir);
|
|
6912
7551
|
const normalizedOptions = normalizeIngestOptions(options);
|
|
6913
7552
|
const absoluteInput = path12.resolve(rootDir, input);
|
|
6914
7553
|
const repoRoot = isHttpUrl(input) || normalizedOptions.repoRoot ? normalizedOptions.repoRoot : await findNearestGitRoot2(absoluteInput).then((value) => value ?? path12.dirname(absoluteInput));
|
|
6915
|
-
const prepared = isHttpUrl(input) ? await
|
|
6916
|
-
return await
|
|
7554
|
+
const prepared = isHttpUrl(input) ? await prepareUrlInputs(rootDir, input, normalizedOptions) : await prepareFileInputs(rootDir, absoluteInput, repoRoot);
|
|
7555
|
+
return await persistPreparedInputs(rootDir, input, prepared, paths);
|
|
6917
7556
|
}
|
|
6918
7557
|
async function ingestInput(rootDir, input, options) {
|
|
6919
|
-
|
|
7558
|
+
const result = await ingestInputDetailed(rootDir, input, options);
|
|
7559
|
+
const manifest = [...result.created, ...result.updated, ...result.unchanged][0];
|
|
7560
|
+
if (!manifest) {
|
|
7561
|
+
throw new Error(`No source manifests were created or updated for ${input}.`);
|
|
7562
|
+
}
|
|
7563
|
+
return manifest;
|
|
6920
7564
|
}
|
|
6921
7565
|
async function addInput(rootDir, input, options = {}) {
|
|
6922
7566
|
const { paths } = await initWorkspace(rootDir);
|
|
@@ -7014,13 +7658,20 @@ async function ingestDirectory(rootDir, inputDir, options) {
|
|
|
7014
7658
|
const progress = createProgressReporter("ingest", files.length);
|
|
7015
7659
|
for (const absolutePath of files) {
|
|
7016
7660
|
const relativePath = repoRelativePathFor(absolutePath, repoRoot) ?? toPosix(path12.relative(repoRoot, absolutePath));
|
|
7017
|
-
const
|
|
7018
|
-
|
|
7019
|
-
|
|
7020
|
-
|
|
7021
|
-
|
|
7022
|
-
|
|
7023
|
-
|
|
7661
|
+
const preparedInputs = await prepareFileInputs(
|
|
7662
|
+
rootDir,
|
|
7663
|
+
absolutePath,
|
|
7664
|
+
repoRoot,
|
|
7665
|
+
sourceClassForRelativePath(relativePath, normalizedOptions)
|
|
7666
|
+
);
|
|
7667
|
+
const result = await persistPreparedInputs(rootDir, absolutePath, preparedInputs, paths);
|
|
7668
|
+
if (result.created.length) {
|
|
7669
|
+
imported.push(...result.created);
|
|
7670
|
+
}
|
|
7671
|
+
if (result.updated.length) {
|
|
7672
|
+
updated.push(...result.updated);
|
|
7673
|
+
}
|
|
7674
|
+
if (!result.created.length && !result.updated.length && !result.removed.length) {
|
|
7024
7675
|
skipped.push({ path: toPosix(path12.relative(rootDir, absolutePath)), reason: "duplicate_content" });
|
|
7025
7676
|
}
|
|
7026
7677
|
progress.tick();
|
|
@@ -7071,13 +7722,13 @@ async function importInbox(rootDir, inputDir) {
|
|
|
7071
7722
|
continue;
|
|
7072
7723
|
}
|
|
7073
7724
|
const prepared = sourceKind === "markdown" && refsBySource.has(absolutePath) ? await prepareInboxMarkdownInput(absolutePath, refsBySource.get(absolutePath) ?? []) : sourceKind === "html" && refsBySource.has(absolutePath) ? await prepareInboxHtmlInput(absolutePath, refsBySource.get(absolutePath) ?? []) : await prepareFileInput(rootDir, absolutePath);
|
|
7074
|
-
const result = await
|
|
7075
|
-
if (!result.
|
|
7725
|
+
const result = await persistPreparedInputs(rootDir, absolutePath, [prepared], paths);
|
|
7726
|
+
if (!result.created.length) {
|
|
7076
7727
|
skipped.push({ path: toPosix(path12.relative(rootDir, absolutePath)), reason: "duplicate_content" });
|
|
7077
7728
|
continue;
|
|
7078
7729
|
}
|
|
7079
|
-
attachmentCount += result.manifest.attachments?.length ?? 0;
|
|
7080
|
-
imported.push(result.
|
|
7730
|
+
attachmentCount += result.created.reduce((total, manifest) => total + (manifest.attachments?.length ?? 0), 0);
|
|
7731
|
+
imported.push(...result.created);
|
|
7081
7732
|
}
|
|
7082
7733
|
await appendLogEntry(rootDir, "inbox_import", toPosix(path12.relative(rootDir, effectiveInputDir)) || ".", [
|
|
7083
7734
|
`scanned=${files.length}`,
|
|
@@ -9336,9 +9987,19 @@ function buildSourcePage(manifest, analysis, schemaHash, metadata, relatedOutput
|
|
|
9336
9987
|
`# ${analysis.title}`,
|
|
9337
9988
|
"",
|
|
9338
9989
|
`Source ID: \`${manifest.sourceId}\``,
|
|
9990
|
+
`Source Kind: \`${manifest.sourceKind}\``,
|
|
9339
9991
|
manifest.url ? `Source URL: ${manifest.url}` : `Source Path: \`${manifest.originalPath ?? manifest.storedPath}\``,
|
|
9340
9992
|
...manifest.sourceType ? [`Source Type: \`${manifest.sourceType}\``, ""] : [""],
|
|
9341
9993
|
...manifest.sourceClass ? [`Source Class: \`${manifest.sourceClass}\``, ""] : [],
|
|
9994
|
+
...manifest.sourceGroupTitle ? [`Source Group: ${manifest.sourceGroupTitle}`] : [],
|
|
9995
|
+
...manifest.partTitle ? [`Part: ${manifest.partIndex ?? "?"}/${manifest.partCount ?? "?"} - ${manifest.partTitle}`] : [],
|
|
9996
|
+
...manifest.details && Object.keys(manifest.details).length ? [
|
|
9997
|
+
"",
|
|
9998
|
+
"## Source Details",
|
|
9999
|
+
"",
|
|
10000
|
+
...Object.entries(manifest.details).map(([key, value]) => `- ${key.replace(/_/g, " ")}: ${value}`),
|
|
10001
|
+
""
|
|
10002
|
+
] : [],
|
|
9342
10003
|
"",
|
|
9343
10004
|
"## Summary",
|
|
9344
10005
|
"",
|
|
@@ -14987,7 +15648,7 @@ async function bootstrapDemo(rootDir, input) {
|
|
|
14987
15648
|
}
|
|
14988
15649
|
|
|
14989
15650
|
// src/mcp.ts
|
|
14990
|
-
var SERVER_VERSION = "0.
|
|
15651
|
+
var SERVER_VERSION = "0.3.0";
|
|
14991
15652
|
async function createMcpServer(rootDir) {
|
|
14992
15653
|
const server = new McpServer({
|
|
14993
15654
|
name: "swarmvault",
|
|
@@ -15165,8 +15826,8 @@ async function createMcpServer(rootDir) {
|
|
|
15165
15826
|
}
|
|
15166
15827
|
},
|
|
15167
15828
|
async ({ input }) => {
|
|
15168
|
-
const
|
|
15169
|
-
return asToolText(
|
|
15829
|
+
const result = await ingestInputDetailed(rootDir, input);
|
|
15830
|
+
return asToolText(result);
|
|
15170
15831
|
}
|
|
15171
15832
|
);
|
|
15172
15833
|
server.registerTool(
|
|
@@ -15970,12 +16631,11 @@ async function syncCrawlSource(rootDir, entry, options) {
|
|
|
15970
16631
|
let updatedCount = 0;
|
|
15971
16632
|
for (const pageUrl of crawl.pages) {
|
|
15972
16633
|
const persisted = await ingestInputDetailed(rootDir, pageUrl);
|
|
15973
|
-
currentSourceIds.push(persisted.manifest.sourceId);
|
|
15974
|
-
|
|
15975
|
-
|
|
15976
|
-
|
|
15977
|
-
|
|
15978
|
-
}
|
|
16634
|
+
currentSourceIds.push(...persisted.created.map((manifest) => manifest.sourceId));
|
|
16635
|
+
currentSourceIds.push(...persisted.updated.map((manifest) => manifest.sourceId));
|
|
16636
|
+
currentSourceIds.push(...persisted.unchanged.map((manifest) => manifest.sourceId));
|
|
16637
|
+
importedCount += persisted.created.length;
|
|
16638
|
+
updatedCount += persisted.updated.length;
|
|
15979
16639
|
}
|
|
15980
16640
|
let removedCount = 0;
|
|
15981
16641
|
for (const sourceId of previousSourceIds) {
|
|
@@ -17237,6 +17897,7 @@ export {
|
|
|
17237
17897
|
importInbox,
|
|
17238
17898
|
ingestDirectory,
|
|
17239
17899
|
ingestInput,
|
|
17900
|
+
ingestInputDetailed,
|
|
17240
17901
|
initVault,
|
|
17241
17902
|
initWorkspace,
|
|
17242
17903
|
installAgent,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@swarmvaultai/engine",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.3.0",
|
|
4
4
|
"description": "Core engine for SwarmVault: ingest, compile, query, lint, and provider abstractions.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|
|
@@ -44,6 +44,7 @@
|
|
|
44
44
|
"@mozilla/readability": "^0.6.0",
|
|
45
45
|
"@vscode/tree-sitter-wasm": "^0.3.1",
|
|
46
46
|
"chokidar": "^4.0.3",
|
|
47
|
+
"csv-parse": "^6.2.1",
|
|
47
48
|
"fflate": "^0.8.2",
|
|
48
49
|
"gray-matter": "^4.0.3",
|
|
49
50
|
"ignore": "^7.0.5",
|
|
@@ -55,6 +56,7 @@
|
|
|
55
56
|
"tree-sitter-wasms": "^0.1.13",
|
|
56
57
|
"turndown": "^7.2.1",
|
|
57
58
|
"typescript": "^5.9.3",
|
|
59
|
+
"xlsx": "^0.18.5",
|
|
58
60
|
"yaml": "^2.8.1",
|
|
59
61
|
"zod": "^4.1.8"
|
|
60
62
|
},
|