@swarmvaultai/engine 0.1.11 → 0.1.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -3
- package/dist/index.d.ts +6 -2
- package/dist/index.js +252 -17
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -117,9 +117,10 @@ This matters because many "OpenAI-compatible" backends only implement part of th
|
|
|
117
117
|
|
|
118
118
|
### Ingest
|
|
119
119
|
|
|
120
|
-
- `ingestInput(rootDir, input)` ingests a local path or URL
|
|
120
|
+
- `ingestInput(rootDir, input, { includeAssets, maxAssetSize })` ingests a local path or URL
|
|
121
121
|
- `importInbox(rootDir, inputDir?)` recursively imports supported inbox files and browser-clipper style bundles
|
|
122
122
|
- `.js`, `.jsx`, `.ts`, and `.tsx` inputs are treated as code sources and compiled into both source pages and `wiki/code/` module pages
|
|
123
|
+
- HTML and markdown URL ingests localize remote image references into `raw/assets/<sourceId>/` by default and rewrite the stored markdown to local relative paths
|
|
123
124
|
|
|
124
125
|
### Compile + Query
|
|
125
126
|
|
|
@@ -160,8 +161,8 @@ Running the engine produces a local workspace with these main areas:
|
|
|
160
161
|
- `swarmvault.schema.md`: vault-specific compile and query instructions
|
|
161
162
|
- `inbox/`: capture staging area for markdown bundles and imported files
|
|
162
163
|
- `raw/sources/`: immutable source copies
|
|
163
|
-
- `raw/assets/`: copied attachments referenced by ingested markdown bundles
|
|
164
|
-
- `wiki/`: generated markdown pages, staged candidates, saved query outputs, exploration hub pages, and a human-only `insights/` area
|
|
164
|
+
- `raw/assets/`: copied attachments referenced by ingested markdown bundles and remote URL ingests
|
|
165
|
+
- `wiki/`: generated markdown pages, the append-only `log.md` activity trail, staged candidates, saved query outputs, exploration hub pages, and a human-only `insights/` area
|
|
165
166
|
- `wiki/outputs/assets/`: local chart/image artifacts and JSON manifests for saved visual outputs
|
|
166
167
|
- `wiki/code/`: generated module pages for ingested JS/TS sources
|
|
167
168
|
- `wiki/projects/`: generated project rollups over canonical pages
|
|
@@ -178,6 +179,7 @@ Running the engine produces a local workspace with these main areas:
|
|
|
178
179
|
|
|
179
180
|
Saved outputs are indexed immediately into the graph page registry and search index, then linked back into compiled source, concept, and entity pages immediately through the lightweight artifact sync path. New concept and entity pages stage into `wiki/candidates/` first and promote to active pages on the next matching compile. Insight pages are indexed into search and page reads, but compile does not mutate them. Project-scoped pages receive `project_ids`, project tags, and layered root-plus-project schema hashes when all contributing sources resolve to the same configured project.
|
|
180
181
|
JS/TS code sources also emit module and symbol nodes into `state/graph.json`, so local imports, exports, inheritance, and same-module call edges are queryable through the same viewer and search pipeline.
|
|
182
|
+
Ingest, inbox import, compile, query, lint, review, and candidate operations also append human-readable entries to `wiki/log.md`.
|
|
181
183
|
|
|
182
184
|
## Notes
|
|
183
185
|
|
package/dist/index.d.ts
CHANGED
|
@@ -189,6 +189,10 @@ interface SourceAttachment {
|
|
|
189
189
|
mimeType: string;
|
|
190
190
|
originalPath?: string;
|
|
191
191
|
}
|
|
192
|
+
interface IngestOptions {
|
|
193
|
+
includeAssets?: boolean;
|
|
194
|
+
maxAssetSize?: number;
|
|
195
|
+
}
|
|
192
196
|
interface SourceManifest {
|
|
193
197
|
sourceId: string;
|
|
194
198
|
title: string;
|
|
@@ -674,7 +678,7 @@ declare function initWorkspace(rootDir: string): Promise<{
|
|
|
674
678
|
paths: ResolvedPaths;
|
|
675
679
|
}>;
|
|
676
680
|
|
|
677
|
-
declare function ingestInput(rootDir: string, input: string): Promise<SourceManifest>;
|
|
681
|
+
declare function ingestInput(rootDir: string, input: string, options?: IngestOptions): Promise<SourceManifest>;
|
|
678
682
|
declare function importInbox(rootDir: string, inputDir?: string): Promise<InboxImportResult>;
|
|
679
683
|
declare function listManifests(rootDir: string): Promise<SourceManifest[]>;
|
|
680
684
|
declare function readExtractedText(rootDir: string, manifest: SourceManifest): Promise<string | undefined>;
|
|
@@ -756,4 +760,4 @@ declare function watchVault(rootDir: string, options?: WatchOptions): Promise<Wa
|
|
|
756
760
|
declare function createWebSearchAdapter(id: string, config: WebSearchProviderConfig, rootDir: string): Promise<WebSearchAdapter>;
|
|
757
761
|
declare function getWebSearchAdapterForTask(rootDir: string, task: "deepLintProvider"): Promise<WebSearchAdapter>;
|
|
758
762
|
|
|
759
|
-
export { type AnalyzedTerm, type ApprovalChangeType, type ApprovalDetail, type ApprovalEntry, type ApprovalEntryDetail, type ApprovalEntryStatus, type ApprovalManifest, type ApprovalSummary, type CandidateRecord, type ChartDatum, type ChartSpec, type ClaimStatus, type CodeAnalysis, type CodeDiagnostic, type CodeImport, type CodeLanguage, type CodeSymbol, type CodeSymbolKind, type CommandRoleExecutorConfig, type CompileOptions, type CompileResult, type CompileState, type ExploreOptions, type ExploreResult, type ExploreStepResult, type Freshness, type GenerationAttachment, type GenerationRequest, type GenerationResponse, type GraphArtifact, type GraphEdge, type GraphNode, type GraphPage, type ImageGenerationRequest, type ImageGenerationResponse, type InboxImportResult, type InboxImportSkip, type InitOptions, type LintFinding, type LintOptions, type OrchestrationConfig, type OrchestrationFinding, type OrchestrationProposal, type OrchestrationRole, type OrchestrationRoleConfig, type OrchestrationRoleResult, type OutputAsset, type OutputAssetRole, type OutputFormat, type OutputOrigin, type PageKind, type PageManager, type PageStatus, type Polarity, type ProviderAdapter, type ProviderCapability, type ProviderConfig, type ProviderRoleExecutorConfig, type ProviderType, type QueryOptions, type QueryResult, type ResolvedPaths, type ReviewActionResult, type RoleExecutorConfig, type SceneElement, type SceneSpec, type ScheduleController, type ScheduleJobConfig, type ScheduleStateRecord, type ScheduleTriggerConfig, type ScheduledCompileTask, type ScheduledExploreTask, type ScheduledLintTask, type ScheduledQueryTask, type ScheduledRunResult, type ScheduledTaskConfig, type SearchResult, type SourceAnalysis, type SourceAttachment, type SourceClaim, type SourceKind, type SourceManifest, type VaultConfig, type WatchController, type WatchOptions, type WatchRunRecord, type WebSearchAdapter, type WebSearchProviderConfig, type WebSearchProviderType, type WebSearchResult, acceptApproval, archiveCandidate, assertProviderCapability, bootstrapDemo, compileVault, createMcpServer, createProvider, createWebSearchAdapter, defaultVaultConfig, defaultVaultSchema, exploreVault, exportGraphHtml, getProviderForTask, getWebSearchAdapterForTask, getWorkspaceInfo, importInbox, ingestInput, initVault, initWorkspace, installAgent, installConfiguredAgents, lintVault, listApprovals, listCandidates, listManifests, listPages, listSchedules, loadVaultConfig, loadVaultSchema, loadVaultSchemas, promoteCandidate, providerCapabilitySchema, providerTypeSchema, queryVault, readApproval, readExtractedText, readPage, rejectApproval, resolvePaths, runSchedule, searchVault, serveSchedules, startGraphServer, startMcpServer, watchVault, webSearchProviderTypeSchema };
|
|
763
|
+
export { type AnalyzedTerm, type ApprovalChangeType, type ApprovalDetail, type ApprovalEntry, type ApprovalEntryDetail, type ApprovalEntryStatus, type ApprovalManifest, type ApprovalSummary, type CandidateRecord, type ChartDatum, type ChartSpec, type ClaimStatus, type CodeAnalysis, type CodeDiagnostic, type CodeImport, type CodeLanguage, type CodeSymbol, type CodeSymbolKind, type CommandRoleExecutorConfig, type CompileOptions, type CompileResult, type CompileState, type ExploreOptions, type ExploreResult, type ExploreStepResult, type Freshness, type GenerationAttachment, type GenerationRequest, type GenerationResponse, type GraphArtifact, type GraphEdge, type GraphNode, type GraphPage, type ImageGenerationRequest, type ImageGenerationResponse, type InboxImportResult, type InboxImportSkip, type IngestOptions, type InitOptions, type LintFinding, type LintOptions, type OrchestrationConfig, type OrchestrationFinding, type OrchestrationProposal, type OrchestrationRole, type OrchestrationRoleConfig, type OrchestrationRoleResult, type OutputAsset, type OutputAssetRole, type OutputFormat, type OutputOrigin, type PageKind, type PageManager, type PageStatus, type Polarity, type ProviderAdapter, type ProviderCapability, type ProviderConfig, type ProviderRoleExecutorConfig, type ProviderType, type QueryOptions, type QueryResult, type ResolvedPaths, type ReviewActionResult, type RoleExecutorConfig, type SceneElement, type SceneSpec, type ScheduleController, type ScheduleJobConfig, type ScheduleStateRecord, type ScheduleTriggerConfig, type ScheduledCompileTask, type ScheduledExploreTask, type ScheduledLintTask, type ScheduledQueryTask, type ScheduledRunResult, type ScheduledTaskConfig, type SearchResult, type SourceAnalysis, type SourceAttachment, type SourceClaim, type SourceKind, type SourceManifest, type VaultConfig, type WatchController, type WatchOptions, type WatchRunRecord, type WebSearchAdapter, type WebSearchProviderConfig, type WebSearchProviderType, type WebSearchResult, acceptApproval, archiveCandidate, assertProviderCapability, bootstrapDemo, compileVault, createMcpServer, createProvider, createWebSearchAdapter, defaultVaultConfig, defaultVaultSchema, exploreVault, exportGraphHtml, getProviderForTask, getWebSearchAdapterForTask, getWorkspaceInfo, importInbox, ingestInput, initVault, initWorkspace, installAgent, installConfiguredAgents, lintVault, listApprovals, listCandidates, listManifests, listPages, listSchedules, loadVaultConfig, loadVaultSchema, loadVaultSchemas, promoteCandidate, providerCapabilitySchema, providerTypeSchema, queryVault, readApproval, readExtractedText, readPage, rejectApproval, resolvePaths, runSchedule, searchVault, serveSchedules, startGraphServer, startMcpServer, watchVault, webSearchProviderTypeSchema };
|
package/dist/index.js
CHANGED
|
@@ -670,6 +670,7 @@ async function appendWatchRun(rootDir, run) {
|
|
|
670
670
|
}
|
|
671
671
|
|
|
672
672
|
// src/ingest.ts
|
|
673
|
+
var DEFAULT_MAX_ASSET_SIZE = 10 * 1024 * 1024;
|
|
673
674
|
function inferKind(mimeType, filePath) {
|
|
674
675
|
if (inferCodeLanguage(filePath, mimeType)) {
|
|
675
676
|
return "code";
|
|
@@ -698,6 +699,12 @@ function titleFromText(fallback, content) {
|
|
|
698
699
|
function guessMimeType(target) {
|
|
699
700
|
return mime.lookup(target) || "application/octet-stream";
|
|
700
701
|
}
|
|
702
|
+
function normalizeIngestOptions(options) {
|
|
703
|
+
return {
|
|
704
|
+
includeAssets: options?.includeAssets ?? true,
|
|
705
|
+
maxAssetSize: Math.max(0, Math.floor(options?.maxAssetSize ?? DEFAULT_MAX_ASSET_SIZE))
|
|
706
|
+
};
|
|
707
|
+
}
|
|
701
708
|
function buildCompositeHash(payloadBytes, attachments = []) {
|
|
702
709
|
if (!attachments.length) {
|
|
703
710
|
return sha256(payloadBytes);
|
|
@@ -742,6 +749,40 @@ function extractMarkdownReferences(content) {
|
|
|
742
749
|
}
|
|
743
750
|
return references;
|
|
744
751
|
}
|
|
752
|
+
function normalizeRemoteReference(value, baseUrl) {
|
|
753
|
+
const trimmed = value.trim().replace(/^<|>$/g, "");
|
|
754
|
+
const [withoutTitle] = trimmed.split(/\s+(?=(?:[^"]*"[^"]*")*[^"]*$)/, 1);
|
|
755
|
+
const candidate = withoutTitle.split("#")[0]?.trim();
|
|
756
|
+
if (!candidate) {
|
|
757
|
+
return null;
|
|
758
|
+
}
|
|
759
|
+
const lowered = candidate.toLowerCase();
|
|
760
|
+
if (lowered.startsWith("data:") || lowered.startsWith("mailto:") || lowered.startsWith("#")) {
|
|
761
|
+
return null;
|
|
762
|
+
}
|
|
763
|
+
let resolved;
|
|
764
|
+
try {
|
|
765
|
+
resolved = new URL(candidate, baseUrl);
|
|
766
|
+
} catch {
|
|
767
|
+
return null;
|
|
768
|
+
}
|
|
769
|
+
if (!/^https?:$/i.test(resolved.protocol)) {
|
|
770
|
+
return null;
|
|
771
|
+
}
|
|
772
|
+
resolved.hash = "";
|
|
773
|
+
return resolved.toString();
|
|
774
|
+
}
|
|
775
|
+
function extractMarkdownImageReferences(content, baseUrl) {
|
|
776
|
+
const references = [];
|
|
777
|
+
const imagePattern = /!\[[^\]]*]\(([^)]+)\)/g;
|
|
778
|
+
for (const match of content.matchAll(imagePattern)) {
|
|
779
|
+
const normalized = normalizeRemoteReference(match[1] ?? "", baseUrl);
|
|
780
|
+
if (normalized) {
|
|
781
|
+
references.push(normalized);
|
|
782
|
+
}
|
|
783
|
+
}
|
|
784
|
+
return references;
|
|
785
|
+
}
|
|
745
786
|
async function convertHtmlToMarkdown(html, url) {
|
|
746
787
|
const dom = new JSDOM(html, { url });
|
|
747
788
|
const article = new Readability(dom.window.document).parse();
|
|
@@ -766,6 +807,142 @@ async function readManifestByHash(manifestsDir, contentHash) {
|
|
|
766
807
|
}
|
|
767
808
|
return null;
|
|
768
809
|
}
|
|
810
|
+
function resolveUrlMimeType(input, response) {
|
|
811
|
+
const headerMimeType = response.headers.get("content-type")?.split(";")[0]?.trim();
|
|
812
|
+
const guessedMimeType = guessMimeType(new URL(input).pathname);
|
|
813
|
+
if (!headerMimeType) {
|
|
814
|
+
return guessedMimeType;
|
|
815
|
+
}
|
|
816
|
+
if ((headerMimeType === "text/plain" || headerMimeType === "application/octet-stream") && guessedMimeType !== "application/octet-stream") {
|
|
817
|
+
return guessedMimeType;
|
|
818
|
+
}
|
|
819
|
+
return headerMimeType;
|
|
820
|
+
}
|
|
821
|
+
function buildRemoteAssetRelativePath(assetUrl, mimeType) {
|
|
822
|
+
const url = new URL(assetUrl);
|
|
823
|
+
const normalized = sanitizeAssetRelativePath(`${url.hostname}${url.pathname || "/asset"}`);
|
|
824
|
+
const extension = path4.posix.extname(normalized);
|
|
825
|
+
const directory = path4.posix.dirname(normalized);
|
|
826
|
+
const basename = extension ? path4.posix.basename(normalized, extension) : path4.posix.basename(normalized);
|
|
827
|
+
const resolvedExtension = extension || `.${mime.extension(mimeType) || "bin"}`;
|
|
828
|
+
const hashedName = `${basename || "asset"}-${sha256(assetUrl).slice(0, 8)}${resolvedExtension}`;
|
|
829
|
+
return directory === "." ? hashedName : path4.posix.join(directory, hashedName);
|
|
830
|
+
}
|
|
831
|
+
async function readResponseBytesWithinLimit(response, maxBytes) {
|
|
832
|
+
const contentLength = Number.parseInt(response.headers.get("content-length") ?? "", 10);
|
|
833
|
+
if (Number.isFinite(contentLength) && contentLength > maxBytes) {
|
|
834
|
+
throw new Error(`asset exceeds max size (${contentLength} > ${maxBytes})`);
|
|
835
|
+
}
|
|
836
|
+
if (!response.body) {
|
|
837
|
+
const bytes = Buffer.from(await response.arrayBuffer());
|
|
838
|
+
if (bytes.length > maxBytes) {
|
|
839
|
+
throw new Error(`asset exceeds max size (${bytes.length} > ${maxBytes})`);
|
|
840
|
+
}
|
|
841
|
+
return bytes;
|
|
842
|
+
}
|
|
843
|
+
const reader = response.body.getReader();
|
|
844
|
+
const chunks = [];
|
|
845
|
+
let total = 0;
|
|
846
|
+
while (true) {
|
|
847
|
+
const { done, value } = await reader.read();
|
|
848
|
+
if (done) {
|
|
849
|
+
break;
|
|
850
|
+
}
|
|
851
|
+
total += value.byteLength;
|
|
852
|
+
if (total > maxBytes) {
|
|
853
|
+
await reader.cancel("asset exceeds configured size limit");
|
|
854
|
+
throw new Error(`asset exceeds max size (${total} > ${maxBytes})`);
|
|
855
|
+
}
|
|
856
|
+
chunks.push(Buffer.from(value));
|
|
857
|
+
}
|
|
858
|
+
return Buffer.concat(chunks);
|
|
859
|
+
}
|
|
860
|
+
async function fetchRemoteImageAttachment(assetUrl, maxAssetSize) {
|
|
861
|
+
const response = await fetch(assetUrl);
|
|
862
|
+
if (!response.ok) {
|
|
863
|
+
throw new Error(`failed with ${response.status} ${response.statusText}`);
|
|
864
|
+
}
|
|
865
|
+
const mimeType = response.headers.get("content-type")?.split(";")[0]?.trim() || guessMimeType(new URL(assetUrl).pathname);
|
|
866
|
+
if (!mimeType.startsWith("image/")) {
|
|
867
|
+
throw new Error(`unsupported mime type ${mimeType}`);
|
|
868
|
+
}
|
|
869
|
+
const bytes = await readResponseBytesWithinLimit(response, maxAssetSize);
|
|
870
|
+
return {
|
|
871
|
+
relativePath: buildRemoteAssetRelativePath(assetUrl, mimeType),
|
|
872
|
+
mimeType,
|
|
873
|
+
originalPath: assetUrl,
|
|
874
|
+
bytes
|
|
875
|
+
};
|
|
876
|
+
}
|
|
877
|
+
async function collectRemoteImageAttachments(assetUrls, options) {
|
|
878
|
+
if (!options.includeAssets || options.maxAssetSize === 0 || !assetUrls.length) {
|
|
879
|
+
return { attachments: [], skippedCount: 0 };
|
|
880
|
+
}
|
|
881
|
+
const attachments = [];
|
|
882
|
+
let skippedCount = 0;
|
|
883
|
+
for (const assetUrl of [...new Set(assetUrls)]) {
|
|
884
|
+
try {
|
|
885
|
+
attachments.push(await fetchRemoteImageAttachment(assetUrl, options.maxAssetSize));
|
|
886
|
+
} catch {
|
|
887
|
+
skippedCount += 1;
|
|
888
|
+
}
|
|
889
|
+
}
|
|
890
|
+
return { attachments, skippedCount };
|
|
891
|
+
}
|
|
892
|
+
function extractHtmlImageReferences(html, baseUrl) {
|
|
893
|
+
const dom = new JSDOM(html, { url: baseUrl });
|
|
894
|
+
const document = dom.window.document;
|
|
895
|
+
const references = [];
|
|
896
|
+
for (const image of [...document.querySelectorAll("img[src]")]) {
|
|
897
|
+
const src = image.getAttribute("src");
|
|
898
|
+
if (!src) {
|
|
899
|
+
continue;
|
|
900
|
+
}
|
|
901
|
+
const normalized = normalizeRemoteReference(src, baseUrl);
|
|
902
|
+
if (normalized) {
|
|
903
|
+
references.push(normalized);
|
|
904
|
+
}
|
|
905
|
+
}
|
|
906
|
+
return references;
|
|
907
|
+
}
|
|
908
|
+
function rewriteHtmlImageReferences(html, baseUrl, replacements) {
|
|
909
|
+
const dom = new JSDOM(html, { url: baseUrl });
|
|
910
|
+
const document = dom.window.document;
|
|
911
|
+
for (const image of [...document.querySelectorAll("img[src]")]) {
|
|
912
|
+
const src = image.getAttribute("src");
|
|
913
|
+
if (!src) {
|
|
914
|
+
continue;
|
|
915
|
+
}
|
|
916
|
+
const normalized = normalizeRemoteReference(src, baseUrl);
|
|
917
|
+
const replacement = normalized ? replacements.get(normalized) : void 0;
|
|
918
|
+
if (replacement) {
|
|
919
|
+
image.setAttribute("src", replacement);
|
|
920
|
+
}
|
|
921
|
+
}
|
|
922
|
+
return dom.serialize();
|
|
923
|
+
}
|
|
924
|
+
function rewriteMarkdownImageReferences(content, baseUrl, replacements) {
|
|
925
|
+
return content.replace(/(!\[[^\]]*]\()([^)]+)(\))/g, (fullMatch, prefix, target, suffix) => {
|
|
926
|
+
const normalized = normalizeRemoteReference(target, baseUrl);
|
|
927
|
+
const replacement = normalized ? replacements.get(normalized) : void 0;
|
|
928
|
+
if (!replacement) {
|
|
929
|
+
return fullMatch;
|
|
930
|
+
}
|
|
931
|
+
return `${prefix}${replacement}${suffix}`;
|
|
932
|
+
});
|
|
933
|
+
}
|
|
934
|
+
function rewriteMarkdownImageTargets(content, replacements) {
|
|
935
|
+
return content.replace(/(!\[[^\]]*]\()([^)]+)(\))/g, (fullMatch, prefix, target, suffix) => {
|
|
936
|
+
const trimmed = target.trim().replace(/^<|>$/g, "");
|
|
937
|
+
const [withoutTitle] = trimmed.split(/\s+(?=(?:[^"]*"[^"]*")*[^"]*$)/, 1);
|
|
938
|
+
const candidate = withoutTitle.trim();
|
|
939
|
+
const replacement = replacements.get(candidate);
|
|
940
|
+
if (!replacement) {
|
|
941
|
+
return fullMatch;
|
|
942
|
+
}
|
|
943
|
+
return `${prefix}${replacement}${suffix}`;
|
|
944
|
+
});
|
|
945
|
+
}
|
|
769
946
|
async function persistPreparedInput(rootDir, prepared, paths) {
|
|
770
947
|
await ensureDir(paths.rawSourcesDir);
|
|
771
948
|
await ensureDir(paths.rawAssetsDir);
|
|
@@ -817,7 +994,8 @@ async function persistPreparedInput(rootDir, prepared, paths) {
|
|
|
817
994
|
await appendLogEntry(rootDir, "ingest", prepared.title, [
|
|
818
995
|
`source_id=${sourceId}`,
|
|
819
996
|
`kind=${prepared.sourceKind}`,
|
|
820
|
-
`attachments=${manifestAttachments.length}
|
|
997
|
+
`attachments=${manifestAttachments.length}`,
|
|
998
|
+
...prepared.logDetails ?? []
|
|
821
999
|
]);
|
|
822
1000
|
return { manifest, isNew: true };
|
|
823
1001
|
}
|
|
@@ -847,33 +1025,86 @@ async function prepareFileInput(_rootDir, absoluteInput) {
|
|
|
847
1025
|
extractedText
|
|
848
1026
|
};
|
|
849
1027
|
}
|
|
850
|
-
async function prepareUrlInput(input) {
|
|
1028
|
+
async function prepareUrlInput(input, options) {
|
|
851
1029
|
const response = await fetch(input);
|
|
852
1030
|
if (!response.ok) {
|
|
853
1031
|
throw new Error(`Failed to fetch ${input}: ${response.status} ${response.statusText}`);
|
|
854
1032
|
}
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
let
|
|
858
|
-
|
|
1033
|
+
const inputUrl = new URL(input);
|
|
1034
|
+
const originalPayloadBytes = Buffer.from(await response.arrayBuffer());
|
|
1035
|
+
let payloadBytes = originalPayloadBytes;
|
|
1036
|
+
let mimeType = resolveUrlMimeType(input, response);
|
|
1037
|
+
let sourceKind = inferKind(mimeType, inputUrl.pathname);
|
|
1038
|
+
const language = inferCodeLanguage(inputUrl.pathname, mimeType);
|
|
859
1039
|
let storedExtension = ".bin";
|
|
860
|
-
let title =
|
|
1040
|
+
let title = inputUrl.hostname + inputUrl.pathname;
|
|
861
1041
|
let extractedText;
|
|
1042
|
+
let attachments;
|
|
1043
|
+
let contentHash;
|
|
1044
|
+
const logDetails = [];
|
|
862
1045
|
if (sourceKind === "html" || mimeType.startsWith("text/html")) {
|
|
863
|
-
const html =
|
|
864
|
-
const
|
|
865
|
-
title =
|
|
1046
|
+
const html = originalPayloadBytes.toString("utf8");
|
|
1047
|
+
const initialConversion = await convertHtmlToMarkdown(html, input);
|
|
1048
|
+
title = initialConversion.title;
|
|
1049
|
+
let localizedHtml = html;
|
|
1050
|
+
let localAssetReplacements;
|
|
1051
|
+
if (options.includeAssets) {
|
|
1052
|
+
const { attachments: remoteAttachments, skippedCount } = await collectRemoteImageAttachments(
|
|
1053
|
+
extractHtmlImageReferences(html, input),
|
|
1054
|
+
options
|
|
1055
|
+
);
|
|
1056
|
+
if (remoteAttachments.length) {
|
|
1057
|
+
attachments = remoteAttachments;
|
|
1058
|
+
contentHash = buildCompositeHash(originalPayloadBytes, remoteAttachments);
|
|
1059
|
+
const sourceId = `${slugify(title)}-${contentHash.slice(0, 8)}`;
|
|
1060
|
+
localAssetReplacements = new Map(
|
|
1061
|
+
remoteAttachments.map((attachment) => [attachment.originalPath ?? "", `../assets/${sourceId}/${attachment.relativePath}`])
|
|
1062
|
+
);
|
|
1063
|
+
localizedHtml = rewriteHtmlImageReferences(html, input, localAssetReplacements);
|
|
1064
|
+
logDetails.push(`remote_assets=${remoteAttachments.length}`);
|
|
1065
|
+
}
|
|
1066
|
+
if (skippedCount) {
|
|
1067
|
+
logDetails.push(`remote_asset_skips=${skippedCount}`);
|
|
1068
|
+
}
|
|
1069
|
+
}
|
|
1070
|
+
const converted = localizedHtml === html && !attachments?.length ? initialConversion : await convertHtmlToMarkdown(localizedHtml, input);
|
|
866
1071
|
extractedText = converted.markdown;
|
|
867
|
-
|
|
1072
|
+
if (localAssetReplacements?.size) {
|
|
1073
|
+
const absoluteLocalAssetReplacements = new Map(
|
|
1074
|
+
[...localAssetReplacements.values()].map((replacement) => [new URL(replacement, input).toString(), replacement])
|
|
1075
|
+
);
|
|
1076
|
+
extractedText = rewriteMarkdownImageTargets(extractedText, absoluteLocalAssetReplacements);
|
|
1077
|
+
}
|
|
1078
|
+
payloadBytes = Buffer.from(extractedText, "utf8");
|
|
868
1079
|
mimeType = "text/markdown";
|
|
869
1080
|
sourceKind = "markdown";
|
|
870
1081
|
storedExtension = ".md";
|
|
871
1082
|
} else {
|
|
872
|
-
const extension = path4.extname(
|
|
1083
|
+
const extension = path4.extname(inputUrl.pathname);
|
|
873
1084
|
storedExtension = extension || `.${mime.extension(mimeType) || "bin"}`;
|
|
874
1085
|
if (sourceKind === "markdown" || sourceKind === "text" || sourceKind === "code") {
|
|
875
1086
|
extractedText = payloadBytes.toString("utf8");
|
|
876
|
-
title = titleFromText(title ||
|
|
1087
|
+
title = titleFromText(title || inputUrl.hostname, extractedText);
|
|
1088
|
+
if (sourceKind === "markdown" && options.includeAssets) {
|
|
1089
|
+
const { attachments: remoteAttachments, skippedCount } = await collectRemoteImageAttachments(
|
|
1090
|
+
extractMarkdownImageReferences(extractedText, input),
|
|
1091
|
+
options
|
|
1092
|
+
);
|
|
1093
|
+
if (remoteAttachments.length) {
|
|
1094
|
+
attachments = remoteAttachments;
|
|
1095
|
+
contentHash = buildCompositeHash(originalPayloadBytes, remoteAttachments);
|
|
1096
|
+
const sourceId = `${slugify(title)}-${contentHash.slice(0, 8)}`;
|
|
1097
|
+
const replacements = new Map(
|
|
1098
|
+
remoteAttachments.map((attachment) => [attachment.originalPath ?? "", `../assets/${sourceId}/${attachment.relativePath}`])
|
|
1099
|
+
);
|
|
1100
|
+
extractedText = rewriteMarkdownImageReferences(extractedText, input, replacements);
|
|
1101
|
+
payloadBytes = Buffer.from(extractedText, "utf8");
|
|
1102
|
+
logDetails.push(`remote_assets=${remoteAttachments.length}`);
|
|
1103
|
+
}
|
|
1104
|
+
if (skippedCount) {
|
|
1105
|
+
logDetails.push(`remote_asset_skips=${skippedCount}`);
|
|
1106
|
+
}
|
|
1107
|
+
}
|
|
877
1108
|
}
|
|
878
1109
|
}
|
|
879
1110
|
return {
|
|
@@ -885,7 +1116,10 @@ async function prepareUrlInput(input) {
|
|
|
885
1116
|
mimeType,
|
|
886
1117
|
storedExtension,
|
|
887
1118
|
payloadBytes,
|
|
888
|
-
extractedText
|
|
1119
|
+
extractedText,
|
|
1120
|
+
attachments,
|
|
1121
|
+
contentHash,
|
|
1122
|
+
logDetails
|
|
889
1123
|
};
|
|
890
1124
|
}
|
|
891
1125
|
async function collectInboxAttachmentRefs(inputDir, files) {
|
|
@@ -975,9 +1209,10 @@ async function prepareInboxMarkdownInput(absolutePath, attachmentRefs) {
|
|
|
975
1209
|
function isSupportedInboxKind(sourceKind) {
|
|
976
1210
|
return ["markdown", "text", "html", "pdf", "image"].includes(sourceKind);
|
|
977
1211
|
}
|
|
978
|
-
async function ingestInput(rootDir, input) {
|
|
1212
|
+
async function ingestInput(rootDir, input, options) {
|
|
979
1213
|
const { paths } = await initWorkspace(rootDir);
|
|
980
|
-
const
|
|
1214
|
+
const normalizedOptions = normalizeIngestOptions(options);
|
|
1215
|
+
const prepared = /^https?:\/\//i.test(input) ? await prepareUrlInput(input, normalizedOptions) : await prepareFileInput(rootDir, path4.resolve(rootDir, input));
|
|
981
1216
|
const result = await persistPreparedInput(rootDir, prepared, paths);
|
|
982
1217
|
return result.manifest;
|
|
983
1218
|
}
|
|
@@ -6531,7 +6766,7 @@ async function bootstrapDemo(rootDir, input) {
|
|
|
6531
6766
|
}
|
|
6532
6767
|
|
|
6533
6768
|
// src/mcp.ts
|
|
6534
|
-
var SERVER_VERSION = "0.1.
|
|
6769
|
+
var SERVER_VERSION = "0.1.12";
|
|
6535
6770
|
async function createMcpServer(rootDir) {
|
|
6536
6771
|
const server = new McpServer({
|
|
6537
6772
|
name: "swarmvault",
|