@j0hanz/superfetch 2.2.1 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +243 -494
- package/dist/cache.d.ts +2 -3
- package/dist/cache.js +51 -241
- package/dist/config.d.ts +6 -1
- package/dist/config.js +29 -34
- package/dist/crypto.d.ts +0 -1
- package/dist/crypto.js +0 -1
- package/dist/dom-noise-removal.d.ts +5 -0
- package/dist/dom-noise-removal.js +485 -0
- package/dist/errors.d.ts +0 -1
- package/dist/errors.js +8 -6
- package/dist/fetch.d.ts +0 -1
- package/dist/fetch.js +71 -61
- package/dist/host-normalization.d.ts +1 -0
- package/dist/host-normalization.js +47 -0
- package/dist/http-native.d.ts +5 -0
- package/dist/http-native.js +693 -0
- package/dist/index.d.ts +0 -1
- package/dist/index.js +1 -2
- package/dist/instructions.md +22 -20
- package/dist/json.d.ts +1 -0
- package/dist/json.js +29 -0
- package/dist/language-detection.d.ts +12 -0
- package/dist/language-detection.js +291 -0
- package/dist/markdown-cleanup.d.ts +18 -0
- package/dist/markdown-cleanup.js +283 -0
- package/dist/mcp-validator.d.ts +14 -0
- package/dist/mcp-validator.js +22 -0
- package/dist/mcp.d.ts +0 -1
- package/dist/mcp.js +0 -1
- package/dist/observability.d.ts +1 -1
- package/dist/observability.js +15 -3
- package/dist/server-tuning.d.ts +9 -0
- package/dist/server-tuning.js +30 -0
- package/dist/session.d.ts +36 -0
- package/dist/session.js +159 -0
- package/dist/tools.d.ts +0 -1
- package/dist/tools.js +23 -33
- package/dist/transform-types.d.ts +80 -0
- package/dist/transform-types.js +5 -0
- package/dist/transform.d.ts +7 -53
- package/dist/transform.js +434 -856
- package/dist/type-guards.d.ts +1 -2
- package/dist/type-guards.js +1 -2
- package/dist/workers/transform-worker.d.ts +0 -1
- package/dist/workers/transform-worker.js +52 -43
- package/package.json +11 -12
- package/dist/cache.d.ts.map +0 -1
- package/dist/cache.js.map +0 -1
- package/dist/config.d.ts.map +0 -1
- package/dist/config.js.map +0 -1
- package/dist/crypto.d.ts.map +0 -1
- package/dist/crypto.js.map +0 -1
- package/dist/errors.d.ts.map +0 -1
- package/dist/errors.js.map +0 -1
- package/dist/fetch.d.ts.map +0 -1
- package/dist/fetch.js.map +0 -1
- package/dist/http.d.ts +0 -90
- package/dist/http.d.ts.map +0 -1
- package/dist/http.js +0 -1576
- package/dist/http.js.map +0 -1
- package/dist/index.d.ts.map +0 -1
- package/dist/index.js.map +0 -1
- package/dist/mcp.d.ts.map +0 -1
- package/dist/mcp.js.map +0 -1
- package/dist/observability.d.ts.map +0 -1
- package/dist/observability.js.map +0 -1
- package/dist/tools.d.ts.map +0 -1
- package/dist/tools.js.map +0 -1
- package/dist/transform.d.ts.map +0 -1
- package/dist/transform.js.map +0 -1
- package/dist/type-guards.d.ts.map +0 -1
- package/dist/type-guards.js.map +0 -1
- package/dist/workers/transform-worker.d.ts.map +0 -1
- package/dist/workers/transform-worker.js.map +0 -1
package/dist/tools.js
CHANGED
|
@@ -6,7 +6,7 @@ import { FetchError, getErrorMessage, isSystemError } from './errors.js';
|
|
|
6
6
|
import { fetchNormalizedUrl, normalizeUrl, transformToRawUrl, } from './fetch.js';
|
|
7
7
|
import { getRequestId, logDebug, logError, logWarn, runWithRequestContext, } from './observability.js';
|
|
8
8
|
import { transformHtmlToMarkdown, } from './transform.js';
|
|
9
|
-
import {
|
|
9
|
+
import { isObject } from './type-guards.js';
|
|
10
10
|
const TRUNCATION_MARKER = '...[truncated]';
|
|
11
11
|
const FETCH_PROGRESS_TOTAL = 4;
|
|
12
12
|
const fetchUrlInputSchema = z.strictObject({
|
|
@@ -106,23 +106,16 @@ function buildEmbeddedResource(content, url, title) {
|
|
|
106
106
|
},
|
|
107
107
|
};
|
|
108
108
|
}
|
|
109
|
-
function
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
if (!url)
|
|
119
|
-
return;
|
|
120
|
-
const embeddedResource = buildEmbeddedResource(contentToEmbed, url, title);
|
|
121
|
-
if (embeddedResource) {
|
|
122
|
-
blocks.push(embeddedResource);
|
|
109
|
+
function appendResourceBlocks({ blocks, inlineResult, resourceName, url, title, fullContent, }) {
|
|
110
|
+
const contentToEmbed = config.runtime.httpMode
|
|
111
|
+
? inlineResult.content
|
|
112
|
+
: (fullContent ?? inlineResult.content);
|
|
113
|
+
if (contentToEmbed && url) {
|
|
114
|
+
const embeddedResource = buildEmbeddedResource(contentToEmbed, url, title);
|
|
115
|
+
if (embeddedResource) {
|
|
116
|
+
blocks.push(embeddedResource);
|
|
117
|
+
}
|
|
123
118
|
}
|
|
124
|
-
}
|
|
125
|
-
function maybeAppendResourceLink(blocks, inlineResult, resourceName) {
|
|
126
119
|
const resourceLink = buildResourceLink(inlineResult, resourceName);
|
|
127
120
|
if (resourceLink) {
|
|
128
121
|
blocks.push(resourceLink);
|
|
@@ -136,9 +129,14 @@ function buildTextBlock(structuredContent) {
|
|
|
136
129
|
}
|
|
137
130
|
function buildToolContentBlocks(structuredContent, fromCache, inlineResult, resourceName, cacheKey, fullContent, url, title) {
|
|
138
131
|
const blocks = [buildTextBlock(structuredContent)];
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
132
|
+
appendResourceBlocks({
|
|
133
|
+
blocks,
|
|
134
|
+
inlineResult,
|
|
135
|
+
resourceName,
|
|
136
|
+
url,
|
|
137
|
+
title,
|
|
138
|
+
fullContent,
|
|
139
|
+
});
|
|
142
140
|
return blocks;
|
|
143
141
|
}
|
|
144
142
|
function applyInlineContentLimit(content, cacheKey) {
|
|
@@ -246,7 +244,7 @@ function persistCache({ cacheKey, data, serialize, normalizedUrl, }) {
|
|
|
246
244
|
cache.set(cacheKey, serializer(data), metadata);
|
|
247
245
|
}
|
|
248
246
|
function extractTitle(value) {
|
|
249
|
-
if (!
|
|
247
|
+
if (!isObject(value))
|
|
250
248
|
return undefined;
|
|
251
249
|
const { title } = value;
|
|
252
250
|
return typeof title === 'string' ? title : undefined;
|
|
@@ -266,14 +264,6 @@ function logRawUrlTransformation(resolvedUrl) {
|
|
|
266
264
|
original: resolvedUrl.originalUrl,
|
|
267
265
|
});
|
|
268
266
|
}
|
|
269
|
-
function applyOptionalPipelineSerialization(pipelineOptions, options) {
|
|
270
|
-
if (options.serialize !== undefined) {
|
|
271
|
-
pipelineOptions.serialize = options.serialize;
|
|
272
|
-
}
|
|
273
|
-
if (options.deserialize !== undefined) {
|
|
274
|
-
pipelineOptions.deserialize = options.deserialize;
|
|
275
|
-
}
|
|
276
|
-
}
|
|
277
267
|
export async function performSharedFetch(options, deps = {}) {
|
|
278
268
|
const executePipeline = deps.executeFetchPipeline ?? executeFetchPipeline;
|
|
279
269
|
const pipelineOptions = {
|
|
@@ -281,8 +271,9 @@ export async function performSharedFetch(options, deps = {}) {
|
|
|
281
271
|
cacheNamespace: 'markdown',
|
|
282
272
|
...(options.signal === undefined ? {} : { signal: options.signal }),
|
|
283
273
|
transform: options.transform,
|
|
274
|
+
...(options.serialize ? { serialize: options.serialize } : {}),
|
|
275
|
+
...(options.deserialize ? { deserialize: options.deserialize } : {}),
|
|
284
276
|
};
|
|
285
|
-
applyOptionalPipelineSerialization(pipelineOptions, options);
|
|
286
277
|
const pipeline = await executePipeline(pipelineOptions);
|
|
287
278
|
const inlineResult = applyInlineContentLimit(pipeline.data.content, pipeline.cacheKey ?? null);
|
|
288
279
|
return { pipeline, inlineResult };
|
|
@@ -319,7 +310,7 @@ function resolveToolErrorMessage(error, fallbackMessage) {
|
|
|
319
310
|
function parseJsonRecord(input) {
|
|
320
311
|
try {
|
|
321
312
|
const parsed = JSON.parse(input);
|
|
322
|
-
return
|
|
313
|
+
return isObject(parsed) ? parsed : undefined;
|
|
323
314
|
}
|
|
324
315
|
catch {
|
|
325
316
|
return undefined;
|
|
@@ -467,7 +458,7 @@ export function withRequestContextIfMissing(handler) {
|
|
|
467
458
|
};
|
|
468
459
|
}
|
|
469
460
|
function resolveRequestIdFromExtra(extra) {
|
|
470
|
-
if (!
|
|
461
|
+
if (!isObject(extra))
|
|
471
462
|
return undefined;
|
|
472
463
|
const { requestId } = extra;
|
|
473
464
|
if (typeof requestId === 'string')
|
|
@@ -485,4 +476,3 @@ export function registerTools(server) {
|
|
|
485
476
|
annotations: TOOL_DEFINITION.annotations,
|
|
486
477
|
}, withRequestContextIfMissing(TOOL_DEFINITION.handler));
|
|
487
478
|
}
|
|
488
|
-
//# sourceMappingURL=tools.js.map
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared types for the transform pipeline.
|
|
3
|
+
* Extracted to avoid circular dependencies between transform modules.
|
|
4
|
+
*/
|
|
5
|
+
/**
|
|
6
|
+
* Metadata block for attaching source information to markdown output.
|
|
7
|
+
*/
|
|
8
|
+
export interface MetadataBlock {
|
|
9
|
+
type: 'metadata';
|
|
10
|
+
title?: string;
|
|
11
|
+
description?: string;
|
|
12
|
+
author?: string;
|
|
13
|
+
url: string;
|
|
14
|
+
fetchedAt: string;
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Article extracted by Readability.
|
|
18
|
+
*/
|
|
19
|
+
export interface ExtractedArticle {
|
|
20
|
+
title?: string;
|
|
21
|
+
byline?: string;
|
|
22
|
+
content: string;
|
|
23
|
+
textContent: string;
|
|
24
|
+
excerpt?: string;
|
|
25
|
+
siteName?: string;
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Metadata extracted from HTML meta tags.
|
|
29
|
+
*/
|
|
30
|
+
export interface ExtractedMetadata {
|
|
31
|
+
title?: string;
|
|
32
|
+
description?: string;
|
|
33
|
+
author?: string;
|
|
34
|
+
image?: string;
|
|
35
|
+
publishedAt?: string;
|
|
36
|
+
modifiedAt?: string;
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Result of content extraction (article + metadata).
|
|
40
|
+
*/
|
|
41
|
+
export interface ExtractionResult {
|
|
42
|
+
article: ExtractedArticle | null;
|
|
43
|
+
metadata: ExtractedMetadata;
|
|
44
|
+
}
|
|
45
|
+
/**
|
|
46
|
+
* Result of HTML to markdown transformation.
|
|
47
|
+
*/
|
|
48
|
+
export interface MarkdownTransformResult {
|
|
49
|
+
markdown: string;
|
|
50
|
+
title: string | undefined;
|
|
51
|
+
truncated: boolean;
|
|
52
|
+
}
|
|
53
|
+
/**
|
|
54
|
+
* Options for transform operations.
|
|
55
|
+
*/
|
|
56
|
+
export interface TransformOptions {
|
|
57
|
+
includeMetadata: boolean;
|
|
58
|
+
signal?: AbortSignal;
|
|
59
|
+
}
|
|
60
|
+
/**
|
|
61
|
+
* Telemetry event emitted during transform stages.
|
|
62
|
+
*/
|
|
63
|
+
export interface TransformStageEvent {
|
|
64
|
+
v: 1;
|
|
65
|
+
type: 'stage';
|
|
66
|
+
stage: string;
|
|
67
|
+
durationMs: number;
|
|
68
|
+
url: string;
|
|
69
|
+
requestId?: string;
|
|
70
|
+
operationId?: string;
|
|
71
|
+
truncated?: boolean;
|
|
72
|
+
}
|
|
73
|
+
/**
|
|
74
|
+
* Context for tracking transform stage timing.
|
|
75
|
+
*/
|
|
76
|
+
export interface TransformStageContext {
|
|
77
|
+
readonly stage: string;
|
|
78
|
+
readonly startTime: number;
|
|
79
|
+
readonly url: string;
|
|
80
|
+
}
|
package/dist/transform.d.ts
CHANGED
|
@@ -1,52 +1,8 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
url: string;
|
|
7
|
-
fetchedAt: string;
|
|
8
|
-
}
|
|
9
|
-
export interface ExtractedArticle {
|
|
10
|
-
title?: string;
|
|
11
|
-
byline?: string;
|
|
12
|
-
content: string;
|
|
13
|
-
textContent: string;
|
|
14
|
-
excerpt?: string;
|
|
15
|
-
siteName?: string;
|
|
16
|
-
}
|
|
17
|
-
export interface ExtractedMetadata {
|
|
18
|
-
title?: string;
|
|
19
|
-
description?: string;
|
|
20
|
-
author?: string;
|
|
21
|
-
}
|
|
22
|
-
export interface ExtractionResult {
|
|
23
|
-
article: ExtractedArticle | null;
|
|
24
|
-
metadata: ExtractedMetadata;
|
|
25
|
-
}
|
|
26
|
-
export interface MarkdownTransformResult {
|
|
27
|
-
markdown: string;
|
|
28
|
-
title: string | undefined;
|
|
29
|
-
truncated: boolean;
|
|
30
|
-
}
|
|
31
|
-
export interface TransformOptions {
|
|
32
|
-
includeMetadata: boolean;
|
|
33
|
-
signal?: AbortSignal;
|
|
34
|
-
}
|
|
35
|
-
export interface TransformStageEvent {
|
|
36
|
-
v: 1;
|
|
37
|
-
type: 'stage';
|
|
38
|
-
stage: string;
|
|
39
|
-
durationMs: number;
|
|
40
|
-
url: string;
|
|
41
|
-
requestId?: string;
|
|
42
|
-
operationId?: string;
|
|
43
|
-
truncated?: boolean;
|
|
44
|
-
}
|
|
45
|
-
export interface TransformStageContext {
|
|
46
|
-
readonly stage: string;
|
|
47
|
-
readonly startTime: number;
|
|
48
|
-
readonly url: string;
|
|
49
|
-
}
|
|
1
|
+
import type { ExtractedArticle, ExtractedMetadata, ExtractionResult, MarkdownTransformResult, MetadataBlock, TransformOptions, TransformStageContext } from './transform-types.js';
|
|
2
|
+
export { detectLanguageFromCode, resolveLanguageFromAttributes, } from './language-detection.js';
|
|
3
|
+
export { cleanupMarkdownArtifacts, promoteOrphanHeadings, } from './markdown-cleanup.js';
|
|
4
|
+
export { removeNoiseFromHtml } from './dom-noise-removal.js';
|
|
5
|
+
export type { MetadataBlock, ExtractedArticle, ExtractedMetadata, ExtractionResult, MarkdownTransformResult, TransformOptions, TransformStageEvent, TransformStageContext, } from './transform-types.js';
|
|
50
6
|
export declare function startTransformStage(url: string, stage: string): TransformStageContext | null;
|
|
51
7
|
export declare function endTransformStage(context: TransformStageContext | null, options?: {
|
|
52
8
|
truncated?: boolean;
|
|
@@ -55,17 +11,15 @@ export declare function extractContent(html: string, url: string, options?: {
|
|
|
55
11
|
extractArticle?: boolean;
|
|
56
12
|
signal?: AbortSignal;
|
|
57
13
|
}): ExtractionResult;
|
|
58
|
-
export declare function detectLanguageFromCode(code: string): string | undefined;
|
|
59
|
-
export declare function resolveLanguageFromAttributes(className: string, dataLang: string): string | undefined;
|
|
60
14
|
export declare function htmlToMarkdown(html: string, metadata?: MetadataBlock, options?: {
|
|
61
15
|
url?: string;
|
|
62
16
|
signal?: AbortSignal;
|
|
63
17
|
document?: Document;
|
|
18
|
+
skipNoiseRemoval?: boolean;
|
|
64
19
|
}): string;
|
|
65
|
-
export declare function isExtractionSufficient(article: ExtractedArticle | null,
|
|
20
|
+
export declare function isExtractionSufficient(article: ExtractedArticle | null, originalHtmlOrDocument: string | Document): boolean;
|
|
66
21
|
export declare function determineContentExtractionSource(article: ExtractedArticle | null): article is ExtractedArticle;
|
|
67
22
|
export declare function createContentMetadataBlock(url: string, article: ExtractedArticle | null, extractedMeta: ExtractedMetadata, shouldExtractFromArticle: boolean, includeMetadata: boolean): MetadataBlock | undefined;
|
|
68
23
|
export declare function transformHtmlToMarkdownInProcess(html: string, url: string, options: TransformOptions): MarkdownTransformResult;
|
|
69
24
|
export declare function shutdownTransformWorkerPool(): Promise<void>;
|
|
70
25
|
export declare function transformHtmlToMarkdown(html: string, url: string, options: TransformOptions): Promise<MarkdownTransformResult>;
|
|
71
|
-
//# sourceMappingURL=transform.d.ts.map
|