@j0hanz/superfetch 1.2.5 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +116 -152
- package/dist/config/auth-config.d.ts +16 -0
- package/dist/config/auth-config.js +53 -0
- package/dist/config/constants.d.ts +11 -13
- package/dist/config/constants.js +1 -3
- package/dist/config/env-parsers.d.ts +7 -0
- package/dist/config/env-parsers.js +84 -0
- package/dist/config/formatting.d.ts +2 -2
- package/dist/config/index.d.ts +47 -53
- package/dist/config/index.js +25 -59
- package/dist/config/types/content.d.ts +1 -49
- package/dist/config/types/runtime.d.ts +8 -16
- package/dist/config/types/tools.d.ts +2 -28
- package/dist/http/accept-policy.d.ts +3 -0
- package/dist/http/accept-policy.js +45 -0
- package/dist/http/async-handler.d.ts +2 -0
- package/dist/http/async-handler.js +5 -0
- package/dist/http/auth-introspection.d.ts +2 -0
- package/dist/http/auth-introspection.js +141 -0
- package/dist/http/auth-static.d.ts +2 -0
- package/dist/http/auth-static.js +23 -0
- package/dist/http/auth.d.ts +3 -2
- package/dist/http/auth.js +98 -26
- package/dist/http/cors.d.ts +6 -6
- package/dist/http/cors.js +7 -42
- package/dist/http/download-routes.d.ts +0 -12
- package/dist/http/download-routes.js +21 -58
- package/dist/http/jsonrpc-http.d.ts +2 -0
- package/dist/http/jsonrpc-http.js +10 -0
- package/dist/http/mcp-routes.d.ts +0 -1
- package/dist/http/mcp-routes.js +43 -30
- package/dist/http/mcp-session-helpers.d.ts +0 -1
- package/dist/http/mcp-session-helpers.js +1 -1
- package/dist/http/mcp-session-transport.d.ts +7 -0
- package/dist/http/mcp-session-transport.js +57 -0
- package/dist/http/mcp-session.js +60 -73
- package/dist/http/mcp-validation.d.ts +1 -0
- package/dist/http/mcp-validation.js +11 -10
- package/dist/http/protocol-policy.d.ts +2 -0
- package/dist/http/protocol-policy.js +31 -0
- package/dist/http/rate-limit.js +5 -2
- package/dist/http/server-config.d.ts +1 -0
- package/dist/http/server-config.js +40 -0
- package/dist/http/server-middleware.d.ts +2 -9
- package/dist/http/server-middleware.js +96 -43
- package/dist/http/server-shutdown.d.ts +4 -0
- package/dist/http/server-shutdown.js +43 -0
- package/dist/http/server.js +52 -64
- package/dist/http/session-cleanup.js +1 -1
- package/dist/middleware/error-handler.js +1 -3
- package/dist/resources/cached-content.js +50 -108
- package/dist/resources/index.js +0 -82
- package/dist/server.js +51 -30
- package/dist/services/cache-keys.d.ts +7 -0
- package/dist/services/cache-keys.js +57 -0
- package/dist/services/cache.d.ts +1 -7
- package/dist/services/cache.js +53 -119
- package/dist/services/context.d.ts +0 -1
- package/dist/services/context.js +0 -7
- package/dist/services/extractor.js +10 -82
- package/dist/services/fetcher/agents.d.ts +2 -2
- package/dist/services/fetcher/agents.js +34 -95
- package/dist/services/fetcher/dns-selection.d.ts +2 -0
- package/dist/services/fetcher/dns-selection.js +72 -0
- package/dist/services/fetcher/interceptors.d.ts +0 -22
- package/dist/services/fetcher/interceptors.js +30 -13
- package/dist/services/fetcher/redirects.js +4 -3
- package/dist/services/fetcher/response.js +66 -31
- package/dist/services/fetcher.d.ts +1 -3
- package/dist/services/fetcher.js +14 -33
- package/dist/services/fifo-queue.d.ts +8 -0
- package/dist/services/fifo-queue.js +25 -0
- package/dist/services/logger.js +2 -2
- package/dist/services/metadata-collector.d.ts +1 -9
- package/dist/services/metadata-collector.js +71 -2
- package/dist/services/transform-worker-pool.d.ts +4 -14
- package/dist/services/transform-worker-pool.js +177 -129
- package/dist/services/transform-worker-types.d.ts +32 -0
- package/dist/services/transform-worker-types.js +14 -0
- package/dist/tools/handlers/fetch-markdown.tool.d.ts +3 -4
- package/dist/tools/handlers/fetch-markdown.tool.js +20 -72
- package/dist/tools/handlers/fetch-single.shared.d.ts +1 -20
- package/dist/tools/handlers/fetch-single.shared.js +44 -87
- package/dist/tools/handlers/fetch-url.tool.d.ts +1 -1
- package/dist/tools/handlers/fetch-url.tool.js +46 -123
- package/dist/tools/index.js +21 -40
- package/dist/tools/schemas.d.ts +1 -51
- package/dist/tools/schemas.js +1 -107
- package/dist/tools/utils/cached-markdown.d.ts +5 -0
- package/dist/tools/utils/cached-markdown.js +46 -0
- package/dist/tools/utils/content-shaping.d.ts +4 -0
- package/dist/tools/utils/content-shaping.js +52 -0
- package/dist/tools/utils/content-transform.d.ts +2 -17
- package/dist/tools/utils/content-transform.js +120 -114
- package/dist/tools/utils/fetch-pipeline.d.ts +0 -8
- package/dist/tools/utils/fetch-pipeline.js +65 -62
- package/dist/tools/utils/inline-content.d.ts +1 -2
- package/dist/tools/utils/inline-content.js +4 -7
- package/dist/transformers/markdown.transformer.js +109 -34
- package/dist/utils/cached-payload.d.ts +7 -0
- package/dist/utils/cached-payload.js +36 -0
- package/dist/utils/error-utils.js +1 -1
- package/dist/utils/filename-generator.js +21 -10
- package/dist/utils/guards.d.ts +1 -0
- package/dist/utils/guards.js +3 -0
- package/dist/utils/header-normalizer.d.ts +0 -3
- package/dist/utils/header-normalizer.js +3 -3
- package/dist/utils/tool-error-handler.d.ts +2 -2
- package/dist/utils/tool-error-handler.js +11 -38
- package/dist/utils/url-transformer.d.ts +7 -0
- package/dist/utils/url-transformer.js +147 -0
- package/dist/utils/url-validator.d.ts +1 -2
- package/dist/utils/url-validator.js +20 -93
- package/dist/workers/content-transform.worker.d.ts +1 -0
- package/dist/workers/content-transform.worker.js +40 -0
- package/package.json +13 -16
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import { isRecord } from '../../utils/guards.js';
|
|
2
|
+
function parseJsonRecord(input) {
|
|
3
|
+
try {
|
|
4
|
+
const parsed = JSON.parse(input);
|
|
5
|
+
return isRecord(parsed) ? parsed : undefined;
|
|
6
|
+
}
|
|
7
|
+
catch {
|
|
8
|
+
return undefined;
|
|
9
|
+
}
|
|
10
|
+
}
|
|
11
|
+
function resolveMarkdownContent(parsed) {
|
|
12
|
+
const { markdown } = parsed;
|
|
13
|
+
if (typeof markdown === 'string')
|
|
14
|
+
return markdown;
|
|
15
|
+
const { content } = parsed;
|
|
16
|
+
if (typeof content === 'string')
|
|
17
|
+
return content;
|
|
18
|
+
return undefined;
|
|
19
|
+
}
|
|
20
|
+
function resolveOptionalTitle(parsed) {
|
|
21
|
+
const { title } = parsed;
|
|
22
|
+
if (title === undefined)
|
|
23
|
+
return undefined;
|
|
24
|
+
return typeof title === 'string' ? title : undefined;
|
|
25
|
+
}
|
|
26
|
+
function resolveTruncatedFlag(parsed) {
|
|
27
|
+
const { truncated } = parsed;
|
|
28
|
+
return typeof truncated === 'boolean' ? truncated : false;
|
|
29
|
+
}
|
|
30
|
+
export function parseCachedMarkdownResult(cached) {
|
|
31
|
+
const parsed = parseJsonRecord(cached);
|
|
32
|
+
if (!parsed)
|
|
33
|
+
return undefined;
|
|
34
|
+
const resolvedContent = resolveMarkdownContent(parsed);
|
|
35
|
+
if (resolvedContent === undefined)
|
|
36
|
+
return undefined;
|
|
37
|
+
const title = resolveOptionalTitle(parsed);
|
|
38
|
+
if (parsed.title !== undefined && title === undefined)
|
|
39
|
+
return undefined;
|
|
40
|
+
return {
|
|
41
|
+
content: resolvedContent,
|
|
42
|
+
markdown: resolvedContent,
|
|
43
|
+
title,
|
|
44
|
+
truncated: resolveTruncatedFlag(parsed),
|
|
45
|
+
};
|
|
46
|
+
}
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
import type { ExtractedArticle, ExtractedMetadata, MetadataBlock } from '../../config/types/content.js';
|
|
2
|
+
export declare function isExtractionSufficient(article: ExtractedArticle | null, originalHtml: string): boolean;
|
|
3
|
+
export declare function determineContentExtractionSource(article: ExtractedArticle | null): article is ExtractedArticle;
|
|
4
|
+
export declare function createContentMetadataBlock(url: string, article: ExtractedArticle | null, extractedMeta: ExtractedMetadata, shouldExtractFromArticle: boolean, includeMetadata: boolean): MetadataBlock | undefined;
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
const MIN_CONTENT_RATIO = 0.3;
|
|
2
|
+
const MIN_HTML_LENGTH_FOR_GATE = 100;
|
|
3
|
+
function estimateTextLength(html) {
|
|
4
|
+
return html
|
|
5
|
+
.replace(/<[^>]*>/g, '')
|
|
6
|
+
.replace(/\s+/g, ' ')
|
|
7
|
+
.trim().length;
|
|
8
|
+
}
|
|
9
|
+
export function isExtractionSufficient(article, originalHtml) {
|
|
10
|
+
if (!article)
|
|
11
|
+
return false;
|
|
12
|
+
const articleLength = article.textContent.length;
|
|
13
|
+
const originalLength = estimateTextLength(originalHtml);
|
|
14
|
+
if (originalLength < MIN_HTML_LENGTH_FOR_GATE)
|
|
15
|
+
return true;
|
|
16
|
+
return articleLength / originalLength >= MIN_CONTENT_RATIO;
|
|
17
|
+
}
|
|
18
|
+
export function determineContentExtractionSource(article) {
|
|
19
|
+
return !!article;
|
|
20
|
+
}
|
|
21
|
+
function applyArticleMetadata(metadata, article) {
|
|
22
|
+
if (article.title !== undefined)
|
|
23
|
+
metadata.title = article.title;
|
|
24
|
+
if (article.byline !== undefined)
|
|
25
|
+
metadata.author = article.byline;
|
|
26
|
+
}
|
|
27
|
+
function applyExtractedMetadata(metadata, extractedMeta) {
|
|
28
|
+
if (extractedMeta.title !== undefined)
|
|
29
|
+
metadata.title = extractedMeta.title;
|
|
30
|
+
if (extractedMeta.description !== undefined) {
|
|
31
|
+
metadata.description = extractedMeta.description;
|
|
32
|
+
}
|
|
33
|
+
if (extractedMeta.author !== undefined) {
|
|
34
|
+
metadata.author = extractedMeta.author;
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
export function createContentMetadataBlock(url, article, extractedMeta, shouldExtractFromArticle, includeMetadata) {
|
|
38
|
+
if (!includeMetadata)
|
|
39
|
+
return undefined;
|
|
40
|
+
const now = new Date().toISOString();
|
|
41
|
+
const metadata = {
|
|
42
|
+
type: 'metadata',
|
|
43
|
+
url,
|
|
44
|
+
fetchedAt: now,
|
|
45
|
+
};
|
|
46
|
+
if (shouldExtractFromArticle && article) {
|
|
47
|
+
applyArticleMetadata(metadata, article);
|
|
48
|
+
return metadata;
|
|
49
|
+
}
|
|
50
|
+
applyExtractedMetadata(metadata, extractedMeta);
|
|
51
|
+
return metadata;
|
|
52
|
+
}
|
|
@@ -1,17 +1,2 @@
|
|
|
1
|
-
import type {
|
|
2
|
-
|
|
3
|
-
readonly extractMainContent: boolean;
|
|
4
|
-
readonly includeMetadata: boolean;
|
|
5
|
-
}
|
|
6
|
-
interface ContentLengthOptions {
|
|
7
|
-
readonly maxContentLength?: number;
|
|
8
|
-
}
|
|
9
|
-
interface MarkdownOptions extends ExtractionOptions, ContentLengthOptions {
|
|
10
|
-
}
|
|
11
|
-
interface MarkdownWithBlocksOptions extends ExtractionOptions, ContentLengthOptions {
|
|
12
|
-
readonly includeContentBlocks?: boolean;
|
|
13
|
-
}
|
|
14
|
-
export declare function transformHtmlToJsonl(html: string, url: string, options: ExtractionOptions & ContentLengthOptions): JsonlTransformResult;
|
|
15
|
-
export declare function transformHtmlToMarkdown(html: string, url: string, options: MarkdownOptions): MarkdownTransformResult;
|
|
16
|
-
export declare function transformHtmlToMarkdownWithBlocks(html: string, url: string, options: MarkdownWithBlocksOptions): JsonlTransformResult;
|
|
17
|
-
export {};
|
|
1
|
+
import type { MarkdownTransformResult, TransformOptions } from '../../config/types/content.js';
|
|
2
|
+
export declare function transformHtmlToMarkdown(html: string, url: string, options: TransformOptions): MarkdownTransformResult;
|
|
@@ -1,134 +1,140 @@
|
|
|
1
|
-
import { TRUNCATION_MARKER } from '../../config/formatting.js';
|
|
2
1
|
import { extractContent } from '../../services/extractor.js';
|
|
3
|
-
import {
|
|
4
|
-
import {
|
|
5
|
-
import { toJsonl } from '../../transformers/jsonl.transformer.js';
|
|
2
|
+
import { logDebug } from '../../services/logger.js';
|
|
3
|
+
import { isRawTextContentUrl } from '../../utils/url-transformer.js';
|
|
6
4
|
import { htmlToMarkdown } from '../../transformers/markdown.transformer.js';
|
|
7
|
-
import { createContentMetadataBlock, determineContentExtractionSource,
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
5
|
+
import { createContentMetadataBlock, determineContentExtractionSource, isExtractionSufficient, } from './content-shaping.js';
|
|
6
|
+
function buildArticleContentSource(url, article, extractedMeta, includeMetadata) {
|
|
7
|
+
const metadata = createContentMetadataBlock(url, article, extractedMeta, true, includeMetadata);
|
|
8
|
+
return {
|
|
9
|
+
sourceHtml: article.content,
|
|
10
|
+
title: article.title,
|
|
11
|
+
metadata,
|
|
12
|
+
};
|
|
13
|
+
}
|
|
14
|
+
function buildFullHtmlContentSource(html, url, article, extractedMeta, includeMetadata) {
|
|
15
|
+
const metadata = createContentMetadataBlock(url, article, extractedMeta, false, includeMetadata);
|
|
16
|
+
return {
|
|
17
|
+
sourceHtml: html,
|
|
18
|
+
title: extractedMeta.title,
|
|
19
|
+
metadata,
|
|
20
|
+
};
|
|
21
|
+
}
|
|
22
|
+
function logQualityGateFallback(url, article) {
|
|
23
|
+
logDebug('Quality gate: Readability extraction below threshold, using full HTML', {
|
|
24
|
+
url: url.substring(0, 80),
|
|
25
|
+
articleLength: article.textContent.length,
|
|
26
|
+
});
|
|
27
|
+
}
|
|
28
|
+
function tryBuildExtractedArticleContentSource(html, url, article, extractedMeta, options) {
|
|
29
|
+
if (!article)
|
|
30
|
+
return null;
|
|
31
|
+
const shouldExtractFromArticle = determineContentExtractionSource(article);
|
|
32
|
+
if (shouldExtractFromArticle && isExtractionSufficient(article, html)) {
|
|
33
|
+
return buildArticleContentSource(url, article, extractedMeta, options.includeMetadata);
|
|
16
34
|
}
|
|
35
|
+
if (shouldExtractFromArticle) {
|
|
36
|
+
logQualityGateFallback(url, article);
|
|
37
|
+
}
|
|
38
|
+
return null;
|
|
39
|
+
}
|
|
40
|
+
function resolveContentSource(html, url, options) {
|
|
17
41
|
const { article, metadata: extractedMeta } = extractContent(html, url, {
|
|
18
|
-
extractArticle:
|
|
42
|
+
extractArticle: true,
|
|
19
43
|
});
|
|
20
|
-
const
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
44
|
+
const extracted = tryBuildExtractedArticleContentSource(html, url, article, extractedMeta, options);
|
|
45
|
+
if (extracted)
|
|
46
|
+
return extracted;
|
|
47
|
+
return buildFullHtmlContentSource(html, url, article, extractedMeta, options.includeMetadata);
|
|
48
|
+
}
|
|
49
|
+
function buildMarkdownPayload(context) {
|
|
50
|
+
return htmlToMarkdown(context.sourceHtml, context.metadata);
|
|
51
|
+
}
|
|
52
|
+
function buildRawMarkdownPayload(rawContent, url, includeMetadata) {
|
|
53
|
+
const title = extractTitleFromRawMarkdown(rawContent);
|
|
54
|
+
const content = includeMetadata
|
|
55
|
+
? addSourceToMarkdown(rawContent, url)
|
|
56
|
+
: rawContent;
|
|
57
|
+
return { content, title };
|
|
58
|
+
}
|
|
59
|
+
function extractTitleFromRawMarkdown(content) {
|
|
60
|
+
const frontmatterMatch = /^---\r?\n([\s\S]*?)\r?\n---/.exec(content);
|
|
61
|
+
if (!frontmatterMatch)
|
|
29
62
|
return undefined;
|
|
30
|
-
const
|
|
31
|
-
const
|
|
32
|
-
return
|
|
33
|
-
}
|
|
34
|
-
function decodeHtmlEntities(value) {
|
|
35
|
-
if (!value.includes('&'))
|
|
36
|
-
return value;
|
|
37
|
-
const basicDecoded = value
|
|
38
|
-
.replace(/&/g, '&')
|
|
39
|
-
.replace(/</g, '<')
|
|
40
|
-
.replace(/>/g, '>')
|
|
41
|
-
.replace(/"/g, '"')
|
|
42
|
-
.replace(/'/g, "'");
|
|
43
|
-
return basicDecoded
|
|
44
|
-
.replace(/&#(\d+);/g, (match, code) => {
|
|
45
|
-
const parsed = Number.parseInt(code, 10);
|
|
46
|
-
return Number.isFinite(parsed) && parsed >= 0 && parsed <= 0x10ffff
|
|
47
|
-
? String.fromCodePoint(parsed)
|
|
48
|
-
: match;
|
|
49
|
-
})
|
|
50
|
-
.replace(/&#x([0-9a-fA-F]+);/g, (match, code) => {
|
|
51
|
-
const parsed = Number.parseInt(code, 16);
|
|
52
|
-
return Number.isFinite(parsed) && parsed >= 0 && parsed <= 0x10ffff
|
|
53
|
-
? String.fromCodePoint(parsed)
|
|
54
|
-
: match;
|
|
55
|
-
});
|
|
63
|
+
const frontmatter = frontmatterMatch[1] ?? '';
|
|
64
|
+
const titleMatch = /^(?:title|name):\s*["']?(.+?)["']?\s*$/im.exec(frontmatter);
|
|
65
|
+
return titleMatch?.[1]?.trim();
|
|
56
66
|
}
|
|
57
|
-
function
|
|
58
|
-
const
|
|
59
|
-
|
|
67
|
+
function addSourceToMarkdown(content, url) {
|
|
68
|
+
const frontmatterMatch = /^(---\r?\n)([\s\S]*?)(\r?\n---)/.exec(content);
|
|
69
|
+
if (frontmatterMatch) {
|
|
70
|
+
const start = frontmatterMatch[1] ?? '---\n';
|
|
71
|
+
const existingFields = frontmatterMatch[2] ?? '';
|
|
72
|
+
const end = frontmatterMatch[3] ?? '\n---';
|
|
73
|
+
const rest = content.slice(frontmatterMatch[0].length);
|
|
74
|
+
if (/^source:/im.test(existingFields)) {
|
|
75
|
+
return content;
|
|
76
|
+
}
|
|
77
|
+
return `${start}${existingFields}\nsource: "${url}"${end}${rest}`;
|
|
78
|
+
}
|
|
79
|
+
return `---\nsource: "${url}"\n---\n\n${content}`;
|
|
60
80
|
}
|
|
61
|
-
function
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
81
|
+
function looksLikeHtmlDocument(trimmed) {
|
|
82
|
+
return (trimmed.startsWith('<!DOCTYPE') ||
|
|
83
|
+
trimmed.startsWith('<!doctype') ||
|
|
84
|
+
trimmed.startsWith('<html') ||
|
|
85
|
+
trimmed.startsWith('<HTML'));
|
|
86
|
+
}
|
|
87
|
+
function hasFrontmatter(trimmed) {
|
|
88
|
+
return /^---\r?\n/.test(trimmed);
|
|
89
|
+
}
|
|
90
|
+
function countCommonHtmlTags(content) {
|
|
91
|
+
const matches = content.match(/<(html|head|body|div|span|script|style|meta|link)\b/gi) ??
|
|
92
|
+
[];
|
|
93
|
+
return matches.length;
|
|
68
94
|
}
|
|
69
|
-
function
|
|
70
|
-
const
|
|
71
|
-
const
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
return {
|
|
80
|
-
content,
|
|
81
|
-
contentBlocks,
|
|
82
|
-
title: parsed.metadata.title,
|
|
83
|
-
...(truncated && { truncated }),
|
|
84
|
-
};
|
|
95
|
+
function looksLikeMarkdown(content) {
|
|
96
|
+
const hasMarkdownHeadings = /^#{1,6}\s+/m.test(content);
|
|
97
|
+
const hasMarkdownLists = /^[\s]*[-*+]\s+/m.test(content);
|
|
98
|
+
const hasMarkdownCodeBlocks = /```[\s\S]*?```/.test(content);
|
|
99
|
+
return hasMarkdownHeadings || hasMarkdownLists || hasMarkdownCodeBlocks;
|
|
100
|
+
}
|
|
101
|
+
function isRawTextContent(content) {
|
|
102
|
+
const trimmed = content.trim();
|
|
103
|
+
if (looksLikeHtmlDocument(trimmed)) {
|
|
104
|
+
return false;
|
|
85
105
|
}
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
106
|
+
if (hasFrontmatter(trimmed)) {
|
|
107
|
+
return true;
|
|
108
|
+
}
|
|
109
|
+
if (countCommonHtmlTags(content) > 2) {
|
|
110
|
+
return false;
|
|
111
|
+
}
|
|
112
|
+
if (looksLikeMarkdown(content)) {
|
|
113
|
+
return true;
|
|
114
|
+
}
|
|
115
|
+
return false;
|
|
94
116
|
}
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
117
|
+
function tryTransformRawContent(html, url, options) {
|
|
118
|
+
if (!isRawTextContentUrl(url) && !isRawTextContent(html)) {
|
|
119
|
+
return null;
|
|
120
|
+
}
|
|
121
|
+
logDebug('Preserving raw markdown content', { url: url.substring(0, 80) });
|
|
122
|
+
const { content, title } = buildRawMarkdownPayload(html, url, options.includeMetadata);
|
|
98
123
|
return {
|
|
99
124
|
markdown: content,
|
|
100
|
-
title
|
|
101
|
-
truncated,
|
|
125
|
+
title,
|
|
126
|
+
truncated: false,
|
|
102
127
|
};
|
|
103
128
|
}
|
|
104
|
-
export function
|
|
105
|
-
const
|
|
106
|
-
if (
|
|
107
|
-
|
|
108
|
-
options.includeMetadata) {
|
|
109
|
-
const parsed = parseHtmlWithMetadata(html);
|
|
110
|
-
const context = {
|
|
111
|
-
sourceHtml: html,
|
|
112
|
-
title: parsed.metadata.title,
|
|
113
|
-
metadata: createContentMetadataBlock(url, null, parsed.metadata, false, true),
|
|
114
|
-
};
|
|
115
|
-
const { content, truncated } = buildMarkdownPayload(context, options.maxContentLength);
|
|
116
|
-
return {
|
|
117
|
-
content,
|
|
118
|
-
contentBlocks: parsed.blocks.length,
|
|
119
|
-
title: context.title,
|
|
120
|
-
...(truncated && { truncated }),
|
|
121
|
-
};
|
|
122
|
-
}
|
|
129
|
+
export function transformHtmlToMarkdown(html, url, options) {
|
|
130
|
+
const raw = tryTransformRawContent(html, url, options);
|
|
131
|
+
if (raw)
|
|
132
|
+
return raw;
|
|
123
133
|
const context = resolveContentSource(html, url, options);
|
|
124
|
-
const
|
|
125
|
-
? parseHtml(context.sourceHtml)
|
|
126
|
-
: [];
|
|
127
|
-
const { content, truncated } = buildMarkdownPayload(context, options.maxContentLength);
|
|
134
|
+
const content = buildMarkdownPayload(context);
|
|
128
135
|
return {
|
|
129
|
-
content,
|
|
130
|
-
contentBlocks: contentBlocks.length,
|
|
136
|
+
markdown: content,
|
|
131
137
|
title: context.title,
|
|
132
|
-
|
|
138
|
+
truncated: false,
|
|
133
139
|
};
|
|
134
140
|
}
|
|
@@ -1,10 +1,2 @@
|
|
|
1
1
|
import type { FetchPipelineOptions, PipelineResult } from '../../config/types/runtime.js';
|
|
2
|
-
/**
|
|
3
|
-
* Unified fetch pipeline that handles caching, fetching, and transformation.
|
|
4
|
-
* Implements cache-first strategy with automatic serialization.
|
|
5
|
-
*
|
|
6
|
-
* @template T - Type of the transformed result
|
|
7
|
-
* @param options - Pipeline configuration options
|
|
8
|
-
* @returns Promise resolving to the pipeline result
|
|
9
|
-
*/
|
|
10
2
|
export declare function executeFetchPipeline<T>(options: FetchPipelineOptions<T>): Promise<PipelineResult<T>>;
|
|
@@ -1,98 +1,101 @@
|
|
|
1
1
|
import * as cache from '../../services/cache.js';
|
|
2
|
-
import {
|
|
2
|
+
import { createCacheKey } from '../../services/cache-keys.js';
|
|
3
|
+
import { fetchNormalizedUrl } from '../../services/fetcher.js';
|
|
3
4
|
import { logDebug } from '../../services/logger.js';
|
|
4
|
-
import {
|
|
5
|
-
import {
|
|
5
|
+
import { isRecord } from '../../utils/guards.js';
|
|
6
|
+
import { transformToRawUrl } from '../../utils/url-transformer.js';
|
|
7
|
+
import { normalizeUrl } from '../../utils/url-validator.js';
|
|
6
8
|
function attemptCacheRetrieval(cacheKey, deserialize, cacheNamespace, normalizedUrl) {
|
|
7
9
|
if (!cacheKey)
|
|
8
10
|
return null;
|
|
9
11
|
const cached = cache.get(cacheKey);
|
|
10
12
|
if (!cached)
|
|
11
13
|
return null;
|
|
12
|
-
if (!deserialize)
|
|
13
|
-
|
|
14
|
-
namespace: cacheNamespace,
|
|
15
|
-
url: normalizedUrl,
|
|
16
|
-
});
|
|
17
|
-
return null;
|
|
18
|
-
}
|
|
14
|
+
if (!deserialize)
|
|
15
|
+
return logCacheMiss('missing deserializer', cacheNamespace, normalizedUrl);
|
|
19
16
|
const data = deserialize(cached.content);
|
|
20
|
-
if (data === undefined)
|
|
21
|
-
|
|
22
|
-
namespace: cacheNamespace,
|
|
23
|
-
url: normalizedUrl,
|
|
24
|
-
});
|
|
25
|
-
return null;
|
|
26
|
-
}
|
|
17
|
+
if (data === undefined)
|
|
18
|
+
return logCacheMiss('deserialize failure', cacheNamespace, normalizedUrl);
|
|
27
19
|
logDebug('Cache hit', { namespace: cacheNamespace, url: normalizedUrl });
|
|
28
|
-
return
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
};
|
|
20
|
+
return buildCacheHitResult(data, cached.fetchedAt, normalizedUrl, cacheKey);
|
|
21
|
+
}
|
|
22
|
+
function resolveNormalizedUrl(url) {
|
|
23
|
+
const { normalizedUrl: validatedUrl } = normalizeUrl(url);
|
|
24
|
+
const { url: normalizedUrl, transformed } = transformToRawUrl(validatedUrl);
|
|
25
|
+
return { normalizedUrl, originalUrl: validatedUrl, transformed };
|
|
35
26
|
}
|
|
36
|
-
/**
|
|
37
|
-
* Unified fetch pipeline that handles caching, fetching, and transformation.
|
|
38
|
-
* Implements cache-first strategy with automatic serialization.
|
|
39
|
-
*
|
|
40
|
-
* @template T - Type of the transformed result
|
|
41
|
-
* @param options - Pipeline configuration options
|
|
42
|
-
* @returns Promise resolving to the pipeline result
|
|
43
|
-
*/
|
|
44
27
|
export async function executeFetchPipeline(options) {
|
|
45
|
-
const
|
|
46
|
-
|
|
47
|
-
const
|
|
28
|
+
const resolvedUrl = resolveNormalizedUrl(options.url);
|
|
29
|
+
logRawUrlTransformation(resolvedUrl);
|
|
30
|
+
const cacheKey = resolveCacheKey(options, resolvedUrl.normalizedUrl);
|
|
31
|
+
const cachedResult = attemptCacheRetrieval(cacheKey, options.deserialize, options.cacheNamespace, resolvedUrl.normalizedUrl);
|
|
48
32
|
if (cachedResult)
|
|
49
33
|
return cachedResult;
|
|
50
|
-
await
|
|
51
|
-
const fetchOptions = buildFetchOptions(options);
|
|
52
|
-
logDebug('Fetching URL', { url: normalizedUrl, retries: options.retries });
|
|
53
|
-
const html = await fetchNormalizedUrlWithRetry(normalizedUrl, fetchOptions, options.retries);
|
|
54
|
-
const data = await options.transform(html, normalizedUrl);
|
|
34
|
+
const data = await fetchAndTransform(options, resolvedUrl.normalizedUrl);
|
|
55
35
|
if (cache.isEnabled()) {
|
|
56
|
-
persistCache(cacheKey, data, options.serialize, normalizedUrl);
|
|
36
|
+
persistCache(cacheKey, data, options.serialize, resolvedUrl.normalizedUrl);
|
|
57
37
|
}
|
|
58
|
-
return buildPipelineResult(normalizedUrl, data, cacheKey);
|
|
38
|
+
return buildPipelineResult(resolvedUrl.normalizedUrl, data, cacheKey);
|
|
59
39
|
}
|
|
60
40
|
function resolveCacheKey(options, normalizedUrl) {
|
|
61
|
-
|
|
62
|
-
|
|
41
|
+
return createCacheKey(options.cacheNamespace, normalizedUrl, options.cacheVary);
|
|
42
|
+
}
|
|
43
|
+
async function fetchAndTransform(options, normalizedUrl) {
|
|
44
|
+
const fetchOptions = buildFetchOptions(options);
|
|
45
|
+
logDebug('Fetching URL', { url: normalizedUrl });
|
|
46
|
+
const html = await fetchNormalizedUrl(normalizedUrl, fetchOptions);
|
|
47
|
+
return options.transform(html, normalizedUrl);
|
|
63
48
|
}
|
|
64
49
|
function buildFetchOptions(options) {
|
|
65
|
-
|
|
66
|
-
if (options.customHeaders !== undefined) {
|
|
67
|
-
fetchOptions.customHeaders = options.customHeaders;
|
|
68
|
-
}
|
|
69
|
-
if (options.signal !== undefined) {
|
|
70
|
-
fetchOptions.signal = options.signal;
|
|
71
|
-
}
|
|
72
|
-
if (options.timeout !== undefined) {
|
|
73
|
-
fetchOptions.timeout = options.timeout;
|
|
74
|
-
}
|
|
75
|
-
return fetchOptions;
|
|
50
|
+
return options.signal === undefined ? {} : { signal: options.signal };
|
|
76
51
|
}
|
|
77
|
-
function
|
|
78
|
-
if (!cacheKey)
|
|
79
|
-
return;
|
|
80
|
-
const serializer = serialize ?? JSON.stringify;
|
|
52
|
+
function resolveCacheMetadata(data, normalizedUrl) {
|
|
81
53
|
const metadata = { url: normalizedUrl };
|
|
82
54
|
const title = extractTitle(data);
|
|
83
55
|
if (title !== undefined) {
|
|
84
56
|
metadata.title = title;
|
|
85
57
|
}
|
|
58
|
+
return metadata;
|
|
59
|
+
}
|
|
60
|
+
function resolveSerializer(serialize) {
|
|
61
|
+
return serialize ?? JSON.stringify;
|
|
62
|
+
}
|
|
63
|
+
function persistCache(cacheKey, data, serialize, normalizedUrl) {
|
|
64
|
+
if (!cacheKey)
|
|
65
|
+
return;
|
|
66
|
+
const serializer = resolveSerializer(serialize);
|
|
67
|
+
const metadata = resolveCacheMetadata(data, normalizedUrl);
|
|
86
68
|
cache.set(cacheKey, serializer(data), metadata);
|
|
87
69
|
}
|
|
88
70
|
function extractTitle(value) {
|
|
89
|
-
if (!value
|
|
90
|
-
return undefined;
|
|
91
|
-
if (!('title' in value))
|
|
71
|
+
if (!isRecord(value))
|
|
92
72
|
return undefined;
|
|
93
73
|
const { title } = value;
|
|
94
74
|
return typeof title === 'string' ? title : undefined;
|
|
95
75
|
}
|
|
76
|
+
function logCacheMiss(reason, cacheNamespace, normalizedUrl) {
|
|
77
|
+
logDebug(`Cache miss due to ${reason}`, {
|
|
78
|
+
namespace: cacheNamespace,
|
|
79
|
+
url: normalizedUrl,
|
|
80
|
+
});
|
|
81
|
+
return null;
|
|
82
|
+
}
|
|
83
|
+
function logRawUrlTransformation(resolvedUrl) {
|
|
84
|
+
if (!resolvedUrl.transformed)
|
|
85
|
+
return;
|
|
86
|
+
logDebug('Using transformed raw content URL', {
|
|
87
|
+
original: resolvedUrl.originalUrl,
|
|
88
|
+
});
|
|
89
|
+
}
|
|
90
|
+
function buildCacheHitResult(data, fetchedAt, url, cacheKey) {
|
|
91
|
+
return {
|
|
92
|
+
data,
|
|
93
|
+
fromCache: true,
|
|
94
|
+
url,
|
|
95
|
+
fetchedAt,
|
|
96
|
+
cacheKey,
|
|
97
|
+
};
|
|
98
|
+
}
|
|
96
99
|
function buildPipelineResult(url, data, cacheKey) {
|
|
97
100
|
return {
|
|
98
101
|
data,
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
type InlineContentFormat = 'jsonl' | 'markdown';
|
|
2
1
|
interface InlineContentResult {
|
|
3
2
|
content?: string;
|
|
4
3
|
contentSize: number;
|
|
@@ -7,5 +6,5 @@ interface InlineContentResult {
|
|
|
7
6
|
error?: string;
|
|
8
7
|
truncated?: boolean;
|
|
9
8
|
}
|
|
10
|
-
export declare function applyInlineContentLimit(content: string, cacheKey: string | null
|
|
9
|
+
export declare function applyInlineContentLimit(content: string, cacheKey: string | null): InlineContentResult;
|
|
11
10
|
export {};
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { TRUNCATION_MARKER } from '../../config/formatting.js';
|
|
2
2
|
import { config } from '../../config/index.js';
|
|
3
|
-
import
|
|
4
|
-
export function applyInlineContentLimit(content, cacheKey
|
|
3
|
+
import { toResourceUri } from '../../services/cache-keys.js';
|
|
4
|
+
export function applyInlineContentLimit(content, cacheKey) {
|
|
5
5
|
const contentSize = content.length;
|
|
6
6
|
const inlineLimit = config.constants.maxInlineContentChars;
|
|
7
7
|
if (contentSize <= inlineLimit) {
|
|
@@ -14,16 +14,13 @@ export function applyInlineContentLimit(content, cacheKey, format) {
|
|
|
14
14
|
return {
|
|
15
15
|
contentSize,
|
|
16
16
|
resourceUri,
|
|
17
|
-
resourceMimeType:
|
|
17
|
+
resourceMimeType: 'text/markdown',
|
|
18
18
|
};
|
|
19
19
|
}
|
|
20
20
|
function resolveResourceUri(cacheKey) {
|
|
21
21
|
if (!config.cache.enabled || !cacheKey)
|
|
22
22
|
return null;
|
|
23
|
-
return
|
|
24
|
-
}
|
|
25
|
-
function resolveResourceMimeType(format) {
|
|
26
|
-
return format === 'markdown' ? 'text/markdown' : 'application/jsonl';
|
|
23
|
+
return toResourceUri(cacheKey);
|
|
27
24
|
}
|
|
28
25
|
function buildTruncatedFallback(content, contentSize, inlineLimit) {
|
|
29
26
|
const maxContentLength = Math.max(0, inlineLimit - TRUNCATION_MARKER.length);
|