@j0hanz/superfetch 2.0.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +28 -17
- package/dist/config/index.js +11 -6
- package/dist/http/auth.js +161 -2
- package/dist/http/host-allowlist.d.ts +3 -0
- package/dist/http/host-allowlist.js +117 -0
- package/dist/http/mcp-routes.d.ts +8 -2
- package/dist/http/mcp-routes.js +101 -8
- package/dist/http/mcp-session-eviction.d.ts +3 -0
- package/dist/http/mcp-session-eviction.js +24 -0
- package/dist/http/mcp-session-init.d.ts +7 -0
- package/dist/http/mcp-session-init.js +94 -0
- package/dist/http/mcp-session-slots.d.ts +17 -0
- package/dist/http/mcp-session-slots.js +55 -0
- package/dist/http/mcp-session-transport-init.d.ts +7 -0
- package/dist/http/mcp-session-transport-init.js +41 -0
- package/dist/http/mcp-session-types.d.ts +5 -0
- package/dist/http/mcp-session-types.js +1 -0
- package/dist/http/mcp-session.d.ts +9 -9
- package/dist/http/mcp-session.js +5 -114
- package/dist/http/mcp-sessions.d.ts +43 -0
- package/dist/http/mcp-sessions.js +392 -0
- package/dist/http/rate-limit.js +2 -2
- package/dist/http/server-middleware.d.ts +6 -1
- package/dist/http/server-middleware.js +3 -117
- package/dist/http/server-shutdown.js +1 -1
- package/dist/http/server.d.ts +10 -0
- package/dist/http/server.js +508 -11
- package/dist/http/session-cleanup.js +8 -5
- package/dist/middleware/error-handler.d.ts +1 -1
- package/dist/middleware/error-handler.js +31 -30
- package/dist/resources/cached-content-params.d.ts +5 -0
- package/dist/resources/cached-content-params.js +36 -0
- package/dist/resources/cached-content.js +33 -33
- package/dist/server.js +1 -1
- package/dist/services/cache-events.d.ts +8 -0
- package/dist/services/cache-events.js +19 -0
- package/dist/services/cache.d.ts +5 -4
- package/dist/services/cache.js +49 -45
- package/dist/services/extractor.js +49 -38
- package/dist/services/fetcher/agents.js +1 -1
- package/dist/services/fetcher/dns-selection.js +1 -1
- package/dist/services/fetcher/interceptors.js +29 -60
- package/dist/services/fetcher/redirects.js +12 -4
- package/dist/services/fetcher/response.js +18 -8
- package/dist/services/fetcher.d.ts +21 -0
- package/dist/services/fetcher.js +532 -13
- package/dist/tools/handlers/fetch-single.shared.d.ts +11 -3
- package/dist/tools/handlers/fetch-single.shared.js +131 -2
- package/dist/tools/handlers/fetch-url.tool.d.ts +6 -0
- package/dist/tools/handlers/fetch-url.tool.js +48 -6
- package/dist/tools/utils/content-shaping.js +19 -4
- package/dist/tools/utils/content-transform.d.ts +4 -1
- package/dist/tools/utils/content-transform.js +110 -96
- package/dist/tools/utils/fetch-pipeline.js +47 -56
- package/dist/tools/utils/frontmatter.d.ts +3 -0
- package/dist/tools/utils/frontmatter.js +73 -0
- package/dist/tools/utils/markdown-heuristics.d.ts +1 -0
- package/dist/tools/utils/markdown-heuristics.js +19 -0
- package/dist/tools/utils/markdown-signals.d.ts +1 -0
- package/dist/tools/utils/markdown-signals.js +19 -0
- package/dist/tools/utils/raw-markdown-frontmatter.d.ts +3 -0
- package/dist/tools/utils/raw-markdown-frontmatter.js +73 -0
- package/dist/tools/utils/raw-markdown.d.ts +6 -0
- package/dist/tools/utils/raw-markdown.js +135 -0
- package/dist/transformers/markdown/fenced-code-rule.d.ts +2 -0
- package/dist/transformers/markdown/fenced-code-rule.js +38 -0
- package/dist/transformers/markdown/frontmatter.d.ts +2 -0
- package/dist/transformers/markdown/frontmatter.js +45 -0
- package/dist/transformers/markdown/noise-rule.d.ts +2 -0
- package/dist/transformers/markdown/noise-rule.js +80 -0
- package/dist/transformers/markdown/turndown-instance.d.ts +2 -0
- package/dist/transformers/markdown/turndown-instance.js +19 -0
- package/dist/transformers/markdown.d.ts +2 -0
- package/dist/transformers/markdown.js +185 -0
- package/dist/transformers/markdown.transformer.js +2 -189
- package/dist/utils/code-language-bash.d.ts +1 -0
- package/dist/utils/code-language-bash.js +48 -0
- package/dist/utils/code-language-core.d.ts +2 -0
- package/dist/utils/code-language-core.js +13 -0
- package/dist/utils/code-language-detectors.d.ts +5 -0
- package/dist/utils/code-language-detectors.js +142 -0
- package/dist/utils/code-language-helpers.d.ts +5 -0
- package/dist/utils/code-language-helpers.js +62 -0
- package/dist/utils/code-language-parsing.d.ts +5 -0
- package/dist/utils/code-language-parsing.js +62 -0
- package/dist/utils/code-language.d.ts +9 -0
- package/dist/utils/code-language.js +250 -46
- package/dist/utils/error-details.d.ts +3 -0
- package/dist/utils/error-details.js +12 -0
- package/dist/utils/filename-generator.js +14 -3
- package/dist/utils/ip-address.d.ts +4 -0
- package/dist/utils/ip-address.js +6 -0
- package/dist/utils/tool-error-handler.js +12 -17
- package/dist/utils/url-validator.js +33 -21
- package/package.json +7 -5
|
@@ -1,6 +1,4 @@
|
|
|
1
|
-
import type { PipelineResult, ToolContentBlock } from '../../config/types/runtime.js';
|
|
2
|
-
import { executeFetchPipeline } from '../utils/fetch-pipeline.js';
|
|
3
|
-
import { applyInlineContentLimit } from '../utils/inline-content.js';
|
|
1
|
+
import type { FetchPipelineOptions, PipelineResult, ToolContentBlock } from '../../config/types/runtime.js';
|
|
4
2
|
interface SharedFetchOptions<T extends {
|
|
5
3
|
content: string;
|
|
6
4
|
}> {
|
|
@@ -20,4 +18,14 @@ export declare function performSharedFetch<T extends {
|
|
|
20
18
|
}>;
|
|
21
19
|
export type InlineResult = ReturnType<typeof applyInlineContentLimit>;
|
|
22
20
|
export declare function buildToolContentBlocks(structuredContent: Record<string, unknown>, fromCache: boolean, inlineResult: InlineResult, resourceName: string, cacheKey?: string | null, fullContent?: string, url?: string, title?: string): ToolContentBlock[];
|
|
21
|
+
interface InlineContentResult {
|
|
22
|
+
content?: string;
|
|
23
|
+
contentSize: number;
|
|
24
|
+
resourceUri?: string;
|
|
25
|
+
resourceMimeType?: string;
|
|
26
|
+
error?: string;
|
|
27
|
+
truncated?: boolean;
|
|
28
|
+
}
|
|
29
|
+
declare function applyInlineContentLimit(content: string, cacheKey: string | null): InlineContentResult;
|
|
30
|
+
export declare function executeFetchPipeline<T>(options: FetchPipelineOptions<T>): Promise<PipelineResult<T>>;
|
|
23
31
|
export {};
|
|
@@ -1,7 +1,13 @@
|
|
|
1
|
+
import { TRUNCATION_MARKER } from '../../config/formatting.js';
|
|
1
2
|
import { config } from '../../config/index.js';
|
|
3
|
+
import * as cache from '../../services/cache.js';
|
|
4
|
+
import { createCacheKey, toResourceUri } from '../../services/cache-keys.js';
|
|
5
|
+
import { fetchNormalizedUrl } from '../../services/fetcher.js';
|
|
6
|
+
import { logDebug } from '../../services/logger.js';
|
|
2
7
|
import { generateSafeFilename } from '../../utils/filename-generator.js';
|
|
3
|
-
import {
|
|
4
|
-
import {
|
|
8
|
+
import { isRecord } from '../../utils/guards.js';
|
|
9
|
+
import { transformToRawUrl } from '../../utils/url-transformer.js';
|
|
10
|
+
import { normalizeUrl } from '../../utils/url-validator.js';
|
|
5
11
|
function applyOptionalPipelineSerialization(pipelineOptions, options) {
|
|
6
12
|
if (options.serialize !== undefined) {
|
|
7
13
|
pipelineOptions.serialize = options.serialize;
|
|
@@ -92,3 +98,126 @@ export function buildToolContentBlocks(structuredContent, fromCache, inlineResul
|
|
|
92
98
|
maybeAppendResourceLink(blocks, inlineResult, resourceName);
|
|
93
99
|
return blocks;
|
|
94
100
|
}
|
|
101
|
+
function applyInlineContentLimit(content, cacheKey) {
|
|
102
|
+
const contentSize = content.length;
|
|
103
|
+
const inlineLimit = config.constants.maxInlineContentChars;
|
|
104
|
+
if (contentSize <= inlineLimit) {
|
|
105
|
+
return { content, contentSize };
|
|
106
|
+
}
|
|
107
|
+
const resourceUri = resolveResourceUri(cacheKey);
|
|
108
|
+
if (!resourceUri) {
|
|
109
|
+
return buildTruncatedFallback(content, contentSize, inlineLimit);
|
|
110
|
+
}
|
|
111
|
+
return {
|
|
112
|
+
contentSize,
|
|
113
|
+
resourceUri,
|
|
114
|
+
resourceMimeType: 'text/markdown',
|
|
115
|
+
};
|
|
116
|
+
}
|
|
117
|
+
function resolveResourceUri(cacheKey) {
|
|
118
|
+
if (!config.cache.enabled || !cacheKey)
|
|
119
|
+
return null;
|
|
120
|
+
return toResourceUri(cacheKey);
|
|
121
|
+
}
|
|
122
|
+
function buildTruncatedFallback(content, contentSize, inlineLimit) {
|
|
123
|
+
const maxContentLength = Math.max(0, inlineLimit - TRUNCATION_MARKER.length);
|
|
124
|
+
const truncatedContent = content.length > inlineLimit
|
|
125
|
+
? `${content.substring(0, maxContentLength)}${TRUNCATION_MARKER}`
|
|
126
|
+
: content;
|
|
127
|
+
return {
|
|
128
|
+
content: truncatedContent,
|
|
129
|
+
contentSize,
|
|
130
|
+
truncated: true,
|
|
131
|
+
};
|
|
132
|
+
}
|
|
133
|
+
function attemptCacheRetrieval({ cacheKey, deserialize, cacheNamespace, normalizedUrl, }) {
|
|
134
|
+
if (!cacheKey)
|
|
135
|
+
return null;
|
|
136
|
+
const cached = cache.get(cacheKey);
|
|
137
|
+
if (!cached)
|
|
138
|
+
return null;
|
|
139
|
+
if (!deserialize) {
|
|
140
|
+
logCacheMiss('missing deserializer', cacheNamespace, normalizedUrl);
|
|
141
|
+
return null;
|
|
142
|
+
}
|
|
143
|
+
const data = deserialize(cached.content);
|
|
144
|
+
if (data === undefined) {
|
|
145
|
+
logCacheMiss('deserialize failure', cacheNamespace, normalizedUrl);
|
|
146
|
+
return null;
|
|
147
|
+
}
|
|
148
|
+
logDebug('Cache hit', { namespace: cacheNamespace, url: normalizedUrl });
|
|
149
|
+
return {
|
|
150
|
+
data,
|
|
151
|
+
fromCache: true,
|
|
152
|
+
url: normalizedUrl,
|
|
153
|
+
fetchedAt: cached.fetchedAt,
|
|
154
|
+
cacheKey,
|
|
155
|
+
};
|
|
156
|
+
}
|
|
157
|
+
function resolveNormalizedUrl(url) {
|
|
158
|
+
const { normalizedUrl: validatedUrl } = normalizeUrl(url);
|
|
159
|
+
const { url: normalizedUrl, transformed } = transformToRawUrl(validatedUrl);
|
|
160
|
+
return { normalizedUrl, originalUrl: validatedUrl, transformed };
|
|
161
|
+
}
|
|
162
|
+
export async function executeFetchPipeline(options) {
|
|
163
|
+
const resolvedUrl = resolveNormalizedUrl(options.url);
|
|
164
|
+
logRawUrlTransformation(resolvedUrl);
|
|
165
|
+
const cacheKey = createCacheKey(options.cacheNamespace, resolvedUrl.normalizedUrl, options.cacheVary);
|
|
166
|
+
const cachedResult = attemptCacheRetrieval({
|
|
167
|
+
cacheKey,
|
|
168
|
+
deserialize: options.deserialize,
|
|
169
|
+
cacheNamespace: options.cacheNamespace,
|
|
170
|
+
normalizedUrl: resolvedUrl.normalizedUrl,
|
|
171
|
+
});
|
|
172
|
+
if (cachedResult)
|
|
173
|
+
return cachedResult;
|
|
174
|
+
logDebug('Fetching URL', { url: resolvedUrl.normalizedUrl });
|
|
175
|
+
const fetchOptions = options.signal === undefined ? {} : { signal: options.signal };
|
|
176
|
+
const html = await fetchNormalizedUrl(resolvedUrl.normalizedUrl, fetchOptions);
|
|
177
|
+
const data = await options.transform(html, resolvedUrl.normalizedUrl);
|
|
178
|
+
if (cache.isEnabled()) {
|
|
179
|
+
persistCache({
|
|
180
|
+
cacheKey,
|
|
181
|
+
data,
|
|
182
|
+
serialize: options.serialize,
|
|
183
|
+
normalizedUrl: resolvedUrl.normalizedUrl,
|
|
184
|
+
});
|
|
185
|
+
}
|
|
186
|
+
return {
|
|
187
|
+
data,
|
|
188
|
+
fromCache: false,
|
|
189
|
+
url: resolvedUrl.normalizedUrl,
|
|
190
|
+
fetchedAt: new Date().toISOString(),
|
|
191
|
+
cacheKey,
|
|
192
|
+
};
|
|
193
|
+
}
|
|
194
|
+
function persistCache({ cacheKey, data, serialize, normalizedUrl, }) {
|
|
195
|
+
if (!cacheKey)
|
|
196
|
+
return;
|
|
197
|
+
const serializer = serialize ?? JSON.stringify;
|
|
198
|
+
const title = extractTitle(data);
|
|
199
|
+
const metadata = {
|
|
200
|
+
url: normalizedUrl,
|
|
201
|
+
...(title === undefined ? {} : { title }),
|
|
202
|
+
};
|
|
203
|
+
cache.set(cacheKey, serializer(data), metadata);
|
|
204
|
+
}
|
|
205
|
+
function extractTitle(value) {
|
|
206
|
+
if (!isRecord(value))
|
|
207
|
+
return undefined;
|
|
208
|
+
const { title } = value;
|
|
209
|
+
return typeof title === 'string' ? title : undefined;
|
|
210
|
+
}
|
|
211
|
+
function logCacheMiss(reason, cacheNamespace, normalizedUrl) {
|
|
212
|
+
logDebug(`Cache miss due to ${reason}`, {
|
|
213
|
+
namespace: cacheNamespace,
|
|
214
|
+
url: normalizedUrl,
|
|
215
|
+
});
|
|
216
|
+
}
|
|
217
|
+
function logRawUrlTransformation(resolvedUrl) {
|
|
218
|
+
if (!resolvedUrl.transformed)
|
|
219
|
+
return;
|
|
220
|
+
logDebug('Using transformed raw content URL', {
|
|
221
|
+
original: resolvedUrl.originalUrl,
|
|
222
|
+
});
|
|
223
|
+
}
|
|
@@ -1,4 +1,10 @@
|
|
|
1
|
+
import type { MarkdownTransformResult } from '../../config/types/content.js';
|
|
1
2
|
import type { FetchUrlInput, ToolResponseBase } from '../../config/types/tools.js';
|
|
2
3
|
export declare const FETCH_URL_TOOL_NAME = "fetch-url";
|
|
3
4
|
export declare const FETCH_URL_TOOL_DESCRIPTION = "Fetches a webpage and converts it to clean Markdown format";
|
|
5
|
+
type MarkdownPipelineResult = MarkdownTransformResult & {
|
|
6
|
+
readonly content: string;
|
|
7
|
+
};
|
|
8
|
+
export declare function parseCachedMarkdownResult(cached: string): MarkdownPipelineResult | undefined;
|
|
4
9
|
export declare function fetchUrlToolHandler(input: FetchUrlInput): Promise<ToolResponseBase>;
|
|
10
|
+
export {};
|
|
@@ -1,10 +1,55 @@
|
|
|
1
1
|
import { logDebug, logError } from '../../services/logger.js';
|
|
2
|
+
import { isRecord } from '../../utils/guards.js';
|
|
2
3
|
import { createToolErrorResponse, handleToolError, } from '../../utils/tool-error-handler.js';
|
|
3
|
-
import { parseCachedMarkdownResult } from '../utils/cached-markdown.js';
|
|
4
4
|
import { transformHtmlToMarkdown } from '../utils/content-transform.js';
|
|
5
5
|
import { buildToolContentBlocks, performSharedFetch, } from './fetch-single.shared.js';
|
|
6
6
|
export const FETCH_URL_TOOL_NAME = 'fetch-url';
|
|
7
7
|
export const FETCH_URL_TOOL_DESCRIPTION = 'Fetches a webpage and converts it to clean Markdown format';
|
|
8
|
+
function parseJsonRecord(input) {
|
|
9
|
+
try {
|
|
10
|
+
const parsed = JSON.parse(input);
|
|
11
|
+
return isRecord(parsed) ? parsed : undefined;
|
|
12
|
+
}
|
|
13
|
+
catch {
|
|
14
|
+
return undefined;
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
function resolveMarkdownContent(parsed) {
|
|
18
|
+
const { markdown } = parsed;
|
|
19
|
+
if (typeof markdown === 'string')
|
|
20
|
+
return markdown;
|
|
21
|
+
const { content } = parsed;
|
|
22
|
+
if (typeof content === 'string')
|
|
23
|
+
return content;
|
|
24
|
+
return undefined;
|
|
25
|
+
}
|
|
26
|
+
function resolveOptionalTitle(parsed) {
|
|
27
|
+
const { title } = parsed;
|
|
28
|
+
if (title === undefined)
|
|
29
|
+
return undefined;
|
|
30
|
+
return typeof title === 'string' ? title : undefined;
|
|
31
|
+
}
|
|
32
|
+
function resolveTruncatedFlag(parsed) {
|
|
33
|
+
const { truncated } = parsed;
|
|
34
|
+
return typeof truncated === 'boolean' ? truncated : false;
|
|
35
|
+
}
|
|
36
|
+
export function parseCachedMarkdownResult(cached) {
|
|
37
|
+
const parsed = parseJsonRecord(cached);
|
|
38
|
+
if (!parsed)
|
|
39
|
+
return undefined;
|
|
40
|
+
const resolvedContent = resolveMarkdownContent(parsed);
|
|
41
|
+
if (resolvedContent === undefined)
|
|
42
|
+
return undefined;
|
|
43
|
+
const title = resolveOptionalTitle(parsed);
|
|
44
|
+
if (parsed.title !== undefined && title === undefined)
|
|
45
|
+
return undefined;
|
|
46
|
+
return {
|
|
47
|
+
content: resolvedContent,
|
|
48
|
+
markdown: resolvedContent,
|
|
49
|
+
title,
|
|
50
|
+
truncated: resolveTruncatedFlag(parsed),
|
|
51
|
+
};
|
|
52
|
+
}
|
|
8
53
|
function deserializeMarkdownResult(cached) {
|
|
9
54
|
return parseCachedMarkdownResult(cached);
|
|
10
55
|
}
|
|
@@ -53,13 +98,10 @@ function buildResponse(pipeline, inlineResult) {
|
|
|
53
98
|
};
|
|
54
99
|
}
|
|
55
100
|
export async function fetchUrlToolHandler(input) {
|
|
56
|
-
|
|
57
|
-
return await executeFetch(input);
|
|
58
|
-
}
|
|
59
|
-
catch (error) {
|
|
101
|
+
return executeFetch(input).catch((error) => {
|
|
60
102
|
logError('fetch-url tool error', error instanceof Error ? error : undefined);
|
|
61
103
|
return handleToolError(error, input.url, 'Failed to fetch URL');
|
|
62
|
-
}
|
|
104
|
+
});
|
|
63
105
|
}
|
|
64
106
|
async function executeFetch(input) {
|
|
65
107
|
const { url } = input;
|
|
@@ -1,10 +1,25 @@
|
|
|
1
1
|
const MIN_CONTENT_RATIO = 0.3;
|
|
2
2
|
const MIN_HTML_LENGTH_FOR_GATE = 100;
|
|
3
|
+
function stripHtmlTags(html) {
|
|
4
|
+
const parts = [];
|
|
5
|
+
let inTag = false;
|
|
6
|
+
for (const char of html) {
|
|
7
|
+
if (char === '<') {
|
|
8
|
+
inTag = true;
|
|
9
|
+
continue;
|
|
10
|
+
}
|
|
11
|
+
if (char === '>') {
|
|
12
|
+
inTag = false;
|
|
13
|
+
continue;
|
|
14
|
+
}
|
|
15
|
+
if (!inTag) {
|
|
16
|
+
parts.push(char);
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
return parts.join('');
|
|
20
|
+
}
|
|
3
21
|
function estimateTextLength(html) {
|
|
4
|
-
return html
|
|
5
|
-
.replace(/<[^>]*>/g, '')
|
|
6
|
-
.replace(/\s+/g, ' ')
|
|
7
|
-
.trim().length;
|
|
22
|
+
return stripHtmlTags(html).replace(/\s+/g, ' ').trim().length;
|
|
8
23
|
}
|
|
9
24
|
export function isExtractionSufficient(article, originalHtml) {
|
|
10
25
|
if (!article)
|
|
@@ -1,2 +1,5 @@
|
|
|
1
|
-
import type { MarkdownTransformResult, TransformOptions } from '../../config/types/content.js';
|
|
1
|
+
import type { ExtractedArticle, ExtractedMetadata, MarkdownTransformResult, MetadataBlock, TransformOptions } from '../../config/types/content.js';
|
|
2
|
+
export declare function isExtractionSufficient(article: ExtractedArticle | null, originalHtml: string): boolean;
|
|
3
|
+
export declare function determineContentExtractionSource(article: ExtractedArticle | null): article is ExtractedArticle;
|
|
4
|
+
export declare function createContentMetadataBlock(url: string, article: ExtractedArticle | null, extractedMeta: ExtractedMetadata, shouldExtractFromArticle: boolean, includeMetadata: boolean): MetadataBlock | undefined;
|
|
2
5
|
export declare function transformHtmlToMarkdown(html: string, url: string, options: TransformOptions): MarkdownTransformResult;
|
|
@@ -1,9 +1,75 @@
|
|
|
1
1
|
import { extractContent } from '../../services/extractor.js';
|
|
2
2
|
import { logDebug } from '../../services/logger.js';
|
|
3
|
-
import {
|
|
4
|
-
import {
|
|
5
|
-
|
|
6
|
-
|
|
3
|
+
import { htmlToMarkdown } from '../../transformers/markdown.js';
|
|
4
|
+
import { tryTransformRawContent } from './raw-markdown.js';
|
|
5
|
+
const MIN_CONTENT_RATIO = 0.3;
|
|
6
|
+
const MIN_HTML_LENGTH_FOR_GATE = 100;
|
|
7
|
+
function stripHtmlTags(html) {
|
|
8
|
+
const parts = [];
|
|
9
|
+
let inTag = false;
|
|
10
|
+
for (const char of html) {
|
|
11
|
+
if (char === '<') {
|
|
12
|
+
inTag = true;
|
|
13
|
+
continue;
|
|
14
|
+
}
|
|
15
|
+
if (char === '>') {
|
|
16
|
+
inTag = false;
|
|
17
|
+
continue;
|
|
18
|
+
}
|
|
19
|
+
if (!inTag) {
|
|
20
|
+
parts.push(char);
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
return parts.join('');
|
|
24
|
+
}
|
|
25
|
+
function estimateTextLength(html) {
|
|
26
|
+
return stripHtmlTags(html).replace(/\s+/g, ' ').trim().length;
|
|
27
|
+
}
|
|
28
|
+
export function isExtractionSufficient(article, originalHtml) {
|
|
29
|
+
if (!article)
|
|
30
|
+
return false;
|
|
31
|
+
const articleLength = article.textContent.length;
|
|
32
|
+
const originalLength = estimateTextLength(originalHtml);
|
|
33
|
+
if (originalLength < MIN_HTML_LENGTH_FOR_GATE)
|
|
34
|
+
return true;
|
|
35
|
+
return articleLength / originalLength >= MIN_CONTENT_RATIO;
|
|
36
|
+
}
|
|
37
|
+
export function determineContentExtractionSource(article) {
|
|
38
|
+
return !!article;
|
|
39
|
+
}
|
|
40
|
+
function applyArticleMetadata(metadata, article) {
|
|
41
|
+
if (article.title !== undefined)
|
|
42
|
+
metadata.title = article.title;
|
|
43
|
+
if (article.byline !== undefined)
|
|
44
|
+
metadata.author = article.byline;
|
|
45
|
+
}
|
|
46
|
+
function applyExtractedMetadata(metadata, extractedMeta) {
|
|
47
|
+
if (extractedMeta.title !== undefined)
|
|
48
|
+
metadata.title = extractedMeta.title;
|
|
49
|
+
if (extractedMeta.description !== undefined) {
|
|
50
|
+
metadata.description = extractedMeta.description;
|
|
51
|
+
}
|
|
52
|
+
if (extractedMeta.author !== undefined) {
|
|
53
|
+
metadata.author = extractedMeta.author;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
export function createContentMetadataBlock(url, article, extractedMeta, shouldExtractFromArticle, includeMetadata) {
|
|
57
|
+
if (!includeMetadata)
|
|
58
|
+
return undefined;
|
|
59
|
+
const now = new Date().toISOString();
|
|
60
|
+
const metadata = {
|
|
61
|
+
type: 'metadata',
|
|
62
|
+
url,
|
|
63
|
+
fetchedAt: now,
|
|
64
|
+
};
|
|
65
|
+
if (shouldExtractFromArticle && article) {
|
|
66
|
+
applyArticleMetadata(metadata, article);
|
|
67
|
+
return metadata;
|
|
68
|
+
}
|
|
69
|
+
applyExtractedMetadata(metadata, extractedMeta);
|
|
70
|
+
return metadata;
|
|
71
|
+
}
|
|
72
|
+
function buildArticleContentSource({ url, article, extractedMeta, includeMetadata, }) {
|
|
7
73
|
const metadata = createContentMetadataBlock(url, article, extractedMeta, true, includeMetadata);
|
|
8
74
|
return {
|
|
9
75
|
sourceHtml: article.content,
|
|
@@ -11,7 +77,7 @@ function buildArticleContentSource(url, article, extractedMeta, includeMetadata)
|
|
|
11
77
|
metadata,
|
|
12
78
|
};
|
|
13
79
|
}
|
|
14
|
-
function buildFullHtmlContentSource(html, url, article, extractedMeta, includeMetadata) {
|
|
80
|
+
function buildFullHtmlContentSource({ html, url, article, extractedMeta, includeMetadata, }) {
|
|
15
81
|
const metadata = createContentMetadataBlock(url, article, extractedMeta, false, includeMetadata);
|
|
16
82
|
return {
|
|
17
83
|
sourceHtml: html,
|
|
@@ -19,119 +85,67 @@ function buildFullHtmlContentSource(html, url, article, extractedMeta, includeMe
|
|
|
19
85
|
metadata,
|
|
20
86
|
};
|
|
21
87
|
}
|
|
22
|
-
function logQualityGateFallback(url,
|
|
88
|
+
function logQualityGateFallback({ url, articleLength, }) {
|
|
23
89
|
logDebug('Quality gate: Readability extraction below threshold, using full HTML', {
|
|
24
90
|
url: url.substring(0, 80),
|
|
25
|
-
articleLength
|
|
91
|
+
articleLength,
|
|
26
92
|
});
|
|
27
93
|
}
|
|
28
|
-
function tryBuildExtractedArticleContentSource(html, url, article, extractedMeta,
|
|
94
|
+
function tryBuildExtractedArticleContentSource({ html, url, article, extractedMeta, includeMetadata, }) {
|
|
29
95
|
if (!article)
|
|
30
96
|
return null;
|
|
31
97
|
const shouldExtractFromArticle = determineContentExtractionSource(article);
|
|
32
98
|
if (shouldExtractFromArticle && isExtractionSufficient(article, html)) {
|
|
33
|
-
return buildArticleContentSource(
|
|
99
|
+
return buildArticleContentSource({
|
|
100
|
+
url,
|
|
101
|
+
article,
|
|
102
|
+
extractedMeta,
|
|
103
|
+
includeMetadata,
|
|
104
|
+
});
|
|
34
105
|
}
|
|
35
106
|
if (shouldExtractFromArticle) {
|
|
36
|
-
logQualityGateFallback(
|
|
107
|
+
logQualityGateFallback({
|
|
108
|
+
url,
|
|
109
|
+
articleLength: article.textContent.length,
|
|
110
|
+
});
|
|
37
111
|
}
|
|
38
112
|
return null;
|
|
39
113
|
}
|
|
40
|
-
function resolveContentSource(html, url,
|
|
114
|
+
function resolveContentSource({ html, url, includeMetadata, }) {
|
|
41
115
|
const { article, metadata: extractedMeta } = extractContent(html, url, {
|
|
42
116
|
extractArticle: true,
|
|
43
117
|
});
|
|
44
|
-
const extracted = tryBuildExtractedArticleContentSource(
|
|
118
|
+
const extracted = tryBuildExtractedArticleContentSource({
|
|
119
|
+
html,
|
|
120
|
+
url,
|
|
121
|
+
article,
|
|
122
|
+
extractedMeta,
|
|
123
|
+
includeMetadata,
|
|
124
|
+
});
|
|
45
125
|
if (extracted)
|
|
46
126
|
return extracted;
|
|
47
|
-
return buildFullHtmlContentSource(
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
const content = includeMetadata
|
|
55
|
-
? addSourceToMarkdown(rawContent, url)
|
|
56
|
-
: rawContent;
|
|
57
|
-
return { content, title };
|
|
58
|
-
}
|
|
59
|
-
function extractTitleFromRawMarkdown(content) {
|
|
60
|
-
const frontmatterMatch = /^---\r?\n([\s\S]*?)\r?\n---/.exec(content);
|
|
61
|
-
if (!frontmatterMatch)
|
|
62
|
-
return undefined;
|
|
63
|
-
const frontmatter = frontmatterMatch[1] ?? '';
|
|
64
|
-
const titleMatch = /^(?:title|name):\s*["']?(.+?)["']?\s*$/im.exec(frontmatter);
|
|
65
|
-
return titleMatch?.[1]?.trim();
|
|
66
|
-
}
|
|
67
|
-
function addSourceToMarkdown(content, url) {
|
|
68
|
-
const frontmatterMatch = /^(---\r?\n)([\s\S]*?)(\r?\n---)/.exec(content);
|
|
69
|
-
if (frontmatterMatch) {
|
|
70
|
-
const start = frontmatterMatch[1] ?? '---\n';
|
|
71
|
-
const existingFields = frontmatterMatch[2] ?? '';
|
|
72
|
-
const end = frontmatterMatch[3] ?? '\n---';
|
|
73
|
-
const rest = content.slice(frontmatterMatch[0].length);
|
|
74
|
-
if (/^source:/im.test(existingFields)) {
|
|
75
|
-
return content;
|
|
76
|
-
}
|
|
77
|
-
return `${start}${existingFields}\nsource: "${url}"${end}${rest}`;
|
|
78
|
-
}
|
|
79
|
-
return `---\nsource: "${url}"\n---\n\n${content}`;
|
|
80
|
-
}
|
|
81
|
-
function looksLikeHtmlDocument(trimmed) {
|
|
82
|
-
return (trimmed.startsWith('<!DOCTYPE') ||
|
|
83
|
-
trimmed.startsWith('<!doctype') ||
|
|
84
|
-
trimmed.startsWith('<html') ||
|
|
85
|
-
trimmed.startsWith('<HTML'));
|
|
86
|
-
}
|
|
87
|
-
function hasFrontmatter(trimmed) {
|
|
88
|
-
return /^---\r?\n/.test(trimmed);
|
|
89
|
-
}
|
|
90
|
-
function countCommonHtmlTags(content) {
|
|
91
|
-
const matches = content.match(/<(html|head|body|div|span|script|style|meta|link)\b/gi) ??
|
|
92
|
-
[];
|
|
93
|
-
return matches.length;
|
|
94
|
-
}
|
|
95
|
-
function looksLikeMarkdown(content) {
|
|
96
|
-
const hasMarkdownHeadings = /^#{1,6}\s+/m.test(content);
|
|
97
|
-
const hasMarkdownLists = /^[\s]*[-*+]\s+/m.test(content);
|
|
98
|
-
const hasMarkdownCodeBlocks = /```[\s\S]*?```/.test(content);
|
|
99
|
-
return hasMarkdownHeadings || hasMarkdownLists || hasMarkdownCodeBlocks;
|
|
100
|
-
}
|
|
101
|
-
function isRawTextContent(content) {
|
|
102
|
-
const trimmed = content.trim();
|
|
103
|
-
if (looksLikeHtmlDocument(trimmed)) {
|
|
104
|
-
return false;
|
|
105
|
-
}
|
|
106
|
-
if (hasFrontmatter(trimmed)) {
|
|
107
|
-
return true;
|
|
108
|
-
}
|
|
109
|
-
if (countCommonHtmlTags(content) > 2) {
|
|
110
|
-
return false;
|
|
111
|
-
}
|
|
112
|
-
if (looksLikeMarkdown(content)) {
|
|
113
|
-
return true;
|
|
114
|
-
}
|
|
115
|
-
return false;
|
|
116
|
-
}
|
|
117
|
-
function tryTransformRawContent(html, url, options) {
|
|
118
|
-
if (!isRawTextContentUrl(url) && !isRawTextContent(html)) {
|
|
119
|
-
return null;
|
|
120
|
-
}
|
|
121
|
-
logDebug('Preserving raw markdown content', { url: url.substring(0, 80) });
|
|
122
|
-
const { content, title } = buildRawMarkdownPayload(html, url, options.includeMetadata);
|
|
123
|
-
return {
|
|
124
|
-
markdown: content,
|
|
125
|
-
title,
|
|
126
|
-
truncated: false,
|
|
127
|
-
};
|
|
127
|
+
return buildFullHtmlContentSource({
|
|
128
|
+
html,
|
|
129
|
+
url,
|
|
130
|
+
article,
|
|
131
|
+
extractedMeta,
|
|
132
|
+
includeMetadata,
|
|
133
|
+
});
|
|
128
134
|
}
|
|
129
135
|
export function transformHtmlToMarkdown(html, url, options) {
|
|
130
|
-
const raw = tryTransformRawContent(
|
|
136
|
+
const raw = tryTransformRawContent({
|
|
137
|
+
html,
|
|
138
|
+
url,
|
|
139
|
+
includeMetadata: options.includeMetadata,
|
|
140
|
+
});
|
|
131
141
|
if (raw)
|
|
132
142
|
return raw;
|
|
133
|
-
const context = resolveContentSource(
|
|
134
|
-
|
|
143
|
+
const context = resolveContentSource({
|
|
144
|
+
html,
|
|
145
|
+
url,
|
|
146
|
+
includeMetadata: options.includeMetadata,
|
|
147
|
+
});
|
|
148
|
+
const content = htmlToMarkdown(context.sourceHtml, context.metadata);
|
|
135
149
|
return {
|
|
136
150
|
markdown: content,
|
|
137
151
|
title: context.title,
|
|
@@ -5,19 +5,29 @@ import { logDebug } from '../../services/logger.js';
|
|
|
5
5
|
import { isRecord } from '../../utils/guards.js';
|
|
6
6
|
import { transformToRawUrl } from '../../utils/url-transformer.js';
|
|
7
7
|
import { normalizeUrl } from '../../utils/url-validator.js';
|
|
8
|
-
function attemptCacheRetrieval(cacheKey, deserialize, cacheNamespace, normalizedUrl) {
|
|
8
|
+
function attemptCacheRetrieval({ cacheKey, deserialize, cacheNamespace, normalizedUrl, }) {
|
|
9
9
|
if (!cacheKey)
|
|
10
10
|
return null;
|
|
11
11
|
const cached = cache.get(cacheKey);
|
|
12
12
|
if (!cached)
|
|
13
13
|
return null;
|
|
14
|
-
if (!deserialize)
|
|
15
|
-
|
|
14
|
+
if (!deserialize) {
|
|
15
|
+
logCacheMiss('missing deserializer', cacheNamespace, normalizedUrl);
|
|
16
|
+
return null;
|
|
17
|
+
}
|
|
16
18
|
const data = deserialize(cached.content);
|
|
17
|
-
if (data === undefined)
|
|
18
|
-
|
|
19
|
+
if (data === undefined) {
|
|
20
|
+
logCacheMiss('deserialize failure', cacheNamespace, normalizedUrl);
|
|
21
|
+
return null;
|
|
22
|
+
}
|
|
19
23
|
logDebug('Cache hit', { namespace: cacheNamespace, url: normalizedUrl });
|
|
20
|
-
return
|
|
24
|
+
return {
|
|
25
|
+
data,
|
|
26
|
+
fromCache: true,
|
|
27
|
+
url: normalizedUrl,
|
|
28
|
+
fetchedAt: cached.fetchedAt,
|
|
29
|
+
cacheKey,
|
|
30
|
+
};
|
|
21
31
|
}
|
|
22
32
|
function resolveNormalizedUrl(url) {
|
|
23
33
|
const { normalizedUrl: validatedUrl } = normalizeUrl(url);
|
|
@@ -27,44 +37,44 @@ function resolveNormalizedUrl(url) {
|
|
|
27
37
|
export async function executeFetchPipeline(options) {
|
|
28
38
|
const resolvedUrl = resolveNormalizedUrl(options.url);
|
|
29
39
|
logRawUrlTransformation(resolvedUrl);
|
|
30
|
-
const cacheKey =
|
|
31
|
-
const cachedResult = attemptCacheRetrieval(
|
|
40
|
+
const cacheKey = createCacheKey(options.cacheNamespace, resolvedUrl.normalizedUrl, options.cacheVary);
|
|
41
|
+
const cachedResult = attemptCacheRetrieval({
|
|
42
|
+
cacheKey,
|
|
43
|
+
deserialize: options.deserialize,
|
|
44
|
+
cacheNamespace: options.cacheNamespace,
|
|
45
|
+
normalizedUrl: resolvedUrl.normalizedUrl,
|
|
46
|
+
});
|
|
32
47
|
if (cachedResult)
|
|
33
48
|
return cachedResult;
|
|
34
|
-
|
|
49
|
+
logDebug('Fetching URL', { url: resolvedUrl.normalizedUrl });
|
|
50
|
+
const fetchOptions = options.signal === undefined ? {} : { signal: options.signal };
|
|
51
|
+
const html = await fetchNormalizedUrl(resolvedUrl.normalizedUrl, fetchOptions);
|
|
52
|
+
const data = await options.transform(html, resolvedUrl.normalizedUrl);
|
|
35
53
|
if (cache.isEnabled()) {
|
|
36
|
-
persistCache(
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
}
|
|
43
|
-
async function fetchAndTransform(options, normalizedUrl) {
|
|
44
|
-
const fetchOptions = buildFetchOptions(options);
|
|
45
|
-
logDebug('Fetching URL', { url: normalizedUrl });
|
|
46
|
-
const html = await fetchNormalizedUrl(normalizedUrl, fetchOptions);
|
|
47
|
-
return options.transform(html, normalizedUrl);
|
|
48
|
-
}
|
|
49
|
-
function buildFetchOptions(options) {
|
|
50
|
-
return options.signal === undefined ? {} : { signal: options.signal };
|
|
51
|
-
}
|
|
52
|
-
function resolveCacheMetadata(data, normalizedUrl) {
|
|
53
|
-
const metadata = { url: normalizedUrl };
|
|
54
|
-
const title = extractTitle(data);
|
|
55
|
-
if (title !== undefined) {
|
|
56
|
-
metadata.title = title;
|
|
54
|
+
persistCache({
|
|
55
|
+
cacheKey,
|
|
56
|
+
data,
|
|
57
|
+
serialize: options.serialize,
|
|
58
|
+
normalizedUrl: resolvedUrl.normalizedUrl,
|
|
59
|
+
});
|
|
57
60
|
}
|
|
58
|
-
return
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
61
|
+
return {
|
|
62
|
+
data,
|
|
63
|
+
fromCache: false,
|
|
64
|
+
url: resolvedUrl.normalizedUrl,
|
|
65
|
+
fetchedAt: new Date().toISOString(),
|
|
66
|
+
cacheKey,
|
|
67
|
+
};
|
|
62
68
|
}
|
|
63
|
-
function persistCache(cacheKey, data, serialize, normalizedUrl) {
|
|
69
|
+
function persistCache({ cacheKey, data, serialize, normalizedUrl, }) {
|
|
64
70
|
if (!cacheKey)
|
|
65
71
|
return;
|
|
66
|
-
const serializer =
|
|
67
|
-
const
|
|
72
|
+
const serializer = serialize ?? JSON.stringify;
|
|
73
|
+
const title = extractTitle(data);
|
|
74
|
+
const metadata = {
|
|
75
|
+
url: normalizedUrl,
|
|
76
|
+
...(title === undefined ? {} : { title }),
|
|
77
|
+
};
|
|
68
78
|
cache.set(cacheKey, serializer(data), metadata);
|
|
69
79
|
}
|
|
70
80
|
function extractTitle(value) {
|
|
@@ -78,7 +88,6 @@ function logCacheMiss(reason, cacheNamespace, normalizedUrl) {
|
|
|
78
88
|
namespace: cacheNamespace,
|
|
79
89
|
url: normalizedUrl,
|
|
80
90
|
});
|
|
81
|
-
return null;
|
|
82
91
|
}
|
|
83
92
|
function logRawUrlTransformation(resolvedUrl) {
|
|
84
93
|
if (!resolvedUrl.transformed)
|
|
@@ -87,21 +96,3 @@ function logRawUrlTransformation(resolvedUrl) {
|
|
|
87
96
|
original: resolvedUrl.originalUrl,
|
|
88
97
|
});
|
|
89
98
|
}
|
|
90
|
-
function buildCacheHitResult(data, fetchedAt, url, cacheKey) {
|
|
91
|
-
return {
|
|
92
|
-
data,
|
|
93
|
-
fromCache: true,
|
|
94
|
-
url,
|
|
95
|
-
fetchedAt,
|
|
96
|
-
cacheKey,
|
|
97
|
-
};
|
|
98
|
-
}
|
|
99
|
-
function buildPipelineResult(url, data, cacheKey) {
|
|
100
|
-
return {
|
|
101
|
-
data,
|
|
102
|
-
fromCache: false,
|
|
103
|
-
url,
|
|
104
|
-
fetchedAt: new Date().toISOString(),
|
|
105
|
-
cacheKey,
|
|
106
|
-
};
|
|
107
|
-
}
|