@j0hanz/superfetch 2.0.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. package/README.md +28 -17
  2. package/dist/config/index.js +11 -6
  3. package/dist/http/auth.js +161 -2
  4. package/dist/http/host-allowlist.d.ts +3 -0
  5. package/dist/http/host-allowlist.js +117 -0
  6. package/dist/http/mcp-routes.d.ts +8 -2
  7. package/dist/http/mcp-routes.js +101 -8
  8. package/dist/http/mcp-session-eviction.d.ts +3 -0
  9. package/dist/http/mcp-session-eviction.js +24 -0
  10. package/dist/http/mcp-session-init.d.ts +7 -0
  11. package/dist/http/mcp-session-init.js +94 -0
  12. package/dist/http/mcp-session-slots.d.ts +17 -0
  13. package/dist/http/mcp-session-slots.js +55 -0
  14. package/dist/http/mcp-session-transport-init.d.ts +7 -0
  15. package/dist/http/mcp-session-transport-init.js +41 -0
  16. package/dist/http/mcp-session-types.d.ts +5 -0
  17. package/dist/http/mcp-session-types.js +1 -0
  18. package/dist/http/mcp-session.d.ts +9 -9
  19. package/dist/http/mcp-session.js +5 -114
  20. package/dist/http/mcp-sessions.d.ts +43 -0
  21. package/dist/http/mcp-sessions.js +392 -0
  22. package/dist/http/rate-limit.js +2 -2
  23. package/dist/http/server-middleware.d.ts +6 -1
  24. package/dist/http/server-middleware.js +3 -117
  25. package/dist/http/server-shutdown.js +1 -1
  26. package/dist/http/server.d.ts +10 -0
  27. package/dist/http/server.js +508 -11
  28. package/dist/http/session-cleanup.js +8 -5
  29. package/dist/middleware/error-handler.d.ts +1 -1
  30. package/dist/middleware/error-handler.js +31 -30
  31. package/dist/resources/cached-content-params.d.ts +5 -0
  32. package/dist/resources/cached-content-params.js +36 -0
  33. package/dist/resources/cached-content.js +33 -33
  34. package/dist/server.js +1 -1
  35. package/dist/services/cache-events.d.ts +8 -0
  36. package/dist/services/cache-events.js +19 -0
  37. package/dist/services/cache.d.ts +5 -4
  38. package/dist/services/cache.js +49 -45
  39. package/dist/services/extractor.js +49 -38
  40. package/dist/services/fetcher/agents.js +1 -1
  41. package/dist/services/fetcher/dns-selection.js +1 -1
  42. package/dist/services/fetcher/interceptors.js +29 -60
  43. package/dist/services/fetcher/redirects.js +12 -4
  44. package/dist/services/fetcher/response.js +18 -8
  45. package/dist/services/fetcher.d.ts +21 -0
  46. package/dist/services/fetcher.js +532 -13
  47. package/dist/tools/handlers/fetch-single.shared.d.ts +11 -3
  48. package/dist/tools/handlers/fetch-single.shared.js +131 -2
  49. package/dist/tools/handlers/fetch-url.tool.d.ts +6 -0
  50. package/dist/tools/handlers/fetch-url.tool.js +48 -6
  51. package/dist/tools/utils/content-shaping.js +19 -4
  52. package/dist/tools/utils/content-transform.d.ts +4 -1
  53. package/dist/tools/utils/content-transform.js +110 -96
  54. package/dist/tools/utils/fetch-pipeline.js +47 -56
  55. package/dist/tools/utils/frontmatter.d.ts +3 -0
  56. package/dist/tools/utils/frontmatter.js +73 -0
  57. package/dist/tools/utils/markdown-heuristics.d.ts +1 -0
  58. package/dist/tools/utils/markdown-heuristics.js +19 -0
  59. package/dist/tools/utils/markdown-signals.d.ts +1 -0
  60. package/dist/tools/utils/markdown-signals.js +19 -0
  61. package/dist/tools/utils/raw-markdown-frontmatter.d.ts +3 -0
  62. package/dist/tools/utils/raw-markdown-frontmatter.js +73 -0
  63. package/dist/tools/utils/raw-markdown.d.ts +6 -0
  64. package/dist/tools/utils/raw-markdown.js +135 -0
  65. package/dist/transformers/markdown/fenced-code-rule.d.ts +2 -0
  66. package/dist/transformers/markdown/fenced-code-rule.js +38 -0
  67. package/dist/transformers/markdown/frontmatter.d.ts +2 -0
  68. package/dist/transformers/markdown/frontmatter.js +45 -0
  69. package/dist/transformers/markdown/noise-rule.d.ts +2 -0
  70. package/dist/transformers/markdown/noise-rule.js +80 -0
  71. package/dist/transformers/markdown/turndown-instance.d.ts +2 -0
  72. package/dist/transformers/markdown/turndown-instance.js +19 -0
  73. package/dist/transformers/markdown.d.ts +2 -0
  74. package/dist/transformers/markdown.js +185 -0
  75. package/dist/transformers/markdown.transformer.js +2 -189
  76. package/dist/utils/code-language-bash.d.ts +1 -0
  77. package/dist/utils/code-language-bash.js +48 -0
  78. package/dist/utils/code-language-core.d.ts +2 -0
  79. package/dist/utils/code-language-core.js +13 -0
  80. package/dist/utils/code-language-detectors.d.ts +5 -0
  81. package/dist/utils/code-language-detectors.js +142 -0
  82. package/dist/utils/code-language-helpers.d.ts +5 -0
  83. package/dist/utils/code-language-helpers.js +62 -0
  84. package/dist/utils/code-language-parsing.d.ts +5 -0
  85. package/dist/utils/code-language-parsing.js +62 -0
  86. package/dist/utils/code-language.d.ts +9 -0
  87. package/dist/utils/code-language.js +250 -46
  88. package/dist/utils/error-details.d.ts +3 -0
  89. package/dist/utils/error-details.js +12 -0
  90. package/dist/utils/filename-generator.js +14 -3
  91. package/dist/utils/ip-address.d.ts +4 -0
  92. package/dist/utils/ip-address.js +6 -0
  93. package/dist/utils/tool-error-handler.js +12 -17
  94. package/dist/utils/url-validator.js +33 -21
  95. package/package.json +7 -5
@@ -1,6 +1,4 @@
1
- import type { PipelineResult, ToolContentBlock } from '../../config/types/runtime.js';
2
- import { executeFetchPipeline } from '../utils/fetch-pipeline.js';
3
- import { applyInlineContentLimit } from '../utils/inline-content.js';
1
+ import type { FetchPipelineOptions, PipelineResult, ToolContentBlock } from '../../config/types/runtime.js';
4
2
  interface SharedFetchOptions<T extends {
5
3
  content: string;
6
4
  }> {
@@ -20,4 +18,14 @@ export declare function performSharedFetch<T extends {
20
18
  }>;
21
19
  export type InlineResult = ReturnType<typeof applyInlineContentLimit>;
22
20
  export declare function buildToolContentBlocks(structuredContent: Record<string, unknown>, fromCache: boolean, inlineResult: InlineResult, resourceName: string, cacheKey?: string | null, fullContent?: string, url?: string, title?: string): ToolContentBlock[];
21
+ interface InlineContentResult {
22
+ content?: string;
23
+ contentSize: number;
24
+ resourceUri?: string;
25
+ resourceMimeType?: string;
26
+ error?: string;
27
+ truncated?: boolean;
28
+ }
29
+ declare function applyInlineContentLimit(content: string, cacheKey: string | null): InlineContentResult;
30
+ export declare function executeFetchPipeline<T>(options: FetchPipelineOptions<T>): Promise<PipelineResult<T>>;
23
31
  export {};
@@ -1,7 +1,13 @@
1
+ import { TRUNCATION_MARKER } from '../../config/formatting.js';
1
2
  import { config } from '../../config/index.js';
3
+ import * as cache from '../../services/cache.js';
4
+ import { createCacheKey, toResourceUri } from '../../services/cache-keys.js';
5
+ import { fetchNormalizedUrl } from '../../services/fetcher.js';
6
+ import { logDebug } from '../../services/logger.js';
2
7
  import { generateSafeFilename } from '../../utils/filename-generator.js';
3
- import { executeFetchPipeline } from '../utils/fetch-pipeline.js';
4
- import { applyInlineContentLimit } from '../utils/inline-content.js';
8
+ import { isRecord } from '../../utils/guards.js';
9
+ import { transformToRawUrl } from '../../utils/url-transformer.js';
10
+ import { normalizeUrl } from '../../utils/url-validator.js';
5
11
  function applyOptionalPipelineSerialization(pipelineOptions, options) {
6
12
  if (options.serialize !== undefined) {
7
13
  pipelineOptions.serialize = options.serialize;
@@ -92,3 +98,126 @@ export function buildToolContentBlocks(structuredContent, fromCache, inlineResul
92
98
  maybeAppendResourceLink(blocks, inlineResult, resourceName);
93
99
  return blocks;
94
100
  }
101
+ function applyInlineContentLimit(content, cacheKey) {
102
+ const contentSize = content.length;
103
+ const inlineLimit = config.constants.maxInlineContentChars;
104
+ if (contentSize <= inlineLimit) {
105
+ return { content, contentSize };
106
+ }
107
+ const resourceUri = resolveResourceUri(cacheKey);
108
+ if (!resourceUri) {
109
+ return buildTruncatedFallback(content, contentSize, inlineLimit);
110
+ }
111
+ return {
112
+ contentSize,
113
+ resourceUri,
114
+ resourceMimeType: 'text/markdown',
115
+ };
116
+ }
117
+ function resolveResourceUri(cacheKey) {
118
+ if (!config.cache.enabled || !cacheKey)
119
+ return null;
120
+ return toResourceUri(cacheKey);
121
+ }
122
+ function buildTruncatedFallback(content, contentSize, inlineLimit) {
123
+ const maxContentLength = Math.max(0, inlineLimit - TRUNCATION_MARKER.length);
124
+ const truncatedContent = content.length > inlineLimit
125
+ ? `${content.substring(0, maxContentLength)}${TRUNCATION_MARKER}`
126
+ : content;
127
+ return {
128
+ content: truncatedContent,
129
+ contentSize,
130
+ truncated: true,
131
+ };
132
+ }
133
+ function attemptCacheRetrieval({ cacheKey, deserialize, cacheNamespace, normalizedUrl, }) {
134
+ if (!cacheKey)
135
+ return null;
136
+ const cached = cache.get(cacheKey);
137
+ if (!cached)
138
+ return null;
139
+ if (!deserialize) {
140
+ logCacheMiss('missing deserializer', cacheNamespace, normalizedUrl);
141
+ return null;
142
+ }
143
+ const data = deserialize(cached.content);
144
+ if (data === undefined) {
145
+ logCacheMiss('deserialize failure', cacheNamespace, normalizedUrl);
146
+ return null;
147
+ }
148
+ logDebug('Cache hit', { namespace: cacheNamespace, url: normalizedUrl });
149
+ return {
150
+ data,
151
+ fromCache: true,
152
+ url: normalizedUrl,
153
+ fetchedAt: cached.fetchedAt,
154
+ cacheKey,
155
+ };
156
+ }
157
+ function resolveNormalizedUrl(url) {
158
+ const { normalizedUrl: validatedUrl } = normalizeUrl(url);
159
+ const { url: normalizedUrl, transformed } = transformToRawUrl(validatedUrl);
160
+ return { normalizedUrl, originalUrl: validatedUrl, transformed };
161
+ }
162
+ export async function executeFetchPipeline(options) {
163
+ const resolvedUrl = resolveNormalizedUrl(options.url);
164
+ logRawUrlTransformation(resolvedUrl);
165
+ const cacheKey = createCacheKey(options.cacheNamespace, resolvedUrl.normalizedUrl, options.cacheVary);
166
+ const cachedResult = attemptCacheRetrieval({
167
+ cacheKey,
168
+ deserialize: options.deserialize,
169
+ cacheNamespace: options.cacheNamespace,
170
+ normalizedUrl: resolvedUrl.normalizedUrl,
171
+ });
172
+ if (cachedResult)
173
+ return cachedResult;
174
+ logDebug('Fetching URL', { url: resolvedUrl.normalizedUrl });
175
+ const fetchOptions = options.signal === undefined ? {} : { signal: options.signal };
176
+ const html = await fetchNormalizedUrl(resolvedUrl.normalizedUrl, fetchOptions);
177
+ const data = await options.transform(html, resolvedUrl.normalizedUrl);
178
+ if (cache.isEnabled()) {
179
+ persistCache({
180
+ cacheKey,
181
+ data,
182
+ serialize: options.serialize,
183
+ normalizedUrl: resolvedUrl.normalizedUrl,
184
+ });
185
+ }
186
+ return {
187
+ data,
188
+ fromCache: false,
189
+ url: resolvedUrl.normalizedUrl,
190
+ fetchedAt: new Date().toISOString(),
191
+ cacheKey,
192
+ };
193
+ }
194
+ function persistCache({ cacheKey, data, serialize, normalizedUrl, }) {
195
+ if (!cacheKey)
196
+ return;
197
+ const serializer = serialize ?? JSON.stringify;
198
+ const title = extractTitle(data);
199
+ const metadata = {
200
+ url: normalizedUrl,
201
+ ...(title === undefined ? {} : { title }),
202
+ };
203
+ cache.set(cacheKey, serializer(data), metadata);
204
+ }
205
+ function extractTitle(value) {
206
+ if (!isRecord(value))
207
+ return undefined;
208
+ const { title } = value;
209
+ return typeof title === 'string' ? title : undefined;
210
+ }
211
+ function logCacheMiss(reason, cacheNamespace, normalizedUrl) {
212
+ logDebug(`Cache miss due to ${reason}`, {
213
+ namespace: cacheNamespace,
214
+ url: normalizedUrl,
215
+ });
216
+ }
217
+ function logRawUrlTransformation(resolvedUrl) {
218
+ if (!resolvedUrl.transformed)
219
+ return;
220
+ logDebug('Using transformed raw content URL', {
221
+ original: resolvedUrl.originalUrl,
222
+ });
223
+ }
@@ -1,4 +1,10 @@
1
+ import type { MarkdownTransformResult } from '../../config/types/content.js';
1
2
  import type { FetchUrlInput, ToolResponseBase } from '../../config/types/tools.js';
2
3
  export declare const FETCH_URL_TOOL_NAME = "fetch-url";
3
4
  export declare const FETCH_URL_TOOL_DESCRIPTION = "Fetches a webpage and converts it to clean Markdown format";
5
+ type MarkdownPipelineResult = MarkdownTransformResult & {
6
+ readonly content: string;
7
+ };
8
+ export declare function parseCachedMarkdownResult(cached: string): MarkdownPipelineResult | undefined;
4
9
  export declare function fetchUrlToolHandler(input: FetchUrlInput): Promise<ToolResponseBase>;
10
+ export {};
@@ -1,10 +1,55 @@
1
1
  import { logDebug, logError } from '../../services/logger.js';
2
+ import { isRecord } from '../../utils/guards.js';
2
3
  import { createToolErrorResponse, handleToolError, } from '../../utils/tool-error-handler.js';
3
- import { parseCachedMarkdownResult } from '../utils/cached-markdown.js';
4
4
  import { transformHtmlToMarkdown } from '../utils/content-transform.js';
5
5
  import { buildToolContentBlocks, performSharedFetch, } from './fetch-single.shared.js';
6
6
  export const FETCH_URL_TOOL_NAME = 'fetch-url';
7
7
  export const FETCH_URL_TOOL_DESCRIPTION = 'Fetches a webpage and converts it to clean Markdown format';
8
+ function parseJsonRecord(input) {
9
+ try {
10
+ const parsed = JSON.parse(input);
11
+ return isRecord(parsed) ? parsed : undefined;
12
+ }
13
+ catch {
14
+ return undefined;
15
+ }
16
+ }
17
+ function resolveMarkdownContent(parsed) {
18
+ const { markdown } = parsed;
19
+ if (typeof markdown === 'string')
20
+ return markdown;
21
+ const { content } = parsed;
22
+ if (typeof content === 'string')
23
+ return content;
24
+ return undefined;
25
+ }
26
+ function resolveOptionalTitle(parsed) {
27
+ const { title } = parsed;
28
+ if (title === undefined)
29
+ return undefined;
30
+ return typeof title === 'string' ? title : undefined;
31
+ }
32
+ function resolveTruncatedFlag(parsed) {
33
+ const { truncated } = parsed;
34
+ return typeof truncated === 'boolean' ? truncated : false;
35
+ }
36
+ export function parseCachedMarkdownResult(cached) {
37
+ const parsed = parseJsonRecord(cached);
38
+ if (!parsed)
39
+ return undefined;
40
+ const resolvedContent = resolveMarkdownContent(parsed);
41
+ if (resolvedContent === undefined)
42
+ return undefined;
43
+ const title = resolveOptionalTitle(parsed);
44
+ if (parsed.title !== undefined && title === undefined)
45
+ return undefined;
46
+ return {
47
+ content: resolvedContent,
48
+ markdown: resolvedContent,
49
+ title,
50
+ truncated: resolveTruncatedFlag(parsed),
51
+ };
52
+ }
8
53
  function deserializeMarkdownResult(cached) {
9
54
  return parseCachedMarkdownResult(cached);
10
55
  }
@@ -53,13 +98,10 @@ function buildResponse(pipeline, inlineResult) {
53
98
  };
54
99
  }
55
100
  export async function fetchUrlToolHandler(input) {
56
- try {
57
- return await executeFetch(input);
58
- }
59
- catch (error) {
101
+ return executeFetch(input).catch((error) => {
60
102
  logError('fetch-url tool error', error instanceof Error ? error : undefined);
61
103
  return handleToolError(error, input.url, 'Failed to fetch URL');
62
- }
104
+ });
63
105
  }
64
106
  async function executeFetch(input) {
65
107
  const { url } = input;
@@ -1,10 +1,25 @@
1
1
  const MIN_CONTENT_RATIO = 0.3;
2
2
  const MIN_HTML_LENGTH_FOR_GATE = 100;
3
+ function stripHtmlTags(html) {
4
+ const parts = [];
5
+ let inTag = false;
6
+ for (const char of html) {
7
+ if (char === '<') {
8
+ inTag = true;
9
+ continue;
10
+ }
11
+ if (char === '>') {
12
+ inTag = false;
13
+ continue;
14
+ }
15
+ if (!inTag) {
16
+ parts.push(char);
17
+ }
18
+ }
19
+ return parts.join('');
20
+ }
3
21
  function estimateTextLength(html) {
4
- return html
5
- .replace(/<[^>]*>/g, '')
6
- .replace(/\s+/g, ' ')
7
- .trim().length;
22
+ return stripHtmlTags(html).replace(/\s+/g, ' ').trim().length;
8
23
  }
9
24
  export function isExtractionSufficient(article, originalHtml) {
10
25
  if (!article)
@@ -1,2 +1,5 @@
1
- import type { MarkdownTransformResult, TransformOptions } from '../../config/types/content.js';
1
+ import type { ExtractedArticle, ExtractedMetadata, MarkdownTransformResult, MetadataBlock, TransformOptions } from '../../config/types/content.js';
2
+ export declare function isExtractionSufficient(article: ExtractedArticle | null, originalHtml: string): boolean;
3
+ export declare function determineContentExtractionSource(article: ExtractedArticle | null): article is ExtractedArticle;
4
+ export declare function createContentMetadataBlock(url: string, article: ExtractedArticle | null, extractedMeta: ExtractedMetadata, shouldExtractFromArticle: boolean, includeMetadata: boolean): MetadataBlock | undefined;
2
5
  export declare function transformHtmlToMarkdown(html: string, url: string, options: TransformOptions): MarkdownTransformResult;
@@ -1,9 +1,75 @@
1
1
  import { extractContent } from '../../services/extractor.js';
2
2
  import { logDebug } from '../../services/logger.js';
3
- import { isRawTextContentUrl } from '../../utils/url-transformer.js';
4
- import { htmlToMarkdown } from '../../transformers/markdown.transformer.js';
5
- import { createContentMetadataBlock, determineContentExtractionSource, isExtractionSufficient, } from './content-shaping.js';
6
- function buildArticleContentSource(url, article, extractedMeta, includeMetadata) {
3
+ import { htmlToMarkdown } from '../../transformers/markdown.js';
4
+ import { tryTransformRawContent } from './raw-markdown.js';
5
+ const MIN_CONTENT_RATIO = 0.3;
6
+ const MIN_HTML_LENGTH_FOR_GATE = 100;
7
+ function stripHtmlTags(html) {
8
+ const parts = [];
9
+ let inTag = false;
10
+ for (const char of html) {
11
+ if (char === '<') {
12
+ inTag = true;
13
+ continue;
14
+ }
15
+ if (char === '>') {
16
+ inTag = false;
17
+ continue;
18
+ }
19
+ if (!inTag) {
20
+ parts.push(char);
21
+ }
22
+ }
23
+ return parts.join('');
24
+ }
25
+ function estimateTextLength(html) {
26
+ return stripHtmlTags(html).replace(/\s+/g, ' ').trim().length;
27
+ }
28
+ export function isExtractionSufficient(article, originalHtml) {
29
+ if (!article)
30
+ return false;
31
+ const articleLength = article.textContent.length;
32
+ const originalLength = estimateTextLength(originalHtml);
33
+ if (originalLength < MIN_HTML_LENGTH_FOR_GATE)
34
+ return true;
35
+ return articleLength / originalLength >= MIN_CONTENT_RATIO;
36
+ }
37
+ export function determineContentExtractionSource(article) {
38
+ return !!article;
39
+ }
40
+ function applyArticleMetadata(metadata, article) {
41
+ if (article.title !== undefined)
42
+ metadata.title = article.title;
43
+ if (article.byline !== undefined)
44
+ metadata.author = article.byline;
45
+ }
46
+ function applyExtractedMetadata(metadata, extractedMeta) {
47
+ if (extractedMeta.title !== undefined)
48
+ metadata.title = extractedMeta.title;
49
+ if (extractedMeta.description !== undefined) {
50
+ metadata.description = extractedMeta.description;
51
+ }
52
+ if (extractedMeta.author !== undefined) {
53
+ metadata.author = extractedMeta.author;
54
+ }
55
+ }
56
+ export function createContentMetadataBlock(url, article, extractedMeta, shouldExtractFromArticle, includeMetadata) {
57
+ if (!includeMetadata)
58
+ return undefined;
59
+ const now = new Date().toISOString();
60
+ const metadata = {
61
+ type: 'metadata',
62
+ url,
63
+ fetchedAt: now,
64
+ };
65
+ if (shouldExtractFromArticle && article) {
66
+ applyArticleMetadata(metadata, article);
67
+ return metadata;
68
+ }
69
+ applyExtractedMetadata(metadata, extractedMeta);
70
+ return metadata;
71
+ }
72
+ function buildArticleContentSource({ url, article, extractedMeta, includeMetadata, }) {
7
73
  const metadata = createContentMetadataBlock(url, article, extractedMeta, true, includeMetadata);
8
74
  return {
9
75
  sourceHtml: article.content,
@@ -11,7 +77,7 @@ function buildArticleContentSource(url, article, extractedMeta, includeMetadata)
11
77
  metadata,
12
78
  };
13
79
  }
14
- function buildFullHtmlContentSource(html, url, article, extractedMeta, includeMetadata) {
80
+ function buildFullHtmlContentSource({ html, url, article, extractedMeta, includeMetadata, }) {
15
81
  const metadata = createContentMetadataBlock(url, article, extractedMeta, false, includeMetadata);
16
82
  return {
17
83
  sourceHtml: html,
@@ -19,119 +85,67 @@ function buildFullHtmlContentSource(html, url, article, extractedMeta, includeMe
19
85
  metadata,
20
86
  };
21
87
  }
22
- function logQualityGateFallback(url, article) {
88
+ function logQualityGateFallback({ url, articleLength, }) {
23
89
  logDebug('Quality gate: Readability extraction below threshold, using full HTML', {
24
90
  url: url.substring(0, 80),
25
- articleLength: article.textContent.length,
91
+ articleLength,
26
92
  });
27
93
  }
28
- function tryBuildExtractedArticleContentSource(html, url, article, extractedMeta, options) {
94
+ function tryBuildExtractedArticleContentSource({ html, url, article, extractedMeta, includeMetadata, }) {
29
95
  if (!article)
30
96
  return null;
31
97
  const shouldExtractFromArticle = determineContentExtractionSource(article);
32
98
  if (shouldExtractFromArticle && isExtractionSufficient(article, html)) {
33
- return buildArticleContentSource(url, article, extractedMeta, options.includeMetadata);
99
+ return buildArticleContentSource({
100
+ url,
101
+ article,
102
+ extractedMeta,
103
+ includeMetadata,
104
+ });
34
105
  }
35
106
  if (shouldExtractFromArticle) {
36
- logQualityGateFallback(url, article);
107
+ logQualityGateFallback({
108
+ url,
109
+ articleLength: article.textContent.length,
110
+ });
37
111
  }
38
112
  return null;
39
113
  }
40
- function resolveContentSource(html, url, options) {
114
+ function resolveContentSource({ html, url, includeMetadata, }) {
41
115
  const { article, metadata: extractedMeta } = extractContent(html, url, {
42
116
  extractArticle: true,
43
117
  });
44
- const extracted = tryBuildExtractedArticleContentSource(html, url, article, extractedMeta, options);
118
+ const extracted = tryBuildExtractedArticleContentSource({
119
+ html,
120
+ url,
121
+ article,
122
+ extractedMeta,
123
+ includeMetadata,
124
+ });
45
125
  if (extracted)
46
126
  return extracted;
47
- return buildFullHtmlContentSource(html, url, article, extractedMeta, options.includeMetadata);
48
- }
49
- function buildMarkdownPayload(context) {
50
- return htmlToMarkdown(context.sourceHtml, context.metadata);
51
- }
52
- function buildRawMarkdownPayload(rawContent, url, includeMetadata) {
53
- const title = extractTitleFromRawMarkdown(rawContent);
54
- const content = includeMetadata
55
- ? addSourceToMarkdown(rawContent, url)
56
- : rawContent;
57
- return { content, title };
58
- }
59
- function extractTitleFromRawMarkdown(content) {
60
- const frontmatterMatch = /^---\r?\n([\s\S]*?)\r?\n---/.exec(content);
61
- if (!frontmatterMatch)
62
- return undefined;
63
- const frontmatter = frontmatterMatch[1] ?? '';
64
- const titleMatch = /^(?:title|name):\s*["']?(.+?)["']?\s*$/im.exec(frontmatter);
65
- return titleMatch?.[1]?.trim();
66
- }
67
- function addSourceToMarkdown(content, url) {
68
- const frontmatterMatch = /^(---\r?\n)([\s\S]*?)(\r?\n---)/.exec(content);
69
- if (frontmatterMatch) {
70
- const start = frontmatterMatch[1] ?? '---\n';
71
- const existingFields = frontmatterMatch[2] ?? '';
72
- const end = frontmatterMatch[3] ?? '\n---';
73
- const rest = content.slice(frontmatterMatch[0].length);
74
- if (/^source:/im.test(existingFields)) {
75
- return content;
76
- }
77
- return `${start}${existingFields}\nsource: "${url}"${end}${rest}`;
78
- }
79
- return `---\nsource: "${url}"\n---\n\n${content}`;
80
- }
81
- function looksLikeHtmlDocument(trimmed) {
82
- return (trimmed.startsWith('<!DOCTYPE') ||
83
- trimmed.startsWith('<!doctype') ||
84
- trimmed.startsWith('<html') ||
85
- trimmed.startsWith('<HTML'));
86
- }
87
- function hasFrontmatter(trimmed) {
88
- return /^---\r?\n/.test(trimmed);
89
- }
90
- function countCommonHtmlTags(content) {
91
- const matches = content.match(/<(html|head|body|div|span|script|style|meta|link)\b/gi) ??
92
- [];
93
- return matches.length;
94
- }
95
- function looksLikeMarkdown(content) {
96
- const hasMarkdownHeadings = /^#{1,6}\s+/m.test(content);
97
- const hasMarkdownLists = /^[\s]*[-*+]\s+/m.test(content);
98
- const hasMarkdownCodeBlocks = /```[\s\S]*?```/.test(content);
99
- return hasMarkdownHeadings || hasMarkdownLists || hasMarkdownCodeBlocks;
100
- }
101
- function isRawTextContent(content) {
102
- const trimmed = content.trim();
103
- if (looksLikeHtmlDocument(trimmed)) {
104
- return false;
105
- }
106
- if (hasFrontmatter(trimmed)) {
107
- return true;
108
- }
109
- if (countCommonHtmlTags(content) > 2) {
110
- return false;
111
- }
112
- if (looksLikeMarkdown(content)) {
113
- return true;
114
- }
115
- return false;
116
- }
117
- function tryTransformRawContent(html, url, options) {
118
- if (!isRawTextContentUrl(url) && !isRawTextContent(html)) {
119
- return null;
120
- }
121
- logDebug('Preserving raw markdown content', { url: url.substring(0, 80) });
122
- const { content, title } = buildRawMarkdownPayload(html, url, options.includeMetadata);
123
- return {
124
- markdown: content,
125
- title,
126
- truncated: false,
127
- };
127
+ return buildFullHtmlContentSource({
128
+ html,
129
+ url,
130
+ article,
131
+ extractedMeta,
132
+ includeMetadata,
133
+ });
128
134
  }
129
135
  export function transformHtmlToMarkdown(html, url, options) {
130
- const raw = tryTransformRawContent(html, url, options);
136
+ const raw = tryTransformRawContent({
137
+ html,
138
+ url,
139
+ includeMetadata: options.includeMetadata,
140
+ });
131
141
  if (raw)
132
142
  return raw;
133
- const context = resolveContentSource(html, url, options);
134
- const content = buildMarkdownPayload(context);
143
+ const context = resolveContentSource({
144
+ html,
145
+ url,
146
+ includeMetadata: options.includeMetadata,
147
+ });
148
+ const content = htmlToMarkdown(context.sourceHtml, context.metadata);
135
149
  return {
136
150
  markdown: content,
137
151
  title: context.title,
@@ -5,19 +5,29 @@ import { logDebug } from '../../services/logger.js';
5
5
  import { isRecord } from '../../utils/guards.js';
6
6
  import { transformToRawUrl } from '../../utils/url-transformer.js';
7
7
  import { normalizeUrl } from '../../utils/url-validator.js';
8
- function attemptCacheRetrieval(cacheKey, deserialize, cacheNamespace, normalizedUrl) {
8
+ function attemptCacheRetrieval({ cacheKey, deserialize, cacheNamespace, normalizedUrl, }) {
9
9
  if (!cacheKey)
10
10
  return null;
11
11
  const cached = cache.get(cacheKey);
12
12
  if (!cached)
13
13
  return null;
14
- if (!deserialize)
15
- return logCacheMiss('missing deserializer', cacheNamespace, normalizedUrl);
14
+ if (!deserialize) {
15
+ logCacheMiss('missing deserializer', cacheNamespace, normalizedUrl);
16
+ return null;
17
+ }
16
18
  const data = deserialize(cached.content);
17
- if (data === undefined)
18
- return logCacheMiss('deserialize failure', cacheNamespace, normalizedUrl);
19
+ if (data === undefined) {
20
+ logCacheMiss('deserialize failure', cacheNamespace, normalizedUrl);
21
+ return null;
22
+ }
19
23
  logDebug('Cache hit', { namespace: cacheNamespace, url: normalizedUrl });
20
- return buildCacheHitResult(data, cached.fetchedAt, normalizedUrl, cacheKey);
24
+ return {
25
+ data,
26
+ fromCache: true,
27
+ url: normalizedUrl,
28
+ fetchedAt: cached.fetchedAt,
29
+ cacheKey,
30
+ };
21
31
  }
22
32
  function resolveNormalizedUrl(url) {
23
33
  const { normalizedUrl: validatedUrl } = normalizeUrl(url);
@@ -27,44 +37,44 @@ function resolveNormalizedUrl(url) {
27
37
  export async function executeFetchPipeline(options) {
28
38
  const resolvedUrl = resolveNormalizedUrl(options.url);
29
39
  logRawUrlTransformation(resolvedUrl);
30
- const cacheKey = resolveCacheKey(options, resolvedUrl.normalizedUrl);
31
- const cachedResult = attemptCacheRetrieval(cacheKey, options.deserialize, options.cacheNamespace, resolvedUrl.normalizedUrl);
40
+ const cacheKey = createCacheKey(options.cacheNamespace, resolvedUrl.normalizedUrl, options.cacheVary);
41
+ const cachedResult = attemptCacheRetrieval({
42
+ cacheKey,
43
+ deserialize: options.deserialize,
44
+ cacheNamespace: options.cacheNamespace,
45
+ normalizedUrl: resolvedUrl.normalizedUrl,
46
+ });
32
47
  if (cachedResult)
33
48
  return cachedResult;
34
- const data = await fetchAndTransform(options, resolvedUrl.normalizedUrl);
49
+ logDebug('Fetching URL', { url: resolvedUrl.normalizedUrl });
50
+ const fetchOptions = options.signal === undefined ? {} : { signal: options.signal };
51
+ const html = await fetchNormalizedUrl(resolvedUrl.normalizedUrl, fetchOptions);
52
+ const data = await options.transform(html, resolvedUrl.normalizedUrl);
35
53
  if (cache.isEnabled()) {
36
- persistCache(cacheKey, data, options.serialize, resolvedUrl.normalizedUrl);
37
- }
38
- return buildPipelineResult(resolvedUrl.normalizedUrl, data, cacheKey);
39
- }
40
- function resolveCacheKey(options, normalizedUrl) {
41
- return createCacheKey(options.cacheNamespace, normalizedUrl, options.cacheVary);
42
- }
43
- async function fetchAndTransform(options, normalizedUrl) {
44
- const fetchOptions = buildFetchOptions(options);
45
- logDebug('Fetching URL', { url: normalizedUrl });
46
- const html = await fetchNormalizedUrl(normalizedUrl, fetchOptions);
47
- return options.transform(html, normalizedUrl);
48
- }
49
- function buildFetchOptions(options) {
50
- return options.signal === undefined ? {} : { signal: options.signal };
51
- }
52
- function resolveCacheMetadata(data, normalizedUrl) {
53
- const metadata = { url: normalizedUrl };
54
- const title = extractTitle(data);
55
- if (title !== undefined) {
56
- metadata.title = title;
54
+ persistCache({
55
+ cacheKey,
56
+ data,
57
+ serialize: options.serialize,
58
+ normalizedUrl: resolvedUrl.normalizedUrl,
59
+ });
57
60
  }
58
- return metadata;
59
- }
60
- function resolveSerializer(serialize) {
61
- return serialize ?? JSON.stringify;
61
+ return {
62
+ data,
63
+ fromCache: false,
64
+ url: resolvedUrl.normalizedUrl,
65
+ fetchedAt: new Date().toISOString(),
66
+ cacheKey,
67
+ };
62
68
  }
63
- function persistCache(cacheKey, data, serialize, normalizedUrl) {
69
+ function persistCache({ cacheKey, data, serialize, normalizedUrl, }) {
64
70
  if (!cacheKey)
65
71
  return;
66
- const serializer = resolveSerializer(serialize);
67
- const metadata = resolveCacheMetadata(data, normalizedUrl);
72
+ const serializer = serialize ?? JSON.stringify;
73
+ const title = extractTitle(data);
74
+ const metadata = {
75
+ url: normalizedUrl,
76
+ ...(title === undefined ? {} : { title }),
77
+ };
68
78
  cache.set(cacheKey, serializer(data), metadata);
69
79
  }
70
80
  function extractTitle(value) {
@@ -78,7 +88,6 @@ function logCacheMiss(reason, cacheNamespace, normalizedUrl) {
78
88
  namespace: cacheNamespace,
79
89
  url: normalizedUrl,
80
90
  });
81
- return null;
82
91
  }
83
92
  function logRawUrlTransformation(resolvedUrl) {
84
93
  if (!resolvedUrl.transformed)
@@ -87,21 +96,3 @@ function logRawUrlTransformation(resolvedUrl) {
87
96
  original: resolvedUrl.originalUrl,
88
97
  });
89
98
  }
90
- function buildCacheHitResult(data, fetchedAt, url, cacheKey) {
91
- return {
92
- data,
93
- fromCache: true,
94
- url,
95
- fetchedAt,
96
- cacheKey,
97
- };
98
- }
99
- function buildPipelineResult(url, data, cacheKey) {
100
- return {
101
- data,
102
- fromCache: false,
103
- url,
104
- fetchedAt: new Date().toISOString(),
105
- cacheKey,
106
- };
107
- }
@@ -0,0 +1,3 @@
1
+ export declare function extractTitleFromRawMarkdown(content: string): string | undefined;
2
+ export declare function addSourceToMarkdown(content: string, url: string): string;
3
+ export declare function hasFrontmatter(trimmed: string): boolean;