@j0hanz/superfetch 1.2.5 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. package/README.md +116 -152
  2. package/dist/config/auth-config.d.ts +16 -0
  3. package/dist/config/auth-config.js +53 -0
  4. package/dist/config/constants.d.ts +11 -13
  5. package/dist/config/constants.js +1 -3
  6. package/dist/config/env-parsers.d.ts +7 -0
  7. package/dist/config/env-parsers.js +84 -0
  8. package/dist/config/formatting.d.ts +2 -2
  9. package/dist/config/index.d.ts +47 -53
  10. package/dist/config/index.js +25 -59
  11. package/dist/config/types/content.d.ts +1 -49
  12. package/dist/config/types/runtime.d.ts +8 -16
  13. package/dist/config/types/tools.d.ts +2 -28
  14. package/dist/http/accept-policy.d.ts +3 -0
  15. package/dist/http/accept-policy.js +45 -0
  16. package/dist/http/async-handler.d.ts +2 -0
  17. package/dist/http/async-handler.js +5 -0
  18. package/dist/http/auth-introspection.d.ts +2 -0
  19. package/dist/http/auth-introspection.js +141 -0
  20. package/dist/http/auth-static.d.ts +2 -0
  21. package/dist/http/auth-static.js +23 -0
  22. package/dist/http/auth.d.ts +3 -2
  23. package/dist/http/auth.js +98 -26
  24. package/dist/http/cors.d.ts +6 -6
  25. package/dist/http/cors.js +7 -42
  26. package/dist/http/download-routes.d.ts +0 -12
  27. package/dist/http/download-routes.js +21 -58
  28. package/dist/http/jsonrpc-http.d.ts +2 -0
  29. package/dist/http/jsonrpc-http.js +10 -0
  30. package/dist/http/mcp-routes.d.ts +0 -1
  31. package/dist/http/mcp-routes.js +43 -30
  32. package/dist/http/mcp-session-helpers.d.ts +0 -1
  33. package/dist/http/mcp-session-helpers.js +1 -1
  34. package/dist/http/mcp-session-transport.d.ts +7 -0
  35. package/dist/http/mcp-session-transport.js +57 -0
  36. package/dist/http/mcp-session.js +60 -73
  37. package/dist/http/mcp-validation.d.ts +1 -0
  38. package/dist/http/mcp-validation.js +11 -10
  39. package/dist/http/protocol-policy.d.ts +2 -0
  40. package/dist/http/protocol-policy.js +31 -0
  41. package/dist/http/rate-limit.js +5 -2
  42. package/dist/http/server-config.d.ts +1 -0
  43. package/dist/http/server-config.js +40 -0
  44. package/dist/http/server-middleware.d.ts +2 -9
  45. package/dist/http/server-middleware.js +96 -43
  46. package/dist/http/server-shutdown.d.ts +4 -0
  47. package/dist/http/server-shutdown.js +43 -0
  48. package/dist/http/server.js +52 -64
  49. package/dist/http/session-cleanup.js +1 -1
  50. package/dist/middleware/error-handler.js +1 -3
  51. package/dist/resources/cached-content.js +50 -108
  52. package/dist/resources/index.js +0 -82
  53. package/dist/server.js +51 -30
  54. package/dist/services/cache-keys.d.ts +7 -0
  55. package/dist/services/cache-keys.js +57 -0
  56. package/dist/services/cache.d.ts +1 -7
  57. package/dist/services/cache.js +53 -119
  58. package/dist/services/context.d.ts +0 -1
  59. package/dist/services/context.js +0 -7
  60. package/dist/services/extractor.js +10 -82
  61. package/dist/services/fetcher/agents.d.ts +2 -2
  62. package/dist/services/fetcher/agents.js +34 -95
  63. package/dist/services/fetcher/dns-selection.d.ts +2 -0
  64. package/dist/services/fetcher/dns-selection.js +72 -0
  65. package/dist/services/fetcher/interceptors.d.ts +0 -22
  66. package/dist/services/fetcher/interceptors.js +30 -13
  67. package/dist/services/fetcher/redirects.js +4 -3
  68. package/dist/services/fetcher/response.js +66 -31
  69. package/dist/services/fetcher.d.ts +1 -3
  70. package/dist/services/fetcher.js +14 -33
  71. package/dist/services/fifo-queue.d.ts +8 -0
  72. package/dist/services/fifo-queue.js +25 -0
  73. package/dist/services/logger.js +2 -2
  74. package/dist/services/metadata-collector.d.ts +1 -9
  75. package/dist/services/metadata-collector.js +71 -2
  76. package/dist/services/transform-worker-pool.d.ts +4 -14
  77. package/dist/services/transform-worker-pool.js +177 -129
  78. package/dist/services/transform-worker-types.d.ts +32 -0
  79. package/dist/services/transform-worker-types.js +14 -0
  80. package/dist/tools/handlers/fetch-markdown.tool.d.ts +3 -4
  81. package/dist/tools/handlers/fetch-markdown.tool.js +20 -72
  82. package/dist/tools/handlers/fetch-single.shared.d.ts +1 -20
  83. package/dist/tools/handlers/fetch-single.shared.js +44 -87
  84. package/dist/tools/handlers/fetch-url.tool.d.ts +1 -1
  85. package/dist/tools/handlers/fetch-url.tool.js +46 -123
  86. package/dist/tools/index.js +21 -40
  87. package/dist/tools/schemas.d.ts +1 -51
  88. package/dist/tools/schemas.js +1 -107
  89. package/dist/tools/utils/cached-markdown.d.ts +5 -0
  90. package/dist/tools/utils/cached-markdown.js +46 -0
  91. package/dist/tools/utils/content-shaping.d.ts +4 -0
  92. package/dist/tools/utils/content-shaping.js +52 -0
  93. package/dist/tools/utils/content-transform.d.ts +2 -17
  94. package/dist/tools/utils/content-transform.js +120 -114
  95. package/dist/tools/utils/fetch-pipeline.d.ts +0 -8
  96. package/dist/tools/utils/fetch-pipeline.js +65 -62
  97. package/dist/tools/utils/inline-content.d.ts +1 -2
  98. package/dist/tools/utils/inline-content.js +4 -7
  99. package/dist/transformers/markdown.transformer.js +109 -34
  100. package/dist/utils/cached-payload.d.ts +7 -0
  101. package/dist/utils/cached-payload.js +36 -0
  102. package/dist/utils/error-utils.js +1 -1
  103. package/dist/utils/filename-generator.js +21 -10
  104. package/dist/utils/guards.d.ts +1 -0
  105. package/dist/utils/guards.js +3 -0
  106. package/dist/utils/header-normalizer.d.ts +0 -3
  107. package/dist/utils/header-normalizer.js +3 -3
  108. package/dist/utils/tool-error-handler.d.ts +2 -2
  109. package/dist/utils/tool-error-handler.js +11 -38
  110. package/dist/utils/url-transformer.d.ts +7 -0
  111. package/dist/utils/url-transformer.js +147 -0
  112. package/dist/utils/url-validator.d.ts +1 -2
  113. package/dist/utils/url-validator.js +20 -93
  114. package/dist/workers/content-transform.worker.d.ts +1 -0
  115. package/dist/workers/content-transform.worker.js +40 -0
  116. package/package.json +13 -16
@@ -0,0 +1,46 @@
1
+ import { isRecord } from '../../utils/guards.js';
2
+ function parseJsonRecord(input) {
3
+ try {
4
+ const parsed = JSON.parse(input);
5
+ return isRecord(parsed) ? parsed : undefined;
6
+ }
7
+ catch {
8
+ return undefined;
9
+ }
10
+ }
11
+ function resolveMarkdownContent(parsed) {
12
+ const { markdown } = parsed;
13
+ if (typeof markdown === 'string')
14
+ return markdown;
15
+ const { content } = parsed;
16
+ if (typeof content === 'string')
17
+ return content;
18
+ return undefined;
19
+ }
20
+ function resolveOptionalTitle(parsed) {
21
+ const { title } = parsed;
22
+ if (title === undefined)
23
+ return undefined;
24
+ return typeof title === 'string' ? title : undefined;
25
+ }
26
+ function resolveTruncatedFlag(parsed) {
27
+ const { truncated } = parsed;
28
+ return typeof truncated === 'boolean' ? truncated : false;
29
+ }
30
+ export function parseCachedMarkdownResult(cached) {
31
+ const parsed = parseJsonRecord(cached);
32
+ if (!parsed)
33
+ return undefined;
34
+ const resolvedContent = resolveMarkdownContent(parsed);
35
+ if (resolvedContent === undefined)
36
+ return undefined;
37
+ const title = resolveOptionalTitle(parsed);
38
+ if (parsed.title !== undefined && title === undefined)
39
+ return undefined;
40
+ return {
41
+ content: resolvedContent,
42
+ markdown: resolvedContent,
43
+ title,
44
+ truncated: resolveTruncatedFlag(parsed),
45
+ };
46
+ }
@@ -0,0 +1,4 @@
1
+ import type { ExtractedArticle, ExtractedMetadata, MetadataBlock } from '../../config/types/content.js';
2
+ export declare function isExtractionSufficient(article: ExtractedArticle | null, originalHtml: string): boolean;
3
+ export declare function determineContentExtractionSource(article: ExtractedArticle | null): article is ExtractedArticle;
4
+ export declare function createContentMetadataBlock(url: string, article: ExtractedArticle | null, extractedMeta: ExtractedMetadata, shouldExtractFromArticle: boolean, includeMetadata: boolean): MetadataBlock | undefined;
@@ -0,0 +1,52 @@
1
+ const MIN_CONTENT_RATIO = 0.3;
2
+ const MIN_HTML_LENGTH_FOR_GATE = 100;
3
+ function estimateTextLength(html) {
4
+ return html
5
+ .replace(/<[^>]*>/g, '')
6
+ .replace(/\s+/g, ' ')
7
+ .trim().length;
8
+ }
9
+ export function isExtractionSufficient(article, originalHtml) {
10
+ if (!article)
11
+ return false;
12
+ const articleLength = article.textContent.length;
13
+ const originalLength = estimateTextLength(originalHtml);
14
+ if (originalLength < MIN_HTML_LENGTH_FOR_GATE)
15
+ return true;
16
+ return articleLength / originalLength >= MIN_CONTENT_RATIO;
17
+ }
18
+ export function determineContentExtractionSource(article) {
19
+ return !!article;
20
+ }
21
+ function applyArticleMetadata(metadata, article) {
22
+ if (article.title !== undefined)
23
+ metadata.title = article.title;
24
+ if (article.byline !== undefined)
25
+ metadata.author = article.byline;
26
+ }
27
+ function applyExtractedMetadata(metadata, extractedMeta) {
28
+ if (extractedMeta.title !== undefined)
29
+ metadata.title = extractedMeta.title;
30
+ if (extractedMeta.description !== undefined) {
31
+ metadata.description = extractedMeta.description;
32
+ }
33
+ if (extractedMeta.author !== undefined) {
34
+ metadata.author = extractedMeta.author;
35
+ }
36
+ }
37
+ export function createContentMetadataBlock(url, article, extractedMeta, shouldExtractFromArticle, includeMetadata) {
38
+ if (!includeMetadata)
39
+ return undefined;
40
+ const now = new Date().toISOString();
41
+ const metadata = {
42
+ type: 'metadata',
43
+ url,
44
+ fetchedAt: now,
45
+ };
46
+ if (shouldExtractFromArticle && article) {
47
+ applyArticleMetadata(metadata, article);
48
+ return metadata;
49
+ }
50
+ applyExtractedMetadata(metadata, extractedMeta);
51
+ return metadata;
52
+ }
@@ -1,17 +1,2 @@
1
- import type { JsonlTransformResult, MarkdownTransformResult } from '../../config/types/content.js';
2
- interface ExtractionOptions {
3
- readonly extractMainContent: boolean;
4
- readonly includeMetadata: boolean;
5
- }
6
- interface ContentLengthOptions {
7
- readonly maxContentLength?: number;
8
- }
9
- interface MarkdownOptions extends ExtractionOptions, ContentLengthOptions {
10
- }
11
- interface MarkdownWithBlocksOptions extends ExtractionOptions, ContentLengthOptions {
12
- readonly includeContentBlocks?: boolean;
13
- }
14
- export declare function transformHtmlToJsonl(html: string, url: string, options: ExtractionOptions & ContentLengthOptions): JsonlTransformResult;
15
- export declare function transformHtmlToMarkdown(html: string, url: string, options: MarkdownOptions): MarkdownTransformResult;
16
- export declare function transformHtmlToMarkdownWithBlocks(html: string, url: string, options: MarkdownWithBlocksOptions): JsonlTransformResult;
17
- export {};
1
+ import type { MarkdownTransformResult, TransformOptions } from '../../config/types/content.js';
2
+ export declare function transformHtmlToMarkdown(html: string, url: string, options: TransformOptions): MarkdownTransformResult;
@@ -1,134 +1,140 @@
1
- import { TRUNCATION_MARKER } from '../../config/formatting.js';
2
1
  import { extractContent } from '../../services/extractor.js';
3
- import { parseHtml, parseHtmlWithMetadata } from '../../services/parser.js';
4
- import { sanitizeText } from '../../utils/sanitizer.js';
5
- import { toJsonl } from '../../transformers/jsonl.transformer.js';
2
+ import { logDebug } from '../../services/logger.js';
3
+ import { isRawTextContentUrl } from '../../utils/url-transformer.js';
6
4
  import { htmlToMarkdown } from '../../transformers/markdown.transformer.js';
7
- import { createContentMetadataBlock, determineContentExtractionSource, truncateContent, } from './common.js';
8
- const TITLE_PATTERN = /<title[^>]*>([\s\S]*?)<\/title>/i;
9
- function resolveContentSource(html, url, options) {
10
- if (!options.extractMainContent && !options.includeMetadata) {
11
- return {
12
- sourceHtml: html,
13
- title: extractTitleFromHtml(html),
14
- metadata: undefined,
15
- };
5
+ import { createContentMetadataBlock, determineContentExtractionSource, isExtractionSufficient, } from './content-shaping.js';
6
+ function buildArticleContentSource(url, article, extractedMeta, includeMetadata) {
7
+ const metadata = createContentMetadataBlock(url, article, extractedMeta, true, includeMetadata);
8
+ return {
9
+ sourceHtml: article.content,
10
+ title: article.title,
11
+ metadata,
12
+ };
13
+ }
14
+ function buildFullHtmlContentSource(html, url, article, extractedMeta, includeMetadata) {
15
+ const metadata = createContentMetadataBlock(url, article, extractedMeta, false, includeMetadata);
16
+ return {
17
+ sourceHtml: html,
18
+ title: extractedMeta.title,
19
+ metadata,
20
+ };
21
+ }
22
+ function logQualityGateFallback(url, article) {
23
+ logDebug('Quality gate: Readability extraction below threshold, using full HTML', {
24
+ url: url.substring(0, 80),
25
+ articleLength: article.textContent.length,
26
+ });
27
+ }
28
+ function tryBuildExtractedArticleContentSource(html, url, article, extractedMeta, options) {
29
+ if (!article)
30
+ return null;
31
+ const shouldExtractFromArticle = determineContentExtractionSource(article);
32
+ if (shouldExtractFromArticle && isExtractionSufficient(article, html)) {
33
+ return buildArticleContentSource(url, article, extractedMeta, options.includeMetadata);
16
34
  }
35
+ if (shouldExtractFromArticle) {
36
+ logQualityGateFallback(url, article);
37
+ }
38
+ return null;
39
+ }
40
+ function resolveContentSource(html, url, options) {
17
41
  const { article, metadata: extractedMeta } = extractContent(html, url, {
18
- extractArticle: options.extractMainContent,
42
+ extractArticle: true,
19
43
  });
20
- const shouldExtractFromArticle = determineContentExtractionSource(options.extractMainContent, article);
21
- const sourceHtml = shouldExtractFromArticle ? article.content : html;
22
- const metadata = createContentMetadataBlock(url, article, extractedMeta, shouldExtractFromArticle, options.includeMetadata);
23
- const title = shouldExtractFromArticle ? article.title : extractedMeta.title;
24
- return { sourceHtml, title, metadata };
25
- }
26
- function extractTitleFromHtml(html) {
27
- const match = TITLE_PATTERN.exec(html);
28
- if (!match?.[1])
44
+ const extracted = tryBuildExtractedArticleContentSource(html, url, article, extractedMeta, options);
45
+ if (extracted)
46
+ return extracted;
47
+ return buildFullHtmlContentSource(html, url, article, extractedMeta, options.includeMetadata);
48
+ }
49
+ function buildMarkdownPayload(context) {
50
+ return htmlToMarkdown(context.sourceHtml, context.metadata);
51
+ }
52
+ function buildRawMarkdownPayload(rawContent, url, includeMetadata) {
53
+ const title = extractTitleFromRawMarkdown(rawContent);
54
+ const content = includeMetadata
55
+ ? addSourceToMarkdown(rawContent, url)
56
+ : rawContent;
57
+ return { content, title };
58
+ }
59
+ function extractTitleFromRawMarkdown(content) {
60
+ const frontmatterMatch = /^---\r?\n([\s\S]*?)\r?\n---/.exec(content);
61
+ if (!frontmatterMatch)
29
62
  return undefined;
30
- const decoded = decodeHtmlEntities(match[1]);
31
- const text = sanitizeText(decoded);
32
- return text || undefined;
33
- }
34
- function decodeHtmlEntities(value) {
35
- if (!value.includes('&'))
36
- return value;
37
- const basicDecoded = value
38
- .replace(/&amp;/g, '&')
39
- .replace(/&lt;/g, '<')
40
- .replace(/&gt;/g, '>')
41
- .replace(/&quot;/g, '"')
42
- .replace(/&#39;/g, "'");
43
- return basicDecoded
44
- .replace(/&#(\d+);/g, (match, code) => {
45
- const parsed = Number.parseInt(code, 10);
46
- return Number.isFinite(parsed) && parsed >= 0 && parsed <= 0x10ffff
47
- ? String.fromCodePoint(parsed)
48
- : match;
49
- })
50
- .replace(/&#x([0-9a-fA-F]+);/g, (match, code) => {
51
- const parsed = Number.parseInt(code, 16);
52
- return Number.isFinite(parsed) && parsed >= 0 && parsed <= 0x10ffff
53
- ? String.fromCodePoint(parsed)
54
- : match;
55
- });
63
+ const frontmatter = frontmatterMatch[1] ?? '';
64
+ const titleMatch = /^(?:title|name):\s*["']?(.+?)["']?\s*$/im.exec(frontmatter);
65
+ return titleMatch?.[1]?.trim();
56
66
  }
57
- function buildJsonlPayload(context, maxContentLength) {
58
- const contentBlocks = parseHtml(context.sourceHtml);
59
- return buildJsonlPayloadFromBlocks(contentBlocks, context.metadata, maxContentLength);
67
+ function addSourceToMarkdown(content, url) {
68
+ const frontmatterMatch = /^(---\r?\n)([\s\S]*?)(\r?\n---)/.exec(content);
69
+ if (frontmatterMatch) {
70
+ const start = frontmatterMatch[1] ?? '---\n';
71
+ const existingFields = frontmatterMatch[2] ?? '';
72
+ const end = frontmatterMatch[3] ?? '\n---';
73
+ const rest = content.slice(frontmatterMatch[0].length);
74
+ if (/^source:/im.test(existingFields)) {
75
+ return content;
76
+ }
77
+ return `${start}${existingFields}\nsource: "${url}"${end}${rest}`;
78
+ }
79
+ return `---\nsource: "${url}"\n---\n\n${content}`;
60
80
  }
61
- function buildJsonlPayloadFromBlocks(contentBlocks, metadata, maxContentLength) {
62
- const { content, truncated } = truncateContent(toJsonl(contentBlocks, metadata), maxContentLength);
63
- return {
64
- content,
65
- contentBlocks: contentBlocks.length,
66
- truncated,
67
- };
81
+ function looksLikeHtmlDocument(trimmed) {
82
+ return (trimmed.startsWith('<!DOCTYPE') ||
83
+ trimmed.startsWith('<!doctype') ||
84
+ trimmed.startsWith('<html') ||
85
+ trimmed.startsWith('<HTML'));
86
+ }
87
+ function hasFrontmatter(trimmed) {
88
+ return /^---\r?\n/.test(trimmed);
89
+ }
90
+ function countCommonHtmlTags(content) {
91
+ const matches = content.match(/<(html|head|body|div|span|script|style|meta|link)\b/gi) ??
92
+ [];
93
+ return matches.length;
68
94
  }
69
- function buildMarkdownPayload(context, maxContentLength) {
70
- const markdown = htmlToMarkdown(context.sourceHtml, context.metadata);
71
- const { content, truncated } = truncateContent(markdown, maxContentLength, TRUNCATION_MARKER);
72
- return { content, truncated };
73
- }
74
- export function transformHtmlToJsonl(html, url, options) {
75
- if (!options.extractMainContent && options.includeMetadata) {
76
- const parsed = parseHtmlWithMetadata(html);
77
- const metadataBlock = createContentMetadataBlock(url, null, parsed.metadata, false, true);
78
- const { content, contentBlocks, truncated } = buildJsonlPayloadFromBlocks(parsed.blocks, metadataBlock, options.maxContentLength);
79
- return {
80
- content,
81
- contentBlocks,
82
- title: parsed.metadata.title,
83
- ...(truncated && { truncated }),
84
- };
95
+ function looksLikeMarkdown(content) {
96
+ const hasMarkdownHeadings = /^#{1,6}\s+/m.test(content);
97
+ const hasMarkdownLists = /^[\s]*[-*+]\s+/m.test(content);
98
+ const hasMarkdownCodeBlocks = /```[\s\S]*?```/.test(content);
99
+ return hasMarkdownHeadings || hasMarkdownLists || hasMarkdownCodeBlocks;
100
+ }
101
+ function isRawTextContent(content) {
102
+ const trimmed = content.trim();
103
+ if (looksLikeHtmlDocument(trimmed)) {
104
+ return false;
85
105
  }
86
- const context = resolveContentSource(html, url, options);
87
- const { content, contentBlocks, truncated } = buildJsonlPayload(context, options.maxContentLength);
88
- return {
89
- content,
90
- contentBlocks,
91
- title: context.title,
92
- ...(truncated && { truncated }),
93
- };
106
+ if (hasFrontmatter(trimmed)) {
107
+ return true;
108
+ }
109
+ if (countCommonHtmlTags(content) > 2) {
110
+ return false;
111
+ }
112
+ if (looksLikeMarkdown(content)) {
113
+ return true;
114
+ }
115
+ return false;
94
116
  }
95
- export function transformHtmlToMarkdown(html, url, options) {
96
- const context = resolveContentSource(html, url, options);
97
- const { content, truncated } = buildMarkdownPayload(context, options.maxContentLength);
117
+ function tryTransformRawContent(html, url, options) {
118
+ if (!isRawTextContentUrl(url) && !isRawTextContent(html)) {
119
+ return null;
120
+ }
121
+ logDebug('Preserving raw markdown content', { url: url.substring(0, 80) });
122
+ const { content, title } = buildRawMarkdownPayload(html, url, options.includeMetadata);
98
123
  return {
99
124
  markdown: content,
100
- title: context.title,
101
- truncated,
125
+ title,
126
+ truncated: false,
102
127
  };
103
128
  }
104
- export function transformHtmlToMarkdownWithBlocks(html, url, options) {
105
- const includeContentBlocks = options.includeContentBlocks ?? true;
106
- if (includeContentBlocks &&
107
- !options.extractMainContent &&
108
- options.includeMetadata) {
109
- const parsed = parseHtmlWithMetadata(html);
110
- const context = {
111
- sourceHtml: html,
112
- title: parsed.metadata.title,
113
- metadata: createContentMetadataBlock(url, null, parsed.metadata, false, true),
114
- };
115
- const { content, truncated } = buildMarkdownPayload(context, options.maxContentLength);
116
- return {
117
- content,
118
- contentBlocks: parsed.blocks.length,
119
- title: context.title,
120
- ...(truncated && { truncated }),
121
- };
122
- }
129
+ export function transformHtmlToMarkdown(html, url, options) {
130
+ const raw = tryTransformRawContent(html, url, options);
131
+ if (raw)
132
+ return raw;
123
133
  const context = resolveContentSource(html, url, options);
124
- const contentBlocks = includeContentBlocks
125
- ? parseHtml(context.sourceHtml)
126
- : [];
127
- const { content, truncated } = buildMarkdownPayload(context, options.maxContentLength);
134
+ const content = buildMarkdownPayload(context);
128
135
  return {
129
- content,
130
- contentBlocks: contentBlocks.length,
136
+ markdown: content,
131
137
  title: context.title,
132
- ...(truncated && { truncated }),
138
+ truncated: false,
133
139
  };
134
140
  }
@@ -1,10 +1,2 @@
1
1
  import type { FetchPipelineOptions, PipelineResult } from '../../config/types/runtime.js';
2
- /**
3
- * Unified fetch pipeline that handles caching, fetching, and transformation.
4
- * Implements cache-first strategy with automatic serialization.
5
- *
6
- * @template T - Type of the transformed result
7
- * @param options - Pipeline configuration options
8
- * @returns Promise resolving to the pipeline result
9
- */
10
2
  export declare function executeFetchPipeline<T>(options: FetchPipelineOptions<T>): Promise<PipelineResult<T>>;
@@ -1,98 +1,101 @@
1
1
  import * as cache from '../../services/cache.js';
2
- import { fetchNormalizedUrlWithRetry } from '../../services/fetcher.js';
2
+ import { createCacheKey } from '../../services/cache-keys.js';
3
+ import { fetchNormalizedUrl } from '../../services/fetcher.js';
3
4
  import { logDebug } from '../../services/logger.js';
4
- import { assertResolvedAddressesAllowed, normalizeUrl, } from '../../utils/url-validator.js';
5
- import { appendHeaderVary } from './cache-vary.js';
5
+ import { isRecord } from '../../utils/guards.js';
6
+ import { transformToRawUrl } from '../../utils/url-transformer.js';
7
+ import { normalizeUrl } from '../../utils/url-validator.js';
6
8
  function attemptCacheRetrieval(cacheKey, deserialize, cacheNamespace, normalizedUrl) {
7
9
  if (!cacheKey)
8
10
  return null;
9
11
  const cached = cache.get(cacheKey);
10
12
  if (!cached)
11
13
  return null;
12
- if (!deserialize) {
13
- logDebug('Cache miss due to missing deserializer', {
14
- namespace: cacheNamespace,
15
- url: normalizedUrl,
16
- });
17
- return null;
18
- }
14
+ if (!deserialize)
15
+ return logCacheMiss('missing deserializer', cacheNamespace, normalizedUrl);
19
16
  const data = deserialize(cached.content);
20
- if (data === undefined) {
21
- logDebug('Cache miss due to deserialize failure', {
22
- namespace: cacheNamespace,
23
- url: normalizedUrl,
24
- });
25
- return null;
26
- }
17
+ if (data === undefined)
18
+ return logCacheMiss('deserialize failure', cacheNamespace, normalizedUrl);
27
19
  logDebug('Cache hit', { namespace: cacheNamespace, url: normalizedUrl });
28
- return {
29
- data,
30
- fromCache: true,
31
- url: normalizedUrl,
32
- fetchedAt: cached.fetchedAt,
33
- cacheKey,
34
- };
20
+ return buildCacheHitResult(data, cached.fetchedAt, normalizedUrl, cacheKey);
21
+ }
22
+ function resolveNormalizedUrl(url) {
23
+ const { normalizedUrl: validatedUrl } = normalizeUrl(url);
24
+ const { url: normalizedUrl, transformed } = transformToRawUrl(validatedUrl);
25
+ return { normalizedUrl, originalUrl: validatedUrl, transformed };
35
26
  }
36
- /**
37
- * Unified fetch pipeline that handles caching, fetching, and transformation.
38
- * Implements cache-first strategy with automatic serialization.
39
- *
40
- * @template T - Type of the transformed result
41
- * @param options - Pipeline configuration options
42
- * @returns Promise resolving to the pipeline result
43
- */
44
27
  export async function executeFetchPipeline(options) {
45
- const { normalizedUrl, hostname } = normalizeUrl(options.url);
46
- const cacheKey = resolveCacheKey(options, normalizedUrl);
47
- const cachedResult = attemptCacheRetrieval(cacheKey, options.deserialize, options.cacheNamespace, normalizedUrl);
28
+ const resolvedUrl = resolveNormalizedUrl(options.url);
29
+ logRawUrlTransformation(resolvedUrl);
30
+ const cacheKey = resolveCacheKey(options, resolvedUrl.normalizedUrl);
31
+ const cachedResult = attemptCacheRetrieval(cacheKey, options.deserialize, options.cacheNamespace, resolvedUrl.normalizedUrl);
48
32
  if (cachedResult)
49
33
  return cachedResult;
50
- await assertResolvedAddressesAllowed(hostname);
51
- const fetchOptions = buildFetchOptions(options);
52
- logDebug('Fetching URL', { url: normalizedUrl, retries: options.retries });
53
- const html = await fetchNormalizedUrlWithRetry(normalizedUrl, fetchOptions, options.retries);
54
- const data = await options.transform(html, normalizedUrl);
34
+ const data = await fetchAndTransform(options, resolvedUrl.normalizedUrl);
55
35
  if (cache.isEnabled()) {
56
- persistCache(cacheKey, data, options.serialize, normalizedUrl);
36
+ persistCache(cacheKey, data, options.serialize, resolvedUrl.normalizedUrl);
57
37
  }
58
- return buildPipelineResult(normalizedUrl, data, cacheKey);
38
+ return buildPipelineResult(resolvedUrl.normalizedUrl, data, cacheKey);
59
39
  }
60
40
  function resolveCacheKey(options, normalizedUrl) {
61
- const cacheVary = appendHeaderVary(options.cacheVary, options.customHeaders);
62
- return cache.createCacheKey(options.cacheNamespace, normalizedUrl, cacheVary);
41
+ return createCacheKey(options.cacheNamespace, normalizedUrl, options.cacheVary);
42
+ }
43
+ async function fetchAndTransform(options, normalizedUrl) {
44
+ const fetchOptions = buildFetchOptions(options);
45
+ logDebug('Fetching URL', { url: normalizedUrl });
46
+ const html = await fetchNormalizedUrl(normalizedUrl, fetchOptions);
47
+ return options.transform(html, normalizedUrl);
63
48
  }
64
49
  function buildFetchOptions(options) {
65
- const fetchOptions = {};
66
- if (options.customHeaders !== undefined) {
67
- fetchOptions.customHeaders = options.customHeaders;
68
- }
69
- if (options.signal !== undefined) {
70
- fetchOptions.signal = options.signal;
71
- }
72
- if (options.timeout !== undefined) {
73
- fetchOptions.timeout = options.timeout;
74
- }
75
- return fetchOptions;
50
+ return options.signal === undefined ? {} : { signal: options.signal };
76
51
  }
77
- function persistCache(cacheKey, data, serialize, normalizedUrl) {
78
- if (!cacheKey)
79
- return;
80
- const serializer = serialize ?? JSON.stringify;
52
+ function resolveCacheMetadata(data, normalizedUrl) {
81
53
  const metadata = { url: normalizedUrl };
82
54
  const title = extractTitle(data);
83
55
  if (title !== undefined) {
84
56
  metadata.title = title;
85
57
  }
58
+ return metadata;
59
+ }
60
+ function resolveSerializer(serialize) {
61
+ return serialize ?? JSON.stringify;
62
+ }
63
+ function persistCache(cacheKey, data, serialize, normalizedUrl) {
64
+ if (!cacheKey)
65
+ return;
66
+ const serializer = resolveSerializer(serialize);
67
+ const metadata = resolveCacheMetadata(data, normalizedUrl);
86
68
  cache.set(cacheKey, serializer(data), metadata);
87
69
  }
88
70
  function extractTitle(value) {
89
- if (!value || typeof value !== 'object')
90
- return undefined;
91
- if (!('title' in value))
71
+ if (!isRecord(value))
92
72
  return undefined;
93
73
  const { title } = value;
94
74
  return typeof title === 'string' ? title : undefined;
95
75
  }
76
+ function logCacheMiss(reason, cacheNamespace, normalizedUrl) {
77
+ logDebug(`Cache miss due to ${reason}`, {
78
+ namespace: cacheNamespace,
79
+ url: normalizedUrl,
80
+ });
81
+ return null;
82
+ }
83
+ function logRawUrlTransformation(resolvedUrl) {
84
+ if (!resolvedUrl.transformed)
85
+ return;
86
+ logDebug('Using transformed raw content URL', {
87
+ original: resolvedUrl.originalUrl,
88
+ });
89
+ }
90
+ function buildCacheHitResult(data, fetchedAt, url, cacheKey) {
91
+ return {
92
+ data,
93
+ fromCache: true,
94
+ url,
95
+ fetchedAt,
96
+ cacheKey,
97
+ };
98
+ }
96
99
  function buildPipelineResult(url, data, cacheKey) {
97
100
  return {
98
101
  data,
@@ -1,4 +1,3 @@
1
- type InlineContentFormat = 'jsonl' | 'markdown';
2
1
  interface InlineContentResult {
3
2
  content?: string;
4
3
  contentSize: number;
@@ -7,5 +6,5 @@ interface InlineContentResult {
7
6
  error?: string;
8
7
  truncated?: boolean;
9
8
  }
10
- export declare function applyInlineContentLimit(content: string, cacheKey: string | null, format: InlineContentFormat): InlineContentResult;
9
+ export declare function applyInlineContentLimit(content: string, cacheKey: string | null): InlineContentResult;
11
10
  export {};
@@ -1,7 +1,7 @@
1
1
  import { TRUNCATION_MARKER } from '../../config/formatting.js';
2
2
  import { config } from '../../config/index.js';
3
- import * as cache from '../../services/cache.js';
4
- export function applyInlineContentLimit(content, cacheKey, format) {
3
+ import { toResourceUri } from '../../services/cache-keys.js';
4
+ export function applyInlineContentLimit(content, cacheKey) {
5
5
  const contentSize = content.length;
6
6
  const inlineLimit = config.constants.maxInlineContentChars;
7
7
  if (contentSize <= inlineLimit) {
@@ -14,16 +14,13 @@ export function applyInlineContentLimit(content, cacheKey, format) {
14
14
  return {
15
15
  contentSize,
16
16
  resourceUri,
17
- resourceMimeType: resolveResourceMimeType(format),
17
+ resourceMimeType: 'text/markdown',
18
18
  };
19
19
  }
20
20
  function resolveResourceUri(cacheKey) {
21
21
  if (!config.cache.enabled || !cacheKey)
22
22
  return null;
23
- return cache.toResourceUri(cacheKey);
24
- }
25
- function resolveResourceMimeType(format) {
26
- return format === 'markdown' ? 'text/markdown' : 'application/jsonl';
23
+ return toResourceUri(cacheKey);
27
24
  }
28
25
  function buildTruncatedFallback(content, contentSize, inlineLimit) {
29
26
  const maxContentLength = Math.max(0, inlineLimit - TRUNCATION_MARKER.length);