@j0hanz/superfetch 2.0.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. package/README.md +139 -46
  2. package/dist/cache.d.ts +42 -0
  3. package/dist/cache.js +565 -0
  4. package/dist/config/env-parsers.d.ts +1 -0
  5. package/dist/config/env-parsers.js +12 -0
  6. package/dist/config/index.d.ts +7 -0
  7. package/dist/config/index.js +20 -8
  8. package/dist/config/types/content.d.ts +1 -0
  9. package/dist/config.d.ts +77 -0
  10. package/dist/config.js +261 -0
  11. package/dist/crypto.d.ts +2 -0
  12. package/dist/crypto.js +32 -0
  13. package/dist/errors.d.ts +10 -0
  14. package/dist/errors.js +28 -0
  15. package/dist/fetch.d.ts +40 -0
  16. package/dist/fetch.js +910 -0
  17. package/dist/http/auth.js +161 -2
  18. package/dist/http/base-middleware.d.ts +7 -0
  19. package/dist/http/base-middleware.js +143 -0
  20. package/dist/http/cors.d.ts +0 -5
  21. package/dist/http/cors.js +0 -6
  22. package/dist/http/download-routes.js +6 -2
  23. package/dist/http/error-handler.d.ts +2 -0
  24. package/dist/http/error-handler.js +55 -0
  25. package/dist/http/host-allowlist.d.ts +3 -0
  26. package/dist/http/host-allowlist.js +117 -0
  27. package/dist/http/mcp-routes.d.ts +8 -2
  28. package/dist/http/mcp-routes.js +101 -8
  29. package/dist/http/mcp-session-eviction.d.ts +3 -0
  30. package/dist/http/mcp-session-eviction.js +24 -0
  31. package/dist/http/mcp-session-init.d.ts +7 -0
  32. package/dist/http/mcp-session-init.js +94 -0
  33. package/dist/http/mcp-session-slots.d.ts +17 -0
  34. package/dist/http/mcp-session-slots.js +55 -0
  35. package/dist/http/mcp-session-transport-init.d.ts +7 -0
  36. package/dist/http/mcp-session-transport-init.js +41 -0
  37. package/dist/http/mcp-session-types.d.ts +5 -0
  38. package/dist/http/mcp-session-types.js +1 -0
  39. package/dist/http/mcp-session.d.ts +9 -9
  40. package/dist/http/mcp-session.js +5 -114
  41. package/dist/http/mcp-sessions.d.ts +41 -0
  42. package/dist/http/mcp-sessions.js +392 -0
  43. package/dist/http/rate-limit.js +2 -2
  44. package/dist/http/server-middleware.d.ts +6 -1
  45. package/dist/http/server-middleware.js +3 -117
  46. package/dist/http/server-shutdown.js +1 -1
  47. package/dist/http/server-tuning.d.ts +9 -0
  48. package/dist/http/server-tuning.js +45 -0
  49. package/dist/http/server.js +206 -9
  50. package/dist/http/session-cleanup.js +8 -5
  51. package/dist/http.d.ts +78 -0
  52. package/dist/http.js +1437 -0
  53. package/dist/index.js +3 -3
  54. package/dist/mcp.d.ts +3 -0
  55. package/dist/mcp.js +94 -0
  56. package/dist/middleware/error-handler.d.ts +1 -1
  57. package/dist/middleware/error-handler.js +31 -30
  58. package/dist/observability.d.ts +16 -0
  59. package/dist/observability.js +78 -0
  60. package/dist/resources/cached-content-params.d.ts +5 -0
  61. package/dist/resources/cached-content-params.js +36 -0
  62. package/dist/resources/cached-content.js +33 -33
  63. package/dist/server.js +21 -6
  64. package/dist/services/cache-events.d.ts +8 -0
  65. package/dist/services/cache-events.js +19 -0
  66. package/dist/services/cache.d.ts +5 -4
  67. package/dist/services/cache.js +49 -45
  68. package/dist/services/context.d.ts +2 -0
  69. package/dist/services/context.js +3 -0
  70. package/dist/services/extractor.d.ts +1 -0
  71. package/dist/services/extractor.js +77 -40
  72. package/dist/services/fetcher/agents.js +1 -1
  73. package/dist/services/fetcher/dns-selection.js +1 -1
  74. package/dist/services/fetcher/interceptors.js +29 -60
  75. package/dist/services/fetcher/redirects.js +12 -4
  76. package/dist/services/fetcher/response.js +18 -8
  77. package/dist/services/fetcher.d.ts +23 -0
  78. package/dist/services/fetcher.js +553 -13
  79. package/dist/services/logger.js +4 -1
  80. package/dist/services/telemetry.d.ts +19 -0
  81. package/dist/services/telemetry.js +43 -0
  82. package/dist/services/transform-worker-pool.d.ts +10 -3
  83. package/dist/services/transform-worker-pool.js +213 -184
  84. package/dist/tools/handlers/fetch-single.shared.d.ts +11 -3
  85. package/dist/tools/handlers/fetch-single.shared.js +131 -2
  86. package/dist/tools/handlers/fetch-url.tool.d.ts +6 -0
  87. package/dist/tools/handlers/fetch-url.tool.js +56 -12
  88. package/dist/tools/index.d.ts +1 -0
  89. package/dist/tools/index.js +13 -1
  90. package/dist/tools/schemas.d.ts +2 -0
  91. package/dist/tools/schemas.js +8 -0
  92. package/dist/tools/utils/content-shaping.js +19 -4
  93. package/dist/tools/utils/content-transform-core.d.ts +5 -0
  94. package/dist/tools/utils/content-transform-core.js +180 -0
  95. package/dist/tools/utils/content-transform-workers.d.ts +1 -0
  96. package/dist/tools/utils/content-transform-workers.js +1 -0
  97. package/dist/tools/utils/content-transform.d.ts +2 -1
  98. package/dist/tools/utils/content-transform.js +37 -136
  99. package/dist/tools/utils/fetch-pipeline.js +47 -56
  100. package/dist/tools/utils/frontmatter.d.ts +3 -0
  101. package/dist/tools/utils/frontmatter.js +73 -0
  102. package/dist/tools/utils/markdown-heuristics.d.ts +1 -0
  103. package/dist/tools/utils/markdown-heuristics.js +19 -0
  104. package/dist/tools/utils/markdown-signals.d.ts +1 -0
  105. package/dist/tools/utils/markdown-signals.js +19 -0
  106. package/dist/tools/utils/raw-markdown-frontmatter.d.ts +3 -0
  107. package/dist/tools/utils/raw-markdown-frontmatter.js +73 -0
  108. package/dist/tools/utils/raw-markdown.d.ts +6 -0
  109. package/dist/tools/utils/raw-markdown.js +149 -0
  110. package/dist/tools.d.ts +104 -0
  111. package/dist/tools.js +421 -0
  112. package/dist/transform.d.ts +69 -0
  113. package/dist/transform.js +1509 -0
  114. package/dist/transformers/markdown/fenced-code-rule.d.ts +2 -0
  115. package/dist/transformers/markdown/fenced-code-rule.js +38 -0
  116. package/dist/transformers/markdown/frontmatter.d.ts +2 -0
  117. package/dist/transformers/markdown/frontmatter.js +45 -0
  118. package/dist/transformers/markdown/noise-rule.d.ts +2 -0
  119. package/dist/transformers/markdown/noise-rule.js +80 -0
  120. package/dist/transformers/markdown/turndown-instance.d.ts +2 -0
  121. package/dist/transformers/markdown/turndown-instance.js +19 -0
  122. package/dist/transformers/markdown.d.ts +5 -0
  123. package/dist/transformers/markdown.js +314 -0
  124. package/dist/transformers/markdown.transformer.js +2 -189
  125. package/dist/utils/cancellation.d.ts +1 -0
  126. package/dist/utils/cancellation.js +18 -0
  127. package/dist/utils/code-language-bash.d.ts +1 -0
  128. package/dist/utils/code-language-bash.js +48 -0
  129. package/dist/utils/code-language-core.d.ts +2 -0
  130. package/dist/utils/code-language-core.js +13 -0
  131. package/dist/utils/code-language-detectors.d.ts +5 -0
  132. package/dist/utils/code-language-detectors.js +142 -0
  133. package/dist/utils/code-language-helpers.d.ts +5 -0
  134. package/dist/utils/code-language-helpers.js +62 -0
  135. package/dist/utils/code-language-parsing.d.ts +5 -0
  136. package/dist/utils/code-language-parsing.js +62 -0
  137. package/dist/utils/code-language.js +250 -46
  138. package/dist/utils/error-details.d.ts +3 -0
  139. package/dist/utils/error-details.js +12 -0
  140. package/dist/utils/filename-generator.js +14 -3
  141. package/dist/utils/host-normalizer.d.ts +1 -0
  142. package/dist/utils/host-normalizer.js +37 -0
  143. package/dist/utils/ip-address.d.ts +4 -0
  144. package/dist/utils/ip-address.js +6 -0
  145. package/dist/utils/tool-error-handler.js +12 -17
  146. package/dist/utils/url-redactor.d.ts +1 -0
  147. package/dist/utils/url-redactor.js +13 -0
  148. package/dist/utils/url-validator.js +35 -20
  149. package/dist/workers/transform-worker.js +82 -38
  150. package/package.json +13 -10
@@ -1,16 +1,61 @@
1
1
  import { logDebug, logError } from '../../services/logger.js';
2
+ import { isRecord } from '../../utils/guards.js';
2
3
  import { createToolErrorResponse, handleToolError, } from '../../utils/tool-error-handler.js';
3
- import { parseCachedMarkdownResult } from '../utils/cached-markdown.js';
4
4
  import { transformHtmlToMarkdown } from '../utils/content-transform.js';
5
5
  import { buildToolContentBlocks, performSharedFetch, } from './fetch-single.shared.js';
6
6
  export const FETCH_URL_TOOL_NAME = 'fetch-url';
7
7
  export const FETCH_URL_TOOL_DESCRIPTION = 'Fetches a webpage and converts it to clean Markdown format';
8
+ function parseJsonRecord(input) {
9
+ try {
10
+ const parsed = JSON.parse(input);
11
+ return isRecord(parsed) ? parsed : undefined;
12
+ }
13
+ catch {
14
+ return undefined;
15
+ }
16
+ }
17
+ function resolveMarkdownContent(parsed) {
18
+ const { markdown } = parsed;
19
+ if (typeof markdown === 'string')
20
+ return markdown;
21
+ const { content } = parsed;
22
+ if (typeof content === 'string')
23
+ return content;
24
+ return undefined;
25
+ }
26
+ function resolveOptionalTitle(parsed) {
27
+ const { title } = parsed;
28
+ if (title === undefined)
29
+ return undefined;
30
+ return typeof title === 'string' ? title : undefined;
31
+ }
32
+ function resolveTruncatedFlag(parsed) {
33
+ const { truncated } = parsed;
34
+ return typeof truncated === 'boolean' ? truncated : false;
35
+ }
36
+ export function parseCachedMarkdownResult(cached) {
37
+ const parsed = parseJsonRecord(cached);
38
+ if (!parsed)
39
+ return undefined;
40
+ const resolvedContent = resolveMarkdownContent(parsed);
41
+ if (resolvedContent === undefined)
42
+ return undefined;
43
+ const title = resolveOptionalTitle(parsed);
44
+ if (parsed.title !== undefined && title === undefined)
45
+ return undefined;
46
+ return {
47
+ content: resolvedContent,
48
+ markdown: resolvedContent,
49
+ title,
50
+ truncated: resolveTruncatedFlag(parsed),
51
+ };
52
+ }
8
53
  function deserializeMarkdownResult(cached) {
9
54
  return parseCachedMarkdownResult(cached);
10
55
  }
11
56
  function buildMarkdownTransform() {
12
- return (html, url) => {
13
- const result = transformHtmlToMarkdown(html, url, {
57
+ return async (html, url) => {
58
+ const result = await transformHtmlToMarkdown(html, url, {
14
59
  includeMetadata: true,
15
60
  });
16
61
  return { ...result, content: result.markdown };
@@ -23,9 +68,11 @@ function serializeMarkdownResult(result) {
23
68
  truncated: result.truncated,
24
69
  });
25
70
  }
26
- function buildStructuredContent(pipeline, inlineResult) {
71
+ function buildStructuredContent(pipeline, inlineResult, inputUrl) {
27
72
  return {
28
73
  url: pipeline.url,
74
+ resolvedUrl: pipeline.url,
75
+ inputUrl,
29
76
  title: pipeline.data.title,
30
77
  markdown: inlineResult.content,
31
78
  };
@@ -44,8 +91,8 @@ async function fetchPipeline(url) {
44
91
  deserialize: deserializeMarkdownResult,
45
92
  });
46
93
  }
47
- function buildResponse(pipeline, inlineResult) {
48
- const structuredContent = buildStructuredContent(pipeline, inlineResult);
94
+ function buildResponse(pipeline, inlineResult, inputUrl) {
95
+ const structuredContent = buildStructuredContent(pipeline, inlineResult, inputUrl);
49
96
  const content = buildFetchUrlContentBlocks(structuredContent, pipeline, inlineResult);
50
97
  return {
51
98
  content,
@@ -53,13 +100,10 @@ function buildResponse(pipeline, inlineResult) {
53
100
  };
54
101
  }
55
102
  export async function fetchUrlToolHandler(input) {
56
- try {
57
- return await executeFetch(input);
58
- }
59
- catch (error) {
103
+ return executeFetch(input).catch((error) => {
60
104
  logError('fetch-url tool error', error instanceof Error ? error : undefined);
61
105
  return handleToolError(error, input.url, 'Failed to fetch URL');
62
- }
106
+ });
63
107
  }
64
108
  async function executeFetch(input) {
65
109
  const { url } = input;
@@ -71,5 +115,5 @@ async function executeFetch(input) {
71
115
  if (inlineResult.error) {
72
116
  return createToolErrorResponse(inlineResult.error, url);
73
117
  }
74
- return buildResponse(pipeline, inlineResult);
118
+ return buildResponse(pipeline, inlineResult, url);
75
119
  }
@@ -1,2 +1,3 @@
1
1
  import type { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
2
+ export declare function withRequestContextIfMissing<TParams, TResult>(handler: (params: TParams) => Promise<TResult>): (params: TParams) => Promise<TResult>;
2
3
  export declare function registerTools(server: McpServer): void;
@@ -1,3 +1,5 @@
1
+ import { randomUUID } from 'node:crypto';
2
+ import { getRequestId, runWithRequestContext } from '../services/context.js';
1
3
  import { FETCH_URL_TOOL_DESCRIPTION, FETCH_URL_TOOL_NAME, fetchUrlToolHandler, } from './handlers/fetch-url.tool.js';
2
4
  import { fetchUrlInputSchema, fetchUrlOutputSchema } from './schemas.js';
3
5
  const TOOL_DEFINITION = {
@@ -14,6 +16,16 @@ const TOOL_DEFINITION = {
14
16
  openWorldHint: true,
15
17
  },
16
18
  };
19
+ export function withRequestContextIfMissing(handler) {
20
+ return async (params) => {
21
+ const existingRequestId = getRequestId();
22
+ if (existingRequestId) {
23
+ return handler(params);
24
+ }
25
+ const requestId = randomUUID();
26
+ return runWithRequestContext({ requestId, operationId: requestId }, () => handler(params));
27
+ };
28
+ }
17
29
  export function registerTools(server) {
18
30
  server.registerTool(TOOL_DEFINITION.name, {
19
31
  title: TOOL_DEFINITION.title,
@@ -21,5 +33,5 @@ export function registerTools(server) {
21
33
  inputSchema: TOOL_DEFINITION.inputSchema,
22
34
  outputSchema: TOOL_DEFINITION.outputSchema,
23
35
  annotations: TOOL_DEFINITION.annotations,
24
- }, TOOL_DEFINITION.handler);
36
+ }, withRequestContextIfMissing(TOOL_DEFINITION.handler));
25
37
  }
@@ -4,6 +4,8 @@ export declare const fetchUrlInputSchema: z.ZodObject<{
4
4
  }, z.core.$strict>;
5
5
  export declare const fetchUrlOutputSchema: z.ZodObject<{
6
6
  url: z.ZodString;
7
+ inputUrl: z.ZodOptional<z.ZodString>;
8
+ resolvedUrl: z.ZodOptional<z.ZodString>;
7
9
  title: z.ZodOptional<z.ZodString>;
8
10
  markdown: z.ZodOptional<z.ZodString>;
9
11
  error: z.ZodOptional<z.ZodString>;
@@ -4,6 +4,14 @@ export const fetchUrlInputSchema = z.strictObject({
4
4
  });
5
5
  export const fetchUrlOutputSchema = z.strictObject({
6
6
  url: z.string().describe('The fetched URL'),
7
+ inputUrl: z
8
+ .string()
9
+ .optional()
10
+ .describe('The original URL provided by the caller'),
11
+ resolvedUrl: z
12
+ .string()
13
+ .optional()
14
+ .describe('The normalized or transformed URL that was fetched'),
7
15
  title: z.string().optional().describe('Page title'),
8
16
  markdown: z
9
17
  .string()
@@ -1,10 +1,25 @@
1
1
  const MIN_CONTENT_RATIO = 0.3;
2
2
  const MIN_HTML_LENGTH_FOR_GATE = 100;
3
+ function stripHtmlTags(html) {
4
+ const parts = [];
5
+ let inTag = false;
6
+ for (const char of html) {
7
+ if (char === '<') {
8
+ inTag = true;
9
+ continue;
10
+ }
11
+ if (char === '>') {
12
+ inTag = false;
13
+ continue;
14
+ }
15
+ if (!inTag) {
16
+ parts.push(char);
17
+ }
18
+ }
19
+ return parts.join('');
20
+ }
3
21
  function estimateTextLength(html) {
4
- return html
5
- .replace(/<[^>]*>/g, '')
6
- .replace(/\s+/g, ' ')
7
- .trim().length;
22
+ return stripHtmlTags(html).replace(/\s+/g, ' ').trim().length;
8
23
  }
9
24
  export function isExtractionSufficient(article, originalHtml) {
10
25
  if (!article)
@@ -0,0 +1,5 @@
1
+ import type { ExtractedArticle, ExtractedMetadata, MarkdownTransformResult, MetadataBlock, TransformOptions } from '../../config/types/content.js';
2
+ export declare function isExtractionSufficient(article: ExtractedArticle | null, originalHtml: string): boolean;
3
+ export declare function determineContentExtractionSource(article: ExtractedArticle | null): article is ExtractedArticle;
4
+ export declare function createContentMetadataBlock(url: string, article: ExtractedArticle | null, extractedMeta: ExtractedMetadata, shouldExtractFromArticle: boolean, includeMetadata: boolean): MetadataBlock | undefined;
5
+ export declare function transformHtmlToMarkdownInProcess(html: string, url: string, options: TransformOptions): MarkdownTransformResult;
@@ -0,0 +1,180 @@
1
+ import { extractContent } from '../../services/extractor.js';
2
+ import { logDebug } from '../../services/logger.js';
3
+ import { endTransformStage, startTransformStage, } from '../../services/telemetry.js';
4
+ import { throwIfAborted } from '../../utils/cancellation.js';
5
+ import { htmlToMarkdown } from '../../transformers/markdown.js';
6
+ import { tryTransformRawContent } from './raw-markdown.js';
7
+ const MIN_CONTENT_RATIO = 0.3;
8
+ const MIN_HTML_LENGTH_FOR_GATE = 100;
9
+ function stripHtmlTags(html) {
10
+ const parts = [];
11
+ let inTag = false;
12
+ for (const char of html) {
13
+ if (char === '<') {
14
+ inTag = true;
15
+ continue;
16
+ }
17
+ if (char === '>') {
18
+ inTag = false;
19
+ continue;
20
+ }
21
+ if (!inTag) {
22
+ parts.push(char);
23
+ }
24
+ }
25
+ return parts.join('');
26
+ }
27
+ function estimateTextLength(html) {
28
+ return stripHtmlTags(html).replace(/\s+/g, ' ').trim().length;
29
+ }
30
+ export function isExtractionSufficient(article, originalHtml) {
31
+ if (!article)
32
+ return false;
33
+ const articleLength = article.textContent.length;
34
+ const originalLength = estimateTextLength(originalHtml);
35
+ if (originalLength < MIN_HTML_LENGTH_FOR_GATE)
36
+ return true;
37
+ return articleLength / originalLength >= MIN_CONTENT_RATIO;
38
+ }
39
+ export function determineContentExtractionSource(article) {
40
+ return !!article;
41
+ }
42
+ function applyArticleMetadata(metadata, article) {
43
+ if (article.title !== undefined)
44
+ metadata.title = article.title;
45
+ if (article.byline !== undefined)
46
+ metadata.author = article.byline;
47
+ }
48
+ function applyExtractedMetadata(metadata, extractedMeta) {
49
+ if (extractedMeta.title !== undefined)
50
+ metadata.title = extractedMeta.title;
51
+ if (extractedMeta.description !== undefined) {
52
+ metadata.description = extractedMeta.description;
53
+ }
54
+ if (extractedMeta.author !== undefined) {
55
+ metadata.author = extractedMeta.author;
56
+ }
57
+ }
58
+ export function createContentMetadataBlock(url, article, extractedMeta, shouldExtractFromArticle, includeMetadata) {
59
+ if (!includeMetadata)
60
+ return undefined;
61
+ const now = new Date().toISOString();
62
+ const metadata = {
63
+ type: 'metadata',
64
+ url,
65
+ fetchedAt: now,
66
+ };
67
+ if (shouldExtractFromArticle && article) {
68
+ applyArticleMetadata(metadata, article);
69
+ return metadata;
70
+ }
71
+ applyExtractedMetadata(metadata, extractedMeta);
72
+ return metadata;
73
+ }
74
+ function buildArticleContentSource({ url, article, extractedMeta, includeMetadata, }) {
75
+ const metadata = createContentMetadataBlock(url, article, extractedMeta, true, includeMetadata);
76
+ return {
77
+ sourceHtml: article.content,
78
+ title: article.title,
79
+ metadata,
80
+ };
81
+ }
82
+ function buildFullHtmlContentSource({ html, url, article, extractedMeta, includeMetadata, }) {
83
+ const metadata = createContentMetadataBlock(url, article, extractedMeta, false, includeMetadata);
84
+ return {
85
+ sourceHtml: html,
86
+ title: extractedMeta.title,
87
+ metadata,
88
+ };
89
+ }
90
+ function logQualityGateFallback({ url, articleLength, }) {
91
+ logDebug('Quality gate: Readability extraction below threshold, using full HTML', {
92
+ url: url.substring(0, 80),
93
+ articleLength,
94
+ });
95
+ }
96
+ function tryBuildExtractedArticleContentSource({ html, url, article, extractedMeta, includeMetadata, }) {
97
+ if (!article)
98
+ return null;
99
+ const shouldExtractFromArticle = determineContentExtractionSource(article);
100
+ if (shouldExtractFromArticle && isExtractionSufficient(article, html)) {
101
+ return buildArticleContentSource({
102
+ url,
103
+ article,
104
+ extractedMeta,
105
+ includeMetadata,
106
+ });
107
+ }
108
+ if (shouldExtractFromArticle) {
109
+ logQualityGateFallback({
110
+ url,
111
+ articleLength: article.textContent.length,
112
+ });
113
+ }
114
+ return null;
115
+ }
116
+ function resolveContentSource({ html, url, includeMetadata, signal, }) {
117
+ const { article, metadata: extractedMeta } = extractContent(html, url, {
118
+ extractArticle: true,
119
+ ...(signal ? { signal } : {}),
120
+ });
121
+ const extracted = tryBuildExtractedArticleContentSource({
122
+ html,
123
+ url,
124
+ article,
125
+ extractedMeta,
126
+ includeMetadata,
127
+ });
128
+ if (extracted)
129
+ return extracted;
130
+ return buildFullHtmlContentSource({
131
+ html,
132
+ url,
133
+ article,
134
+ extractedMeta,
135
+ includeMetadata,
136
+ });
137
+ }
138
+ export function transformHtmlToMarkdownInProcess(html, url, options) {
139
+ const totalStage = startTransformStage(url, 'transform:total');
140
+ let success = false;
141
+ try {
142
+ throwIfAborted(options.signal, url, 'transform:begin');
143
+ const rawStage = startTransformStage(url, 'transform:raw');
144
+ const raw = tryTransformRawContent({
145
+ html,
146
+ url,
147
+ includeMetadata: options.includeMetadata,
148
+ });
149
+ endTransformStage(rawStage);
150
+ if (raw) {
151
+ success = true;
152
+ return raw;
153
+ }
154
+ const extractStage = startTransformStage(url, 'transform:extract');
155
+ const context = resolveContentSource({
156
+ html,
157
+ url,
158
+ includeMetadata: options.includeMetadata,
159
+ ...(options.signal ? { signal: options.signal } : {}),
160
+ });
161
+ endTransformStage(extractStage);
162
+ const markdownStage = startTransformStage(url, 'transform:markdown');
163
+ const content = htmlToMarkdown(context.sourceHtml, context.metadata, {
164
+ url,
165
+ ...(options.signal ? { signal: options.signal } : {}),
166
+ });
167
+ endTransformStage(markdownStage);
168
+ success = true;
169
+ return {
170
+ markdown: content,
171
+ title: context.title,
172
+ truncated: false,
173
+ };
174
+ }
175
+ finally {
176
+ if (success) {
177
+ endTransformStage(totalStage, { truncated: false });
178
+ }
179
+ }
180
+ }
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1 @@
1
+ export {};
@@ -1,2 +1,3 @@
1
1
  import type { MarkdownTransformResult, TransformOptions } from '../../config/types/content.js';
2
- export declare function transformHtmlToMarkdown(html: string, url: string, options: TransformOptions): MarkdownTransformResult;
2
+ export { createContentMetadataBlock, determineContentExtractionSource, isExtractionSufficient, } from './content-transform-core.js';
3
+ export declare function transformHtmlToMarkdown(html: string, url: string, options: TransformOptions): Promise<MarkdownTransformResult>;
@@ -1,140 +1,41 @@
1
- import { extractContent } from '../../services/extractor.js';
2
- import { logDebug } from '../../services/logger.js';
3
- import { isRawTextContentUrl } from '../../utils/url-transformer.js';
4
- import { htmlToMarkdown } from '../../transformers/markdown.transformer.js';
5
- import { createContentMetadataBlock, determineContentExtractionSource, isExtractionSufficient, } from './content-shaping.js';
6
- function buildArticleContentSource(url, article, extractedMeta, includeMetadata) {
7
- const metadata = createContentMetadataBlock(url, article, extractedMeta, true, includeMetadata);
8
- return {
9
- sourceHtml: article.content,
10
- title: article.title,
11
- metadata,
12
- };
13
- }
14
- function buildFullHtmlContentSource(html, url, article, extractedMeta, includeMetadata) {
15
- const metadata = createContentMetadataBlock(url, article, extractedMeta, false, includeMetadata);
16
- return {
17
- sourceHtml: html,
18
- title: extractedMeta.title,
19
- metadata,
20
- };
21
- }
22
- function logQualityGateFallback(url, article) {
23
- logDebug('Quality gate: Readability extraction below threshold, using full HTML', {
24
- url: url.substring(0, 80),
25
- articleLength: article.textContent.length,
26
- });
27
- }
28
- function tryBuildExtractedArticleContentSource(html, url, article, extractedMeta, options) {
29
- if (!article)
30
- return null;
31
- const shouldExtractFromArticle = determineContentExtractionSource(article);
32
- if (shouldExtractFromArticle && isExtractionSufficient(article, html)) {
33
- return buildArticleContentSource(url, article, extractedMeta, options.includeMetadata);
34
- }
35
- if (shouldExtractFromArticle) {
36
- logQualityGateFallback(url, article);
37
- }
38
- return null;
39
- }
40
- function resolveContentSource(html, url, options) {
41
- const { article, metadata: extractedMeta } = extractContent(html, url, {
42
- extractArticle: true,
43
- });
44
- const extracted = tryBuildExtractedArticleContentSource(html, url, article, extractedMeta, options);
45
- if (extracted)
46
- return extracted;
47
- return buildFullHtmlContentSource(html, url, article, extractedMeta, options.includeMetadata);
48
- }
49
- function buildMarkdownPayload(context) {
50
- return htmlToMarkdown(context.sourceHtml, context.metadata);
51
- }
52
- function buildRawMarkdownPayload(rawContent, url, includeMetadata) {
53
- const title = extractTitleFromRawMarkdown(rawContent);
54
- const content = includeMetadata
55
- ? addSourceToMarkdown(rawContent, url)
56
- : rawContent;
57
- return { content, title };
58
- }
59
- function extractTitleFromRawMarkdown(content) {
60
- const frontmatterMatch = /^---\r?\n([\s\S]*?)\r?\n---/.exec(content);
61
- if (!frontmatterMatch)
62
- return undefined;
63
- const frontmatter = frontmatterMatch[1] ?? '';
64
- const titleMatch = /^(?:title|name):\s*["']?(.+?)["']?\s*$/im.exec(frontmatter);
65
- return titleMatch?.[1]?.trim();
66
- }
67
- function addSourceToMarkdown(content, url) {
68
- const frontmatterMatch = /^(---\r?\n)([\s\S]*?)(\r?\n---)/.exec(content);
69
- if (frontmatterMatch) {
70
- const start = frontmatterMatch[1] ?? '---\n';
71
- const existingFields = frontmatterMatch[2] ?? '';
72
- const end = frontmatterMatch[3] ?? '\n---';
73
- const rest = content.slice(frontmatterMatch[0].length);
74
- if (/^source:/im.test(existingFields)) {
75
- return content;
1
+ import { FetchError } from '../../errors/app-error.js';
2
+ import { endTransformStage, startTransformStage, } from '../../services/telemetry.js';
3
+ import { getOrCreateTransformWorkerPool } from '../../services/transform-worker-pool.js';
4
+ import { throwIfAborted } from '../../utils/cancellation.js';
5
+ import { transformHtmlToMarkdownInProcess } from './content-transform-core.js';
6
+ export { createContentMetadataBlock, determineContentExtractionSource, isExtractionSufficient, } from './content-transform-core.js';
7
+ export async function transformHtmlToMarkdown(html, url, options) {
8
+ const totalStage = startTransformStage(url, 'transform:total');
9
+ let success = false;
10
+ try {
11
+ throwIfAborted(options.signal, url, 'transform:begin');
12
+ const workerStage = startTransformStage(url, 'transform:worker');
13
+ try {
14
+ const pool = getOrCreateTransformWorkerPool();
15
+ const result = await pool.transform(html, url, {
16
+ includeMetadata: options.includeMetadata,
17
+ ...(options.signal ? { signal: options.signal } : {}),
18
+ });
19
+ success = true;
20
+ return result;
21
+ }
22
+ catch (error) {
23
+ if (error instanceof FetchError) {
24
+ throw error;
25
+ }
26
+ // Stability-first: if worker infrastructure fails, fall back to in-process.
27
+ throwIfAborted(options.signal, url, 'transform:worker-fallback');
28
+ const fallback = transformHtmlToMarkdownInProcess(html, url, options);
29
+ success = true;
30
+ return fallback;
31
+ }
32
+ finally {
33
+ endTransformStage(workerStage);
76
34
  }
77
- return `${start}${existingFields}\nsource: "${url}"${end}${rest}`;
78
- }
79
- return `---\nsource: "${url}"\n---\n\n${content}`;
80
- }
81
- function looksLikeHtmlDocument(trimmed) {
82
- return (trimmed.startsWith('<!DOCTYPE') ||
83
- trimmed.startsWith('<!doctype') ||
84
- trimmed.startsWith('<html') ||
85
- trimmed.startsWith('<HTML'));
86
- }
87
- function hasFrontmatter(trimmed) {
88
- return /^---\r?\n/.test(trimmed);
89
- }
90
- function countCommonHtmlTags(content) {
91
- const matches = content.match(/<(html|head|body|div|span|script|style|meta|link)\b/gi) ??
92
- [];
93
- return matches.length;
94
- }
95
- function looksLikeMarkdown(content) {
96
- const hasMarkdownHeadings = /^#{1,6}\s+/m.test(content);
97
- const hasMarkdownLists = /^[\s]*[-*+]\s+/m.test(content);
98
- const hasMarkdownCodeBlocks = /```[\s\S]*?```/.test(content);
99
- return hasMarkdownHeadings || hasMarkdownLists || hasMarkdownCodeBlocks;
100
- }
101
- function isRawTextContent(content) {
102
- const trimmed = content.trim();
103
- if (looksLikeHtmlDocument(trimmed)) {
104
- return false;
105
- }
106
- if (hasFrontmatter(trimmed)) {
107
- return true;
108
- }
109
- if (countCommonHtmlTags(content) > 2) {
110
- return false;
111
- }
112
- if (looksLikeMarkdown(content)) {
113
- return true;
114
35
  }
115
- return false;
116
- }
117
- function tryTransformRawContent(html, url, options) {
118
- if (!isRawTextContentUrl(url) && !isRawTextContent(html)) {
119
- return null;
36
+ finally {
37
+ if (success) {
38
+ endTransformStage(totalStage, { truncated: false });
39
+ }
120
40
  }
121
- logDebug('Preserving raw markdown content', { url: url.substring(0, 80) });
122
- const { content, title } = buildRawMarkdownPayload(html, url, options.includeMetadata);
123
- return {
124
- markdown: content,
125
- title,
126
- truncated: false,
127
- };
128
- }
129
- export function transformHtmlToMarkdown(html, url, options) {
130
- const raw = tryTransformRawContent(html, url, options);
131
- if (raw)
132
- return raw;
133
- const context = resolveContentSource(html, url, options);
134
- const content = buildMarkdownPayload(context);
135
- return {
136
- markdown: content,
137
- title: context.title,
138
- truncated: false,
139
- };
140
41
  }