@j0hanz/superfetch 1.2.5 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (173) hide show
  1. package/README.md +131 -156
  2. package/dist/config/auth-config.d.ts +16 -0
  3. package/dist/config/auth-config.js +53 -0
  4. package/dist/config/constants.d.ts +11 -13
  5. package/dist/config/constants.js +1 -3
  6. package/dist/config/env-parsers.d.ts +7 -0
  7. package/dist/config/env-parsers.js +84 -0
  8. package/dist/config/formatting.d.ts +2 -2
  9. package/dist/config/index.d.ts +47 -53
  10. package/dist/config/index.js +35 -64
  11. package/dist/config/types/content.d.ts +1 -49
  12. package/dist/config/types/runtime.d.ts +8 -16
  13. package/dist/config/types/tools.d.ts +2 -28
  14. package/dist/http/accept-policy.d.ts +3 -0
  15. package/dist/http/accept-policy.js +45 -0
  16. package/dist/http/async-handler.d.ts +2 -0
  17. package/dist/http/async-handler.js +5 -0
  18. package/dist/http/auth-introspection.d.ts +2 -0
  19. package/dist/http/auth-introspection.js +141 -0
  20. package/dist/http/auth-static.d.ts +2 -0
  21. package/dist/http/auth-static.js +23 -0
  22. package/dist/http/auth.d.ts +3 -2
  23. package/dist/http/auth.js +254 -23
  24. package/dist/http/cors.d.ts +6 -6
  25. package/dist/http/cors.js +7 -42
  26. package/dist/http/download-routes.d.ts +0 -12
  27. package/dist/http/download-routes.js +21 -58
  28. package/dist/http/host-allowlist.d.ts +3 -0
  29. package/dist/http/host-allowlist.js +117 -0
  30. package/dist/http/jsonrpc-http.d.ts +2 -0
  31. package/dist/http/jsonrpc-http.js +10 -0
  32. package/dist/http/mcp-routes.d.ts +8 -3
  33. package/dist/http/mcp-routes.js +137 -31
  34. package/dist/http/mcp-session-eviction.d.ts +3 -0
  35. package/dist/http/mcp-session-eviction.js +24 -0
  36. package/dist/http/mcp-session-helpers.d.ts +0 -1
  37. package/dist/http/mcp-session-helpers.js +1 -1
  38. package/dist/http/mcp-session-init.d.ts +7 -0
  39. package/dist/http/mcp-session-init.js +94 -0
  40. package/dist/http/mcp-session-slots.d.ts +17 -0
  41. package/dist/http/mcp-session-slots.js +55 -0
  42. package/dist/http/mcp-session-transport-init.d.ts +7 -0
  43. package/dist/http/mcp-session-transport-init.js +41 -0
  44. package/dist/http/mcp-session-transport.d.ts +7 -0
  45. package/dist/http/mcp-session-transport.js +57 -0
  46. package/dist/http/mcp-session-types.d.ts +5 -0
  47. package/dist/http/mcp-session-types.js +1 -0
  48. package/dist/http/mcp-session.d.ts +9 -9
  49. package/dist/http/mcp-session.js +15 -137
  50. package/dist/http/mcp-sessions.d.ts +43 -0
  51. package/dist/http/mcp-sessions.js +392 -0
  52. package/dist/http/mcp-validation.d.ts +1 -0
  53. package/dist/http/mcp-validation.js +11 -10
  54. package/dist/http/protocol-policy.d.ts +2 -0
  55. package/dist/http/protocol-policy.js +31 -0
  56. package/dist/http/rate-limit.js +7 -4
  57. package/dist/http/server-config.d.ts +1 -0
  58. package/dist/http/server-config.js +40 -0
  59. package/dist/http/server-middleware.d.ts +7 -9
  60. package/dist/http/server-middleware.js +9 -70
  61. package/dist/http/server-shutdown.d.ts +4 -0
  62. package/dist/http/server-shutdown.js +43 -0
  63. package/dist/http/server.d.ts +10 -0
  64. package/dist/http/server.js +546 -61
  65. package/dist/http/session-cleanup.js +8 -5
  66. package/dist/middleware/error-handler.d.ts +1 -1
  67. package/dist/middleware/error-handler.js +32 -33
  68. package/dist/resources/cached-content-params.d.ts +5 -0
  69. package/dist/resources/cached-content-params.js +36 -0
  70. package/dist/resources/cached-content.js +67 -125
  71. package/dist/resources/index.js +0 -82
  72. package/dist/server.js +50 -29
  73. package/dist/services/cache-events.d.ts +8 -0
  74. package/dist/services/cache-events.js +19 -0
  75. package/dist/services/cache-keys.d.ts +7 -0
  76. package/dist/services/cache-keys.js +57 -0
  77. package/dist/services/cache.d.ts +4 -9
  78. package/dist/services/cache.js +77 -139
  79. package/dist/services/context.d.ts +0 -1
  80. package/dist/services/context.js +0 -7
  81. package/dist/services/extractor.js +55 -116
  82. package/dist/services/fetcher/agents.d.ts +2 -2
  83. package/dist/services/fetcher/agents.js +35 -96
  84. package/dist/services/fetcher/dns-selection.d.ts +2 -0
  85. package/dist/services/fetcher/dns-selection.js +72 -0
  86. package/dist/services/fetcher/interceptors.d.ts +0 -22
  87. package/dist/services/fetcher/interceptors.js +18 -32
  88. package/dist/services/fetcher/redirects.js +16 -7
  89. package/dist/services/fetcher/response.js +79 -34
  90. package/dist/services/fetcher.d.ts +22 -3
  91. package/dist/services/fetcher.js +544 -44
  92. package/dist/services/fifo-queue.d.ts +8 -0
  93. package/dist/services/fifo-queue.js +25 -0
  94. package/dist/services/logger.js +2 -2
  95. package/dist/services/metadata-collector.d.ts +1 -9
  96. package/dist/services/metadata-collector.js +71 -2
  97. package/dist/services/transform-worker-pool.d.ts +4 -14
  98. package/dist/services/transform-worker-pool.js +177 -129
  99. package/dist/services/transform-worker-types.d.ts +32 -0
  100. package/dist/services/transform-worker-types.js +14 -0
  101. package/dist/tools/handlers/fetch-markdown.tool.d.ts +3 -4
  102. package/dist/tools/handlers/fetch-markdown.tool.js +20 -72
  103. package/dist/tools/handlers/fetch-single.shared.d.ts +11 -22
  104. package/dist/tools/handlers/fetch-single.shared.js +175 -89
  105. package/dist/tools/handlers/fetch-url.tool.d.ts +7 -1
  106. package/dist/tools/handlers/fetch-url.tool.js +84 -119
  107. package/dist/tools/index.js +21 -40
  108. package/dist/tools/schemas.d.ts +1 -51
  109. package/dist/tools/schemas.js +1 -107
  110. package/dist/tools/utils/cached-markdown.d.ts +5 -0
  111. package/dist/tools/utils/cached-markdown.js +46 -0
  112. package/dist/tools/utils/content-shaping.d.ts +4 -0
  113. package/dist/tools/utils/content-shaping.js +67 -0
  114. package/dist/tools/utils/content-transform.d.ts +5 -17
  115. package/dist/tools/utils/content-transform.js +134 -114
  116. package/dist/tools/utils/fetch-pipeline.d.ts +0 -8
  117. package/dist/tools/utils/fetch-pipeline.js +57 -63
  118. package/dist/tools/utils/frontmatter.d.ts +3 -0
  119. package/dist/tools/utils/frontmatter.js +73 -0
  120. package/dist/tools/utils/inline-content.d.ts +1 -2
  121. package/dist/tools/utils/inline-content.js +4 -7
  122. package/dist/tools/utils/markdown-heuristics.d.ts +1 -0
  123. package/dist/tools/utils/markdown-heuristics.js +19 -0
  124. package/dist/tools/utils/markdown-signals.d.ts +1 -0
  125. package/dist/tools/utils/markdown-signals.js +19 -0
  126. package/dist/tools/utils/raw-markdown-frontmatter.d.ts +3 -0
  127. package/dist/tools/utils/raw-markdown-frontmatter.js +73 -0
  128. package/dist/tools/utils/raw-markdown.d.ts +6 -0
  129. package/dist/tools/utils/raw-markdown.js +135 -0
  130. package/dist/transformers/markdown/fenced-code-rule.d.ts +2 -0
  131. package/dist/transformers/markdown/fenced-code-rule.js +38 -0
  132. package/dist/transformers/markdown/frontmatter.d.ts +2 -0
  133. package/dist/transformers/markdown/frontmatter.js +45 -0
  134. package/dist/transformers/markdown/noise-rule.d.ts +2 -0
  135. package/dist/transformers/markdown/noise-rule.js +80 -0
  136. package/dist/transformers/markdown/turndown-instance.d.ts +2 -0
  137. package/dist/transformers/markdown/turndown-instance.js +19 -0
  138. package/dist/transformers/markdown.d.ts +2 -0
  139. package/dist/transformers/markdown.js +185 -0
  140. package/dist/transformers/markdown.transformer.js +5 -117
  141. package/dist/utils/cached-payload.d.ts +7 -0
  142. package/dist/utils/cached-payload.js +36 -0
  143. package/dist/utils/code-language-bash.d.ts +1 -0
  144. package/dist/utils/code-language-bash.js +48 -0
  145. package/dist/utils/code-language-core.d.ts +2 -0
  146. package/dist/utils/code-language-core.js +13 -0
  147. package/dist/utils/code-language-detectors.d.ts +5 -0
  148. package/dist/utils/code-language-detectors.js +142 -0
  149. package/dist/utils/code-language-helpers.d.ts +5 -0
  150. package/dist/utils/code-language-helpers.js +62 -0
  151. package/dist/utils/code-language-parsing.d.ts +5 -0
  152. package/dist/utils/code-language-parsing.js +62 -0
  153. package/dist/utils/code-language.d.ts +9 -0
  154. package/dist/utils/code-language.js +250 -46
  155. package/dist/utils/error-details.d.ts +3 -0
  156. package/dist/utils/error-details.js +12 -0
  157. package/dist/utils/error-utils.js +1 -1
  158. package/dist/utils/filename-generator.js +34 -12
  159. package/dist/utils/guards.d.ts +1 -0
  160. package/dist/utils/guards.js +3 -0
  161. package/dist/utils/header-normalizer.d.ts +0 -3
  162. package/dist/utils/header-normalizer.js +3 -3
  163. package/dist/utils/ip-address.d.ts +4 -0
  164. package/dist/utils/ip-address.js +6 -0
  165. package/dist/utils/tool-error-handler.d.ts +2 -2
  166. package/dist/utils/tool-error-handler.js +14 -46
  167. package/dist/utils/url-transformer.d.ts +7 -0
  168. package/dist/utils/url-transformer.js +147 -0
  169. package/dist/utils/url-validator.d.ts +1 -2
  170. package/dist/utils/url-validator.js +53 -114
  171. package/dist/workers/content-transform.worker.d.ts +1 -0
  172. package/dist/workers/content-transform.worker.js +40 -0
  173. package/package.json +17 -18
@@ -1,60 +1,10 @@
1
1
  import { z } from 'zod';
2
2
  export declare const fetchUrlInputSchema: z.ZodObject<{
3
- format: z.ZodDefault<z.ZodEnum<{
4
- jsonl: "jsonl";
5
- markdown: "markdown";
6
- }>>;
7
- includeContentBlocks: z.ZodOptional<z.ZodBoolean>;
8
- extractMainContent: z.ZodDefault<z.ZodBoolean>;
9
- includeMetadata: z.ZodDefault<z.ZodBoolean>;
10
- maxContentLength: z.ZodOptional<z.ZodNumber>;
11
3
  url: z.ZodURL;
12
- customHeaders: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
13
- timeout: z.ZodDefault<z.ZodNumber>;
14
- retries: z.ZodDefault<z.ZodNumber>;
15
- }, z.core.$strict>;
16
- export declare const fetchMarkdownInputSchema: z.ZodObject<{
17
- extractMainContent: z.ZodDefault<z.ZodBoolean>;
18
- includeMetadata: z.ZodDefault<z.ZodBoolean>;
19
- maxContentLength: z.ZodOptional<z.ZodNumber>;
20
- url: z.ZodURL;
21
- customHeaders: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
22
- timeout: z.ZodDefault<z.ZodNumber>;
23
- retries: z.ZodDefault<z.ZodNumber>;
24
4
  }, z.core.$strict>;
25
5
  export declare const fetchUrlOutputSchema: z.ZodObject<{
26
- contentSize: z.ZodOptional<z.ZodNumber>;
27
- resourceUri: z.ZodOptional<z.ZodString>;
28
- resourceMimeType: z.ZodOptional<z.ZodString>;
29
- cached: z.ZodBoolean;
30
- truncated: z.ZodOptional<z.ZodBoolean>;
31
- error: z.ZodOptional<z.ZodString>;
32
- errorCode: z.ZodOptional<z.ZodString>;
33
- url: z.ZodString;
34
- title: z.ZodOptional<z.ZodString>;
35
- contentBlocks: z.ZodNumber;
36
- fetchedAt: z.ZodString;
37
- format: z.ZodEnum<{
38
- jsonl: "jsonl";
39
- markdown: "markdown";
40
- }>;
41
- content: z.ZodOptional<z.ZodString>;
42
- }, z.core.$strict>;
43
- export declare const fetchMarkdownOutputSchema: z.ZodObject<{
44
- contentSize: z.ZodOptional<z.ZodNumber>;
45
- resourceUri: z.ZodOptional<z.ZodString>;
46
- resourceMimeType: z.ZodOptional<z.ZodString>;
47
- cached: z.ZodBoolean;
48
- truncated: z.ZodOptional<z.ZodBoolean>;
49
- error: z.ZodOptional<z.ZodString>;
50
- errorCode: z.ZodOptional<z.ZodString>;
51
6
  url: z.ZodString;
52
7
  title: z.ZodOptional<z.ZodString>;
53
- fetchedAt: z.ZodString;
54
8
  markdown: z.ZodOptional<z.ZodString>;
55
- file: z.ZodOptional<z.ZodObject<{
56
- downloadUrl: z.ZodString;
57
- fileName: z.ZodString;
58
- expiresAt: z.ZodString;
59
- }, z.core.$strip>>;
9
+ error: z.ZodOptional<z.ZodString>;
60
10
  }, z.core.$strict>;
@@ -1,119 +1,13 @@
1
1
  import { z } from 'zod';
2
- import { config } from '../config/index.js';
3
- const MAX_HEADER_NAME_LENGTH = 128;
4
- const MAX_HEADER_VALUE_LENGTH = 2048;
5
- const MAX_HEADER_COUNT = 50;
6
- const MAX_CONTENT_LENGTH = config.constants.maxContentSize;
7
- const customHeadersSchema = z
8
- .record(z.string().max(MAX_HEADER_NAME_LENGTH), z.string().max(MAX_HEADER_VALUE_LENGTH))
9
- .refine((headers) => Object.keys(headers).length <= MAX_HEADER_COUNT, {
10
- error: `customHeaders must have at most ${MAX_HEADER_COUNT} entries`,
11
- });
12
- const requestOptionsSchema = z.object({
13
- customHeaders: customHeadersSchema
14
- .optional()
15
- .describe('Custom HTTP headers for the request'),
16
- timeout: z
17
- .number()
18
- .min(1000)
19
- .max(120000)
20
- .default(config.fetcher.timeout)
21
- .describe('Request timeout in milliseconds (1000-120000)'),
22
- retries: z
23
- .number()
24
- .min(1)
25
- .max(10)
26
- .default(3)
27
- .describe('Number of retry attempts (1-10)'),
28
- });
29
- const extractionOptionsSchema = z.object({
30
- extractMainContent: z
31
- .boolean()
32
- .default(true)
33
- .describe('Use Readability to extract main article content'),
34
- includeMetadata: z
35
- .boolean()
36
- .default(true)
37
- .describe('Include page metadata (title, description, etc.)'),
38
- maxContentLength: z
39
- .number()
40
- .positive()
41
- .max(MAX_CONTENT_LENGTH)
42
- .optional()
43
- .describe('Maximum content length in characters'),
44
- });
45
- const formatOptionsSchema = z.object({
46
- format: z
47
- .enum(['jsonl', 'markdown'])
48
- .default('jsonl')
49
- .describe('Output format'),
50
- includeContentBlocks: z
51
- .boolean()
52
- .optional()
53
- .describe('Include content block counts when format=markdown'),
54
- });
55
- const resourceFieldsSchema = z.object({
56
- contentSize: z.number().optional().describe('Content length in characters'),
57
- resourceUri: z
58
- .string()
59
- .optional()
60
- .describe('Resource URI when content is too large to inline'),
61
- resourceMimeType: z
62
- .string()
63
- .optional()
64
- .describe('MIME type for the resource URI'),
65
- cached: z.boolean().describe('Whether the result was served from cache'),
66
- truncated: z
67
- .boolean()
68
- .optional()
69
- .describe('Whether content was truncated by maxContentLength'),
70
- error: z.string().optional().describe('Error message if the request failed'),
71
- errorCode: z.string().optional().describe('Error code if the request failed'),
72
- });
73
- const fileDownloadSchema = z.object({
74
- downloadUrl: z.string().describe('Relative URL to download the .md file'),
75
- fileName: z.string().describe('Suggested filename for download'),
76
- expiresAt: z.string().describe('ISO timestamp when download expires'),
77
- });
78
2
  export const fetchUrlInputSchema = z.strictObject({
79
- ...requestOptionsSchema.shape,
80
- url: z.url({ protocol: /^https?$/i }).describe('The URL to fetch'),
81
- ...extractionOptionsSchema.shape,
82
- ...formatOptionsSchema.shape,
83
- });
84
- export const fetchMarkdownInputSchema = z.strictObject({
85
- ...requestOptionsSchema.shape,
86
3
  url: z.url({ protocol: /^https?$/i }).describe('The URL to fetch'),
87
- ...extractionOptionsSchema.shape,
88
4
  });
89
5
  export const fetchUrlOutputSchema = z.strictObject({
90
6
  url: z.string().describe('The fetched URL'),
91
7
  title: z.string().optional().describe('Page title'),
92
- contentBlocks: z
93
- .number()
94
- .describe('Number of content blocks extracted (JSONL only)'),
95
- fetchedAt: z
96
- .string()
97
- .describe('ISO timestamp of when the content was fetched'),
98
- format: z.enum(['jsonl', 'markdown']).describe('Output format used'),
99
- content: z
100
- .string()
101
- .optional()
102
- .describe('The extracted content in JSONL or Markdown format'),
103
- ...resourceFieldsSchema.shape,
104
- });
105
- export const fetchMarkdownOutputSchema = z.strictObject({
106
- url: z.string().describe('The fetched URL'),
107
- title: z.string().optional().describe('Page title'),
108
- fetchedAt: z
109
- .string()
110
- .describe('ISO timestamp of when the content was fetched'),
111
8
  markdown: z
112
9
  .string()
113
10
  .optional()
114
11
  .describe('The extracted content in Markdown format'),
115
- file: fileDownloadSchema
116
- .optional()
117
- .describe('Download information when content is cached'),
118
- ...resourceFieldsSchema.shape,
12
+ error: z.string().optional().describe('Error message if the request failed'),
119
13
  });
@@ -0,0 +1,5 @@
1
+ import type { MarkdownTransformResult } from '../../config/types/content.js';
2
+ export type CachedMarkdownResult = MarkdownTransformResult & {
3
+ readonly content: string;
4
+ };
5
+ export declare function parseCachedMarkdownResult(cached: string): CachedMarkdownResult | undefined;
@@ -0,0 +1,46 @@
1
+ import { isRecord } from '../../utils/guards.js';
2
+ function parseJsonRecord(input) {
3
+ try {
4
+ const parsed = JSON.parse(input);
5
+ return isRecord(parsed) ? parsed : undefined;
6
+ }
7
+ catch {
8
+ return undefined;
9
+ }
10
+ }
11
+ function resolveMarkdownContent(parsed) {
12
+ const { markdown } = parsed;
13
+ if (typeof markdown === 'string')
14
+ return markdown;
15
+ const { content } = parsed;
16
+ if (typeof content === 'string')
17
+ return content;
18
+ return undefined;
19
+ }
20
+ function resolveOptionalTitle(parsed) {
21
+ const { title } = parsed;
22
+ if (title === undefined)
23
+ return undefined;
24
+ return typeof title === 'string' ? title : undefined;
25
+ }
26
+ function resolveTruncatedFlag(parsed) {
27
+ const { truncated } = parsed;
28
+ return typeof truncated === 'boolean' ? truncated : false;
29
+ }
30
+ export function parseCachedMarkdownResult(cached) {
31
+ const parsed = parseJsonRecord(cached);
32
+ if (!parsed)
33
+ return undefined;
34
+ const resolvedContent = resolveMarkdownContent(parsed);
35
+ if (resolvedContent === undefined)
36
+ return undefined;
37
+ const title = resolveOptionalTitle(parsed);
38
+ if (parsed.title !== undefined && title === undefined)
39
+ return undefined;
40
+ return {
41
+ content: resolvedContent,
42
+ markdown: resolvedContent,
43
+ title,
44
+ truncated: resolveTruncatedFlag(parsed),
45
+ };
46
+ }
@@ -0,0 +1,4 @@
1
+ import type { ExtractedArticle, ExtractedMetadata, MetadataBlock } from '../../config/types/content.js';
2
+ export declare function isExtractionSufficient(article: ExtractedArticle | null, originalHtml: string): boolean;
3
+ export declare function determineContentExtractionSource(article: ExtractedArticle | null): article is ExtractedArticle;
4
+ export declare function createContentMetadataBlock(url: string, article: ExtractedArticle | null, extractedMeta: ExtractedMetadata, shouldExtractFromArticle: boolean, includeMetadata: boolean): MetadataBlock | undefined;
@@ -0,0 +1,67 @@
1
+ const MIN_CONTENT_RATIO = 0.3;
2
+ const MIN_HTML_LENGTH_FOR_GATE = 100;
3
+ function stripHtmlTags(html) {
4
+ const parts = [];
5
+ let inTag = false;
6
+ for (const char of html) {
7
+ if (char === '<') {
8
+ inTag = true;
9
+ continue;
10
+ }
11
+ if (char === '>') {
12
+ inTag = false;
13
+ continue;
14
+ }
15
+ if (!inTag) {
16
+ parts.push(char);
17
+ }
18
+ }
19
+ return parts.join('');
20
+ }
21
+ function estimateTextLength(html) {
22
+ return stripHtmlTags(html).replace(/\s+/g, ' ').trim().length;
23
+ }
24
+ export function isExtractionSufficient(article, originalHtml) {
25
+ if (!article)
26
+ return false;
27
+ const articleLength = article.textContent.length;
28
+ const originalLength = estimateTextLength(originalHtml);
29
+ if (originalLength < MIN_HTML_LENGTH_FOR_GATE)
30
+ return true;
31
+ return articleLength / originalLength >= MIN_CONTENT_RATIO;
32
+ }
33
+ export function determineContentExtractionSource(article) {
34
+ return !!article;
35
+ }
36
+ function applyArticleMetadata(metadata, article) {
37
+ if (article.title !== undefined)
38
+ metadata.title = article.title;
39
+ if (article.byline !== undefined)
40
+ metadata.author = article.byline;
41
+ }
42
+ function applyExtractedMetadata(metadata, extractedMeta) {
43
+ if (extractedMeta.title !== undefined)
44
+ metadata.title = extractedMeta.title;
45
+ if (extractedMeta.description !== undefined) {
46
+ metadata.description = extractedMeta.description;
47
+ }
48
+ if (extractedMeta.author !== undefined) {
49
+ metadata.author = extractedMeta.author;
50
+ }
51
+ }
52
+ export function createContentMetadataBlock(url, article, extractedMeta, shouldExtractFromArticle, includeMetadata) {
53
+ if (!includeMetadata)
54
+ return undefined;
55
+ const now = new Date().toISOString();
56
+ const metadata = {
57
+ type: 'metadata',
58
+ url,
59
+ fetchedAt: now,
60
+ };
61
+ if (shouldExtractFromArticle && article) {
62
+ applyArticleMetadata(metadata, article);
63
+ return metadata;
64
+ }
65
+ applyExtractedMetadata(metadata, extractedMeta);
66
+ return metadata;
67
+ }
@@ -1,17 +1,5 @@
1
- import type { JsonlTransformResult, MarkdownTransformResult } from '../../config/types/content.js';
2
- interface ExtractionOptions {
3
- readonly extractMainContent: boolean;
4
- readonly includeMetadata: boolean;
5
- }
6
- interface ContentLengthOptions {
7
- readonly maxContentLength?: number;
8
- }
9
- interface MarkdownOptions extends ExtractionOptions, ContentLengthOptions {
10
- }
11
- interface MarkdownWithBlocksOptions extends ExtractionOptions, ContentLengthOptions {
12
- readonly includeContentBlocks?: boolean;
13
- }
14
- export declare function transformHtmlToJsonl(html: string, url: string, options: ExtractionOptions & ContentLengthOptions): JsonlTransformResult;
15
- export declare function transformHtmlToMarkdown(html: string, url: string, options: MarkdownOptions): MarkdownTransformResult;
16
- export declare function transformHtmlToMarkdownWithBlocks(html: string, url: string, options: MarkdownWithBlocksOptions): JsonlTransformResult;
17
- export {};
1
+ import type { ExtractedArticle, ExtractedMetadata, MarkdownTransformResult, MetadataBlock, TransformOptions } from '../../config/types/content.js';
2
+ export declare function isExtractionSufficient(article: ExtractedArticle | null, originalHtml: string): boolean;
3
+ export declare function determineContentExtractionSource(article: ExtractedArticle | null): article is ExtractedArticle;
4
+ export declare function createContentMetadataBlock(url: string, article: ExtractedArticle | null, extractedMeta: ExtractedMetadata, shouldExtractFromArticle: boolean, includeMetadata: boolean): MetadataBlock | undefined;
5
+ export declare function transformHtmlToMarkdown(html: string, url: string, options: TransformOptions): MarkdownTransformResult;
@@ -1,134 +1,154 @@
1
- import { TRUNCATION_MARKER } from '../../config/formatting.js';
2
1
  import { extractContent } from '../../services/extractor.js';
3
- import { parseHtml, parseHtmlWithMetadata } from '../../services/parser.js';
4
- import { sanitizeText } from '../../utils/sanitizer.js';
5
- import { toJsonl } from '../../transformers/jsonl.transformer.js';
6
- import { htmlToMarkdown } from '../../transformers/markdown.transformer.js';
7
- import { createContentMetadataBlock, determineContentExtractionSource, truncateContent, } from './common.js';
8
- const TITLE_PATTERN = /<title[^>]*>([\s\S]*?)<\/title>/i;
9
- function resolveContentSource(html, url, options) {
10
- if (!options.extractMainContent && !options.includeMetadata) {
11
- return {
12
- sourceHtml: html,
13
- title: extractTitleFromHtml(html),
14
- metadata: undefined,
15
- };
2
+ import { logDebug } from '../../services/logger.js';
3
+ import { htmlToMarkdown } from '../../transformers/markdown.js';
4
+ import { tryTransformRawContent } from './raw-markdown.js';
5
+ const MIN_CONTENT_RATIO = 0.3;
6
+ const MIN_HTML_LENGTH_FOR_GATE = 100;
7
+ function stripHtmlTags(html) {
8
+ const parts = [];
9
+ let inTag = false;
10
+ for (const char of html) {
11
+ if (char === '<') {
12
+ inTag = true;
13
+ continue;
14
+ }
15
+ if (char === '>') {
16
+ inTag = false;
17
+ continue;
18
+ }
19
+ if (!inTag) {
20
+ parts.push(char);
21
+ }
16
22
  }
17
- const { article, metadata: extractedMeta } = extractContent(html, url, {
18
- extractArticle: options.extractMainContent,
19
- });
20
- const shouldExtractFromArticle = determineContentExtractionSource(options.extractMainContent, article);
21
- const sourceHtml = shouldExtractFromArticle ? article.content : html;
22
- const metadata = createContentMetadataBlock(url, article, extractedMeta, shouldExtractFromArticle, options.includeMetadata);
23
- const title = shouldExtractFromArticle ? article.title : extractedMeta.title;
24
- return { sourceHtml, title, metadata };
23
+ return parts.join('');
25
24
  }
26
- function extractTitleFromHtml(html) {
27
- const match = TITLE_PATTERN.exec(html);
28
- if (!match?.[1])
29
- return undefined;
30
- const decoded = decodeHtmlEntities(match[1]);
31
- const text = sanitizeText(decoded);
32
- return text || undefined;
25
+ function estimateTextLength(html) {
26
+ return stripHtmlTags(html).replace(/\s+/g, ' ').trim().length;
33
27
  }
34
- function decodeHtmlEntities(value) {
35
- if (!value.includes('&'))
36
- return value;
37
- const basicDecoded = value
38
- .replace(/&amp;/g, '&')
39
- .replace(/&lt;/g, '<')
40
- .replace(/&gt;/g, '>')
41
- .replace(/&quot;/g, '"')
42
- .replace(/&#39;/g, "'");
43
- return basicDecoded
44
- .replace(/&#(\d+);/g, (match, code) => {
45
- const parsed = Number.parseInt(code, 10);
46
- return Number.isFinite(parsed) && parsed >= 0 && parsed <= 0x10ffff
47
- ? String.fromCodePoint(parsed)
48
- : match;
49
- })
50
- .replace(/&#x([0-9a-fA-F]+);/g, (match, code) => {
51
- const parsed = Number.parseInt(code, 16);
52
- return Number.isFinite(parsed) && parsed >= 0 && parsed <= 0x10ffff
53
- ? String.fromCodePoint(parsed)
54
- : match;
55
- });
28
+ export function isExtractionSufficient(article, originalHtml) {
29
+ if (!article)
30
+ return false;
31
+ const articleLength = article.textContent.length;
32
+ const originalLength = estimateTextLength(originalHtml);
33
+ if (originalLength < MIN_HTML_LENGTH_FOR_GATE)
34
+ return true;
35
+ return articleLength / originalLength >= MIN_CONTENT_RATIO;
56
36
  }
57
- function buildJsonlPayload(context, maxContentLength) {
58
- const contentBlocks = parseHtml(context.sourceHtml);
59
- return buildJsonlPayloadFromBlocks(contentBlocks, context.metadata, maxContentLength);
37
+ export function determineContentExtractionSource(article) {
38
+ return !!article;
60
39
  }
61
- function buildJsonlPayloadFromBlocks(contentBlocks, metadata, maxContentLength) {
62
- const { content, truncated } = truncateContent(toJsonl(contentBlocks, metadata), maxContentLength);
63
- return {
64
- content,
65
- contentBlocks: contentBlocks.length,
66
- truncated,
67
- };
40
+ function applyArticleMetadata(metadata, article) {
41
+ if (article.title !== undefined)
42
+ metadata.title = article.title;
43
+ if (article.byline !== undefined)
44
+ metadata.author = article.byline;
68
45
  }
69
- function buildMarkdownPayload(context, maxContentLength) {
70
- const markdown = htmlToMarkdown(context.sourceHtml, context.metadata);
71
- const { content, truncated } = truncateContent(markdown, maxContentLength, TRUNCATION_MARKER);
72
- return { content, truncated };
46
+ function applyExtractedMetadata(metadata, extractedMeta) {
47
+ if (extractedMeta.title !== undefined)
48
+ metadata.title = extractedMeta.title;
49
+ if (extractedMeta.description !== undefined) {
50
+ metadata.description = extractedMeta.description;
51
+ }
52
+ if (extractedMeta.author !== undefined) {
53
+ metadata.author = extractedMeta.author;
54
+ }
73
55
  }
74
- export function transformHtmlToJsonl(html, url, options) {
75
- if (!options.extractMainContent && options.includeMetadata) {
76
- const parsed = parseHtmlWithMetadata(html);
77
- const metadataBlock = createContentMetadataBlock(url, null, parsed.metadata, false, true);
78
- const { content, contentBlocks, truncated } = buildJsonlPayloadFromBlocks(parsed.blocks, metadataBlock, options.maxContentLength);
79
- return {
80
- content,
81
- contentBlocks,
82
- title: parsed.metadata.title,
83
- ...(truncated && { truncated }),
84
- };
56
+ export function createContentMetadataBlock(url, article, extractedMeta, shouldExtractFromArticle, includeMetadata) {
57
+ if (!includeMetadata)
58
+ return undefined;
59
+ const now = new Date().toISOString();
60
+ const metadata = {
61
+ type: 'metadata',
62
+ url,
63
+ fetchedAt: now,
64
+ };
65
+ if (shouldExtractFromArticle && article) {
66
+ applyArticleMetadata(metadata, article);
67
+ return metadata;
85
68
  }
86
- const context = resolveContentSource(html, url, options);
87
- const { content, contentBlocks, truncated } = buildJsonlPayload(context, options.maxContentLength);
69
+ applyExtractedMetadata(metadata, extractedMeta);
70
+ return metadata;
71
+ }
72
+ function buildArticleContentSource({ url, article, extractedMeta, includeMetadata, }) {
73
+ const metadata = createContentMetadataBlock(url, article, extractedMeta, true, includeMetadata);
88
74
  return {
89
- content,
90
- contentBlocks,
91
- title: context.title,
92
- ...(truncated && { truncated }),
75
+ sourceHtml: article.content,
76
+ title: article.title,
77
+ metadata,
93
78
  };
94
79
  }
95
- export function transformHtmlToMarkdown(html, url, options) {
96
- const context = resolveContentSource(html, url, options);
97
- const { content, truncated } = buildMarkdownPayload(context, options.maxContentLength);
80
+ function buildFullHtmlContentSource({ html, url, article, extractedMeta, includeMetadata, }) {
81
+ const metadata = createContentMetadataBlock(url, article, extractedMeta, false, includeMetadata);
98
82
  return {
99
- markdown: content,
100
- title: context.title,
101
- truncated,
83
+ sourceHtml: html,
84
+ title: extractedMeta.title,
85
+ metadata,
102
86
  };
103
87
  }
104
- export function transformHtmlToMarkdownWithBlocks(html, url, options) {
105
- const includeContentBlocks = options.includeContentBlocks ?? true;
106
- if (includeContentBlocks &&
107
- !options.extractMainContent &&
108
- options.includeMetadata) {
109
- const parsed = parseHtmlWithMetadata(html);
110
- const context = {
111
- sourceHtml: html,
112
- title: parsed.metadata.title,
113
- metadata: createContentMetadataBlock(url, null, parsed.metadata, false, true),
114
- };
115
- const { content, truncated } = buildMarkdownPayload(context, options.maxContentLength);
116
- return {
117
- content,
118
- contentBlocks: parsed.blocks.length,
119
- title: context.title,
120
- ...(truncated && { truncated }),
121
- };
88
+ function logQualityGateFallback({ url, articleLength, }) {
89
+ logDebug('Quality gate: Readability extraction below threshold, using full HTML', {
90
+ url: url.substring(0, 80),
91
+ articleLength,
92
+ });
93
+ }
94
+ function tryBuildExtractedArticleContentSource({ html, url, article, extractedMeta, includeMetadata, }) {
95
+ if (!article)
96
+ return null;
97
+ const shouldExtractFromArticle = determineContentExtractionSource(article);
98
+ if (shouldExtractFromArticle && isExtractionSufficient(article, html)) {
99
+ return buildArticleContentSource({
100
+ url,
101
+ article,
102
+ extractedMeta,
103
+ includeMetadata,
104
+ });
105
+ }
106
+ if (shouldExtractFromArticle) {
107
+ logQualityGateFallback({
108
+ url,
109
+ articleLength: article.textContent.length,
110
+ });
122
111
  }
123
- const context = resolveContentSource(html, url, options);
124
- const contentBlocks = includeContentBlocks
125
- ? parseHtml(context.sourceHtml)
126
- : [];
127
- const { content, truncated } = buildMarkdownPayload(context, options.maxContentLength);
112
+ return null;
113
+ }
114
+ function resolveContentSource({ html, url, includeMetadata, }) {
115
+ const { article, metadata: extractedMeta } = extractContent(html, url, {
116
+ extractArticle: true,
117
+ });
118
+ const extracted = tryBuildExtractedArticleContentSource({
119
+ html,
120
+ url,
121
+ article,
122
+ extractedMeta,
123
+ includeMetadata,
124
+ });
125
+ if (extracted)
126
+ return extracted;
127
+ return buildFullHtmlContentSource({
128
+ html,
129
+ url,
130
+ article,
131
+ extractedMeta,
132
+ includeMetadata,
133
+ });
134
+ }
135
+ export function transformHtmlToMarkdown(html, url, options) {
136
+ const raw = tryTransformRawContent({
137
+ html,
138
+ url,
139
+ includeMetadata: options.includeMetadata,
140
+ });
141
+ if (raw)
142
+ return raw;
143
+ const context = resolveContentSource({
144
+ html,
145
+ url,
146
+ includeMetadata: options.includeMetadata,
147
+ });
148
+ const content = htmlToMarkdown(context.sourceHtml, context.metadata);
128
149
  return {
129
- content,
130
- contentBlocks: contentBlocks.length,
150
+ markdown: content,
131
151
  title: context.title,
132
- ...(truncated && { truncated }),
152
+ truncated: false,
133
153
  };
134
154
  }
@@ -1,10 +1,2 @@
1
1
  import type { FetchPipelineOptions, PipelineResult } from '../../config/types/runtime.js';
2
- /**
3
- * Unified fetch pipeline that handles caching, fetching, and transformation.
4
- * Implements cache-first strategy with automatic serialization.
5
- *
6
- * @template T - Type of the transformed result
7
- * @param options - Pipeline configuration options
8
- * @returns Promise resolving to the pipeline result
9
- */
10
2
  export declare function executeFetchPipeline<T>(options: FetchPipelineOptions<T>): Promise<PipelineResult<T>>;