@j0hanz/superfetch 1.2.5 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +131 -156
- package/dist/config/auth-config.d.ts +16 -0
- package/dist/config/auth-config.js +53 -0
- package/dist/config/constants.d.ts +11 -13
- package/dist/config/constants.js +1 -3
- package/dist/config/env-parsers.d.ts +7 -0
- package/dist/config/env-parsers.js +84 -0
- package/dist/config/formatting.d.ts +2 -2
- package/dist/config/index.d.ts +47 -53
- package/dist/config/index.js +35 -64
- package/dist/config/types/content.d.ts +1 -49
- package/dist/config/types/runtime.d.ts +8 -16
- package/dist/config/types/tools.d.ts +2 -28
- package/dist/http/accept-policy.d.ts +3 -0
- package/dist/http/accept-policy.js +45 -0
- package/dist/http/async-handler.d.ts +2 -0
- package/dist/http/async-handler.js +5 -0
- package/dist/http/auth-introspection.d.ts +2 -0
- package/dist/http/auth-introspection.js +141 -0
- package/dist/http/auth-static.d.ts +2 -0
- package/dist/http/auth-static.js +23 -0
- package/dist/http/auth.d.ts +3 -2
- package/dist/http/auth.js +254 -23
- package/dist/http/cors.d.ts +6 -6
- package/dist/http/cors.js +7 -42
- package/dist/http/download-routes.d.ts +0 -12
- package/dist/http/download-routes.js +21 -58
- package/dist/http/host-allowlist.d.ts +3 -0
- package/dist/http/host-allowlist.js +117 -0
- package/dist/http/jsonrpc-http.d.ts +2 -0
- package/dist/http/jsonrpc-http.js +10 -0
- package/dist/http/mcp-routes.d.ts +8 -3
- package/dist/http/mcp-routes.js +137 -31
- package/dist/http/mcp-session-eviction.d.ts +3 -0
- package/dist/http/mcp-session-eviction.js +24 -0
- package/dist/http/mcp-session-helpers.d.ts +0 -1
- package/dist/http/mcp-session-helpers.js +1 -1
- package/dist/http/mcp-session-init.d.ts +7 -0
- package/dist/http/mcp-session-init.js +94 -0
- package/dist/http/mcp-session-slots.d.ts +17 -0
- package/dist/http/mcp-session-slots.js +55 -0
- package/dist/http/mcp-session-transport-init.d.ts +7 -0
- package/dist/http/mcp-session-transport-init.js +41 -0
- package/dist/http/mcp-session-transport.d.ts +7 -0
- package/dist/http/mcp-session-transport.js +57 -0
- package/dist/http/mcp-session-types.d.ts +5 -0
- package/dist/http/mcp-session-types.js +1 -0
- package/dist/http/mcp-session.d.ts +9 -9
- package/dist/http/mcp-session.js +15 -137
- package/dist/http/mcp-sessions.d.ts +43 -0
- package/dist/http/mcp-sessions.js +392 -0
- package/dist/http/mcp-validation.d.ts +1 -0
- package/dist/http/mcp-validation.js +11 -10
- package/dist/http/protocol-policy.d.ts +2 -0
- package/dist/http/protocol-policy.js +31 -0
- package/dist/http/rate-limit.js +7 -4
- package/dist/http/server-config.d.ts +1 -0
- package/dist/http/server-config.js +40 -0
- package/dist/http/server-middleware.d.ts +7 -9
- package/dist/http/server-middleware.js +9 -70
- package/dist/http/server-shutdown.d.ts +4 -0
- package/dist/http/server-shutdown.js +43 -0
- package/dist/http/server.d.ts +10 -0
- package/dist/http/server.js +546 -61
- package/dist/http/session-cleanup.js +8 -5
- package/dist/middleware/error-handler.d.ts +1 -1
- package/dist/middleware/error-handler.js +32 -33
- package/dist/resources/cached-content-params.d.ts +5 -0
- package/dist/resources/cached-content-params.js +36 -0
- package/dist/resources/cached-content.js +67 -125
- package/dist/resources/index.js +0 -82
- package/dist/server.js +50 -29
- package/dist/services/cache-events.d.ts +8 -0
- package/dist/services/cache-events.js +19 -0
- package/dist/services/cache-keys.d.ts +7 -0
- package/dist/services/cache-keys.js +57 -0
- package/dist/services/cache.d.ts +4 -9
- package/dist/services/cache.js +77 -139
- package/dist/services/context.d.ts +0 -1
- package/dist/services/context.js +0 -7
- package/dist/services/extractor.js +55 -116
- package/dist/services/fetcher/agents.d.ts +2 -2
- package/dist/services/fetcher/agents.js +35 -96
- package/dist/services/fetcher/dns-selection.d.ts +2 -0
- package/dist/services/fetcher/dns-selection.js +72 -0
- package/dist/services/fetcher/interceptors.d.ts +0 -22
- package/dist/services/fetcher/interceptors.js +18 -32
- package/dist/services/fetcher/redirects.js +16 -7
- package/dist/services/fetcher/response.js +79 -34
- package/dist/services/fetcher.d.ts +22 -3
- package/dist/services/fetcher.js +544 -44
- package/dist/services/fifo-queue.d.ts +8 -0
- package/dist/services/fifo-queue.js +25 -0
- package/dist/services/logger.js +2 -2
- package/dist/services/metadata-collector.d.ts +1 -9
- package/dist/services/metadata-collector.js +71 -2
- package/dist/services/transform-worker-pool.d.ts +4 -14
- package/dist/services/transform-worker-pool.js +177 -129
- package/dist/services/transform-worker-types.d.ts +32 -0
- package/dist/services/transform-worker-types.js +14 -0
- package/dist/tools/handlers/fetch-markdown.tool.d.ts +3 -4
- package/dist/tools/handlers/fetch-markdown.tool.js +20 -72
- package/dist/tools/handlers/fetch-single.shared.d.ts +11 -22
- package/dist/tools/handlers/fetch-single.shared.js +175 -89
- package/dist/tools/handlers/fetch-url.tool.d.ts +7 -1
- package/dist/tools/handlers/fetch-url.tool.js +84 -119
- package/dist/tools/index.js +21 -40
- package/dist/tools/schemas.d.ts +1 -51
- package/dist/tools/schemas.js +1 -107
- package/dist/tools/utils/cached-markdown.d.ts +5 -0
- package/dist/tools/utils/cached-markdown.js +46 -0
- package/dist/tools/utils/content-shaping.d.ts +4 -0
- package/dist/tools/utils/content-shaping.js +67 -0
- package/dist/tools/utils/content-transform.d.ts +5 -17
- package/dist/tools/utils/content-transform.js +134 -114
- package/dist/tools/utils/fetch-pipeline.d.ts +0 -8
- package/dist/tools/utils/fetch-pipeline.js +57 -63
- package/dist/tools/utils/frontmatter.d.ts +3 -0
- package/dist/tools/utils/frontmatter.js +73 -0
- package/dist/tools/utils/inline-content.d.ts +1 -2
- package/dist/tools/utils/inline-content.js +4 -7
- package/dist/tools/utils/markdown-heuristics.d.ts +1 -0
- package/dist/tools/utils/markdown-heuristics.js +19 -0
- package/dist/tools/utils/markdown-signals.d.ts +1 -0
- package/dist/tools/utils/markdown-signals.js +19 -0
- package/dist/tools/utils/raw-markdown-frontmatter.d.ts +3 -0
- package/dist/tools/utils/raw-markdown-frontmatter.js +73 -0
- package/dist/tools/utils/raw-markdown.d.ts +6 -0
- package/dist/tools/utils/raw-markdown.js +135 -0
- package/dist/transformers/markdown/fenced-code-rule.d.ts +2 -0
- package/dist/transformers/markdown/fenced-code-rule.js +38 -0
- package/dist/transformers/markdown/frontmatter.d.ts +2 -0
- package/dist/transformers/markdown/frontmatter.js +45 -0
- package/dist/transformers/markdown/noise-rule.d.ts +2 -0
- package/dist/transformers/markdown/noise-rule.js +80 -0
- package/dist/transformers/markdown/turndown-instance.d.ts +2 -0
- package/dist/transformers/markdown/turndown-instance.js +19 -0
- package/dist/transformers/markdown.d.ts +2 -0
- package/dist/transformers/markdown.js +185 -0
- package/dist/transformers/markdown.transformer.js +5 -117
- package/dist/utils/cached-payload.d.ts +7 -0
- package/dist/utils/cached-payload.js +36 -0
- package/dist/utils/code-language-bash.d.ts +1 -0
- package/dist/utils/code-language-bash.js +48 -0
- package/dist/utils/code-language-core.d.ts +2 -0
- package/dist/utils/code-language-core.js +13 -0
- package/dist/utils/code-language-detectors.d.ts +5 -0
- package/dist/utils/code-language-detectors.js +142 -0
- package/dist/utils/code-language-helpers.d.ts +5 -0
- package/dist/utils/code-language-helpers.js +62 -0
- package/dist/utils/code-language-parsing.d.ts +5 -0
- package/dist/utils/code-language-parsing.js +62 -0
- package/dist/utils/code-language.d.ts +9 -0
- package/dist/utils/code-language.js +250 -46
- package/dist/utils/error-details.d.ts +3 -0
- package/dist/utils/error-details.js +12 -0
- package/dist/utils/error-utils.js +1 -1
- package/dist/utils/filename-generator.js +34 -12
- package/dist/utils/guards.d.ts +1 -0
- package/dist/utils/guards.js +3 -0
- package/dist/utils/header-normalizer.d.ts +0 -3
- package/dist/utils/header-normalizer.js +3 -3
- package/dist/utils/ip-address.d.ts +4 -0
- package/dist/utils/ip-address.js +6 -0
- package/dist/utils/tool-error-handler.d.ts +2 -2
- package/dist/utils/tool-error-handler.js +14 -46
- package/dist/utils/url-transformer.d.ts +7 -0
- package/dist/utils/url-transformer.js +147 -0
- package/dist/utils/url-validator.d.ts +1 -2
- package/dist/utils/url-validator.js +53 -114
- package/dist/workers/content-transform.worker.d.ts +1 -0
- package/dist/workers/content-transform.worker.js +40 -0
- package/package.json +17 -18
package/dist/tools/schemas.d.ts
CHANGED
|
@@ -1,60 +1,10 @@
|
|
|
1
1
|
import { z } from 'zod';
|
|
2
2
|
export declare const fetchUrlInputSchema: z.ZodObject<{
|
|
3
|
-
format: z.ZodDefault<z.ZodEnum<{
|
|
4
|
-
jsonl: "jsonl";
|
|
5
|
-
markdown: "markdown";
|
|
6
|
-
}>>;
|
|
7
|
-
includeContentBlocks: z.ZodOptional<z.ZodBoolean>;
|
|
8
|
-
extractMainContent: z.ZodDefault<z.ZodBoolean>;
|
|
9
|
-
includeMetadata: z.ZodDefault<z.ZodBoolean>;
|
|
10
|
-
maxContentLength: z.ZodOptional<z.ZodNumber>;
|
|
11
3
|
url: z.ZodURL;
|
|
12
|
-
customHeaders: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
|
|
13
|
-
timeout: z.ZodDefault<z.ZodNumber>;
|
|
14
|
-
retries: z.ZodDefault<z.ZodNumber>;
|
|
15
|
-
}, z.core.$strict>;
|
|
16
|
-
export declare const fetchMarkdownInputSchema: z.ZodObject<{
|
|
17
|
-
extractMainContent: z.ZodDefault<z.ZodBoolean>;
|
|
18
|
-
includeMetadata: z.ZodDefault<z.ZodBoolean>;
|
|
19
|
-
maxContentLength: z.ZodOptional<z.ZodNumber>;
|
|
20
|
-
url: z.ZodURL;
|
|
21
|
-
customHeaders: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
|
|
22
|
-
timeout: z.ZodDefault<z.ZodNumber>;
|
|
23
|
-
retries: z.ZodDefault<z.ZodNumber>;
|
|
24
4
|
}, z.core.$strict>;
|
|
25
5
|
export declare const fetchUrlOutputSchema: z.ZodObject<{
|
|
26
|
-
contentSize: z.ZodOptional<z.ZodNumber>;
|
|
27
|
-
resourceUri: z.ZodOptional<z.ZodString>;
|
|
28
|
-
resourceMimeType: z.ZodOptional<z.ZodString>;
|
|
29
|
-
cached: z.ZodBoolean;
|
|
30
|
-
truncated: z.ZodOptional<z.ZodBoolean>;
|
|
31
|
-
error: z.ZodOptional<z.ZodString>;
|
|
32
|
-
errorCode: z.ZodOptional<z.ZodString>;
|
|
33
|
-
url: z.ZodString;
|
|
34
|
-
title: z.ZodOptional<z.ZodString>;
|
|
35
|
-
contentBlocks: z.ZodNumber;
|
|
36
|
-
fetchedAt: z.ZodString;
|
|
37
|
-
format: z.ZodEnum<{
|
|
38
|
-
jsonl: "jsonl";
|
|
39
|
-
markdown: "markdown";
|
|
40
|
-
}>;
|
|
41
|
-
content: z.ZodOptional<z.ZodString>;
|
|
42
|
-
}, z.core.$strict>;
|
|
43
|
-
export declare const fetchMarkdownOutputSchema: z.ZodObject<{
|
|
44
|
-
contentSize: z.ZodOptional<z.ZodNumber>;
|
|
45
|
-
resourceUri: z.ZodOptional<z.ZodString>;
|
|
46
|
-
resourceMimeType: z.ZodOptional<z.ZodString>;
|
|
47
|
-
cached: z.ZodBoolean;
|
|
48
|
-
truncated: z.ZodOptional<z.ZodBoolean>;
|
|
49
|
-
error: z.ZodOptional<z.ZodString>;
|
|
50
|
-
errorCode: z.ZodOptional<z.ZodString>;
|
|
51
6
|
url: z.ZodString;
|
|
52
7
|
title: z.ZodOptional<z.ZodString>;
|
|
53
|
-
fetchedAt: z.ZodString;
|
|
54
8
|
markdown: z.ZodOptional<z.ZodString>;
|
|
55
|
-
|
|
56
|
-
downloadUrl: z.ZodString;
|
|
57
|
-
fileName: z.ZodString;
|
|
58
|
-
expiresAt: z.ZodString;
|
|
59
|
-
}, z.core.$strip>>;
|
|
9
|
+
error: z.ZodOptional<z.ZodString>;
|
|
60
10
|
}, z.core.$strict>;
|
package/dist/tools/schemas.js
CHANGED
|
@@ -1,119 +1,13 @@
|
|
|
1
1
|
import { z } from 'zod';
|
|
2
|
-
import { config } from '../config/index.js';
|
|
3
|
-
const MAX_HEADER_NAME_LENGTH = 128;
|
|
4
|
-
const MAX_HEADER_VALUE_LENGTH = 2048;
|
|
5
|
-
const MAX_HEADER_COUNT = 50;
|
|
6
|
-
const MAX_CONTENT_LENGTH = config.constants.maxContentSize;
|
|
7
|
-
const customHeadersSchema = z
|
|
8
|
-
.record(z.string().max(MAX_HEADER_NAME_LENGTH), z.string().max(MAX_HEADER_VALUE_LENGTH))
|
|
9
|
-
.refine((headers) => Object.keys(headers).length <= MAX_HEADER_COUNT, {
|
|
10
|
-
error: `customHeaders must have at most ${MAX_HEADER_COUNT} entries`,
|
|
11
|
-
});
|
|
12
|
-
const requestOptionsSchema = z.object({
|
|
13
|
-
customHeaders: customHeadersSchema
|
|
14
|
-
.optional()
|
|
15
|
-
.describe('Custom HTTP headers for the request'),
|
|
16
|
-
timeout: z
|
|
17
|
-
.number()
|
|
18
|
-
.min(1000)
|
|
19
|
-
.max(120000)
|
|
20
|
-
.default(config.fetcher.timeout)
|
|
21
|
-
.describe('Request timeout in milliseconds (1000-120000)'),
|
|
22
|
-
retries: z
|
|
23
|
-
.number()
|
|
24
|
-
.min(1)
|
|
25
|
-
.max(10)
|
|
26
|
-
.default(3)
|
|
27
|
-
.describe('Number of retry attempts (1-10)'),
|
|
28
|
-
});
|
|
29
|
-
const extractionOptionsSchema = z.object({
|
|
30
|
-
extractMainContent: z
|
|
31
|
-
.boolean()
|
|
32
|
-
.default(true)
|
|
33
|
-
.describe('Use Readability to extract main article content'),
|
|
34
|
-
includeMetadata: z
|
|
35
|
-
.boolean()
|
|
36
|
-
.default(true)
|
|
37
|
-
.describe('Include page metadata (title, description, etc.)'),
|
|
38
|
-
maxContentLength: z
|
|
39
|
-
.number()
|
|
40
|
-
.positive()
|
|
41
|
-
.max(MAX_CONTENT_LENGTH)
|
|
42
|
-
.optional()
|
|
43
|
-
.describe('Maximum content length in characters'),
|
|
44
|
-
});
|
|
45
|
-
const formatOptionsSchema = z.object({
|
|
46
|
-
format: z
|
|
47
|
-
.enum(['jsonl', 'markdown'])
|
|
48
|
-
.default('jsonl')
|
|
49
|
-
.describe('Output format'),
|
|
50
|
-
includeContentBlocks: z
|
|
51
|
-
.boolean()
|
|
52
|
-
.optional()
|
|
53
|
-
.describe('Include content block counts when format=markdown'),
|
|
54
|
-
});
|
|
55
|
-
const resourceFieldsSchema = z.object({
|
|
56
|
-
contentSize: z.number().optional().describe('Content length in characters'),
|
|
57
|
-
resourceUri: z
|
|
58
|
-
.string()
|
|
59
|
-
.optional()
|
|
60
|
-
.describe('Resource URI when content is too large to inline'),
|
|
61
|
-
resourceMimeType: z
|
|
62
|
-
.string()
|
|
63
|
-
.optional()
|
|
64
|
-
.describe('MIME type for the resource URI'),
|
|
65
|
-
cached: z.boolean().describe('Whether the result was served from cache'),
|
|
66
|
-
truncated: z
|
|
67
|
-
.boolean()
|
|
68
|
-
.optional()
|
|
69
|
-
.describe('Whether content was truncated by maxContentLength'),
|
|
70
|
-
error: z.string().optional().describe('Error message if the request failed'),
|
|
71
|
-
errorCode: z.string().optional().describe('Error code if the request failed'),
|
|
72
|
-
});
|
|
73
|
-
const fileDownloadSchema = z.object({
|
|
74
|
-
downloadUrl: z.string().describe('Relative URL to download the .md file'),
|
|
75
|
-
fileName: z.string().describe('Suggested filename for download'),
|
|
76
|
-
expiresAt: z.string().describe('ISO timestamp when download expires'),
|
|
77
|
-
});
|
|
78
2
|
export const fetchUrlInputSchema = z.strictObject({
|
|
79
|
-
...requestOptionsSchema.shape,
|
|
80
|
-
url: z.url({ protocol: /^https?$/i }).describe('The URL to fetch'),
|
|
81
|
-
...extractionOptionsSchema.shape,
|
|
82
|
-
...formatOptionsSchema.shape,
|
|
83
|
-
});
|
|
84
|
-
export const fetchMarkdownInputSchema = z.strictObject({
|
|
85
|
-
...requestOptionsSchema.shape,
|
|
86
3
|
url: z.url({ protocol: /^https?$/i }).describe('The URL to fetch'),
|
|
87
|
-
...extractionOptionsSchema.shape,
|
|
88
4
|
});
|
|
89
5
|
export const fetchUrlOutputSchema = z.strictObject({
|
|
90
6
|
url: z.string().describe('The fetched URL'),
|
|
91
7
|
title: z.string().optional().describe('Page title'),
|
|
92
|
-
contentBlocks: z
|
|
93
|
-
.number()
|
|
94
|
-
.describe('Number of content blocks extracted (JSONL only)'),
|
|
95
|
-
fetchedAt: z
|
|
96
|
-
.string()
|
|
97
|
-
.describe('ISO timestamp of when the content was fetched'),
|
|
98
|
-
format: z.enum(['jsonl', 'markdown']).describe('Output format used'),
|
|
99
|
-
content: z
|
|
100
|
-
.string()
|
|
101
|
-
.optional()
|
|
102
|
-
.describe('The extracted content in JSONL or Markdown format'),
|
|
103
|
-
...resourceFieldsSchema.shape,
|
|
104
|
-
});
|
|
105
|
-
export const fetchMarkdownOutputSchema = z.strictObject({
|
|
106
|
-
url: z.string().describe('The fetched URL'),
|
|
107
|
-
title: z.string().optional().describe('Page title'),
|
|
108
|
-
fetchedAt: z
|
|
109
|
-
.string()
|
|
110
|
-
.describe('ISO timestamp of when the content was fetched'),
|
|
111
8
|
markdown: z
|
|
112
9
|
.string()
|
|
113
10
|
.optional()
|
|
114
11
|
.describe('The extracted content in Markdown format'),
|
|
115
|
-
|
|
116
|
-
.optional()
|
|
117
|
-
.describe('Download information when content is cached'),
|
|
118
|
-
...resourceFieldsSchema.shape,
|
|
12
|
+
error: z.string().optional().describe('Error message if the request failed'),
|
|
119
13
|
});
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
import type { MarkdownTransformResult } from '../../config/types/content.js';
|
|
2
|
+
export type CachedMarkdownResult = MarkdownTransformResult & {
|
|
3
|
+
readonly content: string;
|
|
4
|
+
};
|
|
5
|
+
export declare function parseCachedMarkdownResult(cached: string): CachedMarkdownResult | undefined;
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import { isRecord } from '../../utils/guards.js';
|
|
2
|
+
function parseJsonRecord(input) {
|
|
3
|
+
try {
|
|
4
|
+
const parsed = JSON.parse(input);
|
|
5
|
+
return isRecord(parsed) ? parsed : undefined;
|
|
6
|
+
}
|
|
7
|
+
catch {
|
|
8
|
+
return undefined;
|
|
9
|
+
}
|
|
10
|
+
}
|
|
11
|
+
function resolveMarkdownContent(parsed) {
|
|
12
|
+
const { markdown } = parsed;
|
|
13
|
+
if (typeof markdown === 'string')
|
|
14
|
+
return markdown;
|
|
15
|
+
const { content } = parsed;
|
|
16
|
+
if (typeof content === 'string')
|
|
17
|
+
return content;
|
|
18
|
+
return undefined;
|
|
19
|
+
}
|
|
20
|
+
function resolveOptionalTitle(parsed) {
|
|
21
|
+
const { title } = parsed;
|
|
22
|
+
if (title === undefined)
|
|
23
|
+
return undefined;
|
|
24
|
+
return typeof title === 'string' ? title : undefined;
|
|
25
|
+
}
|
|
26
|
+
function resolveTruncatedFlag(parsed) {
|
|
27
|
+
const { truncated } = parsed;
|
|
28
|
+
return typeof truncated === 'boolean' ? truncated : false;
|
|
29
|
+
}
|
|
30
|
+
export function parseCachedMarkdownResult(cached) {
|
|
31
|
+
const parsed = parseJsonRecord(cached);
|
|
32
|
+
if (!parsed)
|
|
33
|
+
return undefined;
|
|
34
|
+
const resolvedContent = resolveMarkdownContent(parsed);
|
|
35
|
+
if (resolvedContent === undefined)
|
|
36
|
+
return undefined;
|
|
37
|
+
const title = resolveOptionalTitle(parsed);
|
|
38
|
+
if (parsed.title !== undefined && title === undefined)
|
|
39
|
+
return undefined;
|
|
40
|
+
return {
|
|
41
|
+
content: resolvedContent,
|
|
42
|
+
markdown: resolvedContent,
|
|
43
|
+
title,
|
|
44
|
+
truncated: resolveTruncatedFlag(parsed),
|
|
45
|
+
};
|
|
46
|
+
}
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
import type { ExtractedArticle, ExtractedMetadata, MetadataBlock } from '../../config/types/content.js';
|
|
2
|
+
export declare function isExtractionSufficient(article: ExtractedArticle | null, originalHtml: string): boolean;
|
|
3
|
+
export declare function determineContentExtractionSource(article: ExtractedArticle | null): article is ExtractedArticle;
|
|
4
|
+
export declare function createContentMetadataBlock(url: string, article: ExtractedArticle | null, extractedMeta: ExtractedMetadata, shouldExtractFromArticle: boolean, includeMetadata: boolean): MetadataBlock | undefined;
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
const MIN_CONTENT_RATIO = 0.3;
|
|
2
|
+
const MIN_HTML_LENGTH_FOR_GATE = 100;
|
|
3
|
+
function stripHtmlTags(html) {
|
|
4
|
+
const parts = [];
|
|
5
|
+
let inTag = false;
|
|
6
|
+
for (const char of html) {
|
|
7
|
+
if (char === '<') {
|
|
8
|
+
inTag = true;
|
|
9
|
+
continue;
|
|
10
|
+
}
|
|
11
|
+
if (char === '>') {
|
|
12
|
+
inTag = false;
|
|
13
|
+
continue;
|
|
14
|
+
}
|
|
15
|
+
if (!inTag) {
|
|
16
|
+
parts.push(char);
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
return parts.join('');
|
|
20
|
+
}
|
|
21
|
+
function estimateTextLength(html) {
|
|
22
|
+
return stripHtmlTags(html).replace(/\s+/g, ' ').trim().length;
|
|
23
|
+
}
|
|
24
|
+
export function isExtractionSufficient(article, originalHtml) {
|
|
25
|
+
if (!article)
|
|
26
|
+
return false;
|
|
27
|
+
const articleLength = article.textContent.length;
|
|
28
|
+
const originalLength = estimateTextLength(originalHtml);
|
|
29
|
+
if (originalLength < MIN_HTML_LENGTH_FOR_GATE)
|
|
30
|
+
return true;
|
|
31
|
+
return articleLength / originalLength >= MIN_CONTENT_RATIO;
|
|
32
|
+
}
|
|
33
|
+
export function determineContentExtractionSource(article) {
|
|
34
|
+
return !!article;
|
|
35
|
+
}
|
|
36
|
+
function applyArticleMetadata(metadata, article) {
|
|
37
|
+
if (article.title !== undefined)
|
|
38
|
+
metadata.title = article.title;
|
|
39
|
+
if (article.byline !== undefined)
|
|
40
|
+
metadata.author = article.byline;
|
|
41
|
+
}
|
|
42
|
+
function applyExtractedMetadata(metadata, extractedMeta) {
|
|
43
|
+
if (extractedMeta.title !== undefined)
|
|
44
|
+
metadata.title = extractedMeta.title;
|
|
45
|
+
if (extractedMeta.description !== undefined) {
|
|
46
|
+
metadata.description = extractedMeta.description;
|
|
47
|
+
}
|
|
48
|
+
if (extractedMeta.author !== undefined) {
|
|
49
|
+
metadata.author = extractedMeta.author;
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
export function createContentMetadataBlock(url, article, extractedMeta, shouldExtractFromArticle, includeMetadata) {
|
|
53
|
+
if (!includeMetadata)
|
|
54
|
+
return undefined;
|
|
55
|
+
const now = new Date().toISOString();
|
|
56
|
+
const metadata = {
|
|
57
|
+
type: 'metadata',
|
|
58
|
+
url,
|
|
59
|
+
fetchedAt: now,
|
|
60
|
+
};
|
|
61
|
+
if (shouldExtractFromArticle && article) {
|
|
62
|
+
applyArticleMetadata(metadata, article);
|
|
63
|
+
return metadata;
|
|
64
|
+
}
|
|
65
|
+
applyExtractedMetadata(metadata, extractedMeta);
|
|
66
|
+
return metadata;
|
|
67
|
+
}
|
|
@@ -1,17 +1,5 @@
|
|
|
1
|
-
import type {
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
interface ContentLengthOptions {
|
|
7
|
-
readonly maxContentLength?: number;
|
|
8
|
-
}
|
|
9
|
-
interface MarkdownOptions extends ExtractionOptions, ContentLengthOptions {
|
|
10
|
-
}
|
|
11
|
-
interface MarkdownWithBlocksOptions extends ExtractionOptions, ContentLengthOptions {
|
|
12
|
-
readonly includeContentBlocks?: boolean;
|
|
13
|
-
}
|
|
14
|
-
export declare function transformHtmlToJsonl(html: string, url: string, options: ExtractionOptions & ContentLengthOptions): JsonlTransformResult;
|
|
15
|
-
export declare function transformHtmlToMarkdown(html: string, url: string, options: MarkdownOptions): MarkdownTransformResult;
|
|
16
|
-
export declare function transformHtmlToMarkdownWithBlocks(html: string, url: string, options: MarkdownWithBlocksOptions): JsonlTransformResult;
|
|
17
|
-
export {};
|
|
1
|
+
import type { ExtractedArticle, ExtractedMetadata, MarkdownTransformResult, MetadataBlock, TransformOptions } from '../../config/types/content.js';
|
|
2
|
+
export declare function isExtractionSufficient(article: ExtractedArticle | null, originalHtml: string): boolean;
|
|
3
|
+
export declare function determineContentExtractionSource(article: ExtractedArticle | null): article is ExtractedArticle;
|
|
4
|
+
export declare function createContentMetadataBlock(url: string, article: ExtractedArticle | null, extractedMeta: ExtractedMetadata, shouldExtractFromArticle: boolean, includeMetadata: boolean): MetadataBlock | undefined;
|
|
5
|
+
export declare function transformHtmlToMarkdown(html: string, url: string, options: TransformOptions): MarkdownTransformResult;
|
|
@@ -1,134 +1,154 @@
|
|
|
1
|
-
import { TRUNCATION_MARKER } from '../../config/formatting.js';
|
|
2
1
|
import { extractContent } from '../../services/extractor.js';
|
|
3
|
-
import {
|
|
4
|
-
import {
|
|
5
|
-
import {
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
}
|
|
2
|
+
import { logDebug } from '../../services/logger.js';
|
|
3
|
+
import { htmlToMarkdown } from '../../transformers/markdown.js';
|
|
4
|
+
import { tryTransformRawContent } from './raw-markdown.js';
|
|
5
|
+
const MIN_CONTENT_RATIO = 0.3;
|
|
6
|
+
const MIN_HTML_LENGTH_FOR_GATE = 100;
|
|
7
|
+
function stripHtmlTags(html) {
|
|
8
|
+
const parts = [];
|
|
9
|
+
let inTag = false;
|
|
10
|
+
for (const char of html) {
|
|
11
|
+
if (char === '<') {
|
|
12
|
+
inTag = true;
|
|
13
|
+
continue;
|
|
14
|
+
}
|
|
15
|
+
if (char === '>') {
|
|
16
|
+
inTag = false;
|
|
17
|
+
continue;
|
|
18
|
+
}
|
|
19
|
+
if (!inTag) {
|
|
20
|
+
parts.push(char);
|
|
21
|
+
}
|
|
16
22
|
}
|
|
17
|
-
|
|
18
|
-
extractArticle: options.extractMainContent,
|
|
19
|
-
});
|
|
20
|
-
const shouldExtractFromArticle = determineContentExtractionSource(options.extractMainContent, article);
|
|
21
|
-
const sourceHtml = shouldExtractFromArticle ? article.content : html;
|
|
22
|
-
const metadata = createContentMetadataBlock(url, article, extractedMeta, shouldExtractFromArticle, options.includeMetadata);
|
|
23
|
-
const title = shouldExtractFromArticle ? article.title : extractedMeta.title;
|
|
24
|
-
return { sourceHtml, title, metadata };
|
|
23
|
+
return parts.join('');
|
|
25
24
|
}
|
|
26
|
-
function
|
|
27
|
-
|
|
28
|
-
if (!match?.[1])
|
|
29
|
-
return undefined;
|
|
30
|
-
const decoded = decodeHtmlEntities(match[1]);
|
|
31
|
-
const text = sanitizeText(decoded);
|
|
32
|
-
return text || undefined;
|
|
25
|
+
function estimateTextLength(html) {
|
|
26
|
+
return stripHtmlTags(html).replace(/\s+/g, ' ').trim().length;
|
|
33
27
|
}
|
|
34
|
-
function
|
|
35
|
-
if (!
|
|
36
|
-
return
|
|
37
|
-
const
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
.replace(/'/g, "'");
|
|
43
|
-
return basicDecoded
|
|
44
|
-
.replace(/&#(\d+);/g, (match, code) => {
|
|
45
|
-
const parsed = Number.parseInt(code, 10);
|
|
46
|
-
return Number.isFinite(parsed) && parsed >= 0 && parsed <= 0x10ffff
|
|
47
|
-
? String.fromCodePoint(parsed)
|
|
48
|
-
: match;
|
|
49
|
-
})
|
|
50
|
-
.replace(/&#x([0-9a-fA-F]+);/g, (match, code) => {
|
|
51
|
-
const parsed = Number.parseInt(code, 16);
|
|
52
|
-
return Number.isFinite(parsed) && parsed >= 0 && parsed <= 0x10ffff
|
|
53
|
-
? String.fromCodePoint(parsed)
|
|
54
|
-
: match;
|
|
55
|
-
});
|
|
28
|
+
export function isExtractionSufficient(article, originalHtml) {
|
|
29
|
+
if (!article)
|
|
30
|
+
return false;
|
|
31
|
+
const articleLength = article.textContent.length;
|
|
32
|
+
const originalLength = estimateTextLength(originalHtml);
|
|
33
|
+
if (originalLength < MIN_HTML_LENGTH_FOR_GATE)
|
|
34
|
+
return true;
|
|
35
|
+
return articleLength / originalLength >= MIN_CONTENT_RATIO;
|
|
56
36
|
}
|
|
57
|
-
function
|
|
58
|
-
|
|
59
|
-
return buildJsonlPayloadFromBlocks(contentBlocks, context.metadata, maxContentLength);
|
|
37
|
+
export function determineContentExtractionSource(article) {
|
|
38
|
+
return !!article;
|
|
60
39
|
}
|
|
61
|
-
function
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
truncated,
|
|
67
|
-
};
|
|
40
|
+
function applyArticleMetadata(metadata, article) {
|
|
41
|
+
if (article.title !== undefined)
|
|
42
|
+
metadata.title = article.title;
|
|
43
|
+
if (article.byline !== undefined)
|
|
44
|
+
metadata.author = article.byline;
|
|
68
45
|
}
|
|
69
|
-
function
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
46
|
+
function applyExtractedMetadata(metadata, extractedMeta) {
|
|
47
|
+
if (extractedMeta.title !== undefined)
|
|
48
|
+
metadata.title = extractedMeta.title;
|
|
49
|
+
if (extractedMeta.description !== undefined) {
|
|
50
|
+
metadata.description = extractedMeta.description;
|
|
51
|
+
}
|
|
52
|
+
if (extractedMeta.author !== undefined) {
|
|
53
|
+
metadata.author = extractedMeta.author;
|
|
54
|
+
}
|
|
73
55
|
}
|
|
74
|
-
export function
|
|
75
|
-
if (!
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
56
|
+
export function createContentMetadataBlock(url, article, extractedMeta, shouldExtractFromArticle, includeMetadata) {
|
|
57
|
+
if (!includeMetadata)
|
|
58
|
+
return undefined;
|
|
59
|
+
const now = new Date().toISOString();
|
|
60
|
+
const metadata = {
|
|
61
|
+
type: 'metadata',
|
|
62
|
+
url,
|
|
63
|
+
fetchedAt: now,
|
|
64
|
+
};
|
|
65
|
+
if (shouldExtractFromArticle && article) {
|
|
66
|
+
applyArticleMetadata(metadata, article);
|
|
67
|
+
return metadata;
|
|
85
68
|
}
|
|
86
|
-
|
|
87
|
-
|
|
69
|
+
applyExtractedMetadata(metadata, extractedMeta);
|
|
70
|
+
return metadata;
|
|
71
|
+
}
|
|
72
|
+
function buildArticleContentSource({ url, article, extractedMeta, includeMetadata, }) {
|
|
73
|
+
const metadata = createContentMetadataBlock(url, article, extractedMeta, true, includeMetadata);
|
|
88
74
|
return {
|
|
89
|
-
content,
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
...(truncated && { truncated }),
|
|
75
|
+
sourceHtml: article.content,
|
|
76
|
+
title: article.title,
|
|
77
|
+
metadata,
|
|
93
78
|
};
|
|
94
79
|
}
|
|
95
|
-
|
|
96
|
-
const
|
|
97
|
-
const { content, truncated } = buildMarkdownPayload(context, options.maxContentLength);
|
|
80
|
+
function buildFullHtmlContentSource({ html, url, article, extractedMeta, includeMetadata, }) {
|
|
81
|
+
const metadata = createContentMetadataBlock(url, article, extractedMeta, false, includeMetadata);
|
|
98
82
|
return {
|
|
99
|
-
|
|
100
|
-
title:
|
|
101
|
-
|
|
83
|
+
sourceHtml: html,
|
|
84
|
+
title: extractedMeta.title,
|
|
85
|
+
metadata,
|
|
102
86
|
};
|
|
103
87
|
}
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
88
|
+
function logQualityGateFallback({ url, articleLength, }) {
|
|
89
|
+
logDebug('Quality gate: Readability extraction below threshold, using full HTML', {
|
|
90
|
+
url: url.substring(0, 80),
|
|
91
|
+
articleLength,
|
|
92
|
+
});
|
|
93
|
+
}
|
|
94
|
+
function tryBuildExtractedArticleContentSource({ html, url, article, extractedMeta, includeMetadata, }) {
|
|
95
|
+
if (!article)
|
|
96
|
+
return null;
|
|
97
|
+
const shouldExtractFromArticle = determineContentExtractionSource(article);
|
|
98
|
+
if (shouldExtractFromArticle && isExtractionSufficient(article, html)) {
|
|
99
|
+
return buildArticleContentSource({
|
|
100
|
+
url,
|
|
101
|
+
article,
|
|
102
|
+
extractedMeta,
|
|
103
|
+
includeMetadata,
|
|
104
|
+
});
|
|
105
|
+
}
|
|
106
|
+
if (shouldExtractFromArticle) {
|
|
107
|
+
logQualityGateFallback({
|
|
108
|
+
url,
|
|
109
|
+
articleLength: article.textContent.length,
|
|
110
|
+
});
|
|
122
111
|
}
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
112
|
+
return null;
|
|
113
|
+
}
|
|
114
|
+
function resolveContentSource({ html, url, includeMetadata, }) {
|
|
115
|
+
const { article, metadata: extractedMeta } = extractContent(html, url, {
|
|
116
|
+
extractArticle: true,
|
|
117
|
+
});
|
|
118
|
+
const extracted = tryBuildExtractedArticleContentSource({
|
|
119
|
+
html,
|
|
120
|
+
url,
|
|
121
|
+
article,
|
|
122
|
+
extractedMeta,
|
|
123
|
+
includeMetadata,
|
|
124
|
+
});
|
|
125
|
+
if (extracted)
|
|
126
|
+
return extracted;
|
|
127
|
+
return buildFullHtmlContentSource({
|
|
128
|
+
html,
|
|
129
|
+
url,
|
|
130
|
+
article,
|
|
131
|
+
extractedMeta,
|
|
132
|
+
includeMetadata,
|
|
133
|
+
});
|
|
134
|
+
}
|
|
135
|
+
export function transformHtmlToMarkdown(html, url, options) {
|
|
136
|
+
const raw = tryTransformRawContent({
|
|
137
|
+
html,
|
|
138
|
+
url,
|
|
139
|
+
includeMetadata: options.includeMetadata,
|
|
140
|
+
});
|
|
141
|
+
if (raw)
|
|
142
|
+
return raw;
|
|
143
|
+
const context = resolveContentSource({
|
|
144
|
+
html,
|
|
145
|
+
url,
|
|
146
|
+
includeMetadata: options.includeMetadata,
|
|
147
|
+
});
|
|
148
|
+
const content = htmlToMarkdown(context.sourceHtml, context.metadata);
|
|
128
149
|
return {
|
|
129
|
-
content,
|
|
130
|
-
contentBlocks: contentBlocks.length,
|
|
150
|
+
markdown: content,
|
|
131
151
|
title: context.title,
|
|
132
|
-
|
|
152
|
+
truncated: false,
|
|
133
153
|
};
|
|
134
154
|
}
|
|
@@ -1,10 +1,2 @@
|
|
|
1
1
|
import type { FetchPipelineOptions, PipelineResult } from '../../config/types/runtime.js';
|
|
2
|
-
/**
|
|
3
|
-
* Unified fetch pipeline that handles caching, fetching, and transformation.
|
|
4
|
-
* Implements cache-first strategy with automatic serialization.
|
|
5
|
-
*
|
|
6
|
-
* @template T - Type of the transformed result
|
|
7
|
-
* @param options - Pipeline configuration options
|
|
8
|
-
* @returns Promise resolving to the pipeline result
|
|
9
|
-
*/
|
|
10
2
|
export declare function executeFetchPipeline<T>(options: FetchPipelineOptions<T>): Promise<PipelineResult<T>>;
|