@j0hanz/superfetch 1.2.0 → 1.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +18 -204
- package/dist/config/constants.d.ts +0 -1
- package/dist/config/constants.js +0 -1
- package/dist/config/formatting.d.ts +0 -1
- package/dist/config/formatting.d.ts.map +1 -1
- package/dist/config/formatting.js +1 -3
- package/dist/config/formatting.js.map +1 -1
- package/dist/config/index.d.ts +1 -2
- package/dist/config/index.d.ts.map +1 -1
- package/dist/config/index.js +5 -11
- package/dist/config/index.js.map +1 -1
- package/dist/config/types/content.d.ts +0 -1
- package/dist/config/types/content.js +0 -1
- package/dist/config/types/runtime.d.ts +0 -1
- package/dist/config/types/runtime.js +0 -1
- package/dist/config/types/tools.d.ts +0 -1
- package/dist/config/types/tools.js +0 -1
- package/dist/errors/app-error.d.ts +0 -1
- package/dist/errors/app-error.js +0 -1
- package/dist/http/auth.d.ts +0 -1
- package/dist/http/auth.js +0 -1
- package/dist/http/cors.d.ts +0 -1
- package/dist/http/cors.js +0 -1
- package/dist/http/download-routes.d.ts +1 -2
- package/dist/http/download-routes.d.ts.map +1 -1
- package/dist/http/download-routes.js +2 -2
- package/dist/http/mcp-routes.d.ts +0 -1
- package/dist/http/mcp-routes.js +0 -1
- package/dist/http/mcp-session-helpers.d.ts +0 -1
- package/dist/http/mcp-session-helpers.js +0 -1
- package/dist/http/mcp-session.d.ts +1 -2
- package/dist/http/mcp-session.d.ts.map +1 -1
- package/dist/http/mcp-session.js +0 -1
- package/dist/http/mcp-validation.d.ts +1 -2
- package/dist/http/mcp-validation.d.ts.map +1 -1
- package/dist/http/mcp-validation.js +6 -27
- package/dist/http/mcp-validation.js.map +1 -1
- package/dist/http/rate-limit.d.ts +1 -2
- package/dist/http/rate-limit.d.ts.map +1 -1
- package/dist/http/rate-limit.js +0 -1
- package/dist/http/rate-limit.js.map +1 -1
- package/dist/http/server-middleware.d.ts +0 -1
- package/dist/http/server-middleware.d.ts.map +1 -1
- package/dist/http/server-middleware.js +60 -4
- package/dist/http/server-middleware.js.map +1 -1
- package/dist/http/server.d.ts +0 -1
- package/dist/http/server.js +0 -1
- package/dist/http/session-cleanup.d.ts +0 -1
- package/dist/http/session-cleanup.js +0 -1
- package/dist/http/sessions.d.ts +1 -2
- package/dist/http/sessions.d.ts.map +1 -1
- package/dist/http/sessions.js +0 -1
- package/dist/index.d.ts +0 -1
- package/dist/index.js +0 -1
- package/dist/middleware/error-handler.d.ts +0 -1
- package/dist/middleware/error-handler.js +0 -1
- package/dist/resources/cached-content.d.ts +0 -1
- package/dist/resources/cached-content.js +0 -1
- package/dist/resources/index.d.ts +0 -1
- package/dist/resources/index.js +0 -1
- package/dist/server.d.ts +0 -1
- package/dist/server.d.ts.map +1 -1
- package/dist/server.js +8 -3
- package/dist/server.js.map +1 -1
- package/dist/services/cache.d.ts +2 -2
- package/dist/services/cache.d.ts.map +1 -1
- package/dist/services/cache.js +4 -2
- package/dist/services/context.d.ts +2 -1
- package/dist/services/context.js +10 -1
- package/dist/services/extractor.d.ts +1 -2
- package/dist/services/extractor.d.ts.map +1 -1
- package/dist/services/extractor.js +0 -1
- package/dist/services/fetcher/agents.d.ts +0 -1
- package/dist/services/fetcher/agents.js +55 -2
- package/dist/services/fetcher/errors.d.ts +0 -1
- package/dist/services/fetcher/errors.js +0 -1
- package/dist/services/fetcher/interceptors.d.ts +0 -1
- package/dist/services/fetcher/interceptors.js +44 -24
- package/dist/services/fetcher/redirects.d.ts +0 -1
- package/dist/services/fetcher/redirects.js +2 -3
- package/dist/services/fetcher/response.d.ts +1 -2
- package/dist/services/fetcher/response.js +28 -16
- package/dist/services/fetcher/retry-policy.d.ts +0 -1
- package/dist/services/fetcher/retry-policy.js +0 -1
- package/dist/services/fetcher.d.ts +2 -2
- package/dist/services/fetcher.d.ts.map +1 -1
- package/dist/services/fetcher.js +9 -7
- package/dist/services/fetcher.js.map +1 -1
- package/dist/services/logger.d.ts +1 -2
- package/dist/services/logger.d.ts.map +1 -1
- package/dist/services/logger.js +0 -1
- package/dist/services/parser.d.ts +5 -2
- package/dist/services/parser.d.ts.map +1 -1
- package/dist/services/parser.js +72 -3
- package/dist/tools/handlers/fetch-markdown.tool.d.ts +1 -2
- package/dist/tools/handlers/fetch-markdown.tool.d.ts.map +1 -1
- package/dist/tools/handlers/fetch-markdown.tool.js +34 -3
- package/dist/tools/handlers/fetch-markdown.tool.js.map +1 -1
- package/dist/tools/handlers/fetch-single.shared.d.ts +5 -3
- package/dist/tools/handlers/fetch-single.shared.d.ts.map +1 -1
- package/dist/tools/handlers/fetch-single.shared.js +8 -4
- package/dist/tools/handlers/fetch-single.shared.js.map +1 -1
- package/dist/tools/handlers/fetch-url.tool.d.ts +1 -2
- package/dist/tools/handlers/fetch-url.tool.d.ts.map +1 -1
- package/dist/tools/handlers/fetch-url.tool.js +36 -3
- package/dist/tools/index.d.ts +0 -1
- package/dist/tools/index.js +0 -1
- package/dist/tools/schemas.d.ts +32 -33
- package/dist/tools/schemas.js +21 -5
- package/dist/tools/utils/cache-vary.d.ts +0 -1
- package/dist/tools/utils/cache-vary.d.ts.map +1 -1
- package/dist/tools/utils/cache-vary.js +7 -25
- package/dist/tools/utils/cache-vary.js.map +1 -1
- package/dist/tools/utils/common.d.ts +2 -3
- package/dist/tools/utils/common.d.ts.map +1 -1
- package/dist/tools/utils/common.js +6 -7
- package/dist/tools/utils/common.js.map +1 -1
- package/dist/tools/utils/content-transform.d.ts +1 -3
- package/dist/tools/utils/content-transform.d.ts.map +1 -1
- package/dist/tools/utils/content-transform.js +73 -8
- package/dist/tools/utils/content-transform.js.map +1 -1
- package/dist/tools/utils/fetch-pipeline.d.ts +1 -2
- package/dist/tools/utils/fetch-pipeline.d.ts.map +1 -1
- package/dist/tools/utils/fetch-pipeline.js +18 -22
- package/dist/tools/utils/inline-content.d.ts +0 -1
- package/dist/tools/utils/inline-content.js +0 -1
- package/dist/transformers/jsonl.transformer.d.ts +1 -2
- package/dist/transformers/jsonl.transformer.d.ts.map +1 -1
- package/dist/transformers/jsonl.transformer.js +0 -1
- package/dist/transformers/jsonl.transformer.js.map +1 -1
- package/dist/transformers/markdown.transformer.d.ts +1 -2
- package/dist/transformers/markdown.transformer.d.ts.map +1 -1
- package/dist/transformers/markdown.transformer.js +8 -1
- package/dist/utils/code-language.d.ts +0 -1
- package/dist/utils/code-language.js +0 -1
- package/dist/utils/content-cleaner.d.ts +0 -1
- package/dist/utils/content-cleaner.js +0 -1
- package/dist/utils/crypto.d.ts +0 -1
- package/dist/utils/crypto.js +0 -1
- package/dist/utils/download-url.d.ts +1 -2
- package/dist/utils/download-url.d.ts.map +1 -1
- package/dist/utils/download-url.js +0 -1
- package/dist/utils/error-utils.d.ts +0 -1
- package/dist/utils/error-utils.js +1 -3
- package/dist/utils/filename-generator.d.ts +0 -1
- package/dist/utils/filename-generator.js +0 -1
- package/dist/utils/header-normalizer.d.ts +0 -1
- package/dist/utils/header-normalizer.js +0 -1
- package/dist/utils/html-truncator.d.ts +0 -1
- package/dist/utils/html-truncator.js +0 -1
- package/dist/utils/sanitizer.d.ts +0 -1
- package/dist/utils/sanitizer.js +0 -1
- package/dist/utils/tool-error-handler.d.ts +1 -2
- package/dist/utils/tool-error-handler.d.ts.map +1 -1
- package/dist/utils/tool-error-handler.js +0 -1
- package/dist/utils/url-validator.d.ts +6 -3
- package/dist/utils/url-validator.d.ts.map +1 -1
- package/dist/utils/url-validator.js +49 -11
- package/dist/utils/url-validator.js.map +1 -1
- package/package.json +4 -4
package/dist/tools/schemas.d.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { z } from 'zod';
|
|
2
2
|
export declare const fetchUrlInputSchema: z.ZodObject<{
|
|
3
|
-
customHeaders: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString
|
|
3
|
+
customHeaders: z.ZodOptional<z.ZodEffects<z.ZodRecord<z.ZodString, z.ZodString>, Record<string, string>, Record<string, string>>>;
|
|
4
4
|
timeout: z.ZodDefault<z.ZodNumber>;
|
|
5
5
|
retries: z.ZodDefault<z.ZodNumber>;
|
|
6
6
|
} & {
|
|
@@ -13,25 +13,25 @@ export declare const fetchUrlInputSchema: z.ZodObject<{
|
|
|
13
13
|
format: z.ZodDefault<z.ZodEnum<["jsonl", "markdown"]>>;
|
|
14
14
|
}, "strict", z.ZodTypeAny, {
|
|
15
15
|
url: string;
|
|
16
|
+
timeout: number;
|
|
17
|
+
retries: number;
|
|
16
18
|
extractMainContent: boolean;
|
|
17
19
|
includeMetadata: boolean;
|
|
18
20
|
format: "jsonl" | "markdown";
|
|
19
|
-
timeout: number;
|
|
20
|
-
retries: number;
|
|
21
|
-
maxContentLength?: number | undefined;
|
|
22
21
|
customHeaders?: Record<string, string> | undefined;
|
|
22
|
+
maxContentLength?: number | undefined;
|
|
23
23
|
}, {
|
|
24
24
|
url: string;
|
|
25
|
+
customHeaders?: Record<string, string> | undefined;
|
|
26
|
+
timeout?: number | undefined;
|
|
27
|
+
retries?: number | undefined;
|
|
25
28
|
extractMainContent?: boolean | undefined;
|
|
26
29
|
includeMetadata?: boolean | undefined;
|
|
27
30
|
maxContentLength?: number | undefined;
|
|
28
31
|
format?: "jsonl" | "markdown" | undefined;
|
|
29
|
-
customHeaders?: Record<string, string> | undefined;
|
|
30
|
-
timeout?: number | undefined;
|
|
31
|
-
retries?: number | undefined;
|
|
32
32
|
}>;
|
|
33
33
|
export declare const fetchMarkdownInputSchema: z.ZodObject<{
|
|
34
|
-
customHeaders: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString
|
|
34
|
+
customHeaders: z.ZodOptional<z.ZodEffects<z.ZodRecord<z.ZodString, z.ZodString>, Record<string, string>, Record<string, string>>>;
|
|
35
35
|
timeout: z.ZodDefault<z.ZodNumber>;
|
|
36
36
|
retries: z.ZodDefault<z.ZodNumber>;
|
|
37
37
|
} & {
|
|
@@ -42,20 +42,20 @@ export declare const fetchMarkdownInputSchema: z.ZodObject<{
|
|
|
42
42
|
maxContentLength: z.ZodOptional<z.ZodNumber>;
|
|
43
43
|
}, "strict", z.ZodTypeAny, {
|
|
44
44
|
url: string;
|
|
45
|
-
extractMainContent: boolean;
|
|
46
|
-
includeMetadata: boolean;
|
|
47
45
|
timeout: number;
|
|
48
46
|
retries: number;
|
|
49
|
-
|
|
47
|
+
extractMainContent: boolean;
|
|
48
|
+
includeMetadata: boolean;
|
|
50
49
|
customHeaders?: Record<string, string> | undefined;
|
|
50
|
+
maxContentLength?: number | undefined;
|
|
51
51
|
}, {
|
|
52
52
|
url: string;
|
|
53
|
-
extractMainContent?: boolean | undefined;
|
|
54
|
-
includeMetadata?: boolean | undefined;
|
|
55
|
-
maxContentLength?: number | undefined;
|
|
56
53
|
customHeaders?: Record<string, string> | undefined;
|
|
57
54
|
timeout?: number | undefined;
|
|
58
55
|
retries?: number | undefined;
|
|
56
|
+
extractMainContent?: boolean | undefined;
|
|
57
|
+
includeMetadata?: boolean | undefined;
|
|
58
|
+
maxContentLength?: number | undefined;
|
|
59
59
|
}>;
|
|
60
60
|
export declare const fetchUrlOutputSchema: z.ZodObject<{
|
|
61
61
|
url: z.ZodString;
|
|
@@ -75,31 +75,31 @@ export declare const fetchUrlOutputSchema: z.ZodObject<{
|
|
|
75
75
|
}, "strict", z.ZodTypeAny, {
|
|
76
76
|
url: string;
|
|
77
77
|
format: "jsonl" | "markdown";
|
|
78
|
+
contentBlocks: number;
|
|
78
79
|
fetchedAt: string;
|
|
79
80
|
cached: boolean;
|
|
80
|
-
contentBlocks: number;
|
|
81
81
|
error?: string | undefined;
|
|
82
|
-
content?: string | undefined;
|
|
83
|
-
errorCode?: string | undefined;
|
|
84
82
|
title?: string | undefined;
|
|
83
|
+
content?: string | undefined;
|
|
85
84
|
contentSize?: number | undefined;
|
|
86
|
-
truncated?: boolean | undefined;
|
|
87
85
|
resourceUri?: string | undefined;
|
|
88
86
|
resourceMimeType?: string | undefined;
|
|
87
|
+
truncated?: boolean | undefined;
|
|
88
|
+
errorCode?: string | undefined;
|
|
89
89
|
}, {
|
|
90
90
|
url: string;
|
|
91
91
|
format: "jsonl" | "markdown";
|
|
92
|
+
contentBlocks: number;
|
|
92
93
|
fetchedAt: string;
|
|
93
94
|
cached: boolean;
|
|
94
|
-
contentBlocks: number;
|
|
95
95
|
error?: string | undefined;
|
|
96
|
-
content?: string | undefined;
|
|
97
|
-
errorCode?: string | undefined;
|
|
98
96
|
title?: string | undefined;
|
|
97
|
+
content?: string | undefined;
|
|
99
98
|
contentSize?: number | undefined;
|
|
100
|
-
truncated?: boolean | undefined;
|
|
101
99
|
resourceUri?: string | undefined;
|
|
102
100
|
resourceMimeType?: string | undefined;
|
|
101
|
+
truncated?: boolean | undefined;
|
|
102
|
+
errorCode?: string | undefined;
|
|
103
103
|
}>;
|
|
104
104
|
export declare const fetchMarkdownOutputSchema: z.ZodObject<{
|
|
105
105
|
url: z.ZodString;
|
|
@@ -111,12 +111,12 @@ export declare const fetchMarkdownOutputSchema: z.ZodObject<{
|
|
|
111
111
|
fileName: z.ZodString;
|
|
112
112
|
expiresAt: z.ZodString;
|
|
113
113
|
}, "strip", z.ZodTypeAny, {
|
|
114
|
-
fileName: string;
|
|
115
114
|
downloadUrl: string;
|
|
115
|
+
fileName: string;
|
|
116
116
|
expiresAt: string;
|
|
117
117
|
}, {
|
|
118
|
-
fileName: string;
|
|
119
118
|
downloadUrl: string;
|
|
119
|
+
fileName: string;
|
|
120
120
|
expiresAt: string;
|
|
121
121
|
}>>;
|
|
122
122
|
} & {
|
|
@@ -133,33 +133,32 @@ export declare const fetchMarkdownOutputSchema: z.ZodObject<{
|
|
|
133
133
|
cached: boolean;
|
|
134
134
|
error?: string | undefined;
|
|
135
135
|
markdown?: string | undefined;
|
|
136
|
-
errorCode?: string | undefined;
|
|
137
136
|
title?: string | undefined;
|
|
138
137
|
contentSize?: number | undefined;
|
|
138
|
+
resourceUri?: string | undefined;
|
|
139
|
+
resourceMimeType?: string | undefined;
|
|
139
140
|
truncated?: boolean | undefined;
|
|
141
|
+
errorCode?: string | undefined;
|
|
140
142
|
file?: {
|
|
141
|
-
fileName: string;
|
|
142
143
|
downloadUrl: string;
|
|
144
|
+
fileName: string;
|
|
143
145
|
expiresAt: string;
|
|
144
146
|
} | undefined;
|
|
145
|
-
resourceUri?: string | undefined;
|
|
146
|
-
resourceMimeType?: string | undefined;
|
|
147
147
|
}, {
|
|
148
148
|
url: string;
|
|
149
149
|
fetchedAt: string;
|
|
150
150
|
cached: boolean;
|
|
151
151
|
error?: string | undefined;
|
|
152
152
|
markdown?: string | undefined;
|
|
153
|
-
errorCode?: string | undefined;
|
|
154
153
|
title?: string | undefined;
|
|
155
154
|
contentSize?: number | undefined;
|
|
155
|
+
resourceUri?: string | undefined;
|
|
156
|
+
resourceMimeType?: string | undefined;
|
|
156
157
|
truncated?: boolean | undefined;
|
|
158
|
+
errorCode?: string | undefined;
|
|
157
159
|
file?: {
|
|
158
|
-
fileName: string;
|
|
159
160
|
downloadUrl: string;
|
|
161
|
+
fileName: string;
|
|
160
162
|
expiresAt: string;
|
|
161
163
|
} | undefined;
|
|
162
|
-
resourceUri?: string | undefined;
|
|
163
|
-
resourceMimeType?: string | undefined;
|
|
164
164
|
}>;
|
|
165
|
-
//# sourceMappingURL=schemas.d.ts.map
|
package/dist/tools/schemas.js
CHANGED
|
@@ -1,8 +1,16 @@
|
|
|
1
1
|
import { z } from 'zod';
|
|
2
2
|
import { config } from '../config/index.js';
|
|
3
|
+
const MAX_HEADER_NAME_LENGTH = 128;
|
|
4
|
+
const MAX_HEADER_VALUE_LENGTH = 2048;
|
|
5
|
+
const MAX_HEADER_COUNT = 50;
|
|
6
|
+
const MAX_CONTENT_LENGTH = config.constants.maxContentSize;
|
|
7
|
+
const customHeadersSchema = z
|
|
8
|
+
.record(z.string().max(MAX_HEADER_NAME_LENGTH), z.string().max(MAX_HEADER_VALUE_LENGTH))
|
|
9
|
+
.refine((headers) => Object.keys(headers).length <= MAX_HEADER_COUNT, {
|
|
10
|
+
message: `customHeaders must have at most ${MAX_HEADER_COUNT} entries`,
|
|
11
|
+
});
|
|
3
12
|
const requestOptionsSchema = z.object({
|
|
4
|
-
customHeaders:
|
|
5
|
-
.record(z.string())
|
|
13
|
+
customHeaders: customHeadersSchema
|
|
6
14
|
.optional()
|
|
7
15
|
.describe('Custom HTTP headers for the request'),
|
|
8
16
|
timeout: z
|
|
@@ -30,6 +38,7 @@ const extractionOptionsSchema = z.object({
|
|
|
30
38
|
maxContentLength: z
|
|
31
39
|
.number()
|
|
32
40
|
.positive()
|
|
41
|
+
.max(MAX_CONTENT_LENGTH)
|
|
33
42
|
.optional()
|
|
34
43
|
.describe('Maximum content length in characters'),
|
|
35
44
|
});
|
|
@@ -64,14 +73,22 @@ const fileDownloadSchema = z.object({
|
|
|
64
73
|
});
|
|
65
74
|
export const fetchUrlInputSchema = requestOptionsSchema
|
|
66
75
|
.extend({
|
|
67
|
-
url: z
|
|
76
|
+
url: z
|
|
77
|
+
.string()
|
|
78
|
+
.min(1)
|
|
79
|
+
.max(config.constants.maxUrlLength)
|
|
80
|
+
.describe('The URL to fetch'),
|
|
68
81
|
})
|
|
69
82
|
.merge(extractionOptionsSchema)
|
|
70
83
|
.merge(formatOptionsSchema)
|
|
71
84
|
.strict();
|
|
72
85
|
export const fetchMarkdownInputSchema = requestOptionsSchema
|
|
73
86
|
.extend({
|
|
74
|
-
url: z
|
|
87
|
+
url: z
|
|
88
|
+
.string()
|
|
89
|
+
.min(1)
|
|
90
|
+
.max(config.constants.maxUrlLength)
|
|
91
|
+
.describe('The URL to fetch'),
|
|
75
92
|
})
|
|
76
93
|
.merge(extractionOptionsSchema)
|
|
77
94
|
.strict();
|
|
@@ -110,4 +127,3 @@ export const fetchMarkdownOutputSchema = z
|
|
|
110
127
|
})
|
|
111
128
|
.merge(resourceFieldsSchema)
|
|
112
129
|
.strict();
|
|
113
|
-
//# sourceMappingURL=schemas.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"cache-vary.d.ts","sourceRoot":"","sources":["../../../src/tools/utils/cache-vary.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"cache-vary.d.ts","sourceRoot":"","sources":["../../../src/tools/utils/cache-vary.ts"],"names":[],"mappings":"AAIA,wBAAgB,gBAAgB,CAC9B,SAAS,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,MAAM,GAAG,SAAS,EACvD,aAAa,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,GACrC,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,MAAM,GAAG,SAAS,CAY9C"}
|
|
@@ -1,30 +1,12 @@
|
|
|
1
1
|
import { config } from '../../config/index.js';
|
|
2
2
|
import { normalizeHeaderRecord } from '../../utils/header-normalizer.js';
|
|
3
|
-
function normalizeHeadersForCache(headers) {
|
|
4
|
-
return normalizeHeaderRecord(headers, config.security.blockedHeaders, {
|
|
5
|
-
trimValues: true,
|
|
6
|
-
});
|
|
7
|
-
}
|
|
8
3
|
export function appendHeaderVary(cacheVary, customHeaders) {
|
|
9
|
-
const
|
|
10
|
-
|
|
11
|
-
}
|
|
12
|
-
function mergeCacheVary(cacheVary, headerVary) {
|
|
13
|
-
if (!cacheVary && !headerVary)
|
|
14
|
-
return undefined;
|
|
15
|
-
if (typeof cacheVary === 'string') {
|
|
16
|
-
return buildStringVary(cacheVary, headerVary);
|
|
17
|
-
}
|
|
18
|
-
return mergeObjectVary(cacheVary, headerVary);
|
|
19
|
-
}
|
|
20
|
-
function mergeObjectVary(cacheVary, headerVary) {
|
|
21
|
-
if (!headerVary)
|
|
4
|
+
const headers = normalizeHeaderRecord(customHeaders, config.security.blockedHeaders, { trimValues: true });
|
|
5
|
+
if (!headers)
|
|
22
6
|
return cacheVary;
|
|
23
|
-
|
|
24
|
-
}
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
return { key, headers: headerVary };
|
|
7
|
+
if (!cacheVary)
|
|
8
|
+
return { headers };
|
|
9
|
+
return typeof cacheVary === 'string'
|
|
10
|
+
? { key: cacheVary, headers }
|
|
11
|
+
: { ...cacheVary, headers };
|
|
29
12
|
}
|
|
30
|
-
//# sourceMappingURL=cache-vary.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"cache-vary.js","sourceRoot":"","sources":["../../../src/tools/utils/cache-vary.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,EAAE,MAAM,uBAAuB,CAAC;AAE/C,OAAO,EAAE,qBAAqB,EAAE,MAAM,kCAAkC,CAAC;AAEzE,
|
|
1
|
+
{"version":3,"file":"cache-vary.js","sourceRoot":"","sources":["../../../src/tools/utils/cache-vary.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,EAAE,MAAM,uBAAuB,CAAC;AAE/C,OAAO,EAAE,qBAAqB,EAAE,MAAM,kCAAkC,CAAC;AAEzE,MAAM,UAAU,gBAAgB,CAC9B,SAAuD,EACvD,aAAsC;IAEtC,MAAM,OAAO,GAAG,qBAAqB,CACnC,aAAa,EACb,MAAM,CAAC,QAAQ,CAAC,cAAc,EAC9B,EAAE,UAAU,EAAE,IAAI,EAAE,CACrB,CAAC;IAEF,IAAI,CAAC,OAAO;QAAE,OAAO,SAAS,CAAC;IAC/B,IAAI,CAAC,SAAS;QAAE,OAAO,EAAE,OAAO,EAAE,CAAC;IACnC,OAAO,OAAO,SAAS,KAAK,QAAQ;QAClC,CAAC,CAAC,EAAE,GAAG,EAAE,SAAS,EAAE,OAAO,EAAE;QAC7B,CAAC,CAAC,EAAE,GAAG,SAAS,EAAE,OAAO,EAAE,CAAC;AAChC,CAAC"}
|
|
@@ -1,6 +1,5 @@
|
|
|
1
|
-
import type { ExtractedArticle, ExtractedMetadata, MetadataBlock
|
|
1
|
+
import type { ExtractedArticle, ExtractedMetadata, MetadataBlock } from '../../config/types/content.js';
|
|
2
|
+
import type { TruncationResult } from '../../config/types/runtime.js';
|
|
2
3
|
export declare function determineContentExtractionSource(extractMainContent: boolean, article: ExtractedArticle | null): article is ExtractedArticle;
|
|
3
4
|
export declare function createContentMetadataBlock(url: string, article: ExtractedArticle | null, extractedMeta: ExtractedMetadata, shouldExtractFromArticle: boolean, includeMetadata: boolean): MetadataBlock | undefined;
|
|
4
|
-
export declare function enforceContentLengthLimit(content: string, maxLength?: number): TruncationResult;
|
|
5
5
|
export declare function truncateContent(content: string, maxLength?: number, suffix?: string): TruncationResult;
|
|
6
|
-
//# sourceMappingURL=common.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"common.d.ts","sourceRoot":"","sources":["../../../src/tools/utils/common.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EACV,gBAAgB,EAChB,iBAAiB,EACjB,aAAa,
|
|
1
|
+
{"version":3,"file":"common.d.ts","sourceRoot":"","sources":["../../../src/tools/utils/common.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EACV,gBAAgB,EAChB,iBAAiB,EACjB,aAAa,EACd,MAAM,+BAA+B,CAAC;AACvC,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,+BAA+B,CAAC;AAEtE,wBAAgB,gCAAgC,CAC9C,kBAAkB,EAAE,OAAO,EAC3B,OAAO,EAAE,gBAAgB,GAAG,IAAI,GAC/B,OAAO,IAAI,gBAAgB,CAE7B;AAED,wBAAgB,0BAA0B,CACxC,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,aAAa,EAAE,iBAAiB,EAChC,wBAAwB,EAAE,OAAO,EACjC,eAAe,EAAE,OAAO,GACvB,aAAa,GAAG,SAAS,CAmB3B;AAED,wBAAgB,eAAe,CAC7B,OAAO,EAAE,MAAM,EACf,SAAS,CAAC,EAAE,MAAM,EAClB,MAAM,SAAoB,GACzB,gBAAgB,CAYlB"}
|
|
@@ -23,17 +23,16 @@ export function createContentMetadataBlock(url, article, extractedMeta, shouldEx
|
|
|
23
23
|
fetchedAt: now,
|
|
24
24
|
};
|
|
25
25
|
}
|
|
26
|
-
export function enforceContentLengthLimit(content, maxLength) {
|
|
27
|
-
return truncateContent(content, maxLength);
|
|
28
|
-
}
|
|
29
26
|
export function truncateContent(content, maxLength, suffix = TRUNCATION_MARKER) {
|
|
30
|
-
|
|
31
|
-
|
|
27
|
+
if (maxLength === undefined ||
|
|
28
|
+
maxLength <= 0 ||
|
|
29
|
+
content.length <= maxLength) {
|
|
32
30
|
return { content, truncated: false };
|
|
33
31
|
}
|
|
32
|
+
const safeMax = Math.max(0, maxLength - suffix.length);
|
|
33
|
+
const marker = suffix.length > maxLength ? suffix.substring(0, maxLength) : suffix;
|
|
34
34
|
return {
|
|
35
|
-
content: `${content.substring(0,
|
|
35
|
+
content: `${content.substring(0, safeMax)}${marker}`,
|
|
36
36
|
truncated: true,
|
|
37
37
|
};
|
|
38
38
|
}
|
|
39
|
-
//# sourceMappingURL=common.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"common.js","sourceRoot":"","sources":["../../../src/tools/utils/common.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,iBAAiB,EAAE,MAAM,4BAA4B,CAAC;AAQ/D,MAAM,UAAU,gCAAgC,CAC9C,kBAA2B,EAC3B,OAAgC;IAEhC,OAAO,kBAAkB,IAAI,CAAC,CAAC,OAAO,CAAC;AACzC,CAAC;AAED,MAAM,UAAU,0BAA0B,CACxC,GAAW,EACX,OAAgC,EAChC,aAAgC,EAChC,wBAAiC,EACjC,eAAwB;IAExB,IAAI,CAAC,eAAe;QAAE,OAAO,SAAS,CAAC;IACvC,MAAM,GAAG,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;IACrC,OAAO,wBAAwB,IAAI,OAAO;QACxC,CAAC,CAAC;YACE,IAAI,EAAE,UAAU;YAChB,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,MAAM,EAAE,OAAO,CAAC,MAAM;YACtB,GAAG;YACH,SAAS,EAAE,GAAG;SACf;QACH,CAAC,CAAC;YACE,IAAI,EAAE,UAAU;YAChB,KAAK,EAAE,aAAa,CAAC,KAAK;YAC1B,WAAW,EAAE,aAAa,CAAC,WAAW;YACtC,MAAM,EAAE,aAAa,CAAC,MAAM;YAC5B,GAAG;YACH,SAAS,EAAE,GAAG;SACf,CAAC;AACR,CAAC;AAED,MAAM,UAAU,
|
|
1
|
+
{"version":3,"file":"common.js","sourceRoot":"","sources":["../../../src/tools/utils/common.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,iBAAiB,EAAE,MAAM,4BAA4B,CAAC;AAQ/D,MAAM,UAAU,gCAAgC,CAC9C,kBAA2B,EAC3B,OAAgC;IAEhC,OAAO,kBAAkB,IAAI,CAAC,CAAC,OAAO,CAAC;AACzC,CAAC;AAED,MAAM,UAAU,0BAA0B,CACxC,GAAW,EACX,OAAgC,EAChC,aAAgC,EAChC,wBAAiC,EACjC,eAAwB;IAExB,IAAI,CAAC,eAAe;QAAE,OAAO,SAAS,CAAC;IACvC,MAAM,GAAG,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;IACrC,OAAO,wBAAwB,IAAI,OAAO;QACxC,CAAC,CAAC;YACE,IAAI,EAAE,UAAU;YAChB,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,MAAM,EAAE,OAAO,CAAC,MAAM;YACtB,GAAG;YACH,SAAS,EAAE,GAAG;SACf;QACH,CAAC,CAAC;YACE,IAAI,EAAE,UAAU;YAChB,KAAK,EAAE,aAAa,CAAC,KAAK;YAC1B,WAAW,EAAE,aAAa,CAAC,WAAW;YACtC,MAAM,EAAE,aAAa,CAAC,MAAM;YAC5B,GAAG;YACH,SAAS,EAAE,GAAG;SACf,CAAC;AACR,CAAC;AAED,MAAM,UAAU,eAAe,CAC7B,OAAe,EACf,SAAkB,EAClB,MAAM,GAAG,iBAAiB;IAE1B,MAAM,cAAc,GAClB,SAAS,KAAK,SAAS,IAAI,SAAS,GAAG,CAAC,IAAI,OAAO,CAAC,MAAM,GAAG,SAAS,CAAC;IAEzE,IAAI,CAAC,cAAc,EAAE,CAAC;QACpB,OAAO,EAAE,OAAO,EAAE,SAAS,EAAE,KAAK,EAAE,CAAC;IACvC,CAAC;IAED,OAAO;QACL,OAAO,EAAE,GAAG,OAAO,CAAC,SAAS,CAAC,CAAC,EAAE,SAAS,CAAC,GAAG,MAAM,EAAE;QACtD,SAAS,EAAE,IAAI;KAChB,CAAC;AACJ,CAAC"}
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import type { JsonlTransformResult, MarkdownTransformResult } from '../../config/types.js';
|
|
1
|
+
import type { JsonlTransformResult, MarkdownTransformResult } from '../../config/types/content.js';
|
|
2
2
|
interface ExtractionOptions {
|
|
3
3
|
readonly extractMainContent: boolean;
|
|
4
4
|
readonly includeMetadata: boolean;
|
|
@@ -7,10 +7,8 @@ interface ContentLengthOptions {
|
|
|
7
7
|
readonly maxContentLength?: number;
|
|
8
8
|
}
|
|
9
9
|
interface MarkdownOptions extends ExtractionOptions, ContentLengthOptions {
|
|
10
|
-
readonly generateToc?: boolean;
|
|
11
10
|
}
|
|
12
11
|
export declare function transformHtmlToJsonl(html: string, url: string, options: ExtractionOptions & ContentLengthOptions): JsonlTransformResult;
|
|
13
12
|
export declare function transformHtmlToMarkdown(html: string, url: string, options: MarkdownOptions): MarkdownTransformResult;
|
|
14
13
|
export declare function transformHtmlToMarkdownWithBlocks(html: string, url: string, options: ExtractionOptions & ContentLengthOptions): JsonlTransformResult;
|
|
15
14
|
export {};
|
|
16
|
-
//# sourceMappingURL=content-transform.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"content-transform.d.ts","sourceRoot":"","sources":["../../../src/tools/utils/content-transform.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EACV,oBAAoB,EACpB,uBAAuB,EACxB,MAAM,
|
|
1
|
+
{"version":3,"file":"content-transform.d.ts","sourceRoot":"","sources":["../../../src/tools/utils/content-transform.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EACV,oBAAoB,EACpB,uBAAuB,EACxB,MAAM,+BAA+B,CAAC;AAcvC,UAAU,iBAAiB;IACzB,QAAQ,CAAC,kBAAkB,EAAE,OAAO,CAAC;IACrC,QAAQ,CAAC,eAAe,EAAE,OAAO,CAAC;CACnC;AAQD,UAAU,oBAAoB;IAC5B,QAAQ,CAAC,gBAAgB,CAAC,EAAE,MAAM,CAAC;CACpC;AAED,UAAU,eAAgB,SAAQ,iBAAiB,EAAE,oBAAoB;CAAG;AA4D5E,wBAAgB,oBAAoB,CAClC,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,iBAAiB,GAAG,oBAAoB,GAChD,oBAAoB,CAatB;AAED,wBAAgB,uBAAuB,CACrC,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,eAAe,GACvB,uBAAuB,CAYzB;AAED,wBAAgB,iCAAiC,CAC/C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,iBAAiB,GAAG,oBAAoB,GAChD,oBAAoB,CActB"}
|
|
@@ -1,10 +1,19 @@
|
|
|
1
1
|
import { TRUNCATION_MARKER } from '../../config/formatting.js';
|
|
2
2
|
import { extractContent } from '../../services/extractor.js';
|
|
3
|
-
import { parseHtml } from '../../services/parser.js';
|
|
3
|
+
import { parseHtml, parseHtmlWithMetadata } from '../../services/parser.js';
|
|
4
|
+
import { sanitizeText } from '../../utils/sanitizer.js';
|
|
4
5
|
import { toJsonl } from '../../transformers/jsonl.transformer.js';
|
|
5
6
|
import { htmlToMarkdown } from '../../transformers/markdown.transformer.js';
|
|
6
7
|
import { createContentMetadataBlock, determineContentExtractionSource, truncateContent, } from './common.js';
|
|
8
|
+
const TITLE_PATTERN = /<title[^>]*>([\s\S]*?)<\/title>/i;
|
|
7
9
|
function resolveContentSource(html, url, options) {
|
|
10
|
+
if (!options.extractMainContent && !options.includeMetadata) {
|
|
11
|
+
return {
|
|
12
|
+
sourceHtml: html,
|
|
13
|
+
title: extractTitleFromHtml(html),
|
|
14
|
+
metadata: undefined,
|
|
15
|
+
};
|
|
16
|
+
}
|
|
8
17
|
const { article, metadata: extractedMeta } = extractContent(html, url, {
|
|
9
18
|
extractArticle: options.extractMainContent,
|
|
10
19
|
});
|
|
@@ -14,12 +23,43 @@ function resolveContentSource(html, url, options) {
|
|
|
14
23
|
const title = shouldExtractFromArticle ? article.title : extractedMeta.title;
|
|
15
24
|
return { sourceHtml, title, metadata };
|
|
16
25
|
}
|
|
17
|
-
function
|
|
18
|
-
|
|
26
|
+
function extractTitleFromHtml(html) {
|
|
27
|
+
const match = TITLE_PATTERN.exec(html);
|
|
28
|
+
if (!match?.[1])
|
|
29
|
+
return undefined;
|
|
30
|
+
const decoded = decodeHtmlEntities(match[1]);
|
|
31
|
+
const text = sanitizeText(decoded);
|
|
32
|
+
return text || undefined;
|
|
33
|
+
}
|
|
34
|
+
function decodeHtmlEntities(value) {
|
|
35
|
+
if (!value.includes('&'))
|
|
36
|
+
return value;
|
|
37
|
+
const basicDecoded = value
|
|
38
|
+
.replace(/&/g, '&')
|
|
39
|
+
.replace(/</g, '<')
|
|
40
|
+
.replace(/>/g, '>')
|
|
41
|
+
.replace(/"/g, '"')
|
|
42
|
+
.replace(/'/g, "'");
|
|
43
|
+
return basicDecoded
|
|
44
|
+
.replace(/&#(\d+);/g, (match, code) => {
|
|
45
|
+
const parsed = Number.parseInt(code, 10);
|
|
46
|
+
return Number.isFinite(parsed) && parsed >= 0 && parsed <= 0x10ffff
|
|
47
|
+
? String.fromCodePoint(parsed)
|
|
48
|
+
: match;
|
|
49
|
+
})
|
|
50
|
+
.replace(/&#x([0-9a-fA-F]+);/g, (match, code) => {
|
|
51
|
+
const parsed = Number.parseInt(code, 16);
|
|
52
|
+
return Number.isFinite(parsed) && parsed >= 0 && parsed <= 0x10ffff
|
|
53
|
+
? String.fromCodePoint(parsed)
|
|
54
|
+
: match;
|
|
55
|
+
});
|
|
19
56
|
}
|
|
20
57
|
function buildJsonlPayload(context, maxContentLength) {
|
|
21
58
|
const contentBlocks = parseHtml(context.sourceHtml);
|
|
22
|
-
|
|
59
|
+
return buildJsonlPayloadFromBlocks(contentBlocks, context.metadata, maxContentLength);
|
|
60
|
+
}
|
|
61
|
+
function buildJsonlPayloadFromBlocks(contentBlocks, metadata, maxContentLength) {
|
|
62
|
+
const { content, truncated } = truncateContent(toJsonl(contentBlocks, metadata), maxContentLength);
|
|
23
63
|
return {
|
|
24
64
|
content,
|
|
25
65
|
contentBlocks: contentBlocks.length,
|
|
@@ -32,7 +72,18 @@ function buildMarkdownPayload(context, maxContentLength) {
|
|
|
32
72
|
return { content, truncated };
|
|
33
73
|
}
|
|
34
74
|
export function transformHtmlToJsonl(html, url, options) {
|
|
35
|
-
|
|
75
|
+
if (!options.extractMainContent && options.includeMetadata) {
|
|
76
|
+
const parsed = parseHtmlWithMetadata(html);
|
|
77
|
+
const metadataBlock = createContentMetadataBlock(url, null, parsed.metadata, false, true);
|
|
78
|
+
const { content, contentBlocks, truncated } = buildJsonlPayloadFromBlocks(parsed.blocks, metadataBlock, options.maxContentLength);
|
|
79
|
+
return {
|
|
80
|
+
content,
|
|
81
|
+
contentBlocks,
|
|
82
|
+
title: parsed.metadata.title,
|
|
83
|
+
...(truncated && { truncated }),
|
|
84
|
+
};
|
|
85
|
+
}
|
|
86
|
+
const context = resolveContentSource(html, url, options);
|
|
36
87
|
const { content, contentBlocks, truncated } = buildJsonlPayload(context, options.maxContentLength);
|
|
37
88
|
return {
|
|
38
89
|
content,
|
|
@@ -42,7 +93,7 @@ export function transformHtmlToJsonl(html, url, options) {
|
|
|
42
93
|
};
|
|
43
94
|
}
|
|
44
95
|
export function transformHtmlToMarkdown(html, url, options) {
|
|
45
|
-
const context =
|
|
96
|
+
const context = resolveContentSource(html, url, options);
|
|
46
97
|
const { content, truncated } = buildMarkdownPayload(context, options.maxContentLength);
|
|
47
98
|
return {
|
|
48
99
|
markdown: content,
|
|
@@ -51,7 +102,22 @@ export function transformHtmlToMarkdown(html, url, options) {
|
|
|
51
102
|
};
|
|
52
103
|
}
|
|
53
104
|
export function transformHtmlToMarkdownWithBlocks(html, url, options) {
|
|
54
|
-
|
|
105
|
+
if (!options.extractMainContent && options.includeMetadata) {
|
|
106
|
+
const parsed = parseHtmlWithMetadata(html);
|
|
107
|
+
const context = {
|
|
108
|
+
sourceHtml: html,
|
|
109
|
+
title: parsed.metadata.title,
|
|
110
|
+
metadata: createContentMetadataBlock(url, null, parsed.metadata, false, true),
|
|
111
|
+
};
|
|
112
|
+
const { content, truncated } = buildMarkdownPayload(context, options.maxContentLength);
|
|
113
|
+
return {
|
|
114
|
+
content,
|
|
115
|
+
contentBlocks: parsed.blocks.length,
|
|
116
|
+
title: context.title,
|
|
117
|
+
...(truncated && { truncated }),
|
|
118
|
+
};
|
|
119
|
+
}
|
|
120
|
+
const context = resolveContentSource(html, url, options);
|
|
55
121
|
const contentBlocks = parseHtml(context.sourceHtml);
|
|
56
122
|
const { content, truncated } = buildMarkdownPayload(context, options.maxContentLength);
|
|
57
123
|
return {
|
|
@@ -61,4 +127,3 @@ export function transformHtmlToMarkdownWithBlocks(html, url, options) {
|
|
|
61
127
|
...(truncated && { truncated }),
|
|
62
128
|
};
|
|
63
129
|
}
|
|
64
|
-
//# sourceMappingURL=content-transform.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"content-transform.js","sourceRoot":"","sources":["../../../src/tools/utils/content-transform.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,iBAAiB,EAAE,MAAM,4BAA4B,CAAC;AAM/D,OAAO,EAAE,cAAc,EAAE,MAAM,6BAA6B,CAAC;AAC7D,OAAO,EAAE,SAAS,EAAE,MAAM,0BAA0B,CAAC;AAErD,OAAO,EAAE,OAAO,EAAE,MAAM,yCAAyC,CAAC;AAClE,OAAO,EAAE,cAAc,EAAE,MAAM,4CAA4C,CAAC;AAE5E,OAAO,EACL,0BAA0B,EAC1B,gCAAgC,EAChC,eAAe,GAChB,MAAM,aAAa,CAAC;
|
|
1
|
+
{"version":3,"file":"content-transform.js","sourceRoot":"","sources":["../../../src/tools/utils/content-transform.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,iBAAiB,EAAE,MAAM,4BAA4B,CAAC;AAM/D,OAAO,EAAE,cAAc,EAAE,MAAM,6BAA6B,CAAC;AAC7D,OAAO,EAAE,SAAS,EAAE,MAAM,0BAA0B,CAAC;AAErD,OAAO,EAAE,OAAO,EAAE,MAAM,yCAAyC,CAAC;AAClE,OAAO,EAAE,cAAc,EAAE,MAAM,4CAA4C,CAAC;AAE5E,OAAO,EACL,0BAA0B,EAC1B,gCAAgC,EAChC,eAAe,GAChB,MAAM,aAAa,CAAC;AAmBrB,SAAS,oBAAoB,CAC3B,IAAY,EACZ,GAAW,EACX,OAA0B;IAE1B,MAAM,EAAE,OAAO,EAAE,QAAQ,EAAE,aAAa,EAAE,GAAG,cAAc,CAAC,IAAI,EAAE,GAAG,EAAE;QACrE,cAAc,EAAE,OAAO,CAAC,kBAAkB;KAC3C,CAAC,CAAC;IAEH,MAAM,wBAAwB,GAAG,gCAAgC,CAC/D,OAAO,CAAC,kBAAkB,EAC1B,OAAO,CACR,CAAC;IAEF,MAAM,UAAU,GAAG,wBAAwB,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC;IACrE,MAAM,QAAQ,GAAG,0BAA0B,CACzC,GAAG,EACH,OAAO,EACP,aAAa,EACb,wBAAwB,EACxB,OAAO,CAAC,eAAe,CACxB,CAAC;IACF,MAAM,KAAK,GAAG,wBAAwB,CAAC,CAAC,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,aAAa,CAAC,KAAK,CAAC;IAE7E,OAAO,EAAE,UAAU,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC;AACzC,CAAC;AAED,SAAS,iBAAiB,CACxB,OAAsB,EACtB,gBAAyB;IAEzB,MAAM,aAAa,GAAG,SAAS,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC;IACpD,MAAM,EAAE,OAAO,EAAE,SAAS,EAAE,GAAG,eAAe,CAC5C,OAAO,CAAC,aAAa,EAAE,OAAO,CAAC,QAAQ,CAAC,EACxC,gBAAgB,CACjB,CAAC;IAEF,OAAO;QACL,OAAO;QACP,aAAa,EAAE,aAAa,CAAC,MAAM;QACnC,SAAS;KACV,CAAC;AACJ,CAAC;AAED,SAAS,oBAAoB,CAC3B,OAAsB,EACtB,gBAAyB;IAEzB,MAAM,QAAQ,GAAG,cAAc,CAAC,OAAO,CAAC,UAAU,EAAE,OAAO,CAAC,QAAQ,CAAC,CAAC;IACtE,MAAM,EAAE,OAAO,EAAE,SAAS,EAAE,GAAG,eAAe,CAC5C,QAAQ,EACR,gBAAgB,EAChB,iBAAiB,CAClB,CAAC;IAEF,OAAO,EAAE,OAAO,EAAE,SAAS,EAAE,CAAC;AAChC,CAAC;AAED,MAAM,UAAU,oBAAoB,CAClC,IAAY,EACZ,GAAW,EACX,OAAiD;IAEjD,MAAM,OAAO,GAAG,oBAAoB,CAAC,IAAI,EAAE,GAAG,EAAE,OAAO,CAAC,CAAC;IACzD,MAAM,EAAE,OAAO,EAAE,aAAa,EAAE,SAAS,EAAE,GAAG,iBAAiB,CAC7D,OAAO,EACP,OAAO,CAAC,gBAAgB,CACzB,CAAC;IAEF,OAAO;QACL,OAAO;QACP,aAAa;QACb,KAAK,EAAE,OAAO,CAAC,KAAK;QACpB,GAAG,CAAC,SAAS,IAAI,EAAE,SAAS,EAAE,CAAC;KAChC,CAAC;AACJ,CAAC;AAED,MAAM,UAAU,uBAAuB,CACrC,IAAY,EACZ,GAAW,EACX,OAAwB;IAExB,MAAM,OAAO,GAAG,oBAAoB,CAAC,IAAI,EAAE,GAAG,EAAE,OAAO,CAAC,CAAC;IACzD,MAAM,EAAE,OAAO,EAAE,SAAS,EAAE,GAAG,oBAAoB,CACjD,OAAO,EACP,OAAO,CAAC,gBAAgB,CACzB,CAAC;IAEF,OAAO;QACL,QAAQ,EAAE,OAAO;QACjB,KAAK,EAAE,OAAO,CAAC,KAAK;QACpB,SAAS;KACV,CAAC;AACJ,CAAC;AAED,MAAM,UAAU,iCAAiC,CAC/C,IAAY,EACZ,GAAW,EACX,OAAiD;IAEjD,MAAM,OAAO,GAAG,oBAAoB,CAAC,IAAI,EAAE,GAAG,EAAE,OAAO,CAAC,CAAC;IACzD,MAAM,aAAa,GAAG,SAAS,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC;IACpD,MAAM,EAAE,OAAO,EAAE,SAAS,EAAE,GAAG,oBAAoB,CACjD,OAAO,EACP,OAAO,CAAC,gBAAgB,CACzB,CAAC;IAEF,OAAO;QACL,OAAO;QACP,aAAa,EAAE,aAAa,CAAC,MAAM;QACnC,KAAK,EAAE,OAAO,CAAC,KAAK;QACpB,GAAG,CAAC,SAAS,IAAI,EAAE,SAAS,EAAE,CAAC;KAChC,CAAC;AACJ,CAAC"}
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import type { FetchPipelineOptions, PipelineResult } from '../../config/types.js';
|
|
1
|
+
import type { FetchPipelineOptions, PipelineResult } from '../../config/types/runtime.js';
|
|
2
2
|
/**
|
|
3
3
|
* Unified fetch pipeline that handles caching, fetching, and transformation.
|
|
4
4
|
* Implements cache-first strategy with automatic serialization.
|
|
@@ -8,4 +8,3 @@ import type { FetchPipelineOptions, PipelineResult } from '../../config/types.js
|
|
|
8
8
|
* @returns Promise resolving to the pipeline result
|
|
9
9
|
*/
|
|
10
10
|
export declare function executeFetchPipeline<T>(options: FetchPipelineOptions<T>): Promise<PipelineResult<T>>;
|
|
11
|
-
//# sourceMappingURL=fetch-pipeline.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"fetch-pipeline.d.ts","sourceRoot":"","sources":["../../../src/tools/utils/fetch-pipeline.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAEV,oBAAoB,EACpB,cAAc,EACf,MAAM,
|
|
1
|
+
{"version":3,"file":"fetch-pipeline.d.ts","sourceRoot":"","sources":["../../../src/tools/utils/fetch-pipeline.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAEV,oBAAoB,EACpB,cAAc,EACf,MAAM,+BAA+B,CAAC;AAuDvC;;;;;;;GAOG;AACH,wBAAsB,oBAAoB,CAAC,CAAC,EAC1C,OAAO,EAAE,oBAAoB,CAAC,CAAC,CAAC,GAC/B,OAAO,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,CAwB5B"}
|
|
@@ -1,29 +1,22 @@
|
|
|
1
1
|
import * as cache from '../../services/cache.js';
|
|
2
|
-
import {
|
|
3
|
-
import { logDebug
|
|
4
|
-
import {
|
|
2
|
+
import { fetchNormalizedUrlWithRetry } from '../../services/fetcher.js';
|
|
3
|
+
import { logDebug } from '../../services/logger.js';
|
|
4
|
+
import { assertResolvedAddressesAllowed, normalizeUrl, } from '../../utils/url-validator.js';
|
|
5
5
|
import { appendHeaderVary } from './cache-vary.js';
|
|
6
|
-
function safeJsonParse(cached, cacheKey) {
|
|
7
|
-
try {
|
|
8
|
-
return JSON.parse(cached);
|
|
9
|
-
}
|
|
10
|
-
catch {
|
|
11
|
-
logWarn('Cache deserialize failed, treating as miss', {
|
|
12
|
-
key: cacheKey.substring(0, 100),
|
|
13
|
-
});
|
|
14
|
-
return undefined;
|
|
15
|
-
}
|
|
16
|
-
}
|
|
17
6
|
function attemptCacheRetrieval(cacheKey, deserialize, cacheNamespace, normalizedUrl) {
|
|
18
7
|
if (!cacheKey)
|
|
19
8
|
return null;
|
|
20
9
|
const cached = cache.get(cacheKey);
|
|
21
10
|
if (!cached)
|
|
22
11
|
return null;
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
12
|
+
if (!deserialize) {
|
|
13
|
+
logDebug('Cache miss due to missing deserializer', {
|
|
14
|
+
namespace: cacheNamespace,
|
|
15
|
+
url: normalizedUrl,
|
|
16
|
+
});
|
|
17
|
+
return null;
|
|
18
|
+
}
|
|
19
|
+
const data = deserialize(cached.content);
|
|
27
20
|
if (data === undefined) {
|
|
28
21
|
logDebug('Cache miss due to deserialize failure', {
|
|
29
22
|
namespace: cacheNamespace,
|
|
@@ -31,6 +24,7 @@ function attemptCacheRetrieval(cacheKey, deserialize, cacheNamespace, normalized
|
|
|
31
24
|
});
|
|
32
25
|
return null;
|
|
33
26
|
}
|
|
27
|
+
logDebug('Cache hit', { namespace: cacheNamespace, url: normalizedUrl });
|
|
34
28
|
return {
|
|
35
29
|
data,
|
|
36
30
|
fromCache: true,
|
|
@@ -48,16 +42,19 @@ function attemptCacheRetrieval(cacheKey, deserialize, cacheNamespace, normalized
|
|
|
48
42
|
* @returns Promise resolving to the pipeline result
|
|
49
43
|
*/
|
|
50
44
|
export async function executeFetchPipeline(options) {
|
|
51
|
-
const normalizedUrl =
|
|
45
|
+
const { normalizedUrl, hostname } = normalizeUrl(options.url);
|
|
52
46
|
const cacheKey = resolveCacheKey(options, normalizedUrl);
|
|
53
47
|
const cachedResult = attemptCacheRetrieval(cacheKey, options.deserialize, options.cacheNamespace, normalizedUrl);
|
|
54
48
|
if (cachedResult)
|
|
55
49
|
return cachedResult;
|
|
50
|
+
await assertResolvedAddressesAllowed(hostname);
|
|
56
51
|
const fetchOptions = buildFetchOptions(options);
|
|
57
52
|
logDebug('Fetching URL', { url: normalizedUrl, retries: options.retries });
|
|
58
|
-
const html = await
|
|
53
|
+
const html = await fetchNormalizedUrlWithRetry(normalizedUrl, fetchOptions, options.retries);
|
|
59
54
|
const data = options.transform(html, normalizedUrl);
|
|
60
|
-
|
|
55
|
+
if (cache.isEnabled()) {
|
|
56
|
+
persistCache(cacheKey, data, options.serialize, normalizedUrl);
|
|
57
|
+
}
|
|
61
58
|
return buildPipelineResult(normalizedUrl, data, cacheKey);
|
|
62
59
|
}
|
|
63
60
|
function resolveCacheKey(options, normalizedUrl) {
|
|
@@ -97,4 +94,3 @@ function buildPipelineResult(url, data, cacheKey) {
|
|
|
97
94
|
cacheKey,
|
|
98
95
|
};
|
|
99
96
|
}
|
|
100
|
-
//# sourceMappingURL=fetch-pipeline.js.map
|
|
@@ -1,3 +1,2 @@
|
|
|
1
|
-
import type { ContentBlockUnion, MetadataBlock } from '../config/types.js';
|
|
1
|
+
import type { ContentBlockUnion, MetadataBlock } from '../config/types/content.js';
|
|
2
2
|
export declare function toJsonl(blocks: readonly ContentBlockUnion[], metadata?: MetadataBlock): string;
|
|
3
|
-
//# sourceMappingURL=jsonl.transformer.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"jsonl.transformer.d.ts","sourceRoot":"","sources":["../../src/transformers/jsonl.transformer.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,
|
|
1
|
+
{"version":3,"file":"jsonl.transformer.d.ts","sourceRoot":"","sources":["../../src/transformers/jsonl.transformer.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EACV,iBAAiB,EACjB,aAAa,EACd,MAAM,4BAA4B,CAAC;AAkEpC,wBAAgB,OAAO,CACrB,MAAM,EAAE,SAAS,iBAAiB,EAAE,EACpC,QAAQ,CAAC,EAAE,aAAa,GACvB,MAAM,CAGR"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"jsonl.transformer.js","sourceRoot":"","sources":["../../src/transformers/jsonl.transformer.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,yBAAyB,CAAC;AACpD,OAAO,EAAE,MAAM,EAAE,MAAM,oBAAoB,CAAC;
|
|
1
|
+
{"version":3,"file":"jsonl.transformer.js","sourceRoot":"","sources":["../../src/transformers/jsonl.transformer.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,yBAAyB,CAAC;AACpD,OAAO,EAAE,MAAM,EAAE,MAAM,oBAAoB,CAAC;AAM5C,OAAO,EAAE,YAAY,EAAE,MAAM,uBAAuB,CAAC;AAErD,MAAM,gBAAgB,GAAG,IAAI,GAAG,CAAC;IAC/B,WAAW;IACX,SAAS;IACT,MAAM;IACN,YAAY;CACb,CAAC,CAAC;AAEH,SAAS,WAAW,CAClB,KAAwB;IAExB,OAAO,MAAM,IAAI,KAAK,CAAC;AACzB,CAAC;AAED,SAAS,WAAW,CAClB,KAAwB;IAExB,OAAO,KAAK,CAAC,IAAI,KAAK,MAAM,CAAC;AAC/B,CAAC;AAED,SAAS,iBAAiB,CACxB,KAAmD,EACnD,SAAiB;IAEjB,MAAM,SAAS,GAAG,YAAY,CAAC,KAAK,CAAC,IAAI,EAAE,SAAS,CAAC,CAAC;IACtD,OAAO,SAAS,KAAK,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,GAAG,KAAK,EAAE,IAAI,EAAE,SAAS,EAAE,CAAC;AAC1E,CAAC;AAED,SAAS,iBAAiB,CACxB,KAAsD,EACtD,SAAiB;IAEjB,MAAM,cAAc,GAAG,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAC9C,YAAY,CAAC,IAAI,EAAE,SAAS,CAAC,CAC9B,CAAC;IACF,MAAM,UAAU,GAAG,cAAc,CAAC,IAAI,CACpC,CAAC,IAAI,EAAE,KAAK,EAAE,EAAE,CAAC,IAAI,KAAK,KAAK,CAAC,KAAK,CAAC,KAAK,CAAC,CAC7C,CAAC;IACF,OAAO,UAAU,CAAC,CAAC,CAAC,EAAE,GAAG,KAAK,EAAE,KAAK,EAAE,cAAc,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC;AAClE,CAAC;AAED,SAAS,aAAa,CAAC,KAAwB;IAC7C,MAAM,SAAS,GAAG,MAAM,CAAC,UAAU,CAAC,cAAc,CAAC;IAEnD,IAAI,gBAAgB,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,WAAW,CAAC,KAAK,CAAC,EAAE,CAAC;QAC3D,OAAO,iBAAiB,CAAC,KAAK,EAAE,SAAS,CAAC,CAAC;IAC7C,CAAC;IAED,IAAI,WAAW,CAAC,KAAK,CAAC,EAAE,CAAC;QACvB,OAAO,iBAAiB,CAAC,KAAK,EAAE,SAAS,CAAC,CAAC;IAC7C,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED,SAAS,cAAc,CAAC,KAAwB;IAC9C,IAAI,CAAC;QACH,OAAO,IAAI,CAAC,SAAS,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC,CAAC;IAC9C,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED,MAAM,UAAU,OAAO,CACrB,MAAoC,EACpC,QAAwB;IAExB,MAAM,KAAK,GAAG,iBAAiB,CAAC,MAAM,EAAE,QAAQ,CAAC,CAAC;IAClD,OAAO,SAAS,CAAC,KAAK,CAAC,CAAC;AAC1B,CAAC;AAED,SAAS,iBAAiB,CACxB,MAAoC,EACpC,QAAwB;IAExB,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,MAAM,MAAM,GAAG,iBAAiB,CAAC,QAAQ,CAAC,CAAC;IAC3C,IAAI,MAAM,EAAE,CAAC;QACX,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IACrB,CAAC;IAED,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAC3B,MAAM,UAAU,GAAG,cAAc,CAAC,KAAK,CAAC,CAAC;QACzC,IAAI,UAAU,EAAE,CAAC;YACf,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QACzB,CAAC;IACH,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED,SAAS,iBAAiB,CAAC,QAAwB;IACjD,IAAI,CAAC,QAAQ;QAAE,OAAO,IAAI,CAAC;IAC3B,IAAI,CAAC;QACH,MAAM,eAAe,GAAG;YACtB,IAAI,EAAE,QAAQ,CAAC,IAAI;YACnB,KAAK,EAAE,QAAQ,CAAC,KAAK;YACrB,GAAG,EAAE,QAAQ,CAAC,GAAG;SAClB,CAAC;QACF,OAAO,IAAI,CAAC,SAAS,CAAC,eAAe,CAAC,CAAC;IACzC,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"markdown.transformer.d.ts","sourceRoot":"","sources":["../../src/transformers/markdown.transformer.ts"],"names":[],"mappings":"AAOA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,
|
|
1
|
+
{"version":3,"file":"markdown.transformer.d.ts","sourceRoot":"","sources":["../../src/transformers/markdown.transformer.ts"],"names":[],"mappings":"AAOA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,4BAA4B,CAAC;AA+HhE,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE,aAAa,GAAG,MAAM,CAa7E"}
|