@j0hanz/superfetch 2.2.2 → 2.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +358 -363
- package/dist/assets/logo.svg +24835 -0
- package/dist/cache.d.ts +0 -1
- package/dist/cache.js +71 -29
- package/dist/config.d.ts +2 -1
- package/dist/config.js +11 -7
- package/dist/crypto.d.ts +0 -1
- package/dist/crypto.js +0 -1
- package/dist/dom-noise-removal.d.ts +0 -1
- package/dist/dom-noise-removal.js +50 -45
- package/dist/errors.d.ts +0 -1
- package/dist/errors.js +0 -1
- package/dist/fetch.d.ts +0 -1
- package/dist/fetch.js +61 -54
- package/dist/host-normalization.d.ts +1 -0
- package/dist/host-normalization.js +47 -0
- package/dist/http-native.d.ts +0 -1
- package/dist/http-native.js +92 -28
- package/dist/index.d.ts +0 -1
- package/dist/index.js +0 -1
- package/dist/instructions.md +41 -41
- package/dist/json.d.ts +0 -1
- package/dist/json.js +0 -1
- package/dist/language-detection.d.ts +0 -1
- package/dist/language-detection.js +10 -2
- package/dist/markdown-cleanup.d.ts +6 -13
- package/dist/markdown-cleanup.js +252 -34
- package/dist/mcp-validator.d.ts +14 -0
- package/dist/mcp-validator.js +22 -0
- package/dist/mcp.d.ts +0 -1
- package/dist/mcp.js +20 -10
- package/dist/observability.d.ts +2 -1
- package/dist/observability.js +30 -3
- package/dist/server-tuning.d.ts +9 -0
- package/dist/server-tuning.js +30 -0
- package/dist/{http-utils.d.ts → session.d.ts} +0 -25
- package/dist/{http-utils.js → session.js} +11 -104
- package/dist/tools.d.ts +5 -4
- package/dist/tools.js +46 -41
- package/dist/transform-types.d.ts +38 -1
- package/dist/transform-types.js +0 -1
- package/dist/transform.d.ts +12 -7
- package/dist/transform.js +205 -344
- package/dist/type-guards.d.ts +0 -1
- package/dist/type-guards.js +0 -1
- package/dist/workers/transform-worker.d.ts +0 -1
- package/dist/workers/transform-worker.js +29 -19
- package/package.json +84 -85
- package/dist/cache.d.ts.map +0 -1
- package/dist/cache.js.map +0 -1
- package/dist/config.d.ts.map +0 -1
- package/dist/config.js.map +0 -1
- package/dist/crypto.d.ts.map +0 -1
- package/dist/crypto.js.map +0 -1
- package/dist/dom-noise-removal.d.ts.map +0 -1
- package/dist/dom-noise-removal.js.map +0 -1
- package/dist/errors.d.ts.map +0 -1
- package/dist/errors.js.map +0 -1
- package/dist/fetch.d.ts.map +0 -1
- package/dist/fetch.js.map +0 -1
- package/dist/http-native.d.ts.map +0 -1
- package/dist/http-native.js.map +0 -1
- package/dist/http-utils.d.ts.map +0 -1
- package/dist/http-utils.js.map +0 -1
- package/dist/index.d.ts.map +0 -1
- package/dist/index.js.map +0 -1
- package/dist/json.d.ts.map +0 -1
- package/dist/json.js.map +0 -1
- package/dist/language-detection.d.ts.map +0 -1
- package/dist/language-detection.js.map +0 -1
- package/dist/markdown-cleanup.d.ts.map +0 -1
- package/dist/markdown-cleanup.js.map +0 -1
- package/dist/mcp.d.ts.map +0 -1
- package/dist/mcp.js.map +0 -1
- package/dist/observability.d.ts.map +0 -1
- package/dist/observability.js.map +0 -1
- package/dist/tools.d.ts.map +0 -1
- package/dist/tools.js.map +0 -1
- package/dist/transform-types.d.ts.map +0 -1
- package/dist/transform-types.js.map +0 -1
- package/dist/transform.d.ts.map +0 -1
- package/dist/transform.js.map +0 -1
- package/dist/type-guards.d.ts.map +0 -1
- package/dist/type-guards.js.map +0 -1
- package/dist/workers/transform-worker.d.ts.map +0 -1
- package/dist/workers/transform-worker.js.map +0 -1
package/dist/http-native.js
CHANGED
|
@@ -5,13 +5,51 @@ import { URL, URLSearchParams } from 'node:url';
|
|
|
5
5
|
import { InvalidTokenError, ServerError, } from '@modelcontextprotocol/sdk/server/auth/errors.js';
|
|
6
6
|
import { StreamableHTTPServerTransport } from '@modelcontextprotocol/sdk/server/streamableHttp.js';
|
|
7
7
|
import { isInitializeRequest } from '@modelcontextprotocol/sdk/types.js';
|
|
8
|
-
import { handleDownload } from './cache.js';
|
|
9
|
-
import { config, enableHttpMode } from './config.js';
|
|
8
|
+
import { keys as cacheKeys, handleDownload } from './cache.js';
|
|
9
|
+
import { config, enableHttpMode, serverVersion } from './config.js';
|
|
10
10
|
import { timingSafeEqualUtf8 } from './crypto.js';
|
|
11
|
-
import {
|
|
11
|
+
import { normalizeHost } from './host-normalization.js';
|
|
12
|
+
import { acceptsEventStream, isJsonRpcBatchRequest, isMcpRequestBody, } from './mcp-validator.js';
|
|
12
13
|
import { createMcpServer } from './mcp.js';
|
|
13
14
|
import { logError, logInfo, logWarn } from './observability.js';
|
|
15
|
+
import { applyHttpServerTuning, drainConnectionsOnShutdown, } from './server-tuning.js';
|
|
16
|
+
import { composeCloseHandlers, createSessionStore, createSlotTracker, ensureSessionCapacity, reserveSessionSlot, startSessionCleanupLoop, } from './session.js';
|
|
17
|
+
import { getTransformPoolStats } from './transform.js';
|
|
14
18
|
import { isObject } from './type-guards.js';
|
|
19
|
+
function createTransportAdapter(transportImpl) {
|
|
20
|
+
const noopOnClose = () => { };
|
|
21
|
+
const noopOnError = () => { };
|
|
22
|
+
const noopOnMessage = () => { };
|
|
23
|
+
let oncloseHandler = noopOnClose;
|
|
24
|
+
let onerrorHandler = noopOnError;
|
|
25
|
+
let onmessageHandler = noopOnMessage;
|
|
26
|
+
return {
|
|
27
|
+
start: () => transportImpl.start(),
|
|
28
|
+
send: (message, options) => transportImpl.send(message, options),
|
|
29
|
+
close: () => transportImpl.close(),
|
|
30
|
+
get onclose() {
|
|
31
|
+
return oncloseHandler;
|
|
32
|
+
},
|
|
33
|
+
set onclose(handler) {
|
|
34
|
+
oncloseHandler = handler;
|
|
35
|
+
transportImpl.onclose = handler;
|
|
36
|
+
},
|
|
37
|
+
get onerror() {
|
|
38
|
+
return onerrorHandler;
|
|
39
|
+
},
|
|
40
|
+
set onerror(handler) {
|
|
41
|
+
onerrorHandler = handler;
|
|
42
|
+
transportImpl.onerror = handler;
|
|
43
|
+
},
|
|
44
|
+
get onmessage() {
|
|
45
|
+
return onmessageHandler;
|
|
46
|
+
},
|
|
47
|
+
set onmessage(handler) {
|
|
48
|
+
onmessageHandler = handler;
|
|
49
|
+
transportImpl.onmessage = handler;
|
|
50
|
+
},
|
|
51
|
+
};
|
|
52
|
+
}
|
|
15
53
|
function shimResponse(res) {
|
|
16
54
|
const shim = res;
|
|
17
55
|
shim.status = function (code) {
|
|
@@ -144,26 +182,26 @@ function resolveOriginHost(origin) {
|
|
|
144
182
|
return null;
|
|
145
183
|
}
|
|
146
184
|
}
|
|
185
|
+
function rejectHostRequest(res, status, message) {
|
|
186
|
+
res.status(status).json({ error: message });
|
|
187
|
+
return false;
|
|
188
|
+
}
|
|
147
189
|
function validateHostAndOrigin(req, res) {
|
|
148
190
|
const host = resolveHostHeader(req);
|
|
149
191
|
if (!host) {
|
|
150
|
-
res
|
|
151
|
-
return false;
|
|
192
|
+
return rejectHostRequest(res, 400, 'Missing or invalid Host header');
|
|
152
193
|
}
|
|
153
194
|
if (!ALLOWED_HOSTS.has(host)) {
|
|
154
|
-
res
|
|
155
|
-
return false;
|
|
195
|
+
return rejectHostRequest(res, 403, 'Host not allowed');
|
|
156
196
|
}
|
|
157
197
|
const originHeader = getHeaderValue(req, 'origin');
|
|
158
198
|
if (originHeader) {
|
|
159
199
|
const originHost = resolveOriginHost(originHeader);
|
|
160
200
|
if (!originHost) {
|
|
161
|
-
res
|
|
162
|
-
return false;
|
|
201
|
+
return rejectHostRequest(res, 403, 'Invalid Origin header');
|
|
163
202
|
}
|
|
164
203
|
if (!ALLOWED_HOSTS.has(originHost)) {
|
|
165
|
-
res
|
|
166
|
-
return false;
|
|
204
|
+
return rejectHostRequest(res, 403, 'Origin not allowed');
|
|
167
205
|
}
|
|
168
206
|
}
|
|
169
207
|
return true;
|
|
@@ -318,24 +356,35 @@ async function verifyWithIntrospection(token) {
|
|
|
318
356
|
throw new InvalidTokenError('Token is inactive');
|
|
319
357
|
return buildIntrospectionAuthInfo(token, payload);
|
|
320
358
|
}
|
|
359
|
+
function resolveBearerToken(authHeader) {
|
|
360
|
+
const [type, token] = authHeader.split(' ');
|
|
361
|
+
if (type !== 'Bearer' || !token) {
|
|
362
|
+
throw new InvalidTokenError('Invalid Authorization header format');
|
|
363
|
+
}
|
|
364
|
+
return token;
|
|
365
|
+
}
|
|
366
|
+
function authenticateWithToken(token) {
|
|
367
|
+
return config.auth.mode === 'oauth'
|
|
368
|
+
? verifyWithIntrospection(token)
|
|
369
|
+
: Promise.resolve(verifyStaticToken(token));
|
|
370
|
+
}
|
|
371
|
+
function authenticateWithApiKey(req) {
|
|
372
|
+
const apiKey = getHeaderValue(req, 'x-api-key');
|
|
373
|
+
if (apiKey && config.auth.mode === 'static') {
|
|
374
|
+
return verifyStaticToken(apiKey);
|
|
375
|
+
}
|
|
376
|
+
if (apiKey && config.auth.mode === 'oauth') {
|
|
377
|
+
throw new InvalidTokenError('X-API-Key not supported for OAuth');
|
|
378
|
+
}
|
|
379
|
+
throw new InvalidTokenError('Missing Authorization header');
|
|
380
|
+
}
|
|
321
381
|
async function authenticate(req) {
|
|
322
382
|
const authHeader = req.headers.authorization;
|
|
323
383
|
if (!authHeader) {
|
|
324
|
-
|
|
325
|
-
if (apiKey && config.auth.mode === 'static') {
|
|
326
|
-
return verifyStaticToken(apiKey);
|
|
327
|
-
}
|
|
328
|
-
if (apiKey && config.auth.mode === 'oauth') {
|
|
329
|
-
throw new InvalidTokenError('X-API-Key not supported for OAuth');
|
|
330
|
-
}
|
|
331
|
-
throw new InvalidTokenError('Missing Authorization header');
|
|
384
|
+
return authenticateWithApiKey(req);
|
|
332
385
|
}
|
|
333
|
-
const
|
|
334
|
-
|
|
335
|
-
throw new InvalidTokenError('Invalid Authorization header format');
|
|
336
|
-
if (config.auth.mode === 'oauth')
|
|
337
|
-
return verifyWithIntrospection(token);
|
|
338
|
-
return verifyStaticToken(token);
|
|
386
|
+
const token = resolveBearerToken(authHeader);
|
|
387
|
+
return authenticateWithToken(token);
|
|
339
388
|
}
|
|
340
389
|
// --- MCP Routes ---
|
|
341
390
|
function sendError(res, code, message, status = 400, id = null) {
|
|
@@ -394,7 +443,8 @@ async function createNewSession(store, mcpServer, res, requestId) {
|
|
|
394
443
|
tracker.releaseSlot();
|
|
395
444
|
};
|
|
396
445
|
try {
|
|
397
|
-
|
|
446
|
+
const transport = createTransportAdapter(transportImpl);
|
|
447
|
+
await mcpServer.connect(transport);
|
|
398
448
|
}
|
|
399
449
|
catch (err) {
|
|
400
450
|
clearTimeout(initTimeout);
|
|
@@ -531,7 +581,22 @@ async function dispatchRequest(req, res, url, ctx) {
|
|
|
531
581
|
const { method } = req;
|
|
532
582
|
try {
|
|
533
583
|
if (method === 'GET' && path === '/health') {
|
|
534
|
-
|
|
584
|
+
const poolStats = getTransformPoolStats();
|
|
585
|
+
res.status(200).json({
|
|
586
|
+
status: 'ok',
|
|
587
|
+
version: serverVersion,
|
|
588
|
+
uptime: Math.floor(process.uptime()),
|
|
589
|
+
timestamp: new Date().toISOString(),
|
|
590
|
+
stats: {
|
|
591
|
+
activeSessions: ctx.store.size(),
|
|
592
|
+
cacheKeys: cacheKeys().length,
|
|
593
|
+
workerPool: poolStats ?? {
|
|
594
|
+
queueDepth: 0,
|
|
595
|
+
activeWorkers: 0,
|
|
596
|
+
capacity: 0,
|
|
597
|
+
},
|
|
598
|
+
},
|
|
599
|
+
});
|
|
535
600
|
return;
|
|
536
601
|
}
|
|
537
602
|
if (!(await authenticateRequest(req, res))) {
|
|
@@ -642,4 +707,3 @@ async function handleRequest(rawReq, rawRes, rateLimiter, ctx) {
|
|
|
642
707
|
// 5. Routing
|
|
643
708
|
await dispatchRequest(req, res, url, ctx);
|
|
644
709
|
}
|
|
645
|
-
//# sourceMappingURL=http-native.js.map
|
package/dist/index.d.ts
CHANGED
package/dist/index.js
CHANGED
package/dist/instructions.md
CHANGED
|
@@ -1,41 +1,41 @@
|
|
|
1
|
-
# superFetch Instructions
|
|
2
|
-
|
|
3
|
-
> Guidance for the Agent: These instructions are available as a resource (`internal://instructions`) or prompt (`get-help`). Load them when you are unsure about tool usage.
|
|
4
|
-
|
|
5
|
-
## 1. Core Capability
|
|
6
|
-
|
|
7
|
-
- **Domain:** Fetch public http(s) URLs, extract readable content, and return clean Markdown.
|
|
8
|
-
- **Primary Resources:** `fetch-url` output (`markdown`, `title`, `url`) and cache resources (`superfetch://cache/markdown/{urlHash}`).
|
|
9
|
-
|
|
10
|
-
## 2. The "Golden Path" Workflows (Critical)
|
|
11
|
-
|
|
12
|
-
_Describe the standard order of operations using ONLY tools that exist._
|
|
13
|
-
|
|
14
|
-
### Workflow A: Fetch and Read
|
|
15
|
-
|
|
16
|
-
1. Call `fetch-url` with `url`.
|
|
17
|
-
2. Read `structuredContent.markdown` and `structuredContent.title` from the result.
|
|
18
|
-
3. If content is truncated (look for `...[truncated]`), follow the returned `resource_link` URI.
|
|
19
|
-
> Constraint: Never guess resource URIs. Use the returned `resource_link` or list resources first.
|
|
20
|
-
|
|
21
|
-
### Workflow B: Retrieve Cached Content
|
|
22
|
-
|
|
23
|
-
1. List resources to find available cached pages (`superfetch://cache/...`).
|
|
24
|
-
2. Read the specific `superfetch://cache/markdown/{urlHash}` URI.
|
|
25
|
-
|
|
26
|
-
## 3. Tool Nuances & Gotchas
|
|
27
|
-
|
|
28
|
-
_Do NOT repeat JSON schema. Focus on behavior and pitfalls._
|
|
29
|
-
|
|
30
|
-
- **`fetch-url`**
|
|
31
|
-
- **Purpose:** Fetches a webpage and converts it to clean Markdown format.
|
|
32
|
-
- **Inputs:** `url` (Must be public http/https. Private patterns like localhost/127.0.0.1 are blocked).
|
|
33
|
-
- **Side effects:** Open world network request; writes to internal LRU cache.
|
|
34
|
-
- **Latency/limits:** Network-bound. Large content exceeds inline limits and returns a `resource_link`.
|
|
35
|
-
- **Common failure modes:** `VALIDATION_ERROR` (private/blocked URL), `FETCH_ERROR` (network timeout/404).
|
|
36
|
-
|
|
37
|
-
## 4. Error Handling Strategy
|
|
38
|
-
|
|
39
|
-
- **`VALIDATION_ERROR`**: Ensure the URL is valid and publicly accessible.
|
|
40
|
-
- **`FETCH_ERROR`**: Retry once. If persistent, the site may be blocking automated requests.
|
|
41
|
-
- **Truncation**: If `isError` is false but content ends in `...[truncated]`, you MUST read the provided `resource_link` URI to get the full markdown.
|
|
1
|
+
# superFetch Instructions
|
|
2
|
+
|
|
3
|
+
> Guidance for the Agent: These instructions are available as a resource (`internal://instructions`) or prompt (`get-help`). Load them when you are unsure about tool usage.
|
|
4
|
+
|
|
5
|
+
## 1. Core Capability
|
|
6
|
+
|
|
7
|
+
- **Domain:** Fetch public http(s) URLs, extract readable content, and return clean Markdown.
|
|
8
|
+
- **Primary Resources:** `fetch-url` output (`markdown`, `title`, `url`) and cache resources (`superfetch://cache/markdown/{urlHash}`).
|
|
9
|
+
|
|
10
|
+
## 2. The "Golden Path" Workflows (Critical)
|
|
11
|
+
|
|
12
|
+
_Describe the standard order of operations using ONLY tools that exist._
|
|
13
|
+
|
|
14
|
+
### Workflow A: Fetch and Read
|
|
15
|
+
|
|
16
|
+
1. Call `fetch-url` with `url`.
|
|
17
|
+
2. Read `structuredContent.markdown` and `structuredContent.title` from the result.
|
|
18
|
+
3. If content is truncated (look for `...[truncated]`), follow the returned `resource_link` URI.
|
|
19
|
+
> Constraint: Never guess resource URIs. Use the returned `resource_link` or list resources first.
|
|
20
|
+
|
|
21
|
+
### Workflow B: Retrieve Cached Content
|
|
22
|
+
|
|
23
|
+
1. List resources to find available cached pages (`superfetch://cache/...`).
|
|
24
|
+
2. Read the specific `superfetch://cache/markdown/{urlHash}` URI.
|
|
25
|
+
|
|
26
|
+
## 3. Tool Nuances & Gotchas
|
|
27
|
+
|
|
28
|
+
_Do NOT repeat JSON schema. Focus on behavior and pitfalls._
|
|
29
|
+
|
|
30
|
+
- **`fetch-url`**
|
|
31
|
+
- **Purpose:** Fetches a webpage and converts it to clean Markdown format.
|
|
32
|
+
- **Inputs:** `url` (Must be public http/https. Private patterns like localhost/127.0.0.1 are blocked).
|
|
33
|
+
- **Side effects:** Open world network request; writes to internal LRU cache.
|
|
34
|
+
- **Latency/limits:** Network-bound. Large content exceeds inline limits and returns a `resource_link`.
|
|
35
|
+
- **Common failure modes:** `VALIDATION_ERROR` (private/blocked URL), `FETCH_ERROR` (network timeout/404).
|
|
36
|
+
|
|
37
|
+
## 4. Error Handling Strategy
|
|
38
|
+
|
|
39
|
+
- **`VALIDATION_ERROR`**: Ensure the URL is valid and publicly accessible.
|
|
40
|
+
- **`FETCH_ERROR`**: Retry once. If persistent, the site may be blocking automated requests.
|
|
41
|
+
- **Truncation**: If `isError` is false but content ends in `...[truncated]`, you MUST read the provided `resource_link` URI to get the full markdown.
|
package/dist/json.d.ts
CHANGED
package/dist/json.js
CHANGED
|
@@ -10,4 +10,3 @@ export declare function detectLanguageFromCode(code: string): string | undefined
|
|
|
10
10
|
* Resolve language from HTML attributes (class name and data-language).
|
|
11
11
|
*/
|
|
12
12
|
export declare function resolveLanguageFromAttributes(className: string, dataLang: string): string | undefined;
|
|
13
|
-
//# sourceMappingURL=language-detection.d.ts.map
|
|
@@ -6,7 +6,16 @@
|
|
|
6
6
|
* Check if source contains the given word as a standalone word (not part of another word).
|
|
7
7
|
*/
|
|
8
8
|
function containsWord(source, word) {
|
|
9
|
-
return
|
|
9
|
+
return getWordRegex(word).test(source);
|
|
10
|
+
}
|
|
11
|
+
const WORD_REGEX_CACHE = new Map();
|
|
12
|
+
function getWordRegex(word) {
|
|
13
|
+
const cached = WORD_REGEX_CACHE.get(word);
|
|
14
|
+
if (cached)
|
|
15
|
+
return cached;
|
|
16
|
+
const compiled = new RegExp(`\\b${word}\\b`);
|
|
17
|
+
WORD_REGEX_CACHE.set(word, compiled);
|
|
18
|
+
return compiled;
|
|
10
19
|
}
|
|
11
20
|
/**
|
|
12
21
|
* Extract language from class name (e.g., "language-typescript", "lang-js", "hljs javascript").
|
|
@@ -280,4 +289,3 @@ export function resolveLanguageFromAttributes(className, dataLang) {
|
|
|
280
289
|
const classMatch = extractLanguageFromClassName(className);
|
|
281
290
|
return classMatch ?? resolveLanguageFromDataAttribute(dataLang);
|
|
282
291
|
}
|
|
283
|
-
//# sourceMappingURL=language-detection.js.map
|
|
@@ -1,19 +1,12 @@
|
|
|
1
|
-
|
|
2
|
-
* Markdown cleanup utilities for post-processing converted content.
|
|
3
|
-
*
|
|
4
|
-
* Goals:
|
|
5
|
-
* - Never mutate fenced code blocks (``` / ~~~) content.
|
|
6
|
-
* - Keep rules localized and readable.
|
|
7
|
-
* - Avoid multi-pass regexes that accidentally hit code blocks.
|
|
8
|
-
*/
|
|
9
|
-
/**
|
|
10
|
-
* Clean up common markdown artifacts and formatting issues.
|
|
11
|
-
* IMPORTANT: All rules are applied ONLY outside fenced code blocks.
|
|
12
|
-
*/
|
|
1
|
+
import type { MetadataBlock } from './transform-types.js';
|
|
13
2
|
export declare function cleanupMarkdownArtifacts(content: string): string;
|
|
3
|
+
export declare function extractTitleFromRawMarkdown(content: string): string | undefined;
|
|
4
|
+
export declare function addSourceToMarkdown(content: string, url: string): string;
|
|
5
|
+
export declare function isRawTextContent(content: string): boolean;
|
|
6
|
+
export declare function isLikelyHtmlContent(content: string): boolean;
|
|
7
|
+
export declare function buildMetadataFooter(metadata?: MetadataBlock, fallbackUrl?: string): string;
|
|
14
8
|
/**
|
|
15
9
|
* Promote standalone lines that look like headings to proper markdown headings.
|
|
16
10
|
* Fence-aware: never modifies content inside fenced code blocks.
|
|
17
11
|
*/
|
|
18
12
|
export declare function promoteOrphanHeadings(markdown: string): string;
|
|
19
|
-
//# sourceMappingURL=markdown-cleanup.d.ts.map
|
package/dist/markdown-cleanup.js
CHANGED
|
@@ -1,11 +1,4 @@
|
|
|
1
|
-
|
|
2
|
-
* Markdown cleanup utilities for post-processing converted content.
|
|
3
|
-
*
|
|
4
|
-
* Goals:
|
|
5
|
-
* - Never mutate fenced code blocks (``` / ~~~) content.
|
|
6
|
-
* - Keep rules localized and readable.
|
|
7
|
-
* - Avoid multi-pass regexes that accidentally hit code blocks.
|
|
8
|
-
*/
|
|
1
|
+
import { config } from './config.js';
|
|
9
2
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
10
3
|
// Fence state helpers
|
|
11
4
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
@@ -76,15 +69,6 @@ function splitByFences(content) {
|
|
|
76
69
|
}
|
|
77
70
|
return segments;
|
|
78
71
|
}
|
|
79
|
-
/**
|
|
80
|
-
* Apply a transformation function only to non-fenced content.
|
|
81
|
-
*/
|
|
82
|
-
function mapOutsideFences(content, transform) {
|
|
83
|
-
const segments = splitByFences(content);
|
|
84
|
-
return segments
|
|
85
|
-
.map((seg) => (seg.inFence ? seg.content : transform(seg.content)))
|
|
86
|
-
.join('\n');
|
|
87
|
-
}
|
|
88
72
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
89
73
|
// Cleanup rules (OUTSIDE fences only)
|
|
90
74
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
@@ -165,28 +149,263 @@ function normalizeListsAndSpacing(text) {
|
|
|
165
149
|
// Collapse excessive blank lines
|
|
166
150
|
return text.replace(/\n{3,}/g, '\n\n');
|
|
167
151
|
}
|
|
152
|
+
const CLEANUP_STEPS = [
|
|
153
|
+
fixOrphanHeadings,
|
|
154
|
+
removeEmptyHeadings,
|
|
155
|
+
removeSkipLinksAndEmptyAnchors,
|
|
156
|
+
ensureBlankLineAfterHeadings,
|
|
157
|
+
removeTocBlocks,
|
|
158
|
+
tidyLinksAndEscapes,
|
|
159
|
+
normalizeListsAndSpacing,
|
|
160
|
+
];
|
|
168
161
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
169
162
|
// Public API
|
|
170
163
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
164
|
+
function getLastLine(text) {
|
|
165
|
+
const index = text.lastIndexOf('\n');
|
|
166
|
+
return index === -1 ? text : text.slice(index + 1);
|
|
167
|
+
}
|
|
175
168
|
export function cleanupMarkdownArtifacts(content) {
|
|
176
169
|
if (!content)
|
|
177
170
|
return '';
|
|
178
|
-
const
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
171
|
+
const segments = splitByFences(content);
|
|
172
|
+
return segments
|
|
173
|
+
.map((seg, index) => {
|
|
174
|
+
if (seg.inFence)
|
|
175
|
+
return seg.content;
|
|
176
|
+
const prevSeg = segments[index - 1];
|
|
177
|
+
const prevLineContext = prevSeg ? getLastLine(prevSeg.content) : '';
|
|
178
|
+
const lines = seg.content.split('\n');
|
|
179
|
+
const promotedLines = [];
|
|
180
|
+
for (let i = 0; i < lines.length; i += 1) {
|
|
181
|
+
const line = lines[i] ?? '';
|
|
182
|
+
const prevLine = i > 0 ? (lines[i - 1] ?? '') : prevLineContext;
|
|
183
|
+
promotedLines.push(processNonFencedLine(line, prevLine));
|
|
184
|
+
}
|
|
185
|
+
const promoted = promotedLines.join('\n');
|
|
186
|
+
return CLEANUP_STEPS.reduce((text, step) => step(text), promoted);
|
|
187
|
+
})
|
|
188
|
+
.join('\n')
|
|
189
|
+
.trim();
|
|
190
|
+
}
|
|
191
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
192
|
+
// Raw markdown handling + metadata footer
|
|
193
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
194
|
+
const HEADING_PATTERN = /^#{1,6}\s/m;
|
|
195
|
+
const LIST_PATTERN = /^(?:[-*+])\s/m;
|
|
196
|
+
const HTML_DOCUMENT_PATTERN = /^(<!doctype|<html)/i;
|
|
197
|
+
function containsMarkdownHeading(content) {
|
|
198
|
+
return HEADING_PATTERN.test(content);
|
|
199
|
+
}
|
|
200
|
+
function containsMarkdownList(content) {
|
|
201
|
+
return LIST_PATTERN.test(content);
|
|
202
|
+
}
|
|
203
|
+
function containsFencedCodeBlock(content) {
|
|
204
|
+
const first = content.indexOf('```');
|
|
205
|
+
if (first === -1)
|
|
206
|
+
return false;
|
|
207
|
+
return content.includes('```', first + 3);
|
|
208
|
+
}
|
|
209
|
+
function looksLikeMarkdown(content) {
|
|
210
|
+
return (containsMarkdownHeading(content) ||
|
|
211
|
+
containsMarkdownList(content) ||
|
|
212
|
+
containsFencedCodeBlock(content));
|
|
213
|
+
}
|
|
214
|
+
function detectLineEnding(content) {
|
|
215
|
+
return content.includes('\r\n') ? '\r\n' : '\n';
|
|
216
|
+
}
|
|
217
|
+
const FRONTMATTER_DELIMITER = '---';
|
|
218
|
+
function findFrontmatterLines(content) {
|
|
219
|
+
const lineEnding = detectLineEnding(content);
|
|
220
|
+
const lines = content.split(lineEnding);
|
|
221
|
+
if (lines[0] !== FRONTMATTER_DELIMITER)
|
|
222
|
+
return null;
|
|
223
|
+
const endIndex = lines.indexOf(FRONTMATTER_DELIMITER, 1);
|
|
224
|
+
if (endIndex === -1)
|
|
225
|
+
return null;
|
|
226
|
+
return { lineEnding, lines, endIndex };
|
|
227
|
+
}
|
|
228
|
+
function stripOptionalQuotes(value) {
|
|
229
|
+
const trimmed = value.trim();
|
|
230
|
+
if (trimmed.length < 2)
|
|
231
|
+
return trimmed;
|
|
232
|
+
const first = trimmed[0];
|
|
233
|
+
const last = trimmed[trimmed.length - 1];
|
|
234
|
+
if ((first === '"' && last === '"') || (first === "'" && last === "'")) {
|
|
235
|
+
return trimmed.slice(1, -1).trim();
|
|
236
|
+
}
|
|
237
|
+
return trimmed;
|
|
238
|
+
}
|
|
239
|
+
function parseFrontmatterEntry(line) {
|
|
240
|
+
const trimmed = line.trim();
|
|
241
|
+
if (!trimmed)
|
|
242
|
+
return null;
|
|
243
|
+
const separatorIndex = trimmed.indexOf(':');
|
|
244
|
+
if (separatorIndex <= 0)
|
|
245
|
+
return null;
|
|
246
|
+
const key = trimmed.slice(0, separatorIndex).trim().toLowerCase();
|
|
247
|
+
const value = trimmed.slice(separatorIndex + 1);
|
|
248
|
+
return { key, value };
|
|
249
|
+
}
|
|
250
|
+
function isTitleKey(key) {
|
|
251
|
+
return key === 'title' || key === 'name';
|
|
252
|
+
}
|
|
253
|
+
function extractTitleFromHeading(content) {
|
|
254
|
+
const lineEnding = detectLineEnding(content);
|
|
255
|
+
const lines = content.split(lineEnding);
|
|
256
|
+
for (const line of lines) {
|
|
257
|
+
const trimmed = line.trim();
|
|
258
|
+
if (!trimmed)
|
|
259
|
+
continue;
|
|
260
|
+
let index = 0;
|
|
261
|
+
while (index < trimmed.length && trimmed[index] === '#') {
|
|
262
|
+
index += 1;
|
|
263
|
+
}
|
|
264
|
+
if (index === 0 || index > 6)
|
|
265
|
+
return undefined;
|
|
266
|
+
const nextChar = trimmed[index];
|
|
267
|
+
if (nextChar !== ' ' && nextChar !== '\t')
|
|
268
|
+
return undefined;
|
|
269
|
+
const heading = trimmed.slice(index).trim();
|
|
270
|
+
return heading.length > 0 ? heading : undefined;
|
|
271
|
+
}
|
|
272
|
+
return undefined;
|
|
273
|
+
}
|
|
274
|
+
export function extractTitleFromRawMarkdown(content) {
|
|
275
|
+
const frontmatter = findFrontmatterLines(content);
|
|
276
|
+
if (!frontmatter) {
|
|
277
|
+
return extractTitleFromHeading(content);
|
|
278
|
+
}
|
|
279
|
+
const { lines, endIndex } = frontmatter;
|
|
280
|
+
const entry = lines
|
|
281
|
+
.slice(1, endIndex)
|
|
282
|
+
.map((line) => parseFrontmatterEntry(line))
|
|
283
|
+
.find((parsed) => parsed !== null && isTitleKey(parsed.key));
|
|
284
|
+
if (!entry)
|
|
285
|
+
return undefined;
|
|
286
|
+
const value = stripOptionalQuotes(entry.value);
|
|
287
|
+
return value || undefined;
|
|
288
|
+
}
|
|
289
|
+
function hasMarkdownSourceLine(content) {
|
|
290
|
+
const lineEnding = detectLineEnding(content);
|
|
291
|
+
const lines = content.split(lineEnding);
|
|
292
|
+
const limit = Math.min(lines.length, 50);
|
|
293
|
+
for (let index = 0; index < limit; index += 1) {
|
|
294
|
+
const line = lines[index];
|
|
295
|
+
if (!line)
|
|
296
|
+
continue;
|
|
297
|
+
if (line.trimStart().toLowerCase().startsWith('source:')) {
|
|
298
|
+
return true;
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
return false;
|
|
302
|
+
}
|
|
303
|
+
function addSourceToMarkdownMarkdownFormat(content, url) {
|
|
304
|
+
if (hasMarkdownSourceLine(content))
|
|
305
|
+
return content;
|
|
306
|
+
const lineEnding = detectLineEnding(content);
|
|
307
|
+
const lines = content.split(lineEnding);
|
|
308
|
+
const firstNonEmptyIndex = lines.findIndex((line) => line.trim().length > 0);
|
|
309
|
+
if (firstNonEmptyIndex !== -1) {
|
|
310
|
+
const firstLine = lines[firstNonEmptyIndex];
|
|
311
|
+
if (firstLine && /^#{1,6}\s+/.test(firstLine.trim())) {
|
|
312
|
+
const insertAt = firstNonEmptyIndex + 1;
|
|
313
|
+
const updated = [
|
|
314
|
+
...lines.slice(0, insertAt),
|
|
315
|
+
'',
|
|
316
|
+
`Source: ${url}`,
|
|
317
|
+
'',
|
|
318
|
+
...lines.slice(insertAt),
|
|
319
|
+
];
|
|
320
|
+
return updated.join(lineEnding);
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
return [`Source: ${url}`, '', content].join(lineEnding);
|
|
324
|
+
}
|
|
325
|
+
export function addSourceToMarkdown(content, url) {
|
|
326
|
+
const frontmatter = findFrontmatterLines(content);
|
|
327
|
+
if (config.transform.metadataFormat === 'markdown' && !frontmatter) {
|
|
328
|
+
return addSourceToMarkdownMarkdownFormat(content, url);
|
|
329
|
+
}
|
|
330
|
+
if (!frontmatter) {
|
|
331
|
+
return `---\nsource: "${url}"\n---\n\n${content}`;
|
|
332
|
+
}
|
|
333
|
+
const { lineEnding, lines, endIndex } = frontmatter;
|
|
334
|
+
const bodyLines = lines.slice(1, endIndex);
|
|
335
|
+
const hasSource = bodyLines.some((line) => line.trimStart().toLowerCase().startsWith('source:'));
|
|
336
|
+
if (hasSource)
|
|
337
|
+
return content;
|
|
338
|
+
const updatedLines = [
|
|
339
|
+
lines[0],
|
|
340
|
+
...bodyLines,
|
|
341
|
+
`source: "${url}"`,
|
|
342
|
+
...lines.slice(endIndex),
|
|
343
|
+
];
|
|
344
|
+
return updatedLines.join(lineEnding);
|
|
345
|
+
}
|
|
346
|
+
function hasFrontmatter(trimmed) {
|
|
347
|
+
return trimmed.startsWith('---\n') || trimmed.startsWith('---\r\n');
|
|
348
|
+
}
|
|
349
|
+
function looksLikeHtmlDocument(trimmed) {
|
|
350
|
+
return HTML_DOCUMENT_PATTERN.test(trimmed);
|
|
351
|
+
}
|
|
352
|
+
function countCommonHtmlTags(content) {
|
|
353
|
+
const matches = content.match(/<(html|head|body|div|span|script|style|meta|link)\b/gi) ??
|
|
354
|
+
[];
|
|
355
|
+
return matches.length;
|
|
356
|
+
}
|
|
357
|
+
export function isRawTextContent(content) {
|
|
358
|
+
const trimmed = content.trim();
|
|
359
|
+
const isHtmlDocument = looksLikeHtmlDocument(trimmed);
|
|
360
|
+
const hasMarkdownFrontmatter = hasFrontmatter(trimmed);
|
|
361
|
+
const hasTooManyHtmlTags = countCommonHtmlTags(content) > 2;
|
|
362
|
+
const isMarkdown = looksLikeMarkdown(content);
|
|
363
|
+
return (!isHtmlDocument &&
|
|
364
|
+
(hasMarkdownFrontmatter || (!hasTooManyHtmlTags && isMarkdown)));
|
|
365
|
+
}
|
|
366
|
+
export function isLikelyHtmlContent(content) {
|
|
367
|
+
const trimmed = content.trim();
|
|
368
|
+
if (!trimmed)
|
|
369
|
+
return false;
|
|
370
|
+
if (looksLikeHtmlDocument(trimmed))
|
|
371
|
+
return true;
|
|
372
|
+
return countCommonHtmlTags(content) > 2;
|
|
373
|
+
}
|
|
374
|
+
function formatFetchedDate(isoString) {
|
|
375
|
+
try {
|
|
376
|
+
const date = new Date(isoString);
|
|
377
|
+
const day = String(date.getDate()).padStart(2, '0');
|
|
378
|
+
const month = String(date.getMonth() + 1).padStart(2, '0');
|
|
379
|
+
const year = date.getFullYear();
|
|
380
|
+
return `${day}-${month}-${year}`;
|
|
381
|
+
}
|
|
382
|
+
catch {
|
|
383
|
+
return isoString;
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
export function buildMetadataFooter(metadata, fallbackUrl) {
|
|
387
|
+
if (!metadata)
|
|
388
|
+
return '';
|
|
389
|
+
const lines = ['---', ''];
|
|
390
|
+
const url = metadata.url || fallbackUrl;
|
|
391
|
+
const parts = [];
|
|
392
|
+
if (metadata.title)
|
|
393
|
+
parts.push(`_${metadata.title}_`);
|
|
394
|
+
if (metadata.author)
|
|
395
|
+
parts.push(`_${metadata.author}_`);
|
|
396
|
+
if (url)
|
|
397
|
+
parts.push(`[_Original Source_](${url})`);
|
|
398
|
+
if (metadata.fetchedAt) {
|
|
399
|
+
const formattedDate = formatFetchedDate(metadata.fetchedAt);
|
|
400
|
+
parts.push(`_${formattedDate}_`);
|
|
401
|
+
}
|
|
402
|
+
if (parts.length > 0) {
|
|
403
|
+
lines.push(` ${parts.join(' | ')}`);
|
|
404
|
+
}
|
|
405
|
+
if (metadata.description) {
|
|
406
|
+
lines.push(` <sub>${metadata.description}</sub>`);
|
|
407
|
+
}
|
|
408
|
+
return lines.join('\n');
|
|
190
409
|
}
|
|
191
410
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
192
411
|
// Heading Promotion (fence-aware)
|
|
@@ -280,4 +499,3 @@ export function promoteOrphanHeadings(markdown) {
|
|
|
280
499
|
}
|
|
281
500
|
return result.join('\n');
|
|
282
501
|
}
|
|
283
|
-
//# sourceMappingURL=markdown-cleanup.js.map
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
export type JsonRpcId = string | number | null;
|
|
2
|
+
export interface McpRequestParams {
|
|
3
|
+
_meta?: Record<string, unknown>;
|
|
4
|
+
[key: string]: unknown;
|
|
5
|
+
}
|
|
6
|
+
export interface McpRequestBody {
|
|
7
|
+
jsonrpc: '2.0';
|
|
8
|
+
method: string;
|
|
9
|
+
id?: JsonRpcId;
|
|
10
|
+
params?: McpRequestParams;
|
|
11
|
+
}
|
|
12
|
+
export declare function isJsonRpcBatchRequest(body: unknown): boolean;
|
|
13
|
+
export declare function isMcpRequestBody(body: unknown): body is McpRequestBody;
|
|
14
|
+
export declare function acceptsEventStream(header: string | null | undefined): boolean;
|