@j0hanz/superfetch 1.2.5 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. package/README.md +116 -152
  2. package/dist/config/auth-config.d.ts +16 -0
  3. package/dist/config/auth-config.js +53 -0
  4. package/dist/config/constants.d.ts +11 -13
  5. package/dist/config/constants.js +1 -3
  6. package/dist/config/env-parsers.d.ts +7 -0
  7. package/dist/config/env-parsers.js +84 -0
  8. package/dist/config/formatting.d.ts +2 -2
  9. package/dist/config/index.d.ts +47 -53
  10. package/dist/config/index.js +25 -59
  11. package/dist/config/types/content.d.ts +1 -49
  12. package/dist/config/types/runtime.d.ts +8 -16
  13. package/dist/config/types/tools.d.ts +2 -28
  14. package/dist/http/accept-policy.d.ts +3 -0
  15. package/dist/http/accept-policy.js +45 -0
  16. package/dist/http/async-handler.d.ts +2 -0
  17. package/dist/http/async-handler.js +5 -0
  18. package/dist/http/auth-introspection.d.ts +2 -0
  19. package/dist/http/auth-introspection.js +141 -0
  20. package/dist/http/auth-static.d.ts +2 -0
  21. package/dist/http/auth-static.js +23 -0
  22. package/dist/http/auth.d.ts +3 -2
  23. package/dist/http/auth.js +98 -26
  24. package/dist/http/cors.d.ts +6 -6
  25. package/dist/http/cors.js +7 -42
  26. package/dist/http/download-routes.d.ts +0 -12
  27. package/dist/http/download-routes.js +21 -58
  28. package/dist/http/jsonrpc-http.d.ts +2 -0
  29. package/dist/http/jsonrpc-http.js +10 -0
  30. package/dist/http/mcp-routes.d.ts +0 -1
  31. package/dist/http/mcp-routes.js +43 -30
  32. package/dist/http/mcp-session-helpers.d.ts +0 -1
  33. package/dist/http/mcp-session-helpers.js +1 -1
  34. package/dist/http/mcp-session-transport.d.ts +7 -0
  35. package/dist/http/mcp-session-transport.js +57 -0
  36. package/dist/http/mcp-session.js +60 -73
  37. package/dist/http/mcp-validation.d.ts +1 -0
  38. package/dist/http/mcp-validation.js +11 -10
  39. package/dist/http/protocol-policy.d.ts +2 -0
  40. package/dist/http/protocol-policy.js +31 -0
  41. package/dist/http/rate-limit.js +5 -2
  42. package/dist/http/server-config.d.ts +1 -0
  43. package/dist/http/server-config.js +40 -0
  44. package/dist/http/server-middleware.d.ts +2 -9
  45. package/dist/http/server-middleware.js +96 -43
  46. package/dist/http/server-shutdown.d.ts +4 -0
  47. package/dist/http/server-shutdown.js +43 -0
  48. package/dist/http/server.js +52 -64
  49. package/dist/http/session-cleanup.js +1 -1
  50. package/dist/middleware/error-handler.js +1 -3
  51. package/dist/resources/cached-content.js +50 -108
  52. package/dist/resources/index.js +0 -82
  53. package/dist/server.js +51 -30
  54. package/dist/services/cache-keys.d.ts +7 -0
  55. package/dist/services/cache-keys.js +57 -0
  56. package/dist/services/cache.d.ts +1 -7
  57. package/dist/services/cache.js +53 -119
  58. package/dist/services/context.d.ts +0 -1
  59. package/dist/services/context.js +0 -7
  60. package/dist/services/extractor.js +10 -82
  61. package/dist/services/fetcher/agents.d.ts +2 -2
  62. package/dist/services/fetcher/agents.js +34 -95
  63. package/dist/services/fetcher/dns-selection.d.ts +2 -0
  64. package/dist/services/fetcher/dns-selection.js +72 -0
  65. package/dist/services/fetcher/interceptors.d.ts +0 -22
  66. package/dist/services/fetcher/interceptors.js +30 -13
  67. package/dist/services/fetcher/redirects.js +4 -3
  68. package/dist/services/fetcher/response.js +66 -31
  69. package/dist/services/fetcher.d.ts +1 -3
  70. package/dist/services/fetcher.js +14 -33
  71. package/dist/services/fifo-queue.d.ts +8 -0
  72. package/dist/services/fifo-queue.js +25 -0
  73. package/dist/services/logger.js +2 -2
  74. package/dist/services/metadata-collector.d.ts +1 -9
  75. package/dist/services/metadata-collector.js +71 -2
  76. package/dist/services/transform-worker-pool.d.ts +4 -14
  77. package/dist/services/transform-worker-pool.js +177 -129
  78. package/dist/services/transform-worker-types.d.ts +32 -0
  79. package/dist/services/transform-worker-types.js +14 -0
  80. package/dist/tools/handlers/fetch-markdown.tool.d.ts +3 -4
  81. package/dist/tools/handlers/fetch-markdown.tool.js +20 -72
  82. package/dist/tools/handlers/fetch-single.shared.d.ts +1 -20
  83. package/dist/tools/handlers/fetch-single.shared.js +44 -87
  84. package/dist/tools/handlers/fetch-url.tool.d.ts +1 -1
  85. package/dist/tools/handlers/fetch-url.tool.js +46 -123
  86. package/dist/tools/index.js +21 -40
  87. package/dist/tools/schemas.d.ts +1 -51
  88. package/dist/tools/schemas.js +1 -107
  89. package/dist/tools/utils/cached-markdown.d.ts +5 -0
  90. package/dist/tools/utils/cached-markdown.js +46 -0
  91. package/dist/tools/utils/content-shaping.d.ts +4 -0
  92. package/dist/tools/utils/content-shaping.js +52 -0
  93. package/dist/tools/utils/content-transform.d.ts +2 -17
  94. package/dist/tools/utils/content-transform.js +120 -114
  95. package/dist/tools/utils/fetch-pipeline.d.ts +0 -8
  96. package/dist/tools/utils/fetch-pipeline.js +65 -62
  97. package/dist/tools/utils/inline-content.d.ts +1 -2
  98. package/dist/tools/utils/inline-content.js +4 -7
  99. package/dist/transformers/markdown.transformer.js +109 -34
  100. package/dist/utils/cached-payload.d.ts +7 -0
  101. package/dist/utils/cached-payload.js +36 -0
  102. package/dist/utils/error-utils.js +1 -1
  103. package/dist/utils/filename-generator.js +21 -10
  104. package/dist/utils/guards.d.ts +1 -0
  105. package/dist/utils/guards.js +3 -0
  106. package/dist/utils/header-normalizer.d.ts +0 -3
  107. package/dist/utils/header-normalizer.js +3 -3
  108. package/dist/utils/tool-error-handler.d.ts +2 -2
  109. package/dist/utils/tool-error-handler.js +11 -38
  110. package/dist/utils/url-transformer.d.ts +7 -0
  111. package/dist/utils/url-transformer.js +147 -0
  112. package/dist/utils/url-validator.d.ts +1 -2
  113. package/dist/utils/url-validator.js +20 -93
  114. package/dist/workers/content-transform.worker.d.ts +1 -0
  115. package/dist/workers/content-transform.worker.js +40 -0
  116. package/package.json +13 -16
package/README.md CHANGED
@@ -10,9 +10,9 @@
10
10
 
11
11
  [![Install in Cursor](https://cursor.com/deeplink/mcp-install-dark.svg)](https://cursor.com/install-mcp?name=superfetch&config=eyJjb21tYW5kIjoibnB4IiwiYXJncyI6WyIteSIsIkBqMGhhbnovc3VwZXJmZXRjaEBsYXRlc3QiLCItLXN0ZGlvIl19)
12
12
 
13
- A [Model Context Protocol](https://modelcontextprotocol.io/) (MCP) server that fetches web pages, extracts readable content with Mozilla Readability, and returns AI-friendly JSONL or Markdown.
13
+ A [Model Context Protocol](https://modelcontextprotocol.io/) (MCP) server that fetches web pages, extracts readable content with Mozilla Readability, and returns AI-friendly Markdown.
14
14
 
15
- [Quick Start](#quick-start) | [How to Choose a Tool](#how-to-choose-a-tool) | [Tools](#available-tools) | [Resources](#resources) | [Configuration](#configuration) | [Security](#security) | [Development](#development)
15
+ [Quick Start](#quick-start) | [Tool](#available-tools) | [Resources](#resources) | [Configuration](#configuration) | [Security](#security) | [Development](#development)
16
16
 
17
17
  > **Published to [MCP Registry](https://registry.modelcontextprotocol.io/)** - Search for `io.github.j0hanz/superfetch`
18
18
 
@@ -23,45 +23,15 @@ A [Model Context Protocol](https://modelcontextprotocol.io/) (MCP) server that f
23
23
 
24
24
  ## Features
25
25
 
26
- | Feature | Description |
27
- | ------------------ | ------------------------------------------------------------------------- |
28
- | Smart extraction | Mozilla Readability removes ads, navigation, and boilerplate when enabled |
29
- | JSONL + Markdown | JSONL semantic blocks or clean Markdown with frontmatter |
30
- | Structured blocks | Headings, paragraphs, lists, code, tables, images, blockquotes |
31
- | Built-in caching | In-memory cache with TTL, max keys, and resource subscriptions |
32
- | Resilient fetching | Redirect handling plus retry with exponential backoff + jitter |
33
- | Security first | URL validation, SSRF/DNS/IP blocklists, header sanitization |
34
- | HTTP mode | API key auth, session management, rate limiting, CORS |
35
-
36
- ---
37
-
38
- ## How to Choose a Tool
39
-
40
- Use this guide to select the right tool for your web content extraction needs.
41
-
42
- ### Decision Tree
43
-
44
- ```text
45
- Need web content for AI?
46
- - Want structured JSONL blocks -> fetch-url (format: jsonl)
47
- - Want clean Markdown -> fetch-markdown
48
- - Want Markdown but also need contentBlocks count -> fetch-url (format: markdown)
49
- ```
50
-
51
- ### Quick Reference Table
52
-
53
- | Tool | Best For | Output Format | Use When |
54
- | ---------------- | ---------------------------------- | -------------------------------- | ----------------------------------------- |
55
- | `fetch-url` | Single page with structured blocks | JSONL (or Markdown via `format`) | RAG pipelines, content parsing, analytics |
56
- | `fetch-markdown` | Single page in readable format | Markdown + frontmatter | Documentation, summaries, human review |
57
-
58
- ### Common Use Cases
59
-
60
- | Task | Recommended Tool | Why |
61
- | ------------------------ | ---------------------------------------- | ---------------------------------------------------- |
62
- | Parse a blog post for AI | `fetch-url` | Returns semantic blocks (headings, paragraphs, code) |
63
- | Generate documentation | `fetch-markdown` | Clean markdown with frontmatter |
64
- | Extract article for RAG | `fetch-url` + `extractMainContent: true` | Removes ads/nav, keeps main content |
26
+ | Feature | Description |
27
+ | -------------------- | ------------------------------------------------------------------------------------- |
28
+ | Smart extraction | Mozilla Readability with quality gates to strip boilerplate when it improves results |
29
+ | Clean Markdown | Markdown output with optional YAML frontmatter (title + source) |
30
+ | Raw content handling | Preserves raw markdown/text and rewrites GitHub/GitLab/Bitbucket blob URLs to raw |
31
+ | Built-in caching | In-memory cache with TTL, max keys, and resource subscriptions |
32
+ | Resilient fetching | Redirect handling with validation, timeouts, and response size limits |
33
+ | Security first | URL validation plus SSRF/DNS/IP blocklists |
34
+ | HTTP mode | Static token or OAuth auth, session management, rate limiting, host/origin validation |
65
35
 
66
36
  ---
67
37
 
@@ -230,7 +200,7 @@ npm install -g @j0hanz/superfetch
230
200
  # Run in stdio mode
231
201
  superfetch --stdio
232
202
 
233
- # Run HTTP server (requires API_KEY)
203
+ # Run HTTP server (requires auth token)
234
204
  superfetch
235
205
  ```
236
206
 
@@ -257,7 +227,7 @@ node dist/index.js --stdio
257
227
  <details>
258
228
  <summary><strong>HTTP Mode</strong> (default)</summary>
259
229
 
260
- HTTP mode requires `API_KEY` and only binds to loopback addresses unless `ALLOW_REMOTE=true`.
230
+ HTTP mode requires authentication. By default it binds to `127.0.0.1`. To listen on all interfaces, set `HOST=0.0.0.0` or `HOST=::` and configure OAuth (remote bindings require OAuth). Other non-loopback `HOST` values are rejected.
261
231
 
262
232
  ```bash
263
233
  API_KEY=supersecret npx -y @j0hanz/superfetch@latest
@@ -271,7 +241,9 @@ $env:API_KEY = "supersecret"
271
241
  npx -y @j0hanz/superfetch@latest
272
242
  ```
273
243
 
274
- Endpoints (all require `Authorization: Bearer <API_KEY>` or `X-API-Key: <API_KEY>`):
244
+ For multiple static tokens, set `ACCESS_TOKENS` (comma/space separated).
245
+
246
+ Endpoints (auth required via `Authorization: Bearer <token>`; in static token mode, `X-API-Key` is also accepted):
275
247
 
276
248
  - `GET /health`
277
249
  - `POST /mcp`
@@ -289,111 +261,61 @@ Sessions are managed via the `mcp-session-id` header (see [HTTP Mode Details](#h
289
261
 
290
262
  ### Tool Response Notes
291
263
 
292
- Both tools return:
264
+ The tool returns `structuredContent` with `url`, optional `title`, and `markdown` when inline content is available. On errors, `error` is included instead of content.
293
265
 
294
- - `structuredContent` for machine-readable fields (includes `contentSize`, `cached`, and optional `resourceUri`/`resourceMimeType`/`truncated`; Markdown responses may also include `file`)
295
- - `content` blocks that include:
296
- - a `text` block containing JSON of `structuredContent`
297
- - in stdio mode, a `resource` block with a `file:///...` URI embedding the full content
298
- - in HTTP mode, a `resource` block when inline content is available
299
- - when content exceeds `MAX_INLINE_CONTENT_CHARS` and cache is enabled, a `resource_link` block points to `superfetch://cache/...` and `structuredContent.resourceUri` is set
266
+ The response includes:
300
267
 
301
- If content exceeds `MAX_INLINE_CONTENT_CHARS` and cache is disabled, the server truncates output, appends `...[truncated]`, and sets `truncated: true`.
268
+ - a `text` block containing JSON of `structuredContent`
269
+ - a `resource` block embedding markdown when inline content is available (always in stdio mode)
270
+ - when content exceeds the inline limit and cache is enabled, a `resource_link` block pointing to `superfetch://cache/...` (inline markdown may be omitted)
302
271
 
303
272
  ---
304
273
 
305
274
  ### `fetch-url`
306
275
 
307
- Fetches a webpage and converts it to AI-readable JSONL format with semantic content blocks. You can also request Markdown with `format: "markdown"`.
308
-
309
- | Parameter | Type | Default | Description |
310
- | ---------------------- | --------------------- | ---------------------------------- | ------------------------------------------------------ |
311
- | `url` | string | required | URL to fetch |
312
- | `format` | "jsonl" \| "markdown" | `"jsonl"` | Output format |
313
- | `includeContentBlocks` | boolean | `true` (jsonl), `false` (markdown) | Include content block counts when `format: "markdown"` |
314
- | `extractMainContent` | boolean | `true` | Use Readability to extract main content |
315
- | `includeMetadata` | boolean | `true` | Include page metadata |
316
- | `maxContentLength` | number | - | Maximum content length in characters (max 5,242,880) |
317
- | `customHeaders` | object | - | Custom HTTP headers (sanitized) |
318
- | `timeout` | number | `30000` | Request timeout in milliseconds (1000-120000) |
319
- | `retries` | number | `3` | Number of retry attempts (1-10) |
276
+ Fetches a webpage and converts it to clean Markdown format with optional frontmatter.
320
277
 
321
- When `format: "markdown"` and `includeContentBlocks` is `false`, `contentBlocks` will be `0`.
278
+ | Parameter | Type | Default | Description |
279
+ | --------- | ------ | -------- | ------------ |
280
+ | `url` | string | required | URL to fetch |
322
281
 
323
282
  **Example `structuredContent`:**
324
283
 
325
284
  ```json
326
285
  {
327
- "url": "https://example.com/article",
328
- "title": "Example Article",
329
- "contentBlocks": 42,
330
- "fetchedAt": "2025-12-11T10:30:00.000Z",
331
- "format": "jsonl",
332
- "contentSize": 12345,
333
- "cached": false,
334
- "content": "{\"type\":\"metadata\",\"title\":\"Example Article\",\"url\":\"https://example.com/article\"}\n{\"type\":\"heading\",\"level\":1,\"text\":\"Introduction\"}"
286
+ "url": "https://example.com/docs",
287
+ "title": "Documentation",
288
+ "markdown": "---\ntitle: Documentation\n---\n\n# Getting Started\n\nWelcome..."
335
289
  }
336
290
  ```
337
291
 
338
- ---
339
-
340
- ### `fetch-markdown`
341
-
342
- Fetches a webpage and converts it to clean Markdown with optional frontmatter.
343
-
344
- | Parameter | Type | Default | Description |
345
- | -------------------- | ------- | -------- | ---------------------------------------------------- |
346
- | `url` | string | required | URL to fetch |
347
- | `extractMainContent` | boolean | `true` | Extract main content only |
348
- | `includeMetadata` | boolean | `true` | Include YAML frontmatter |
349
- | `maxContentLength` | number | - | Maximum content length in characters (max 5,242,880) |
350
- | `customHeaders` | object | - | Custom HTTP headers (sanitized) |
351
- | `timeout` | number | `30000` | Request timeout in milliseconds (1000-120000) |
352
- | `retries` | number | `3` | Number of retry attempts (1-10) |
353
-
354
- **Example `structuredContent`:**
292
+ **Error response:**
355
293
 
356
294
  ```json
357
295
  {
358
- "url": "https://example.com/docs",
359
- "title": "Documentation",
360
- "fetchedAt": "2025-12-11T10:30:00.000Z",
361
- "markdown": "---\ntitle: Documentation\nsource: \"https://example.com/docs\"\n---\n\n# Getting Started\n\nWelcome...",
362
- "contentSize": 9876,
363
- "cached": false,
364
- "truncated": false,
365
- "file": {
366
- "downloadUrl": "/mcp/downloads/markdown/abc123def456",
367
- "fileName": "documentation.md",
368
- "expiresAt": "2025-12-11T11:30:00.000Z"
369
- }
296
+ "url": "https://example.com/broken",
297
+ "error": "Failed to fetch: 404 Not Found"
370
298
  }
371
299
  ```
372
300
 
373
- `file` is included only in HTTP mode when content is cached and too large to inline.
374
-
375
301
  ---
376
302
 
377
303
  ### Large Content Handling
378
304
 
379
- - Inline limit is configurable via `MAX_INLINE_CONTENT_CHARS` (see `CONFIGURATION.md`).
380
- - If content exceeds the limit and cache is enabled, responses include `resourceUri`/`resourceMimeType` and a `resource_link` block.
381
- - If cache is disabled, content is truncated with `...[truncated]` and `truncated: true`.
382
- - Use `maxContentLength` per request to enforce a lower limit (hard cap: 5,242,880 characters).
305
+ - Inline markdown is capped at 20,000 characters (`maxInlineContentChars`).
306
+ - **Stdio mode:** full markdown is embedded as a `resource` block.
307
+ - **HTTP mode:** if content exceeds the inline limit and cache is enabled, the response includes a `resource_link` to `superfetch://cache/...` (no embedded markdown). If cache is disabled, the inline markdown is truncated with `...[truncated]`.
383
308
  - Upstream fetch size is capped at 10 MB of HTML; larger responses fail.
384
309
 
385
310
  ---
386
311
 
387
312
  ## Resources
388
313
 
389
- | URI | Description |
390
- | ------------------------------------------ | -------------------------------------------------------------------------- |
391
- | `superfetch://health` | Real-time server health and memory checks |
392
- | `superfetch://stats` | Server stats and cache metrics |
393
- | `superfetch://cache/list` | List cached entries and their resource URIs |
394
- | `superfetch://cache/{namespace}/{urlHash}` | Cached content entry (`namespace`: `url`, `markdown`; `links` is reserved) |
314
+ | URI | Description |
315
+ | ------------------------------------------ | ---------------------------------------------- |
316
+ | `superfetch://cache/{namespace}/{urlHash}` | Cached content entry (`namespace`: `markdown`) |
395
317
 
396
- Resource subscriptions notify clients when cache entries update.
318
+ Resource listings enumerate cached entries, and subscriptions notify clients when cache entries update.
397
319
 
398
320
  ---
399
321
 
@@ -407,21 +329,21 @@ When running in HTTP mode, cached content can be downloaded directly. Downloads
407
329
  GET /mcp/downloads/:namespace/:hash
408
330
  ```
409
331
 
410
- - `namespace`: `markdown` or `url`
411
- - Auth required (`Authorization: Bearer <API_KEY>` or `X-API-Key: <API_KEY>`)
332
+ - `namespace`: `markdown`
333
+ - Auth required (`Authorization: Bearer <token>`; in static token mode, `X-API-Key` is accepted)
412
334
 
413
335
  ### Response Headers
414
336
 
415
- | Header | Value |
416
- | --------------------- | ----------------------------------------------------------------------- |
417
- | `Content-Type` | `text/markdown; charset=utf-8` or `application/x-ndjson; charset=utf-8` |
418
- | `Content-Disposition` | `attachment; filename="<name>"` |
419
- | `Cache-Control` | `private, max-age=<CACHE_TTL>` |
337
+ | Header | Value |
338
+ | --------------------- | ------------------------------- |
339
+ | `Content-Type` | `text/markdown; charset=utf-8` |
340
+ | `Content-Disposition` | `attachment; filename="<name>"` |
341
+ | `Cache-Control` | `private, max-age=<CACHE_TTL>` |
420
342
 
421
343
  ### Example Usage
422
344
 
423
345
  ```bash
424
- curl -H "Authorization: Bearer $API_KEY" \
346
+ curl -H "Authorization: Bearer $TOKEN" \
425
347
  http://localhost:3000/mcp/downloads/markdown/abc123.def456 \
426
348
  -o article.md
427
349
  ```
@@ -438,7 +360,65 @@ curl -H "Authorization: Bearer $API_KEY" \
438
360
 
439
361
  ## Configuration
440
362
 
441
- Configuration details live in `CONFIGURATION.md`, including all environment variables, defaults, ranges, presets, and dev-only flags.
363
+ Set environment variables in your MCP client `env` or in the shell before starting the server.
364
+
365
+ ### Core Server Settings
366
+
367
+ | Variable | Default | Description |
368
+ | --------------- | -------------------- | ------------------------------------------------------------- |
369
+ | `HOST` | `127.0.0.1` | HTTP bind address |
370
+ | `PORT` | `3000` | HTTP server port (1024-65535) |
371
+ | `USER_AGENT` | `superFetch-MCP/2.0` | User-Agent header for outgoing requests |
372
+ | `CACHE_ENABLED` | `true` | Enable response caching |
373
+ | `CACHE_TTL` | `3600` | Cache TTL in seconds (60-86400) |
374
+ | `LOG_LEVEL` | `info` | `debug`, `info`, `warn`, `error` |
375
+ | `ALLOWED_HOSTS` | (empty) | Additional allowed Host/Origin values (comma/space separated) |
376
+
377
+ ### Auth (HTTP Mode)
378
+
379
+ | Variable | Default | Description |
380
+ | --------------- | ------- | ------------------------------------------------------------ |
381
+ | `AUTH_MODE` | auto | `static` or `oauth`. Auto-selects OAuth if any OAUTH URL set |
382
+ | `ACCESS_TOKENS` | (empty) | Comma/space-separated static bearer tokens |
383
+ | `API_KEY` | (empty) | Adds a static bearer token and enables `X-API-Key` header |
384
+
385
+ Static mode requires at least one token (`ACCESS_TOKENS` or `API_KEY`).
386
+
387
+ ### OAuth (HTTP Mode)
388
+
389
+ Required when `AUTH_MODE=oauth` (or auto-selected by presence of OAuth URLs):
390
+
391
+ | Variable | Default | Description |
392
+ | ------------------------- | ------- | ---------------------- |
393
+ | `OAUTH_ISSUER_URL` | - | OAuth issuer |
394
+ | `OAUTH_AUTHORIZATION_URL` | - | Authorization endpoint |
395
+ | `OAUTH_TOKEN_URL` | - | Token endpoint |
396
+ | `OAUTH_INTROSPECTION_URL` | - | Introspection endpoint |
397
+
398
+ Optional:
399
+
400
+ | Variable | Default | Description |
401
+ | -------------------------------- | -------------------------- | --------------------------------------- |
402
+ | `OAUTH_REVOCATION_URL` | - | Revocation endpoint |
403
+ | `OAUTH_REGISTRATION_URL` | - | Dynamic client registration endpoint |
404
+ | `OAUTH_RESOURCE_URL` | `http://<host>:<port>/mcp` | Protected resource URL |
405
+ | `OAUTH_REQUIRED_SCOPES` | (empty) | Required scopes (comma/space separated) |
406
+ | `OAUTH_CLIENT_ID` | - | Client ID for introspection |
407
+ | `OAUTH_CLIENT_SECRET` | - | Client secret for introspection |
408
+ | `OAUTH_INTROSPECTION_TIMEOUT_MS` | `5000` | Introspection timeout (1000-30000) |
409
+
410
+ ### Fixed Limits (Not Configurable via env)
411
+
412
+ - Fetch timeout: 15 seconds
413
+ - Max redirects: 5
414
+ - Max HTML response size: 10 MB
415
+ - Inline markdown limit: 20,000 characters
416
+ - Cache max entries: 100
417
+ - Session TTL: 30 minutes
418
+ - Max sessions: 200
419
+ - Rate limit: 100 req/min per IP (60s window)
420
+
421
+ See `CONFIGURATION.md` for preset examples and quick-start snippets.
442
422
 
443
423
  ---
444
424
 
@@ -450,28 +430,13 @@ HTTP mode uses the MCP Streamable HTTP transport. The workflow is:
450
430
  2. The server returns `mcp-session-id` in the response headers.
451
431
  3. Use that header for subsequent `POST /mcp`, `GET /mcp`, and `DELETE /mcp` requests.
452
432
 
453
- `GET /mcp` and `DELETE /mcp` require `mcp-session-id`. `POST /mcp` without an `initialize` request will return 400.
433
+ If the `mcp-protocol-version` header is missing, the server defaults it to `2025-03-26`. Supported versions are `2025-03-26` and `2025-11-25`.
454
434
 
455
- If `MAX_SESSIONS` is reached, the server evicts the oldest session when possible, otherwise returns a 503.
456
-
457
- Host header validation is always enforced in HTTP mode. When binding to `0.0.0.0` or `::`, set `ALLOWED_HOSTS` to the hostnames clients will send. If an `Origin` header is present, it must be allowed by `ALLOWED_ORIGINS` or `CORS_ALLOW_ALL`.
458
-
459
- ---
460
-
461
- ## Content Block Types
435
+ `GET /mcp` and `DELETE /mcp` require `mcp-session-id`. `POST /mcp` without an `initialize` request will return 400.
462
436
 
463
- JSONL output includes semantic content blocks:
437
+ If the server reaches its session cap (200), it evicts the oldest session when possible; otherwise it returns a 503.
464
438
 
465
- | Type | Description |
466
- | ------------ | ---------------------------------------- |
467
- | `metadata` | Minimal page metadata (type, title, url) |
468
- | `heading` | Headings (h1-h6) with level indicator |
469
- | `paragraph` | Text paragraphs |
470
- | `list` | Ordered/unordered lists |
471
- | `code` | Code blocks with optional language |
472
- | `table` | Tables with headers and rows |
473
- | `image` | Images with src and alt text |
474
- | `blockquote` | Block quote text |
439
+ Host and Origin headers are always validated. Allowed values include loopback hosts, the configured `HOST` (if not a wildcard), and any entries in `ALLOWED_HOSTS`. When binding to `0.0.0.0` or `::`, set `ALLOWED_HOSTS` to the hostnames clients will send.
475
440
 
476
441
  ---
477
442
 
@@ -498,13 +463,14 @@ DNS resolution is performed and blocked if any resolved IP matches a blocked ran
498
463
  - Max URL length: 2048 characters
499
464
  - Hostnames ending in `.local` or `.internal` are rejected
500
465
 
501
- ### Header Sanitization
466
+ ### Host/Origin Validation (HTTP Mode)
502
467
 
503
- Blocked headers: `host`, `authorization`, `cookie`, `x-forwarded-for`, `x-real-ip`, `proxy-authorization`
468
+ - Host header must match loopback, configured `HOST` (if not a wildcard), or `ALLOWED_HOSTS`
469
+ - Origin header (when present) is validated against the same allow-list
504
470
 
505
471
  ### Rate Limiting
506
472
 
507
- Rate limiting applies to `/mcp` and `/mcp/downloads` and is configurable via `RATE_LIMIT_ENABLED`, `RATE_LIMIT_MAX`, `RATE_LIMIT_WINDOW_MS`, and `RATE_LIMIT_CLEANUP_MS` (see `CONFIGURATION.md`).
473
+ Rate limiting applies to `/mcp` and `/mcp/downloads` (100 req/min per IP, 60s window). OPTIONS requests are not rate-limited.
508
474
 
509
475
  ---
510
476
 
@@ -522,8 +488,6 @@ Rate limiting applies to `/mcp` and `/mcp/downloads` and is configurable via `RA
522
488
  | `npm run format` | Format with Prettier |
523
489
  | `npm test` | Run Node test runner (builds dist) |
524
490
  | `npm run test:coverage` | Run tests with experimental coverage |
525
- | `npm run bench` | Run minimal performance benchmark |
526
- | `npm run release` | Create new release |
527
491
  | `npm run knip` | Find unused exports/dependencies |
528
492
  | `npm run knip:fix` | Auto-fix unused code |
529
493
 
@@ -537,10 +501,10 @@ Rate limiting applies to `/mcp` and `/mcp/downloads` and is configurable via `RA
537
501
  | Language | TypeScript 5.9 |
538
502
  | MCP SDK | @modelcontextprotocol/sdk ^1.25.1 |
539
503
  | Content Extraction | @mozilla/readability ^0.6.0 |
540
- | HTML Parsing | Cheerio ^1.1.2, LinkeDOM ^0.18.12 |
504
+ | HTML Parsing | LinkeDOM ^0.18.12 |
541
505
  | Markdown | Turndown ^7.2.2 |
542
- | HTTP | Express ^5.2.1, undici ^6.22.0 |
543
- | Validation | Zod ^4.3.4 |
506
+ | HTTP | Express ^5.2.1, undici ^6.23.0 |
507
+ | Validation | Zod ^4.3.5 |
544
508
 
545
509
  ---
546
510
 
@@ -0,0 +1,16 @@
1
+ export interface AuthConfig {
2
+ mode: 'oauth' | 'static';
3
+ issuerUrl: URL | undefined;
4
+ authorizationUrl: URL | undefined;
5
+ tokenUrl: URL | undefined;
6
+ revocationUrl: URL | undefined;
7
+ registrationUrl: URL | undefined;
8
+ introspectionUrl: URL | undefined;
9
+ resourceUrl: URL;
10
+ requiredScopes: string[];
11
+ clientId: string | undefined;
12
+ clientSecret: string | undefined;
13
+ introspectionTimeoutMs: number;
14
+ staticTokens: string[];
15
+ }
16
+ export declare function buildAuthConfig(baseUrl: URL): AuthConfig;
@@ -0,0 +1,53 @@
1
+ import { parseInteger, parseList, parseUrlEnv } from './env-parsers.js';
2
+ function readCoreOAuthUrls() {
3
+ return {
4
+ issuerUrl: parseUrlEnv(process.env.OAUTH_ISSUER_URL, 'OAUTH_ISSUER_URL'),
5
+ authorizationUrl: parseUrlEnv(process.env.OAUTH_AUTHORIZATION_URL, 'OAUTH_AUTHORIZATION_URL'),
6
+ tokenUrl: parseUrlEnv(process.env.OAUTH_TOKEN_URL, 'OAUTH_TOKEN_URL'),
7
+ };
8
+ }
9
+ function readOptionalOAuthUrls(baseUrl) {
10
+ return {
11
+ revocationUrl: parseUrlEnv(process.env.OAUTH_REVOCATION_URL, 'OAUTH_REVOCATION_URL'),
12
+ registrationUrl: parseUrlEnv(process.env.OAUTH_REGISTRATION_URL, 'OAUTH_REGISTRATION_URL'),
13
+ introspectionUrl: parseUrlEnv(process.env.OAUTH_INTROSPECTION_URL, 'OAUTH_INTROSPECTION_URL'),
14
+ resourceUrl: parseUrlEnv(process.env.OAUTH_RESOURCE_URL, 'OAUTH_RESOURCE_URL') ??
15
+ new URL('/mcp', baseUrl),
16
+ };
17
+ }
18
+ function readOAuthUrls(baseUrl) {
19
+ return { ...readCoreOAuthUrls(), ...readOptionalOAuthUrls(baseUrl) };
20
+ }
21
+ function resolveAuthMode(authModeEnv, urls) {
22
+ if (authModeEnv === 'oauth')
23
+ return 'oauth';
24
+ if (authModeEnv === 'static')
25
+ return 'static';
26
+ const oauthConfigured = [
27
+ urls.issuerUrl,
28
+ urls.authorizationUrl,
29
+ urls.tokenUrl,
30
+ urls.introspectionUrl,
31
+ ].some((value) => value !== undefined);
32
+ return oauthConfigured ? 'oauth' : 'static';
33
+ }
34
+ function collectStaticTokens() {
35
+ const staticTokens = new Set(parseList(process.env.ACCESS_TOKENS));
36
+ if (process.env.API_KEY) {
37
+ staticTokens.add(process.env.API_KEY);
38
+ }
39
+ return Array.from(staticTokens);
40
+ }
41
+ export function buildAuthConfig(baseUrl) {
42
+ const urls = readOAuthUrls(baseUrl);
43
+ const mode = resolveAuthMode(process.env.AUTH_MODE?.toLowerCase(), urls);
44
+ return {
45
+ mode,
46
+ ...urls,
47
+ requiredScopes: parseList(process.env.OAUTH_REQUIRED_SCOPES),
48
+ clientId: process.env.OAUTH_CLIENT_ID,
49
+ clientSecret: process.env.OAUTH_CLIENT_SECRET,
50
+ introspectionTimeoutMs: parseInteger(process.env.OAUTH_INTROSPECTION_TIMEOUT_MS, 5000, 1000, 30000),
51
+ staticTokens: collectStaticTokens(),
52
+ };
53
+ }
@@ -1,19 +1,17 @@
1
1
  export declare const SIZE_LIMITS: {
2
- readonly ONE_MB: number;
3
- readonly FIVE_MB: number;
4
- readonly TEN_MB: number;
5
- readonly FIFTY_MB: number;
6
- readonly HUNDRED_MB: number;
2
+ ONE_MB: number;
3
+ FIVE_MB: number;
4
+ TEN_MB: number;
5
+ FIFTY_MB: number;
6
+ HUNDRED_MB: number;
7
7
  };
8
8
  export declare const CACHE_HASH: {
9
- readonly URL_HASH_LENGTH: 16;
10
- readonly VARY_HASH_LENGTH: 12;
9
+ URL_HASH_LENGTH: number;
10
+ VARY_HASH_LENGTH: number;
11
11
  };
12
12
  export declare const TIMEOUT: {
13
- readonly MIN_FETCH_TIMEOUT_MS: 5000;
14
- readonly DEFAULT_FETCH_TIMEOUT_MS: 30000;
15
- readonly MAX_FETCH_TIMEOUT_MS: 120000;
16
- readonly MIN_SESSION_TTL_MS: number;
17
- readonly DEFAULT_SESSION_TTL_MS: number;
18
- readonly MAX_SESSION_TTL_MS: number;
13
+ DEFAULT_FETCH_TIMEOUT_MS: number;
14
+ MIN_SESSION_TTL_MS: number;
15
+ DEFAULT_SESSION_TTL_MS: number;
16
+ MAX_SESSION_TTL_MS: number;
19
17
  };
@@ -15,9 +15,7 @@ export const CACHE_HASH = {
15
15
  VARY_HASH_LENGTH: 12,
16
16
  };
17
17
  export const TIMEOUT = {
18
- MIN_FETCH_TIMEOUT_MS: 5000,
19
- DEFAULT_FETCH_TIMEOUT_MS: 30000,
20
- MAX_FETCH_TIMEOUT_MS: 120000,
18
+ DEFAULT_FETCH_TIMEOUT_MS: 15000,
21
19
  MIN_SESSION_TTL_MS: 60 * 1000,
22
20
  DEFAULT_SESSION_TTL_MS: 30 * 60 * 1000,
23
21
  MAX_SESSION_TTL_MS: 24 * 60 * 60 * 1000,
@@ -0,0 +1,7 @@
1
+ import type { LogLevel } from './types/runtime.js';
2
+ export declare function parseInteger(envValue: string | undefined, defaultValue: number, min?: number, max?: number): number;
3
+ export declare function parseBoolean(envValue: string | undefined, defaultValue: boolean): boolean;
4
+ export declare function parseList(envValue: string | undefined): string[];
5
+ export declare function parseUrlEnv(value: string | undefined, name: string): URL | undefined;
6
+ export declare function parseAllowedHosts(envValue: string | undefined): Set<string>;
7
+ export declare function parseLogLevel(envValue: string | undefined): LogLevel;
@@ -0,0 +1,84 @@
1
+ function normalizeHostValue(value) {
2
+ const trimmed = value.trim().toLowerCase();
3
+ if (!trimmed)
4
+ return null;
5
+ if (trimmed.startsWith('[')) {
6
+ const end = trimmed.indexOf(']');
7
+ if (end === -1)
8
+ return null;
9
+ return trimmed.slice(1, end);
10
+ }
11
+ const colonIndex = trimmed.indexOf(':');
12
+ if (colonIndex !== -1) {
13
+ return trimmed.slice(0, colonIndex);
14
+ }
15
+ return trimmed;
16
+ }
17
+ const ALLOWED_LOG_LEVELS = new Set([
18
+ 'debug',
19
+ 'info',
20
+ 'warn',
21
+ 'error',
22
+ ]);
23
+ function isLogLevel(value) {
24
+ return ALLOWED_LOG_LEVELS.has(value);
25
+ }
26
+ function isBelowMin(value, min) {
27
+ if (min === undefined)
28
+ return false;
29
+ return value < min;
30
+ }
31
+ function isAboveMax(value, max) {
32
+ if (max === undefined)
33
+ return false;
34
+ return value > max;
35
+ }
36
+ export function parseInteger(envValue, defaultValue, min, max) {
37
+ if (!envValue)
38
+ return defaultValue;
39
+ const parsed = parseInt(envValue, 10);
40
+ if (Number.isNaN(parsed))
41
+ return defaultValue;
42
+ if (isBelowMin(parsed, min))
43
+ return defaultValue;
44
+ if (isAboveMax(parsed, max))
45
+ return defaultValue;
46
+ return parsed;
47
+ }
48
+ export function parseBoolean(envValue, defaultValue) {
49
+ if (!envValue)
50
+ return defaultValue;
51
+ return envValue !== 'false';
52
+ }
53
+ export function parseList(envValue) {
54
+ if (!envValue)
55
+ return [];
56
+ return envValue
57
+ .split(/[\s,]+/)
58
+ .map((entry) => entry.trim())
59
+ .filter((entry) => entry.length > 0);
60
+ }
61
+ export function parseUrlEnv(value, name) {
62
+ if (!value)
63
+ return undefined;
64
+ if (!URL.canParse(value)) {
65
+ throw new Error(`Invalid ${name} value: ${value}`);
66
+ }
67
+ return new URL(value);
68
+ }
69
+ export function parseAllowedHosts(envValue) {
70
+ const hosts = new Set();
71
+ for (const entry of parseList(envValue)) {
72
+ const normalized = normalizeHostValue(entry);
73
+ if (normalized) {
74
+ hosts.add(normalized);
75
+ }
76
+ }
77
+ return hosts;
78
+ }
79
+ export function parseLogLevel(envValue) {
80
+ const level = envValue?.toLowerCase();
81
+ if (!level)
82
+ return 'info';
83
+ return isLogLevel(level) ? level : 'info';
84
+ }
@@ -1,7 +1,7 @@
1
1
  export declare const TRUNCATION_MARKER = "...[truncated]";
2
2
  export declare const CODE_BLOCK: {
3
- readonly fence: "```";
4
- readonly format: (code: string, language?: string) => string;
3
+ fence: string;
4
+ format: (code: string, language?: string) => string;
5
5
  };
6
6
  export declare const FRONTMATTER_DELIMITER = "---";
7
7
  export declare const joinLines: (lines: readonly string[]) => string;