@j0hanz/superfetch 1.2.5 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (173) hide show
  1. package/README.md +131 -156
  2. package/dist/config/auth-config.d.ts +16 -0
  3. package/dist/config/auth-config.js +53 -0
  4. package/dist/config/constants.d.ts +11 -13
  5. package/dist/config/constants.js +1 -3
  6. package/dist/config/env-parsers.d.ts +7 -0
  7. package/dist/config/env-parsers.js +84 -0
  8. package/dist/config/formatting.d.ts +2 -2
  9. package/dist/config/index.d.ts +47 -53
  10. package/dist/config/index.js +35 -64
  11. package/dist/config/types/content.d.ts +1 -49
  12. package/dist/config/types/runtime.d.ts +8 -16
  13. package/dist/config/types/tools.d.ts +2 -28
  14. package/dist/http/accept-policy.d.ts +3 -0
  15. package/dist/http/accept-policy.js +45 -0
  16. package/dist/http/async-handler.d.ts +2 -0
  17. package/dist/http/async-handler.js +5 -0
  18. package/dist/http/auth-introspection.d.ts +2 -0
  19. package/dist/http/auth-introspection.js +141 -0
  20. package/dist/http/auth-static.d.ts +2 -0
  21. package/dist/http/auth-static.js +23 -0
  22. package/dist/http/auth.d.ts +3 -2
  23. package/dist/http/auth.js +254 -23
  24. package/dist/http/cors.d.ts +6 -6
  25. package/dist/http/cors.js +7 -42
  26. package/dist/http/download-routes.d.ts +0 -12
  27. package/dist/http/download-routes.js +21 -58
  28. package/dist/http/host-allowlist.d.ts +3 -0
  29. package/dist/http/host-allowlist.js +117 -0
  30. package/dist/http/jsonrpc-http.d.ts +2 -0
  31. package/dist/http/jsonrpc-http.js +10 -0
  32. package/dist/http/mcp-routes.d.ts +8 -3
  33. package/dist/http/mcp-routes.js +137 -31
  34. package/dist/http/mcp-session-eviction.d.ts +3 -0
  35. package/dist/http/mcp-session-eviction.js +24 -0
  36. package/dist/http/mcp-session-helpers.d.ts +0 -1
  37. package/dist/http/mcp-session-helpers.js +1 -1
  38. package/dist/http/mcp-session-init.d.ts +7 -0
  39. package/dist/http/mcp-session-init.js +94 -0
  40. package/dist/http/mcp-session-slots.d.ts +17 -0
  41. package/dist/http/mcp-session-slots.js +55 -0
  42. package/dist/http/mcp-session-transport-init.d.ts +7 -0
  43. package/dist/http/mcp-session-transport-init.js +41 -0
  44. package/dist/http/mcp-session-transport.d.ts +7 -0
  45. package/dist/http/mcp-session-transport.js +57 -0
  46. package/dist/http/mcp-session-types.d.ts +5 -0
  47. package/dist/http/mcp-session-types.js +1 -0
  48. package/dist/http/mcp-session.d.ts +9 -9
  49. package/dist/http/mcp-session.js +15 -137
  50. package/dist/http/mcp-sessions.d.ts +43 -0
  51. package/dist/http/mcp-sessions.js +392 -0
  52. package/dist/http/mcp-validation.d.ts +1 -0
  53. package/dist/http/mcp-validation.js +11 -10
  54. package/dist/http/protocol-policy.d.ts +2 -0
  55. package/dist/http/protocol-policy.js +31 -0
  56. package/dist/http/rate-limit.js +7 -4
  57. package/dist/http/server-config.d.ts +1 -0
  58. package/dist/http/server-config.js +40 -0
  59. package/dist/http/server-middleware.d.ts +7 -9
  60. package/dist/http/server-middleware.js +9 -70
  61. package/dist/http/server-shutdown.d.ts +4 -0
  62. package/dist/http/server-shutdown.js +43 -0
  63. package/dist/http/server.d.ts +10 -0
  64. package/dist/http/server.js +546 -61
  65. package/dist/http/session-cleanup.js +8 -5
  66. package/dist/middleware/error-handler.d.ts +1 -1
  67. package/dist/middleware/error-handler.js +32 -33
  68. package/dist/resources/cached-content-params.d.ts +5 -0
  69. package/dist/resources/cached-content-params.js +36 -0
  70. package/dist/resources/cached-content.js +67 -125
  71. package/dist/resources/index.js +0 -82
  72. package/dist/server.js +50 -29
  73. package/dist/services/cache-events.d.ts +8 -0
  74. package/dist/services/cache-events.js +19 -0
  75. package/dist/services/cache-keys.d.ts +7 -0
  76. package/dist/services/cache-keys.js +57 -0
  77. package/dist/services/cache.d.ts +4 -9
  78. package/dist/services/cache.js +77 -139
  79. package/dist/services/context.d.ts +0 -1
  80. package/dist/services/context.js +0 -7
  81. package/dist/services/extractor.js +55 -116
  82. package/dist/services/fetcher/agents.d.ts +2 -2
  83. package/dist/services/fetcher/agents.js +35 -96
  84. package/dist/services/fetcher/dns-selection.d.ts +2 -0
  85. package/dist/services/fetcher/dns-selection.js +72 -0
  86. package/dist/services/fetcher/interceptors.d.ts +0 -22
  87. package/dist/services/fetcher/interceptors.js +18 -32
  88. package/dist/services/fetcher/redirects.js +16 -7
  89. package/dist/services/fetcher/response.js +79 -34
  90. package/dist/services/fetcher.d.ts +22 -3
  91. package/dist/services/fetcher.js +544 -44
  92. package/dist/services/fifo-queue.d.ts +8 -0
  93. package/dist/services/fifo-queue.js +25 -0
  94. package/dist/services/logger.js +2 -2
  95. package/dist/services/metadata-collector.d.ts +1 -9
  96. package/dist/services/metadata-collector.js +71 -2
  97. package/dist/services/transform-worker-pool.d.ts +4 -14
  98. package/dist/services/transform-worker-pool.js +177 -129
  99. package/dist/services/transform-worker-types.d.ts +32 -0
  100. package/dist/services/transform-worker-types.js +14 -0
  101. package/dist/tools/handlers/fetch-markdown.tool.d.ts +3 -4
  102. package/dist/tools/handlers/fetch-markdown.tool.js +20 -72
  103. package/dist/tools/handlers/fetch-single.shared.d.ts +11 -22
  104. package/dist/tools/handlers/fetch-single.shared.js +175 -89
  105. package/dist/tools/handlers/fetch-url.tool.d.ts +7 -1
  106. package/dist/tools/handlers/fetch-url.tool.js +84 -119
  107. package/dist/tools/index.js +21 -40
  108. package/dist/tools/schemas.d.ts +1 -51
  109. package/dist/tools/schemas.js +1 -107
  110. package/dist/tools/utils/cached-markdown.d.ts +5 -0
  111. package/dist/tools/utils/cached-markdown.js +46 -0
  112. package/dist/tools/utils/content-shaping.d.ts +4 -0
  113. package/dist/tools/utils/content-shaping.js +67 -0
  114. package/dist/tools/utils/content-transform.d.ts +5 -17
  115. package/dist/tools/utils/content-transform.js +134 -114
  116. package/dist/tools/utils/fetch-pipeline.d.ts +0 -8
  117. package/dist/tools/utils/fetch-pipeline.js +57 -63
  118. package/dist/tools/utils/frontmatter.d.ts +3 -0
  119. package/dist/tools/utils/frontmatter.js +73 -0
  120. package/dist/tools/utils/inline-content.d.ts +1 -2
  121. package/dist/tools/utils/inline-content.js +4 -7
  122. package/dist/tools/utils/markdown-heuristics.d.ts +1 -0
  123. package/dist/tools/utils/markdown-heuristics.js +19 -0
  124. package/dist/tools/utils/markdown-signals.d.ts +1 -0
  125. package/dist/tools/utils/markdown-signals.js +19 -0
  126. package/dist/tools/utils/raw-markdown-frontmatter.d.ts +3 -0
  127. package/dist/tools/utils/raw-markdown-frontmatter.js +73 -0
  128. package/dist/tools/utils/raw-markdown.d.ts +6 -0
  129. package/dist/tools/utils/raw-markdown.js +135 -0
  130. package/dist/transformers/markdown/fenced-code-rule.d.ts +2 -0
  131. package/dist/transformers/markdown/fenced-code-rule.js +38 -0
  132. package/dist/transformers/markdown/frontmatter.d.ts +2 -0
  133. package/dist/transformers/markdown/frontmatter.js +45 -0
  134. package/dist/transformers/markdown/noise-rule.d.ts +2 -0
  135. package/dist/transformers/markdown/noise-rule.js +80 -0
  136. package/dist/transformers/markdown/turndown-instance.d.ts +2 -0
  137. package/dist/transformers/markdown/turndown-instance.js +19 -0
  138. package/dist/transformers/markdown.d.ts +2 -0
  139. package/dist/transformers/markdown.js +185 -0
  140. package/dist/transformers/markdown.transformer.js +5 -117
  141. package/dist/utils/cached-payload.d.ts +7 -0
  142. package/dist/utils/cached-payload.js +36 -0
  143. package/dist/utils/code-language-bash.d.ts +1 -0
  144. package/dist/utils/code-language-bash.js +48 -0
  145. package/dist/utils/code-language-core.d.ts +2 -0
  146. package/dist/utils/code-language-core.js +13 -0
  147. package/dist/utils/code-language-detectors.d.ts +5 -0
  148. package/dist/utils/code-language-detectors.js +142 -0
  149. package/dist/utils/code-language-helpers.d.ts +5 -0
  150. package/dist/utils/code-language-helpers.js +62 -0
  151. package/dist/utils/code-language-parsing.d.ts +5 -0
  152. package/dist/utils/code-language-parsing.js +62 -0
  153. package/dist/utils/code-language.d.ts +9 -0
  154. package/dist/utils/code-language.js +250 -46
  155. package/dist/utils/error-details.d.ts +3 -0
  156. package/dist/utils/error-details.js +12 -0
  157. package/dist/utils/error-utils.js +1 -1
  158. package/dist/utils/filename-generator.js +34 -12
  159. package/dist/utils/guards.d.ts +1 -0
  160. package/dist/utils/guards.js +3 -0
  161. package/dist/utils/header-normalizer.d.ts +0 -3
  162. package/dist/utils/header-normalizer.js +3 -3
  163. package/dist/utils/ip-address.d.ts +4 -0
  164. package/dist/utils/ip-address.js +6 -0
  165. package/dist/utils/tool-error-handler.d.ts +2 -2
  166. package/dist/utils/tool-error-handler.js +14 -46
  167. package/dist/utils/url-transformer.d.ts +7 -0
  168. package/dist/utils/url-transformer.js +147 -0
  169. package/dist/utils/url-validator.d.ts +1 -2
  170. package/dist/utils/url-validator.js +53 -114
  171. package/dist/workers/content-transform.worker.d.ts +1 -0
  172. package/dist/workers/content-transform.worker.js +40 -0
  173. package/package.json +17 -18
@@ -1,27 +1,23 @@
1
1
  import * as cache from '../../services/cache.js';
2
- import { fetchNormalizedUrlWithRetry } from '../../services/fetcher.js';
2
+ import { createCacheKey } from '../../services/cache-keys.js';
3
+ import { fetchNormalizedUrl } from '../../services/fetcher.js';
3
4
  import { logDebug } from '../../services/logger.js';
4
- import { assertResolvedAddressesAllowed, normalizeUrl, } from '../../utils/url-validator.js';
5
- import { appendHeaderVary } from './cache-vary.js';
6
- function attemptCacheRetrieval(cacheKey, deserialize, cacheNamespace, normalizedUrl) {
5
+ import { isRecord } from '../../utils/guards.js';
6
+ import { transformToRawUrl } from '../../utils/url-transformer.js';
7
+ import { normalizeUrl } from '../../utils/url-validator.js';
8
+ function attemptCacheRetrieval({ cacheKey, deserialize, cacheNamespace, normalizedUrl, }) {
7
9
  if (!cacheKey)
8
10
  return null;
9
11
  const cached = cache.get(cacheKey);
10
12
  if (!cached)
11
13
  return null;
12
14
  if (!deserialize) {
13
- logDebug('Cache miss due to missing deserializer', {
14
- namespace: cacheNamespace,
15
- url: normalizedUrl,
16
- });
15
+ logCacheMiss('missing deserializer', cacheNamespace, normalizedUrl);
17
16
  return null;
18
17
  }
19
18
  const data = deserialize(cached.content);
20
19
  if (data === undefined) {
21
- logDebug('Cache miss due to deserialize failure', {
22
- namespace: cacheNamespace,
23
- url: normalizedUrl,
24
- });
20
+ logCacheMiss('deserialize failure', cacheNamespace, normalizedUrl);
25
21
  return null;
26
22
  }
27
23
  logDebug('Cache hit', { namespace: cacheNamespace, url: normalizedUrl });
@@ -33,72 +29,70 @@ function attemptCacheRetrieval(cacheKey, deserialize, cacheNamespace, normalized
33
29
  cacheKey,
34
30
  };
35
31
  }
36
- /**
37
- * Unified fetch pipeline that handles caching, fetching, and transformation.
38
- * Implements cache-first strategy with automatic serialization.
39
- *
40
- * @template T - Type of the transformed result
41
- * @param options - Pipeline configuration options
42
- * @returns Promise resolving to the pipeline result
43
- */
32
+ function resolveNormalizedUrl(url) {
33
+ const { normalizedUrl: validatedUrl } = normalizeUrl(url);
34
+ const { url: normalizedUrl, transformed } = transformToRawUrl(validatedUrl);
35
+ return { normalizedUrl, originalUrl: validatedUrl, transformed };
36
+ }
44
37
  export async function executeFetchPipeline(options) {
45
- const { normalizedUrl, hostname } = normalizeUrl(options.url);
46
- const cacheKey = resolveCacheKey(options, normalizedUrl);
47
- const cachedResult = attemptCacheRetrieval(cacheKey, options.deserialize, options.cacheNamespace, normalizedUrl);
38
+ const resolvedUrl = resolveNormalizedUrl(options.url);
39
+ logRawUrlTransformation(resolvedUrl);
40
+ const cacheKey = createCacheKey(options.cacheNamespace, resolvedUrl.normalizedUrl, options.cacheVary);
41
+ const cachedResult = attemptCacheRetrieval({
42
+ cacheKey,
43
+ deserialize: options.deserialize,
44
+ cacheNamespace: options.cacheNamespace,
45
+ normalizedUrl: resolvedUrl.normalizedUrl,
46
+ });
48
47
  if (cachedResult)
49
48
  return cachedResult;
50
- await assertResolvedAddressesAllowed(hostname);
51
- const fetchOptions = buildFetchOptions(options);
52
- logDebug('Fetching URL', { url: normalizedUrl, retries: options.retries });
53
- const html = await fetchNormalizedUrlWithRetry(normalizedUrl, fetchOptions, options.retries);
54
- const data = await options.transform(html, normalizedUrl);
49
+ logDebug('Fetching URL', { url: resolvedUrl.normalizedUrl });
50
+ const fetchOptions = options.signal === undefined ? {} : { signal: options.signal };
51
+ const html = await fetchNormalizedUrl(resolvedUrl.normalizedUrl, fetchOptions);
52
+ const data = await options.transform(html, resolvedUrl.normalizedUrl);
55
53
  if (cache.isEnabled()) {
56
- persistCache(cacheKey, data, options.serialize, normalizedUrl);
57
- }
58
- return buildPipelineResult(normalizedUrl, data, cacheKey);
59
- }
60
- function resolveCacheKey(options, normalizedUrl) {
61
- const cacheVary = appendHeaderVary(options.cacheVary, options.customHeaders);
62
- return cache.createCacheKey(options.cacheNamespace, normalizedUrl, cacheVary);
63
- }
64
- function buildFetchOptions(options) {
65
- const fetchOptions = {};
66
- if (options.customHeaders !== undefined) {
67
- fetchOptions.customHeaders = options.customHeaders;
68
- }
69
- if (options.signal !== undefined) {
70
- fetchOptions.signal = options.signal;
71
- }
72
- if (options.timeout !== undefined) {
73
- fetchOptions.timeout = options.timeout;
54
+ persistCache({
55
+ cacheKey,
56
+ data,
57
+ serialize: options.serialize,
58
+ normalizedUrl: resolvedUrl.normalizedUrl,
59
+ });
74
60
  }
75
- return fetchOptions;
61
+ return {
62
+ data,
63
+ fromCache: false,
64
+ url: resolvedUrl.normalizedUrl,
65
+ fetchedAt: new Date().toISOString(),
66
+ cacheKey,
67
+ };
76
68
  }
77
- function persistCache(cacheKey, data, serialize, normalizedUrl) {
69
+ function persistCache({ cacheKey, data, serialize, normalizedUrl, }) {
78
70
  if (!cacheKey)
79
71
  return;
80
72
  const serializer = serialize ?? JSON.stringify;
81
- const metadata = { url: normalizedUrl };
82
73
  const title = extractTitle(data);
83
- if (title !== undefined) {
84
- metadata.title = title;
85
- }
74
+ const metadata = {
75
+ url: normalizedUrl,
76
+ ...(title === undefined ? {} : { title }),
77
+ };
86
78
  cache.set(cacheKey, serializer(data), metadata);
87
79
  }
88
80
  function extractTitle(value) {
89
- if (!value || typeof value !== 'object')
90
- return undefined;
91
- if (!('title' in value))
81
+ if (!isRecord(value))
92
82
  return undefined;
93
83
  const { title } = value;
94
84
  return typeof title === 'string' ? title : undefined;
95
85
  }
96
- function buildPipelineResult(url, data, cacheKey) {
97
- return {
98
- data,
99
- fromCache: false,
100
- url,
101
- fetchedAt: new Date().toISOString(),
102
- cacheKey,
103
- };
86
+ function logCacheMiss(reason, cacheNamespace, normalizedUrl) {
87
+ logDebug(`Cache miss due to ${reason}`, {
88
+ namespace: cacheNamespace,
89
+ url: normalizedUrl,
90
+ });
91
+ }
92
+ function logRawUrlTransformation(resolvedUrl) {
93
+ if (!resolvedUrl.transformed)
94
+ return;
95
+ logDebug('Using transformed raw content URL', {
96
+ original: resolvedUrl.originalUrl,
97
+ });
104
98
  }
@@ -0,0 +1,3 @@
1
+ export declare function extractTitleFromRawMarkdown(content: string): string | undefined;
2
+ export declare function addSourceToMarkdown(content: string, url: string): string;
3
+ export declare function hasFrontmatter(trimmed: string): boolean;
@@ -0,0 +1,73 @@
1
+ function detectLineEnding(content) {
2
+ return content.includes('\r\n') ? '\r\n' : '\n';
3
+ }
4
+ function findFrontmatterLines(content) {
5
+ const lineEnding = detectLineEnding(content);
6
+ const lines = content.split(lineEnding);
7
+ if (lines[0] !== '---')
8
+ return null;
9
+ const endIndex = lines.indexOf('---', 1);
10
+ if (endIndex === -1)
11
+ return null;
12
+ return { lineEnding, lines, endIndex };
13
+ }
14
+ function stripOptionalQuotes(value) {
15
+ const trimmed = value.trim();
16
+ if (trimmed.length < 2)
17
+ return trimmed;
18
+ const first = trimmed[0];
19
+ const last = trimmed[trimmed.length - 1];
20
+ if ((first === '"' && last === '"') || (first === "'" && last === "'")) {
21
+ return trimmed.slice(1, -1).trim();
22
+ }
23
+ return trimmed;
24
+ }
25
+ function parseFrontmatterEntry(line) {
26
+ const trimmed = line.trim();
27
+ if (!trimmed)
28
+ return null;
29
+ const separatorIndex = trimmed.indexOf(':');
30
+ if (separatorIndex <= 0)
31
+ return null;
32
+ const key = trimmed.slice(0, separatorIndex).trim().toLowerCase();
33
+ const value = trimmed.slice(separatorIndex + 1);
34
+ return { key, value };
35
+ }
36
+ function isTitleKey(key) {
37
+ return key === 'title' || key === 'name';
38
+ }
39
+ export function extractTitleFromRawMarkdown(content) {
40
+ const frontmatter = findFrontmatterLines(content);
41
+ if (!frontmatter)
42
+ return undefined;
43
+ const { lines, endIndex } = frontmatter;
44
+ const entry = lines
45
+ .slice(1, endIndex)
46
+ .map((line) => parseFrontmatterEntry(line))
47
+ .find((parsed) => parsed !== null && isTitleKey(parsed.key));
48
+ if (!entry)
49
+ return undefined;
50
+ const value = stripOptionalQuotes(entry.value);
51
+ return value || undefined;
52
+ }
53
+ export function addSourceToMarkdown(content, url) {
54
+ const frontmatter = findFrontmatterLines(content);
55
+ if (!frontmatter) {
56
+ return `---\nsource: "${url}"\n---\n\n${content}`;
57
+ }
58
+ const { lineEnding, lines, endIndex } = frontmatter;
59
+ const bodyLines = lines.slice(1, endIndex);
60
+ const hasSource = bodyLines.some((line) => line.trimStart().toLowerCase().startsWith('source:'));
61
+ if (hasSource)
62
+ return content;
63
+ const updatedLines = [
64
+ lines[0],
65
+ ...bodyLines,
66
+ `source: "${url}"`,
67
+ ...lines.slice(endIndex),
68
+ ];
69
+ return updatedLines.join(lineEnding);
70
+ }
71
+ export function hasFrontmatter(trimmed) {
72
+ return trimmed.startsWith('---\n') || trimmed.startsWith('---\r\n');
73
+ }
@@ -1,4 +1,3 @@
1
- type InlineContentFormat = 'jsonl' | 'markdown';
2
1
  interface InlineContentResult {
3
2
  content?: string;
4
3
  contentSize: number;
@@ -7,5 +6,5 @@ interface InlineContentResult {
7
6
  error?: string;
8
7
  truncated?: boolean;
9
8
  }
10
- export declare function applyInlineContentLimit(content: string, cacheKey: string | null, format: InlineContentFormat): InlineContentResult;
9
+ export declare function applyInlineContentLimit(content: string, cacheKey: string | null): InlineContentResult;
11
10
  export {};
@@ -1,7 +1,7 @@
1
1
  import { TRUNCATION_MARKER } from '../../config/formatting.js';
2
2
  import { config } from '../../config/index.js';
3
- import * as cache from '../../services/cache.js';
4
- export function applyInlineContentLimit(content, cacheKey, format) {
3
+ import { toResourceUri } from '../../services/cache-keys.js';
4
+ export function applyInlineContentLimit(content, cacheKey) {
5
5
  const contentSize = content.length;
6
6
  const inlineLimit = config.constants.maxInlineContentChars;
7
7
  if (contentSize <= inlineLimit) {
@@ -14,16 +14,13 @@ export function applyInlineContentLimit(content, cacheKey, format) {
14
14
  return {
15
15
  contentSize,
16
16
  resourceUri,
17
- resourceMimeType: resolveResourceMimeType(format),
17
+ resourceMimeType: 'text/markdown',
18
18
  };
19
19
  }
20
20
  function resolveResourceUri(cacheKey) {
21
21
  if (!config.cache.enabled || !cacheKey)
22
22
  return null;
23
- return cache.toResourceUri(cacheKey);
24
- }
25
- function resolveResourceMimeType(format) {
26
- return format === 'markdown' ? 'text/markdown' : 'application/jsonl';
23
+ return toResourceUri(cacheKey);
27
24
  }
28
25
  function buildTruncatedFallback(content, contentSize, inlineLimit) {
29
26
  const maxContentLength = Math.max(0, inlineLimit - TRUNCATION_MARKER.length);
@@ -0,0 +1 @@
1
+ export declare function looksLikeMarkdown(content: string): boolean;
@@ -0,0 +1,19 @@
1
+ const HEADING_PATTERN = /^#{1,6}\s/m;
2
+ const LIST_PATTERN = /^(?:[-*+])\s/m;
3
+ export function looksLikeMarkdown(content) {
4
+ return (containsMarkdownHeading(content) ||
5
+ containsMarkdownList(content) ||
6
+ containsFencedCodeBlock(content));
7
+ }
8
+ function containsMarkdownHeading(content) {
9
+ return HEADING_PATTERN.test(content);
10
+ }
11
+ function containsMarkdownList(content) {
12
+ return LIST_PATTERN.test(content);
13
+ }
14
+ function containsFencedCodeBlock(content) {
15
+ const first = content.indexOf('```');
16
+ if (first === -1)
17
+ return false;
18
+ return content.includes('```', first + 3);
19
+ }
@@ -0,0 +1 @@
1
+ export declare function looksLikeMarkdown(content: string): boolean;
@@ -0,0 +1,19 @@
1
+ const HEADING_PATTERN = /^#{1,6}\s/m;
2
+ const LIST_PATTERN = /^(?:[-*+])\s/m;
3
+ export function looksLikeMarkdown(content) {
4
+ return (containsMarkdownHeading(content) ||
5
+ containsMarkdownList(content) ||
6
+ containsFencedCodeBlock(content));
7
+ }
8
+ function containsMarkdownHeading(content) {
9
+ return HEADING_PATTERN.test(content);
10
+ }
11
+ function containsMarkdownList(content) {
12
+ return LIST_PATTERN.test(content);
13
+ }
14
+ function containsFencedCodeBlock(content) {
15
+ const first = content.indexOf('```');
16
+ if (first === -1)
17
+ return false;
18
+ return content.includes('```', first + 3);
19
+ }
@@ -0,0 +1,3 @@
1
+ export declare function extractTitleFromRawMarkdown(content: string): string | undefined;
2
+ export declare function addSourceToMarkdown(content: string, url: string): string;
3
+ export declare function hasFrontmatter(trimmed: string): boolean;
@@ -0,0 +1,73 @@
1
+ function detectLineEnding(content) {
2
+ return content.includes('\r\n') ? '\r\n' : '\n';
3
+ }
4
+ function findFrontmatterLines(content) {
5
+ const lineEnding = detectLineEnding(content);
6
+ const lines = content.split(lineEnding);
7
+ if (lines[0] !== '---')
8
+ return null;
9
+ const endIndex = lines.indexOf('---', 1);
10
+ if (endIndex === -1)
11
+ return null;
12
+ return { lineEnding, lines, endIndex };
13
+ }
14
+ function stripOptionalQuotes(value) {
15
+ const trimmed = value.trim();
16
+ if (trimmed.length < 2)
17
+ return trimmed;
18
+ const first = trimmed[0];
19
+ const last = trimmed[trimmed.length - 1];
20
+ if ((first === '"' && last === '"') || (first === "'" && last === "'")) {
21
+ return trimmed.slice(1, -1).trim();
22
+ }
23
+ return trimmed;
24
+ }
25
+ function parseFrontmatterEntry(line) {
26
+ const trimmed = line.trim();
27
+ if (!trimmed)
28
+ return null;
29
+ const separatorIndex = trimmed.indexOf(':');
30
+ if (separatorIndex <= 0)
31
+ return null;
32
+ const key = trimmed.slice(0, separatorIndex).trim().toLowerCase();
33
+ const value = trimmed.slice(separatorIndex + 1);
34
+ return { key, value };
35
+ }
36
+ function isTitleKey(key) {
37
+ return key === 'title' || key === 'name';
38
+ }
39
+ export function extractTitleFromRawMarkdown(content) {
40
+ const frontmatter = findFrontmatterLines(content);
41
+ if (!frontmatter)
42
+ return undefined;
43
+ const { lines, endIndex } = frontmatter;
44
+ const entry = lines
45
+ .slice(1, endIndex)
46
+ .map((line) => parseFrontmatterEntry(line))
47
+ .find((parsed) => parsed !== null && isTitleKey(parsed.key));
48
+ if (!entry)
49
+ return undefined;
50
+ const value = stripOptionalQuotes(entry.value);
51
+ return value || undefined;
52
+ }
53
+ export function addSourceToMarkdown(content, url) {
54
+ const frontmatter = findFrontmatterLines(content);
55
+ if (!frontmatter) {
56
+ return `---\nsource: "${url}"\n---\n\n${content}`;
57
+ }
58
+ const { lineEnding, lines, endIndex } = frontmatter;
59
+ const bodyLines = lines.slice(1, endIndex);
60
+ const hasSource = bodyLines.some((line) => line.trimStart().toLowerCase().startsWith('source:'));
61
+ if (hasSource)
62
+ return content;
63
+ const updatedLines = [
64
+ lines[0],
65
+ ...bodyLines,
66
+ `source: "${url}"`,
67
+ ...lines.slice(endIndex),
68
+ ];
69
+ return updatedLines.join(lineEnding);
70
+ }
71
+ export function hasFrontmatter(trimmed) {
72
+ return trimmed.startsWith('---\n') || trimmed.startsWith('---\r\n');
73
+ }
@@ -0,0 +1,6 @@
1
+ import type { MarkdownTransformResult } from '../../config/types/content.js';
2
+ export declare function tryTransformRawContent({ html, url, includeMetadata, }: {
3
+ html: string;
4
+ url: string;
5
+ includeMetadata: boolean;
6
+ }): MarkdownTransformResult | null;
@@ -0,0 +1,135 @@
1
+ import { logDebug } from '../../services/logger.js';
2
+ import { isRawTextContentUrl } from '../../utils/url-transformer.js';
3
+ const HEADING_PATTERN = /^#{1,6}\s/m;
4
+ const LIST_PATTERN = /^(?:[-*+])\s/m;
5
+ const HTML_DOCUMENT_PATTERN = /^(<!doctype|<html)/i;
6
+ function containsMarkdownHeading(content) {
7
+ return HEADING_PATTERN.test(content);
8
+ }
9
+ function containsMarkdownList(content) {
10
+ return LIST_PATTERN.test(content);
11
+ }
12
+ function containsFencedCodeBlock(content) {
13
+ const first = content.indexOf('```');
14
+ if (first === -1)
15
+ return false;
16
+ return content.includes('```', first + 3);
17
+ }
18
+ function looksLikeMarkdown(content) {
19
+ return (containsMarkdownHeading(content) ||
20
+ containsMarkdownList(content) ||
21
+ containsFencedCodeBlock(content));
22
+ }
23
+ function detectLineEnding(content) {
24
+ return content.includes('\r\n') ? '\r\n' : '\n';
25
+ }
26
+ function findFrontmatterLines(content) {
27
+ const lineEnding = detectLineEnding(content);
28
+ const lines = content.split(lineEnding);
29
+ if (lines[0] !== '---')
30
+ return null;
31
+ const endIndex = lines.indexOf('---', 1);
32
+ if (endIndex === -1)
33
+ return null;
34
+ return { lineEnding, lines, endIndex };
35
+ }
36
+ function stripOptionalQuotes(value) {
37
+ const trimmed = value.trim();
38
+ if (trimmed.length < 2)
39
+ return trimmed;
40
+ const first = trimmed[0];
41
+ const last = trimmed[trimmed.length - 1];
42
+ if ((first === '"' && last === '"') || (first === "'" && last === "'")) {
43
+ return trimmed.slice(1, -1).trim();
44
+ }
45
+ return trimmed;
46
+ }
47
+ function parseFrontmatterEntry(line) {
48
+ const trimmed = line.trim();
49
+ if (!trimmed)
50
+ return null;
51
+ const separatorIndex = trimmed.indexOf(':');
52
+ if (separatorIndex <= 0)
53
+ return null;
54
+ const key = trimmed.slice(0, separatorIndex).trim().toLowerCase();
55
+ const value = trimmed.slice(separatorIndex + 1);
56
+ return { key, value };
57
+ }
58
+ function isTitleKey(key) {
59
+ return key === 'title' || key === 'name';
60
+ }
61
+ function extractTitleFromRawMarkdown(content) {
62
+ const frontmatter = findFrontmatterLines(content);
63
+ if (!frontmatter)
64
+ return undefined;
65
+ const { lines, endIndex } = frontmatter;
66
+ const entry = lines
67
+ .slice(1, endIndex)
68
+ .map((line) => parseFrontmatterEntry(line))
69
+ .find((parsed) => parsed !== null && isTitleKey(parsed.key));
70
+ if (!entry)
71
+ return undefined;
72
+ const value = stripOptionalQuotes(entry.value);
73
+ return value || undefined;
74
+ }
75
+ function addSourceToMarkdown(content, url) {
76
+ const frontmatter = findFrontmatterLines(content);
77
+ if (!frontmatter) {
78
+ return `---\nsource: "${url}"\n---\n\n${content}`;
79
+ }
80
+ const { lineEnding, lines, endIndex } = frontmatter;
81
+ const bodyLines = lines.slice(1, endIndex);
82
+ const hasSource = bodyLines.some((line) => line.trimStart().toLowerCase().startsWith('source:'));
83
+ if (hasSource)
84
+ return content;
85
+ const updatedLines = [
86
+ lines[0],
87
+ ...bodyLines,
88
+ `source: "${url}"`,
89
+ ...lines.slice(endIndex),
90
+ ];
91
+ return updatedLines.join(lineEnding);
92
+ }
93
+ function hasFrontmatter(trimmed) {
94
+ return trimmed.startsWith('---\n') || trimmed.startsWith('---\r\n');
95
+ }
96
+ function looksLikeHtmlDocument(trimmed) {
97
+ return HTML_DOCUMENT_PATTERN.test(trimmed);
98
+ }
99
+ function countCommonHtmlTags(content) {
100
+ const matches = content.match(/<(html|head|body|div|span|script|style|meta|link)\b/gi) ??
101
+ [];
102
+ return matches.length;
103
+ }
104
+ function isRawTextContent(content) {
105
+ const trimmed = content.trim();
106
+ const isHtmlDocument = looksLikeHtmlDocument(trimmed);
107
+ const hasMarkdownFrontmatter = hasFrontmatter(trimmed);
108
+ const hasTooManyHtmlTags = countCommonHtmlTags(content) > 2;
109
+ const isMarkdown = looksLikeMarkdown(content);
110
+ return (!isHtmlDocument &&
111
+ (hasMarkdownFrontmatter || (!hasTooManyHtmlTags && isMarkdown)));
112
+ }
113
+ function buildRawMarkdownPayload({ rawContent, url, includeMetadata, }) {
114
+ const title = extractTitleFromRawMarkdown(rawContent);
115
+ const content = includeMetadata
116
+ ? addSourceToMarkdown(rawContent, url)
117
+ : rawContent;
118
+ return { content, title };
119
+ }
120
+ export function tryTransformRawContent({ html, url, includeMetadata, }) {
121
+ if (!isRawTextContentUrl(url) && !isRawTextContent(html)) {
122
+ return null;
123
+ }
124
+ logDebug('Preserving raw markdown content', { url: url.substring(0, 80) });
125
+ const { content, title } = buildRawMarkdownPayload({
126
+ rawContent: html,
127
+ url,
128
+ includeMetadata,
129
+ });
130
+ return {
131
+ markdown: content,
132
+ title,
133
+ truncated: false,
134
+ };
135
+ }
@@ -0,0 +1,2 @@
1
+ import type TurndownService from 'turndown';
2
+ export declare function addFencedCodeRule(instance: TurndownService): void;
@@ -0,0 +1,38 @@
1
+ import { CODE_BLOCK } from '../../config/formatting.js';
2
+ import { detectLanguageFromCode, resolveLanguageFromAttributes, } from '../../utils/code-language.js';
3
+ import { isRecord } from '../../utils/guards.js';
4
+ function isElement(node) {
5
+ return (isRecord(node) &&
6
+ 'getAttribute' in node &&
7
+ typeof node.getAttribute === 'function');
8
+ }
9
+ function isFencedCodeBlock(node, options) {
10
+ return (options.codeBlockStyle === 'fenced' &&
11
+ node.nodeName === 'PRE' &&
12
+ node.firstChild?.nodeName === 'CODE');
13
+ }
14
+ function formatFencedCodeBlock(node) {
15
+ const codeNode = node.firstChild;
16
+ if (!isElement(codeNode))
17
+ return '';
18
+ const code = codeNode.textContent || '';
19
+ const language = resolveCodeLanguage(codeNode, code);
20
+ return CODE_BLOCK.format(code, language);
21
+ }
22
+ function resolveCodeLanguage(codeNode, code) {
23
+ const { className, dataLanguage } = readCodeAttributes(codeNode);
24
+ const attributeLanguage = resolveLanguageFromAttributes(className, dataLanguage);
25
+ return attributeLanguage ?? detectLanguageFromCode(code) ?? '';
26
+ }
27
+ function readCodeAttributes(codeNode) {
28
+ return {
29
+ className: codeNode.getAttribute('class') ?? '',
30
+ dataLanguage: codeNode.getAttribute('data-language') ?? '',
31
+ };
32
+ }
33
+ export function addFencedCodeRule(instance) {
34
+ instance.addRule('fencedCodeBlockWithLanguage', {
35
+ filter: (node, options) => isFencedCodeBlock(node, options),
36
+ replacement: (_content, node) => formatFencedCodeBlock(node),
37
+ });
38
+ }
@@ -0,0 +1,2 @@
1
+ import type { MetadataBlock } from '../../config/types/content.js';
2
+ export declare function buildFrontmatter(metadata?: MetadataBlock): string;
@@ -0,0 +1,45 @@
1
+ import { FRONTMATTER_DELIMITER, joinLines } from '../../config/formatting.js';
2
+ const YAML_SPECIAL_CHARS = /[:[\]{}"\r\t'|>&*!?,#]|\n/;
3
+ const YAML_NUMERIC = /^[\d.]+$/;
4
+ const YAML_RESERVED_WORDS = /^(true|false|null|yes|no|on|off)$/i;
5
+ const ESCAPE_PATTERNS = {
6
+ backslash: /\\/g,
7
+ quote: /"/g,
8
+ newline: /\n/g,
9
+ tab: /\t/g,
10
+ };
11
+ const YAML_QUOTE_CHECKS = [
12
+ (input) => YAML_SPECIAL_CHARS.test(input),
13
+ (input) => input.startsWith(' ') || input.endsWith(' '),
14
+ (input) => input === '',
15
+ (input) => YAML_NUMERIC.test(input),
16
+ (input) => YAML_RESERVED_WORDS.test(input),
17
+ ];
18
+ function needsYamlQuotes(value) {
19
+ return YAML_QUOTE_CHECKS.some((check) => check(value));
20
+ }
21
+ function escapeYamlValue(value) {
22
+ if (!needsYamlQuotes(value)) {
23
+ return value;
24
+ }
25
+ const escaped = value
26
+ .replace(ESCAPE_PATTERNS.backslash, '\\\\')
27
+ .replace(ESCAPE_PATTERNS.quote, '\\"')
28
+ .replace(ESCAPE_PATTERNS.newline, '\\n')
29
+ .replace(ESCAPE_PATTERNS.tab, '\\t');
30
+ return `"${escaped}"`;
31
+ }
32
+ function appendFrontmatterField(lines, key, value) {
33
+ if (!value)
34
+ return;
35
+ lines.push(`${key}: ${escapeYamlValue(value)}`);
36
+ }
37
+ export function buildFrontmatter(metadata) {
38
+ if (!metadata)
39
+ return '';
40
+ const lines = [FRONTMATTER_DELIMITER];
41
+ appendFrontmatterField(lines, 'title', metadata.title);
42
+ appendFrontmatterField(lines, 'source', metadata.url);
43
+ lines.push(FRONTMATTER_DELIMITER);
44
+ return joinLines(lines);
45
+ }
@@ -0,0 +1,2 @@
1
+ import type TurndownService from 'turndown';
2
+ export declare function addNoiseRule(instance: TurndownService): void;