@j0hanz/superfetch 1.2.5 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (173) hide show
  1. package/README.md +131 -156
  2. package/dist/config/auth-config.d.ts +16 -0
  3. package/dist/config/auth-config.js +53 -0
  4. package/dist/config/constants.d.ts +11 -13
  5. package/dist/config/constants.js +1 -3
  6. package/dist/config/env-parsers.d.ts +7 -0
  7. package/dist/config/env-parsers.js +84 -0
  8. package/dist/config/formatting.d.ts +2 -2
  9. package/dist/config/index.d.ts +47 -53
  10. package/dist/config/index.js +35 -64
  11. package/dist/config/types/content.d.ts +1 -49
  12. package/dist/config/types/runtime.d.ts +8 -16
  13. package/dist/config/types/tools.d.ts +2 -28
  14. package/dist/http/accept-policy.d.ts +3 -0
  15. package/dist/http/accept-policy.js +45 -0
  16. package/dist/http/async-handler.d.ts +2 -0
  17. package/dist/http/async-handler.js +5 -0
  18. package/dist/http/auth-introspection.d.ts +2 -0
  19. package/dist/http/auth-introspection.js +141 -0
  20. package/dist/http/auth-static.d.ts +2 -0
  21. package/dist/http/auth-static.js +23 -0
  22. package/dist/http/auth.d.ts +3 -2
  23. package/dist/http/auth.js +254 -23
  24. package/dist/http/cors.d.ts +6 -6
  25. package/dist/http/cors.js +7 -42
  26. package/dist/http/download-routes.d.ts +0 -12
  27. package/dist/http/download-routes.js +21 -58
  28. package/dist/http/host-allowlist.d.ts +3 -0
  29. package/dist/http/host-allowlist.js +117 -0
  30. package/dist/http/jsonrpc-http.d.ts +2 -0
  31. package/dist/http/jsonrpc-http.js +10 -0
  32. package/dist/http/mcp-routes.d.ts +8 -3
  33. package/dist/http/mcp-routes.js +137 -31
  34. package/dist/http/mcp-session-eviction.d.ts +3 -0
  35. package/dist/http/mcp-session-eviction.js +24 -0
  36. package/dist/http/mcp-session-helpers.d.ts +0 -1
  37. package/dist/http/mcp-session-helpers.js +1 -1
  38. package/dist/http/mcp-session-init.d.ts +7 -0
  39. package/dist/http/mcp-session-init.js +94 -0
  40. package/dist/http/mcp-session-slots.d.ts +17 -0
  41. package/dist/http/mcp-session-slots.js +55 -0
  42. package/dist/http/mcp-session-transport-init.d.ts +7 -0
  43. package/dist/http/mcp-session-transport-init.js +41 -0
  44. package/dist/http/mcp-session-transport.d.ts +7 -0
  45. package/dist/http/mcp-session-transport.js +57 -0
  46. package/dist/http/mcp-session-types.d.ts +5 -0
  47. package/dist/http/mcp-session-types.js +1 -0
  48. package/dist/http/mcp-session.d.ts +9 -9
  49. package/dist/http/mcp-session.js +15 -137
  50. package/dist/http/mcp-sessions.d.ts +43 -0
  51. package/dist/http/mcp-sessions.js +392 -0
  52. package/dist/http/mcp-validation.d.ts +1 -0
  53. package/dist/http/mcp-validation.js +11 -10
  54. package/dist/http/protocol-policy.d.ts +2 -0
  55. package/dist/http/protocol-policy.js +31 -0
  56. package/dist/http/rate-limit.js +7 -4
  57. package/dist/http/server-config.d.ts +1 -0
  58. package/dist/http/server-config.js +40 -0
  59. package/dist/http/server-middleware.d.ts +7 -9
  60. package/dist/http/server-middleware.js +9 -70
  61. package/dist/http/server-shutdown.d.ts +4 -0
  62. package/dist/http/server-shutdown.js +43 -0
  63. package/dist/http/server.d.ts +10 -0
  64. package/dist/http/server.js +546 -61
  65. package/dist/http/session-cleanup.js +8 -5
  66. package/dist/middleware/error-handler.d.ts +1 -1
  67. package/dist/middleware/error-handler.js +32 -33
  68. package/dist/resources/cached-content-params.d.ts +5 -0
  69. package/dist/resources/cached-content-params.js +36 -0
  70. package/dist/resources/cached-content.js +67 -125
  71. package/dist/resources/index.js +0 -82
  72. package/dist/server.js +50 -29
  73. package/dist/services/cache-events.d.ts +8 -0
  74. package/dist/services/cache-events.js +19 -0
  75. package/dist/services/cache-keys.d.ts +7 -0
  76. package/dist/services/cache-keys.js +57 -0
  77. package/dist/services/cache.d.ts +4 -9
  78. package/dist/services/cache.js +77 -139
  79. package/dist/services/context.d.ts +0 -1
  80. package/dist/services/context.js +0 -7
  81. package/dist/services/extractor.js +55 -116
  82. package/dist/services/fetcher/agents.d.ts +2 -2
  83. package/dist/services/fetcher/agents.js +35 -96
  84. package/dist/services/fetcher/dns-selection.d.ts +2 -0
  85. package/dist/services/fetcher/dns-selection.js +72 -0
  86. package/dist/services/fetcher/interceptors.d.ts +0 -22
  87. package/dist/services/fetcher/interceptors.js +18 -32
  88. package/dist/services/fetcher/redirects.js +16 -7
  89. package/dist/services/fetcher/response.js +79 -34
  90. package/dist/services/fetcher.d.ts +22 -3
  91. package/dist/services/fetcher.js +544 -44
  92. package/dist/services/fifo-queue.d.ts +8 -0
  93. package/dist/services/fifo-queue.js +25 -0
  94. package/dist/services/logger.js +2 -2
  95. package/dist/services/metadata-collector.d.ts +1 -9
  96. package/dist/services/metadata-collector.js +71 -2
  97. package/dist/services/transform-worker-pool.d.ts +4 -14
  98. package/dist/services/transform-worker-pool.js +177 -129
  99. package/dist/services/transform-worker-types.d.ts +32 -0
  100. package/dist/services/transform-worker-types.js +14 -0
  101. package/dist/tools/handlers/fetch-markdown.tool.d.ts +3 -4
  102. package/dist/tools/handlers/fetch-markdown.tool.js +20 -72
  103. package/dist/tools/handlers/fetch-single.shared.d.ts +11 -22
  104. package/dist/tools/handlers/fetch-single.shared.js +175 -89
  105. package/dist/tools/handlers/fetch-url.tool.d.ts +7 -1
  106. package/dist/tools/handlers/fetch-url.tool.js +84 -119
  107. package/dist/tools/index.js +21 -40
  108. package/dist/tools/schemas.d.ts +1 -51
  109. package/dist/tools/schemas.js +1 -107
  110. package/dist/tools/utils/cached-markdown.d.ts +5 -0
  111. package/dist/tools/utils/cached-markdown.js +46 -0
  112. package/dist/tools/utils/content-shaping.d.ts +4 -0
  113. package/dist/tools/utils/content-shaping.js +67 -0
  114. package/dist/tools/utils/content-transform.d.ts +5 -17
  115. package/dist/tools/utils/content-transform.js +134 -114
  116. package/dist/tools/utils/fetch-pipeline.d.ts +0 -8
  117. package/dist/tools/utils/fetch-pipeline.js +57 -63
  118. package/dist/tools/utils/frontmatter.d.ts +3 -0
  119. package/dist/tools/utils/frontmatter.js +73 -0
  120. package/dist/tools/utils/inline-content.d.ts +1 -2
  121. package/dist/tools/utils/inline-content.js +4 -7
  122. package/dist/tools/utils/markdown-heuristics.d.ts +1 -0
  123. package/dist/tools/utils/markdown-heuristics.js +19 -0
  124. package/dist/tools/utils/markdown-signals.d.ts +1 -0
  125. package/dist/tools/utils/markdown-signals.js +19 -0
  126. package/dist/tools/utils/raw-markdown-frontmatter.d.ts +3 -0
  127. package/dist/tools/utils/raw-markdown-frontmatter.js +73 -0
  128. package/dist/tools/utils/raw-markdown.d.ts +6 -0
  129. package/dist/tools/utils/raw-markdown.js +135 -0
  130. package/dist/transformers/markdown/fenced-code-rule.d.ts +2 -0
  131. package/dist/transformers/markdown/fenced-code-rule.js +38 -0
  132. package/dist/transformers/markdown/frontmatter.d.ts +2 -0
  133. package/dist/transformers/markdown/frontmatter.js +45 -0
  134. package/dist/transformers/markdown/noise-rule.d.ts +2 -0
  135. package/dist/transformers/markdown/noise-rule.js +80 -0
  136. package/dist/transformers/markdown/turndown-instance.d.ts +2 -0
  137. package/dist/transformers/markdown/turndown-instance.js +19 -0
  138. package/dist/transformers/markdown.d.ts +2 -0
  139. package/dist/transformers/markdown.js +185 -0
  140. package/dist/transformers/markdown.transformer.js +5 -117
  141. package/dist/utils/cached-payload.d.ts +7 -0
  142. package/dist/utils/cached-payload.js +36 -0
  143. package/dist/utils/code-language-bash.d.ts +1 -0
  144. package/dist/utils/code-language-bash.js +48 -0
  145. package/dist/utils/code-language-core.d.ts +2 -0
  146. package/dist/utils/code-language-core.js +13 -0
  147. package/dist/utils/code-language-detectors.d.ts +5 -0
  148. package/dist/utils/code-language-detectors.js +142 -0
  149. package/dist/utils/code-language-helpers.d.ts +5 -0
  150. package/dist/utils/code-language-helpers.js +62 -0
  151. package/dist/utils/code-language-parsing.d.ts +5 -0
  152. package/dist/utils/code-language-parsing.js +62 -0
  153. package/dist/utils/code-language.d.ts +9 -0
  154. package/dist/utils/code-language.js +250 -46
  155. package/dist/utils/error-details.d.ts +3 -0
  156. package/dist/utils/error-details.js +12 -0
  157. package/dist/utils/error-utils.js +1 -1
  158. package/dist/utils/filename-generator.js +34 -12
  159. package/dist/utils/guards.d.ts +1 -0
  160. package/dist/utils/guards.js +3 -0
  161. package/dist/utils/header-normalizer.d.ts +0 -3
  162. package/dist/utils/header-normalizer.js +3 -3
  163. package/dist/utils/ip-address.d.ts +4 -0
  164. package/dist/utils/ip-address.js +6 -0
  165. package/dist/utils/tool-error-handler.d.ts +2 -2
  166. package/dist/utils/tool-error-handler.js +14 -46
  167. package/dist/utils/url-transformer.d.ts +7 -0
  168. package/dist/utils/url-transformer.js +147 -0
  169. package/dist/utils/url-validator.d.ts +1 -2
  170. package/dist/utils/url-validator.js +53 -114
  171. package/dist/workers/content-transform.worker.d.ts +1 -0
  172. package/dist/workers/content-transform.worker.js +40 -0
  173. package/package.json +17 -18
@@ -0,0 +1,80 @@
1
+ import { isRecord } from '../../utils/guards.js';
2
+ const STRUCTURAL_TAGS = new Set([
3
+ 'script',
4
+ 'style',
5
+ 'noscript',
6
+ 'iframe',
7
+ 'nav',
8
+ 'footer',
9
+ 'aside',
10
+ 'header',
11
+ 'form',
12
+ 'button',
13
+ 'input',
14
+ 'select',
15
+ 'textarea',
16
+ ]);
17
+ const NAVIGATION_ROLES = new Set([
18
+ 'navigation',
19
+ 'banner',
20
+ 'complementary',
21
+ 'contentinfo',
22
+ 'tree',
23
+ 'menubar',
24
+ 'menu',
25
+ ]);
26
+ const PROMO_PATTERN = /banner|promo|announcement|cta|callout|advert|newsletter|subscribe|cookie|consent|popup|modal|overlay|toast/;
27
+ const FIXED_PATTERN = /\b(fixed|sticky)\b/;
28
+ const HIGH_Z_PATTERN = /\bz-(?:4\d|50)\b/;
29
+ const ISOLATE_PATTERN = /\bisolate\b/;
30
+ function isElement(node) {
31
+ return (isRecord(node) &&
32
+ 'getAttribute' in node &&
33
+ typeof node.getAttribute === 'function');
34
+ }
35
+ function isStructuralNoiseTag(tagName) {
36
+ return (STRUCTURAL_TAGS.has(tagName) || tagName === 'svg' || tagName === 'canvas');
37
+ }
38
+ function isElementHidden(element) {
39
+ return (element.getAttribute('hidden') !== null ||
40
+ element.getAttribute('aria-hidden') === 'true');
41
+ }
42
+ function hasNoiseRole(role) {
43
+ return role !== null && NAVIGATION_ROLES.has(role);
44
+ }
45
+ function matchesPromoIdOrClass(className, id) {
46
+ const combined = `${className} ${id}`.toLowerCase();
47
+ return PROMO_PATTERN.test(combined);
48
+ }
49
+ function matchesHighZIsolate(className) {
50
+ return HIGH_Z_PATTERN.test(className) && ISOLATE_PATTERN.test(className);
51
+ }
52
+ function matchesFixedOrHighZIsolate(className) {
53
+ return FIXED_PATTERN.test(className) || matchesHighZIsolate(className);
54
+ }
55
+ function readElementMetadata(element) {
56
+ return {
57
+ tagName: element.tagName.toLowerCase(),
58
+ className: element.getAttribute('class') ?? '',
59
+ id: element.getAttribute('id') ?? '',
60
+ role: element.getAttribute('role'),
61
+ isHidden: isElementHidden(element),
62
+ };
63
+ }
64
+ function isNoiseElement(node) {
65
+ const metadata = readElementMetadata(node);
66
+ return (isStructuralNoiseTag(metadata.tagName) ||
67
+ metadata.isHidden ||
68
+ hasNoiseRole(metadata.role) ||
69
+ matchesFixedOrHighZIsolate(metadata.className) ||
70
+ matchesPromoIdOrClass(metadata.className, metadata.id));
71
+ }
72
+ function isNoiseNode(node) {
73
+ return isElement(node) && isNoiseElement(node);
74
+ }
75
+ export function addNoiseRule(instance) {
76
+ instance.addRule('removeNoise', {
77
+ filter: (node) => isNoiseNode(node),
78
+ replacement: () => '',
79
+ });
80
+ }
@@ -0,0 +1,2 @@
1
+ import TurndownService from 'turndown';
2
+ export declare function getTurndown(): TurndownService;
@@ -0,0 +1,19 @@
1
+ import TurndownService from 'turndown';
2
+ import { addFencedCodeRule } from './fenced-code-rule.js';
3
+ import { addNoiseRule } from './noise-rule.js';
4
+ let turndownInstance = null;
5
+ function createTurndownInstance() {
6
+ const instance = new TurndownService({
7
+ headingStyle: 'atx',
8
+ codeBlockStyle: 'fenced',
9
+ emDelimiter: '_',
10
+ bulletListMarker: '-',
11
+ });
12
+ addNoiseRule(instance);
13
+ addFencedCodeRule(instance);
14
+ return instance;
15
+ }
16
+ export function getTurndown() {
17
+ turndownInstance ??= createTurndownInstance();
18
+ return turndownInstance;
19
+ }
@@ -0,0 +1,2 @@
1
+ import type { MetadataBlock } from '../config/types/content.js';
2
+ export declare function htmlToMarkdown(html: string, metadata?: MetadataBlock): string;
@@ -0,0 +1,185 @@
1
+ import TurndownService from 'turndown';
2
+ import { CODE_BLOCK, FRONTMATTER_DELIMITER, joinLines, } from '../config/formatting.js';
3
+ import { detectLanguageFromCode, resolveLanguageFromAttributes, } from '../utils/code-language.js';
4
+ import { isRecord } from '../utils/guards.js';
5
+ const YAML_SPECIAL_CHARS = /[:[\]{}"\r\t'|>&*!?,#]|\n/;
6
+ const YAML_NUMERIC = /^[\d.]+$/;
7
+ const YAML_RESERVED_WORDS = /^(true|false|null|yes|no|on|off)$/i;
8
+ const ESCAPE_PATTERNS = {
9
+ backslash: /\\/g,
10
+ quote: /"/g,
11
+ newline: /\n/g,
12
+ tab: /\t/g,
13
+ };
14
+ const YAML_QUOTE_CHECKS = [
15
+ (input) => YAML_SPECIAL_CHARS.test(input),
16
+ (input) => input.startsWith(' ') || input.endsWith(' '),
17
+ (input) => input === '',
18
+ (input) => YAML_NUMERIC.test(input),
19
+ (input) => YAML_RESERVED_WORDS.test(input),
20
+ ];
21
+ function needsYamlQuotes(value) {
22
+ return YAML_QUOTE_CHECKS.some((check) => check(value));
23
+ }
24
+ function escapeYamlValue(value) {
25
+ if (!needsYamlQuotes(value)) {
26
+ return value;
27
+ }
28
+ const escaped = value
29
+ .replace(ESCAPE_PATTERNS.backslash, '\\\\')
30
+ .replace(ESCAPE_PATTERNS.quote, '\\"')
31
+ .replace(ESCAPE_PATTERNS.newline, '\\n')
32
+ .replace(ESCAPE_PATTERNS.tab, '\\t');
33
+ return `"${escaped}"`;
34
+ }
35
+ function appendFrontmatterField(lines, key, value) {
36
+ if (!value)
37
+ return;
38
+ lines.push(`${key}: ${escapeYamlValue(value)}`);
39
+ }
40
+ function buildFrontmatter(metadata) {
41
+ if (!metadata)
42
+ return '';
43
+ const lines = [FRONTMATTER_DELIMITER];
44
+ appendFrontmatterField(lines, 'title', metadata.title);
45
+ appendFrontmatterField(lines, 'source', metadata.url);
46
+ lines.push(FRONTMATTER_DELIMITER);
47
+ return joinLines(lines);
48
+ }
49
+ function isElement(node) {
50
+ return (isRecord(node) &&
51
+ 'getAttribute' in node &&
52
+ typeof node.getAttribute === 'function');
53
+ }
54
+ function isFencedCodeBlock(node, options) {
55
+ return (options.codeBlockStyle === 'fenced' &&
56
+ node.nodeName === 'PRE' &&
57
+ node.firstChild?.nodeName === 'CODE');
58
+ }
59
+ function formatFencedCodeBlock(node) {
60
+ const codeNode = node.firstChild;
61
+ if (!isElement(codeNode))
62
+ return '';
63
+ const code = codeNode.textContent || '';
64
+ const language = resolveCodeLanguage(codeNode, code);
65
+ return CODE_BLOCK.format(code, language);
66
+ }
67
+ function resolveCodeLanguage(codeNode, code) {
68
+ const { className, dataLanguage } = readCodeAttributes(codeNode);
69
+ const attributeLanguage = resolveLanguageFromAttributes(className, dataLanguage);
70
+ return attributeLanguage ?? detectLanguageFromCode(code) ?? '';
71
+ }
72
+ function readCodeAttributes(codeNode) {
73
+ return {
74
+ className: codeNode.getAttribute('class') ?? '',
75
+ dataLanguage: codeNode.getAttribute('data-language') ?? '',
76
+ };
77
+ }
78
+ function addFencedCodeRule(instance) {
79
+ instance.addRule('fencedCodeBlockWithLanguage', {
80
+ filter: (node, options) => isFencedCodeBlock(node, options),
81
+ replacement: (_content, node) => formatFencedCodeBlock(node),
82
+ });
83
+ }
84
+ const STRUCTURAL_TAGS = new Set([
85
+ 'script',
86
+ 'style',
87
+ 'noscript',
88
+ 'iframe',
89
+ 'nav',
90
+ 'footer',
91
+ 'aside',
92
+ 'header',
93
+ 'form',
94
+ 'button',
95
+ 'input',
96
+ 'select',
97
+ 'textarea',
98
+ ]);
99
+ const NAVIGATION_ROLES = new Set([
100
+ 'navigation',
101
+ 'banner',
102
+ 'complementary',
103
+ 'contentinfo',
104
+ 'tree',
105
+ 'menubar',
106
+ 'menu',
107
+ ]);
108
+ const PROMO_PATTERN = /banner|promo|announcement|cta|callout|advert|newsletter|subscribe|cookie|consent|popup|modal|overlay|toast/;
109
+ const FIXED_PATTERN = /\b(fixed|sticky)\b/;
110
+ const HIGH_Z_PATTERN = /\bz-(?:4\d|50)\b/;
111
+ const ISOLATE_PATTERN = /\bisolate\b/;
112
+ function isStructuralNoiseTag(tagName) {
113
+ return (STRUCTURAL_TAGS.has(tagName) || tagName === 'svg' || tagName === 'canvas');
114
+ }
115
+ function isElementHidden(element) {
116
+ return (element.getAttribute('hidden') !== null ||
117
+ element.getAttribute('aria-hidden') === 'true');
118
+ }
119
+ function hasNoiseRole(role) {
120
+ return role !== null && NAVIGATION_ROLES.has(role);
121
+ }
122
+ function matchesPromoIdOrClass(className, id) {
123
+ const combined = `${className} ${id}`.toLowerCase();
124
+ return PROMO_PATTERN.test(combined);
125
+ }
126
+ function matchesHighZIsolate(className) {
127
+ return HIGH_Z_PATTERN.test(className) && ISOLATE_PATTERN.test(className);
128
+ }
129
+ function matchesFixedOrHighZIsolate(className) {
130
+ return FIXED_PATTERN.test(className) || matchesHighZIsolate(className);
131
+ }
132
+ function readElementMetadata(element) {
133
+ return {
134
+ tagName: element.tagName.toLowerCase(),
135
+ className: element.getAttribute('class') ?? '',
136
+ id: element.getAttribute('id') ?? '',
137
+ role: element.getAttribute('role'),
138
+ isHidden: isElementHidden(element),
139
+ };
140
+ }
141
+ function isNoiseElement(node) {
142
+ const metadata = readElementMetadata(node);
143
+ return (isStructuralNoiseTag(metadata.tagName) ||
144
+ metadata.isHidden ||
145
+ hasNoiseRole(metadata.role) ||
146
+ matchesFixedOrHighZIsolate(metadata.className) ||
147
+ matchesPromoIdOrClass(metadata.className, metadata.id));
148
+ }
149
+ function isNoiseNode(node) {
150
+ return isElement(node) && isNoiseElement(node);
151
+ }
152
+ function addNoiseRule(instance) {
153
+ instance.addRule('removeNoise', {
154
+ filter: (node) => isNoiseNode(node),
155
+ replacement: () => '',
156
+ });
157
+ }
158
+ let turndownInstance = null;
159
+ function createTurndownInstance() {
160
+ const instance = new TurndownService({
161
+ headingStyle: 'atx',
162
+ codeBlockStyle: 'fenced',
163
+ emDelimiter: '_',
164
+ bulletListMarker: '-',
165
+ });
166
+ addNoiseRule(instance);
167
+ addFencedCodeRule(instance);
168
+ return instance;
169
+ }
170
+ function getTurndown() {
171
+ turndownInstance ??= createTurndownInstance();
172
+ return turndownInstance;
173
+ }
174
+ export function htmlToMarkdown(html, metadata) {
175
+ const frontmatter = buildFrontmatter(metadata);
176
+ if (!html)
177
+ return frontmatter;
178
+ try {
179
+ const content = getTurndown().turndown(html).trim();
180
+ return frontmatter ? `${frontmatter}\n${content}` : content;
181
+ }
182
+ catch {
183
+ return frontmatter;
184
+ }
185
+ }
@@ -1,126 +1,14 @@
1
- import TurndownService from 'turndown';
2
- import { CODE_BLOCK, FRONTMATTER_DELIMITER, joinLines, } from '../config/formatting.js';
3
- import { detectLanguageFromCode, resolveLanguageFromAttributes, } from '../utils/code-language.js';
4
- let turndownInstance = null;
5
- function getTurndown() {
6
- if (turndownInstance)
7
- return turndownInstance;
8
- turndownInstance = createTurndownInstance();
9
- return turndownInstance;
10
- }
11
- function createTurndownInstance() {
12
- const instance = new TurndownService({
13
- headingStyle: 'atx',
14
- codeBlockStyle: 'fenced',
15
- emDelimiter: '_',
16
- bulletListMarker: '-',
17
- });
18
- addNoiseRule(instance);
19
- addFencedCodeRule(instance);
20
- return instance;
21
- }
22
- function addNoiseRule(instance) {
23
- instance.addRule('removeNoise', {
24
- filter: ['script', 'style', 'noscript', 'nav', 'footer', 'aside', 'iframe'],
25
- replacement: () => '',
26
- });
27
- }
28
- function addFencedCodeRule(instance) {
29
- instance.addRule('fencedCodeBlockWithLanguage', {
30
- filter: (node, options) => isFencedCodeBlock(node, options),
31
- replacement: (_content, node) => formatFencedCodeBlock(node),
32
- });
33
- }
34
- function isFencedCodeBlock(node, options) {
35
- if (options.codeBlockStyle !== 'fenced')
36
- return false;
37
- if (node.nodeName !== 'PRE')
38
- return false;
39
- const { firstChild } = node;
40
- if (!firstChild)
41
- return false;
42
- return firstChild.nodeName === 'CODE';
43
- }
44
- function isElement(node) {
45
- return (node !== null &&
46
- typeof node === 'object' &&
47
- 'getAttribute' in node &&
48
- typeof node.getAttribute === 'function');
49
- }
50
- function formatFencedCodeBlock(node) {
51
- const codeNode = node.firstChild;
52
- if (!isElement(codeNode))
53
- return '';
54
- const code = codeNode.textContent || '';
55
- const language = resolveCodeLanguage(codeNode, code);
56
- return CODE_BLOCK.format(code, language);
57
- }
58
- function resolveCodeLanguage(codeNode, code) {
59
- const className = codeNode.getAttribute('class') ?? '';
60
- const dataLang = codeNode.getAttribute('data-language') ?? '';
61
- const attributeLanguage = resolveLanguageFromAttributes(className, dataLang);
62
- return attributeLanguage ?? detectLanguageFromCode(code) ?? '';
63
- }
64
- const YAML_SPECIAL_CHARS = /[:[\]{}"\r\t'|>&*!?,#]|\n/;
65
- const YAML_NUMERIC = /^[\d.]+$/;
66
- const YAML_RESERVED_WORDS = /^(true|false|null|yes|no|on|off)$/i;
67
- const ESCAPE_PATTERNS = {
68
- backslash: /\\/g,
69
- quote: /"/g,
70
- newline: /\n/g,
71
- tab: /\t/g,
72
- };
73
- function needsYamlQuotes(value) {
74
- const checks = [
75
- (input) => YAML_SPECIAL_CHARS.test(input),
76
- (input) => input.startsWith(' ') || input.endsWith(' '),
77
- (input) => input === '',
78
- (input) => YAML_NUMERIC.test(input),
79
- (input) => YAML_RESERVED_WORDS.test(input),
80
- ];
81
- return checks.some((check) => check(value));
82
- }
83
- function escapeYamlValue(value) {
84
- if (!needsYamlQuotes(value)) {
85
- return value;
86
- }
87
- const escaped = value
88
- .replace(ESCAPE_PATTERNS.backslash, '\\\\')
89
- .replace(ESCAPE_PATTERNS.quote, '\\"')
90
- .replace(ESCAPE_PATTERNS.newline, '\\n')
91
- .replace(ESCAPE_PATTERNS.tab, '\\t');
92
- return `"${escaped}"`;
93
- }
94
- function createFrontmatter(metadata) {
95
- const lines = [FRONTMATTER_DELIMITER];
96
- if (metadata.title) {
97
- lines.push(`title: ${escapeYamlValue(metadata.title)}`);
98
- }
99
- if (metadata.url) {
100
- lines.push(`source: ${escapeYamlValue(metadata.url)}`);
101
- }
102
- lines.push(FRONTMATTER_DELIMITER);
103
- return joinLines(lines);
104
- }
105
- function convertHtmlToMarkdown(html) {
106
- return getTurndown().turndown(html).trim();
107
- }
108
- function buildFrontmatterBlock(metadata) {
109
- return metadata ? createFrontmatter(metadata) : '';
110
- }
1
+ import { buildFrontmatter } from './markdown/frontmatter.js';
2
+ import { getTurndown } from './markdown/turndown-instance.js';
111
3
  export function htmlToMarkdown(html, metadata) {
112
- const frontmatter = buildFrontmatterBlock(metadata);
113
- if (!isValidHtmlInput(html)) {
4
+ const frontmatter = buildFrontmatter(metadata);
5
+ if (!html)
114
6
  return frontmatter;
115
- }
116
7
  try {
117
- const content = convertHtmlToMarkdown(html);
8
+ const content = getTurndown().turndown(html).trim();
118
9
  return frontmatter ? `${frontmatter}\n${content}` : content;
119
10
  }
120
11
  catch {
121
12
  return frontmatter;
122
13
  }
123
14
  }
124
- function isValidHtmlInput(html) {
125
- return Boolean(html && typeof html === 'string');
126
- }
@@ -0,0 +1,7 @@
1
+ export interface CachedPayload {
2
+ content?: string;
3
+ markdown?: string;
4
+ title?: string;
5
+ }
6
+ export declare function parseCachedPayload(raw: string): CachedPayload | null;
7
+ export declare function resolveCachedPayloadContent(payload: CachedPayload): string | null;
@@ -0,0 +1,36 @@
1
+ import { isRecord } from './guards.js';
2
+ export function parseCachedPayload(raw) {
3
+ try {
4
+ const parsed = JSON.parse(raw);
5
+ return isCachedPayload(parsed) ? parsed : null;
6
+ }
7
+ catch {
8
+ return null;
9
+ }
10
+ }
11
+ export function resolveCachedPayloadContent(payload) {
12
+ if (typeof payload.markdown === 'string') {
13
+ return payload.markdown;
14
+ }
15
+ if (typeof payload.content === 'string') {
16
+ return payload.content;
17
+ }
18
+ return null;
19
+ }
20
+ function hasOptionalStringProperty(value, key) {
21
+ const prop = value[key];
22
+ if (prop === undefined)
23
+ return true;
24
+ return typeof prop === 'string';
25
+ }
26
+ function isCachedPayload(value) {
27
+ if (!isRecord(value))
28
+ return false;
29
+ if (!hasOptionalStringProperty(value, 'content'))
30
+ return false;
31
+ if (!hasOptionalStringProperty(value, 'markdown'))
32
+ return false;
33
+ if (!hasOptionalStringProperty(value, 'title'))
34
+ return false;
35
+ return true;
36
+ }
@@ -0,0 +1 @@
1
+ export declare function detectBash(code: string): boolean;
@@ -0,0 +1,48 @@
1
+ import { splitLines } from './code-language-parsing.js';
2
+ const BASH_PACKAGE_MANAGERS = [
3
+ 'npm',
4
+ 'yarn',
5
+ 'pnpm',
6
+ 'npx',
7
+ 'brew',
8
+ 'apt',
9
+ 'pip',
10
+ 'cargo',
11
+ 'go',
12
+ ];
13
+ const BASH_VERBS = ['install', 'add', 'run', 'build', 'start'];
14
+ const BASH_COMMANDS = ['sudo', 'chmod', 'mkdir', 'cd', 'ls', 'cat', 'echo'];
15
+ export function detectBash(code) {
16
+ const lines = splitLines(code);
17
+ for (const line of lines) {
18
+ const trimmed = line.trimStart();
19
+ if (!trimmed)
20
+ continue;
21
+ if (isBashIndicator(trimmed))
22
+ return true;
23
+ }
24
+ return false;
25
+ }
26
+ function startsWithCommand(line, commands) {
27
+ return commands.some((command) => line === command || line.startsWith(`${command} `));
28
+ }
29
+ function isBashIndicator(line) {
30
+ return (isShebang(line) ||
31
+ isPromptLine(line) ||
32
+ startsWithCommand(line, BASH_COMMANDS) ||
33
+ startsWithPackageManagerCommand(line));
34
+ }
35
+ function isShebang(line) {
36
+ return line.startsWith('#!');
37
+ }
38
+ function isPromptLine(line) {
39
+ return line.startsWith('$ ') || line.startsWith('# ');
40
+ }
41
+ function startsWithPackageManagerCommand(line) {
42
+ return BASH_PACKAGE_MANAGERS.some((manager) => {
43
+ if (!line.startsWith(`${manager} `))
44
+ return false;
45
+ const rest = line.slice(manager.length + 1);
46
+ return BASH_VERBS.some((verb) => rest === verb || rest.startsWith(`${verb} `));
47
+ });
48
+ }
@@ -0,0 +1,2 @@
1
+ export declare function detectLanguageFromCode(code: string): string | undefined;
2
+ export declare function resolveLanguageFromAttributes(className: string, dataLang: string): string | undefined;
@@ -0,0 +1,13 @@
1
+ import { CODE_DETECTORS } from './code-language-detectors.js';
2
+ import { extractLanguageFromClassName, resolveLanguageFromDataAttribute, } from './code-language-parsing.js';
3
+ export function detectLanguageFromCode(code) {
4
+ for (const { language, detect } of CODE_DETECTORS) {
5
+ if (detect(code))
6
+ return language;
7
+ }
8
+ return undefined;
9
+ }
10
+ export function resolveLanguageFromAttributes(className, dataLang) {
11
+ const classMatch = extractLanguageFromClassName(className);
12
+ return classMatch ?? resolveLanguageFromDataAttribute(dataLang);
13
+ }
@@ -0,0 +1,5 @@
1
+ export interface CodeDetector {
2
+ language: string;
3
+ detect: (code: string) => boolean;
4
+ }
5
+ export declare const CODE_DETECTORS: readonly CodeDetector[];
@@ -0,0 +1,142 @@
1
+ import { detectBash } from './code-language-bash.js';
2
+ import { containsJsxTag, containsWord, splitLines, } from './code-language-parsing.js';
3
+ const TYPE_HINTS = [
4
+ 'string',
5
+ 'number',
6
+ 'boolean',
7
+ 'void',
8
+ 'any',
9
+ 'unknown',
10
+ 'never',
11
+ ];
12
+ const HTML_TAGS = [
13
+ '<!doctype',
14
+ '<html',
15
+ '<head',
16
+ '<body',
17
+ '<div',
18
+ '<span',
19
+ '<p',
20
+ '<a',
21
+ '<script',
22
+ '<style',
23
+ ];
24
+ const SQL_KEYWORDS = [
25
+ 'select',
26
+ 'insert',
27
+ 'update',
28
+ 'delete',
29
+ 'create',
30
+ 'alter',
31
+ 'drop',
32
+ ];
33
+ const JS_WORD_REGEX = /\b(?:const|let|var|function|class|async|await|export|import)\b/;
34
+ const PYTHON_WORD_REGEX = /\b(?:def|class|import|from)\b/;
35
+ const RUST_WORD_REGEX = /\b(?:fn|impl|struct|enum)\b/;
36
+ const CSS_DIRECTIVE_REGEX = /@media|@import|@keyframes/;
37
+ export const CODE_DETECTORS = [
38
+ { language: 'jsx', detect: detectJsx },
39
+ { language: 'typescript', detect: detectTypescript },
40
+ { language: 'rust', detect: detectRust },
41
+ { language: 'javascript', detect: detectJavascript },
42
+ { language: 'python', detect: detectPython },
43
+ { language: 'bash', detect: detectBash },
44
+ { language: 'css', detect: detectCss },
45
+ { language: 'html', detect: detectHtml },
46
+ { language: 'json', detect: detectJson },
47
+ { language: 'yaml', detect: detectYaml },
48
+ { language: 'sql', detect: detectSql },
49
+ { language: 'go', detect: detectGo },
50
+ ];
51
+ function detectJsx(code) {
52
+ const lower = code.toLowerCase();
53
+ if (lower.includes('classname='))
54
+ return true;
55
+ if (lower.includes('jsx:'))
56
+ return true;
57
+ if (lower.includes("from 'react'") || lower.includes('from "react"')) {
58
+ return true;
59
+ }
60
+ return containsJsxTag(code);
61
+ }
62
+ function detectTypescript(code) {
63
+ const lower = code.toLowerCase();
64
+ if (containsWord(lower, 'interface'))
65
+ return true;
66
+ if (containsWord(lower, 'type'))
67
+ return true;
68
+ return TYPE_HINTS.some((hint) => lower.includes(`: ${hint}`) || lower.includes(`:${hint}`));
69
+ }
70
+ function detectRust(code) {
71
+ const lower = code.toLowerCase();
72
+ return (RUST_WORD_REGEX.test(lower) ||
73
+ lower.includes('let mut') ||
74
+ (lower.includes('use ') && lower.includes('::')));
75
+ }
76
+ function detectJavascript(code) {
77
+ const lower = code.toLowerCase();
78
+ return JS_WORD_REGEX.test(lower);
79
+ }
80
+ function detectPython(code) {
81
+ const lower = code.toLowerCase();
82
+ return (PYTHON_WORD_REGEX.test(lower) ||
83
+ lower.includes('print(') ||
84
+ lower.includes('__name__'));
85
+ }
86
+ function detectCss(code) {
87
+ const lower = code.toLowerCase();
88
+ if (CSS_DIRECTIVE_REGEX.test(lower))
89
+ return true;
90
+ const lines = splitLines(code);
91
+ for (const line of lines) {
92
+ const trimmed = line.trimStart();
93
+ if (!trimmed)
94
+ continue;
95
+ if (isCssSelectorLine(trimmed) || isCssPropertyLine(trimmed))
96
+ return true;
97
+ }
98
+ return false;
99
+ }
100
+ function detectHtml(code) {
101
+ const lower = code.toLowerCase();
102
+ return HTML_TAGS.some((tag) => lower.includes(tag));
103
+ }
104
+ function detectJson(code) {
105
+ const trimmed = code.trimStart();
106
+ if (!trimmed)
107
+ return false;
108
+ return trimmed.startsWith('{') || trimmed.startsWith('[');
109
+ }
110
+ function detectYaml(code) {
111
+ const lines = splitLines(code);
112
+ for (const line of lines) {
113
+ const trimmed = line.trim();
114
+ if (!trimmed)
115
+ continue;
116
+ const colonIndex = trimmed.indexOf(':');
117
+ if (colonIndex <= 0)
118
+ continue;
119
+ const after = trimmed[colonIndex + 1];
120
+ if (after === ' ' || after === '\t')
121
+ return true;
122
+ }
123
+ return false;
124
+ }
125
+ function detectSql(code) {
126
+ const lower = code.toLowerCase();
127
+ return SQL_KEYWORDS.some((keyword) => containsWord(lower, keyword));
128
+ }
129
+ function detectGo(code) {
130
+ const lower = code.toLowerCase();
131
+ return (containsWord(lower, 'package') ||
132
+ containsWord(lower, 'func') ||
133
+ lower.includes('import "'));
134
+ }
135
+ function isCssSelectorLine(line) {
136
+ if (!line.startsWith('.') && !line.startsWith('#'))
137
+ return false;
138
+ return line.includes('{');
139
+ }
140
+ function isCssPropertyLine(line) {
141
+ return line.includes(':') && line.includes(';');
142
+ }