@apitap/core 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (236) hide show
  1. package/LICENSE +60 -0
  2. package/README.md +362 -0
  3. package/SKILL.md +270 -0
  4. package/dist/auth/crypto.d.ts +31 -0
  5. package/dist/auth/crypto.js +66 -0
  6. package/dist/auth/crypto.js.map +1 -0
  7. package/dist/auth/handoff.d.ts +29 -0
  8. package/dist/auth/handoff.js +180 -0
  9. package/dist/auth/handoff.js.map +1 -0
  10. package/dist/auth/manager.d.ts +46 -0
  11. package/dist/auth/manager.js +127 -0
  12. package/dist/auth/manager.js.map +1 -0
  13. package/dist/auth/oauth-refresh.d.ts +16 -0
  14. package/dist/auth/oauth-refresh.js +91 -0
  15. package/dist/auth/oauth-refresh.js.map +1 -0
  16. package/dist/auth/refresh.d.ts +43 -0
  17. package/dist/auth/refresh.js +217 -0
  18. package/dist/auth/refresh.js.map +1 -0
  19. package/dist/capture/anti-bot.d.ts +15 -0
  20. package/dist/capture/anti-bot.js +43 -0
  21. package/dist/capture/anti-bot.js.map +1 -0
  22. package/dist/capture/blocklist.d.ts +6 -0
  23. package/dist/capture/blocklist.js +70 -0
  24. package/dist/capture/blocklist.js.map +1 -0
  25. package/dist/capture/body-diff.d.ts +8 -0
  26. package/dist/capture/body-diff.js +102 -0
  27. package/dist/capture/body-diff.js.map +1 -0
  28. package/dist/capture/body-variables.d.ts +13 -0
  29. package/dist/capture/body-variables.js +142 -0
  30. package/dist/capture/body-variables.js.map +1 -0
  31. package/dist/capture/domain.d.ts +8 -0
  32. package/dist/capture/domain.js +34 -0
  33. package/dist/capture/domain.js.map +1 -0
  34. package/dist/capture/entropy.d.ts +33 -0
  35. package/dist/capture/entropy.js +100 -0
  36. package/dist/capture/entropy.js.map +1 -0
  37. package/dist/capture/filter.d.ts +11 -0
  38. package/dist/capture/filter.js +49 -0
  39. package/dist/capture/filter.js.map +1 -0
  40. package/dist/capture/graphql.d.ts +21 -0
  41. package/dist/capture/graphql.js +99 -0
  42. package/dist/capture/graphql.js.map +1 -0
  43. package/dist/capture/idle.d.ts +23 -0
  44. package/dist/capture/idle.js +44 -0
  45. package/dist/capture/idle.js.map +1 -0
  46. package/dist/capture/monitor.d.ts +26 -0
  47. package/dist/capture/monitor.js +183 -0
  48. package/dist/capture/monitor.js.map +1 -0
  49. package/dist/capture/oauth-detector.d.ts +18 -0
  50. package/dist/capture/oauth-detector.js +96 -0
  51. package/dist/capture/oauth-detector.js.map +1 -0
  52. package/dist/capture/pagination.d.ts +9 -0
  53. package/dist/capture/pagination.js +40 -0
  54. package/dist/capture/pagination.js.map +1 -0
  55. package/dist/capture/parameterize.d.ts +17 -0
  56. package/dist/capture/parameterize.js +63 -0
  57. package/dist/capture/parameterize.js.map +1 -0
  58. package/dist/capture/scrubber.d.ts +5 -0
  59. package/dist/capture/scrubber.js +38 -0
  60. package/dist/capture/scrubber.js.map +1 -0
  61. package/dist/capture/session.d.ts +46 -0
  62. package/dist/capture/session.js +445 -0
  63. package/dist/capture/session.js.map +1 -0
  64. package/dist/capture/token-detector.d.ts +16 -0
  65. package/dist/capture/token-detector.js +62 -0
  66. package/dist/capture/token-detector.js.map +1 -0
  67. package/dist/capture/verifier.d.ts +17 -0
  68. package/dist/capture/verifier.js +147 -0
  69. package/dist/capture/verifier.js.map +1 -0
  70. package/dist/cli.d.ts +2 -0
  71. package/dist/cli.js +930 -0
  72. package/dist/cli.js.map +1 -0
  73. package/dist/discovery/auth.d.ts +17 -0
  74. package/dist/discovery/auth.js +81 -0
  75. package/dist/discovery/auth.js.map +1 -0
  76. package/dist/discovery/fetch.d.ts +17 -0
  77. package/dist/discovery/fetch.js +59 -0
  78. package/dist/discovery/fetch.js.map +1 -0
  79. package/dist/discovery/frameworks.d.ts +11 -0
  80. package/dist/discovery/frameworks.js +249 -0
  81. package/dist/discovery/frameworks.js.map +1 -0
  82. package/dist/discovery/index.d.ts +21 -0
  83. package/dist/discovery/index.js +219 -0
  84. package/dist/discovery/index.js.map +1 -0
  85. package/dist/discovery/openapi.d.ts +13 -0
  86. package/dist/discovery/openapi.js +175 -0
  87. package/dist/discovery/openapi.js.map +1 -0
  88. package/dist/discovery/probes.d.ts +9 -0
  89. package/dist/discovery/probes.js +70 -0
  90. package/dist/discovery/probes.js.map +1 -0
  91. package/dist/index.d.ts +25 -0
  92. package/dist/index.js +25 -0
  93. package/dist/index.js.map +1 -0
  94. package/dist/inspect/report.d.ts +52 -0
  95. package/dist/inspect/report.js +191 -0
  96. package/dist/inspect/report.js.map +1 -0
  97. package/dist/mcp.d.ts +8 -0
  98. package/dist/mcp.js +526 -0
  99. package/dist/mcp.js.map +1 -0
  100. package/dist/orchestration/browse.d.ts +38 -0
  101. package/dist/orchestration/browse.js +198 -0
  102. package/dist/orchestration/browse.js.map +1 -0
  103. package/dist/orchestration/cache.d.ts +15 -0
  104. package/dist/orchestration/cache.js +24 -0
  105. package/dist/orchestration/cache.js.map +1 -0
  106. package/dist/plugin.d.ts +17 -0
  107. package/dist/plugin.js +158 -0
  108. package/dist/plugin.js.map +1 -0
  109. package/dist/read/decoders/deepwiki.d.ts +2 -0
  110. package/dist/read/decoders/deepwiki.js +148 -0
  111. package/dist/read/decoders/deepwiki.js.map +1 -0
  112. package/dist/read/decoders/grokipedia.d.ts +2 -0
  113. package/dist/read/decoders/grokipedia.js +210 -0
  114. package/dist/read/decoders/grokipedia.js.map +1 -0
  115. package/dist/read/decoders/hackernews.d.ts +2 -0
  116. package/dist/read/decoders/hackernews.js +168 -0
  117. package/dist/read/decoders/hackernews.js.map +1 -0
  118. package/dist/read/decoders/index.d.ts +2 -0
  119. package/dist/read/decoders/index.js +12 -0
  120. package/dist/read/decoders/index.js.map +1 -0
  121. package/dist/read/decoders/reddit.d.ts +2 -0
  122. package/dist/read/decoders/reddit.js +142 -0
  123. package/dist/read/decoders/reddit.js.map +1 -0
  124. package/dist/read/decoders/twitter.d.ts +12 -0
  125. package/dist/read/decoders/twitter.js +187 -0
  126. package/dist/read/decoders/twitter.js.map +1 -0
  127. package/dist/read/decoders/wikipedia.d.ts +2 -0
  128. package/dist/read/decoders/wikipedia.js +66 -0
  129. package/dist/read/decoders/wikipedia.js.map +1 -0
  130. package/dist/read/decoders/youtube.d.ts +2 -0
  131. package/dist/read/decoders/youtube.js +69 -0
  132. package/dist/read/decoders/youtube.js.map +1 -0
  133. package/dist/read/extract.d.ts +25 -0
  134. package/dist/read/extract.js +320 -0
  135. package/dist/read/extract.js.map +1 -0
  136. package/dist/read/index.d.ts +14 -0
  137. package/dist/read/index.js +66 -0
  138. package/dist/read/index.js.map +1 -0
  139. package/dist/read/peek.d.ts +9 -0
  140. package/dist/read/peek.js +137 -0
  141. package/dist/read/peek.js.map +1 -0
  142. package/dist/read/types.d.ts +44 -0
  143. package/dist/read/types.js +3 -0
  144. package/dist/read/types.js.map +1 -0
  145. package/dist/replay/engine.d.ts +53 -0
  146. package/dist/replay/engine.js +441 -0
  147. package/dist/replay/engine.js.map +1 -0
  148. package/dist/replay/truncate.d.ts +16 -0
  149. package/dist/replay/truncate.js +92 -0
  150. package/dist/replay/truncate.js.map +1 -0
  151. package/dist/serve.d.ts +31 -0
  152. package/dist/serve.js +149 -0
  153. package/dist/serve.js.map +1 -0
  154. package/dist/skill/generator.d.ts +44 -0
  155. package/dist/skill/generator.js +419 -0
  156. package/dist/skill/generator.js.map +1 -0
  157. package/dist/skill/importer.d.ts +26 -0
  158. package/dist/skill/importer.js +80 -0
  159. package/dist/skill/importer.js.map +1 -0
  160. package/dist/skill/search.d.ts +19 -0
  161. package/dist/skill/search.js +51 -0
  162. package/dist/skill/search.js.map +1 -0
  163. package/dist/skill/signing.d.ts +16 -0
  164. package/dist/skill/signing.js +34 -0
  165. package/dist/skill/signing.js.map +1 -0
  166. package/dist/skill/ssrf.d.ts +27 -0
  167. package/dist/skill/ssrf.js +210 -0
  168. package/dist/skill/ssrf.js.map +1 -0
  169. package/dist/skill/store.d.ts +7 -0
  170. package/dist/skill/store.js +93 -0
  171. package/dist/skill/store.js.map +1 -0
  172. package/dist/stats/report.d.ts +26 -0
  173. package/dist/stats/report.js +157 -0
  174. package/dist/stats/report.js.map +1 -0
  175. package/dist/types.d.ts +214 -0
  176. package/dist/types.js +3 -0
  177. package/dist/types.js.map +1 -0
  178. package/package.json +58 -0
  179. package/src/auth/crypto.ts +92 -0
  180. package/src/auth/handoff.ts +229 -0
  181. package/src/auth/manager.ts +140 -0
  182. package/src/auth/oauth-refresh.ts +120 -0
  183. package/src/auth/refresh.ts +300 -0
  184. package/src/capture/anti-bot.ts +63 -0
  185. package/src/capture/blocklist.ts +75 -0
  186. package/src/capture/body-diff.ts +109 -0
  187. package/src/capture/body-variables.ts +156 -0
  188. package/src/capture/domain.ts +34 -0
  189. package/src/capture/entropy.ts +121 -0
  190. package/src/capture/filter.ts +56 -0
  191. package/src/capture/graphql.ts +124 -0
  192. package/src/capture/idle.ts +45 -0
  193. package/src/capture/monitor.ts +224 -0
  194. package/src/capture/oauth-detector.ts +106 -0
  195. package/src/capture/pagination.ts +49 -0
  196. package/src/capture/parameterize.ts +68 -0
  197. package/src/capture/scrubber.ts +49 -0
  198. package/src/capture/session.ts +502 -0
  199. package/src/capture/token-detector.ts +76 -0
  200. package/src/capture/verifier.ts +171 -0
  201. package/src/cli.ts +1031 -0
  202. package/src/discovery/auth.ts +99 -0
  203. package/src/discovery/fetch.ts +85 -0
  204. package/src/discovery/frameworks.ts +231 -0
  205. package/src/discovery/index.ts +256 -0
  206. package/src/discovery/openapi.ts +230 -0
  207. package/src/discovery/probes.ts +76 -0
  208. package/src/index.ts +26 -0
  209. package/src/inspect/report.ts +247 -0
  210. package/src/mcp.ts +618 -0
  211. package/src/orchestration/browse.ts +250 -0
  212. package/src/orchestration/cache.ts +37 -0
  213. package/src/plugin.ts +188 -0
  214. package/src/read/decoders/deepwiki.ts +180 -0
  215. package/src/read/decoders/grokipedia.ts +246 -0
  216. package/src/read/decoders/hackernews.ts +198 -0
  217. package/src/read/decoders/index.ts +15 -0
  218. package/src/read/decoders/reddit.ts +158 -0
  219. package/src/read/decoders/twitter.ts +211 -0
  220. package/src/read/decoders/wikipedia.ts +75 -0
  221. package/src/read/decoders/youtube.ts +75 -0
  222. package/src/read/extract.ts +396 -0
  223. package/src/read/index.ts +78 -0
  224. package/src/read/peek.ts +175 -0
  225. package/src/read/types.ts +37 -0
  226. package/src/replay/engine.ts +559 -0
  227. package/src/replay/truncate.ts +116 -0
  228. package/src/serve.ts +189 -0
  229. package/src/skill/generator.ts +473 -0
  230. package/src/skill/importer.ts +107 -0
  231. package/src/skill/search.ts +76 -0
  232. package/src/skill/signing.ts +36 -0
  233. package/src/skill/ssrf.ts +238 -0
  234. package/src/skill/store.ts +107 -0
  235. package/src/stats/report.ts +208 -0
  236. package/src/types.ts +233 -0
@@ -0,0 +1,396 @@
1
+ // src/read/extract.ts
2
+
3
+ export interface HeadMeta {
4
+ title: string | null;
5
+ ogTitle: string | null;
6
+ ogDescription: string | null;
7
+ ogImage: string | null;
8
+ ogType: string | null;
9
+ ogSiteName: string | null;
10
+ canonical: string | null;
11
+ author: string | null;
12
+ publishedTime: string | null;
13
+ }
14
+
15
+ export interface ExtractResult {
16
+ content: string;
17
+ links: Array<{ text: string; href: string }>;
18
+ images: Array<{ alt: string; src: string }>;
19
+ isSpaShell: boolean;
20
+ }
21
+
22
+ // ---- HTML entity decoding ----
23
+
24
+ const ENTITY_MAP: Record<string, string> = {
25
+ '&amp;': '&',
26
+ '&lt;': '<',
27
+ '&gt;': '>',
28
+ '&quot;': '"',
29
+ '&#39;': "'",
30
+ '&apos;': "'",
31
+ '&nbsp;': ' ',
32
+ };
33
+
34
+ function decodeEntities(text: string): string {
35
+ return text.replace(/&(?:amp|lt|gt|quot|apos|nbsp|#39);/g, (m) => ENTITY_MAP[m] ?? m);
36
+ }
37
+
38
+ // ---- parseHead ----
39
+
40
+ function extractMetaContent(html: string, attrName: string, attrValue: string): string | null {
41
+ // Handle both orders: property="X" content="Y" and content="Y" property="X"
42
+ // Also handle name="X" content="Y" for author etc.
43
+ const patterns = [
44
+ new RegExp(`<meta\\s+${attrName}=["']${escapeRegex(attrValue)}["']\\s+content=["']([^"']*)["']`, 'i'),
45
+ new RegExp(`<meta\\s+content=["']([^"']*)["']\\s+${attrName}=["']${escapeRegex(attrValue)}["']`, 'i'),
46
+ ];
47
+ for (const re of patterns) {
48
+ const m = html.match(re);
49
+ if (m) return decodeEntities(m[1]);
50
+ }
51
+ return null;
52
+ }
53
+
54
+ function escapeRegex(s: string): string {
55
+ return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
56
+ }
57
+
58
+ export function parseHead(html: string): HeadMeta {
59
+ // Extract <title>
60
+ const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
61
+ const title = titleMatch ? decodeEntities(titleMatch[1].trim()) : null;
62
+
63
+ // Extract canonical
64
+ const canonicalMatch = html.match(/<link\s+[^>]*rel=["']canonical["'][^>]*href=["']([^"']*)["'][^>]*\/?>/i)
65
+ ?? html.match(/<link\s+[^>]*href=["']([^"']*)["'][^>]*rel=["']canonical["'][^>]*\/?>/i);
66
+ const canonical = canonicalMatch ? decodeEntities(canonicalMatch[1]) : null;
67
+
68
+ return {
69
+ title,
70
+ ogTitle: extractMetaContent(html, 'property', 'og:title'),
71
+ ogDescription: extractMetaContent(html, 'property', 'og:description'),
72
+ ogImage: extractMetaContent(html, 'property', 'og:image'),
73
+ ogType: extractMetaContent(html, 'property', 'og:type'),
74
+ ogSiteName: extractMetaContent(html, 'property', 'og:site_name'),
75
+ canonical,
76
+ author: extractMetaContent(html, 'name', 'author'),
77
+ publishedTime: extractMetaContent(html, 'property', 'article:published_time'),
78
+ };
79
+ }
80
+
81
+ // ---- extractContent ----
82
+
83
+ /** Tags whose entire content (including children) should be removed */
84
+ const NOISE_TAGS = ['script', 'style', 'noscript', 'svg', 'iframe', 'nav', 'header', 'footer', 'aside'];
85
+
86
+ /** SPA shell markers */
87
+ const SPA_MARKERS = [
88
+ '<div id="root"',
89
+ '<div id="app"',
90
+ '<div id="__next"',
91
+ 'bundle.js',
92
+ 'main.js',
93
+ 'app.js',
94
+ '__NEXT_DATA__',
95
+ 'window.__INITIAL_STATE__',
96
+ 'window.__NUXT__',
97
+ ];
98
+
99
+ function stripTags(html: string, tags: string[]): string {
100
+ let result = html;
101
+ for (const tag of tags) {
102
+ // Use non-greedy match with dotAll behavior via [\s\S]
103
+ const re = new RegExp(`<${tag}[\\s>][\\s\\S]*?<\\/${tag}>`, 'gi');
104
+ result = result.replace(re, '');
105
+ // Also strip self-closing variants (e.g. <iframe ... />)
106
+ const selfClose = new RegExp(`<${tag}[^>]*/?>`, 'gi');
107
+ result = result.replace(selfClose, '');
108
+ }
109
+ return result;
110
+ }
111
+
112
+ function findContentRoot(html: string): string {
113
+ // Priority order for content root
114
+ const selectors: Array<{ re: RegExp }> = [
115
+ { re: /<article[^>]*>([\s\S]*?)<\/article>/i },
116
+ { re: /<main[^>]*>([\s\S]*?)<\/main>/i },
117
+ { re: /<[^>]+role=["']main["'][^>]*>([\s\S]*?)<\/div>/i },
118
+ ];
119
+
120
+ for (const { re } of selectors) {
121
+ const m = html.match(re);
122
+ if (m) return m[1];
123
+ }
124
+
125
+ // Class-based selectors
126
+ const classPatterns = [
127
+ /class=["'][^"']*\bpost-content\b/i,
128
+ /class=["'][^"']*\barticle-body\b/i,
129
+ /class=["'][^"']*\bentry-content\b/i,
130
+ ];
131
+
132
+ for (const cp of classPatterns) {
133
+ const m = html.match(cp);
134
+ if (m) {
135
+ // Find the enclosing tag and extract its content
136
+ const idx = m.index!;
137
+ const extracted = extractTagContent(html, idx);
138
+ if (extracted) return extracted;
139
+ }
140
+ }
141
+
142
+ // id="content"
143
+ const contentId = html.match(/id=["']content["']/i);
144
+ if (contentId) {
145
+ const extracted = extractTagContent(html, contentId.index!);
146
+ if (extracted) return extracted;
147
+ }
148
+
149
+ // Fallback: <body>
150
+ const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
151
+ if (bodyMatch) return bodyMatch[1];
152
+
153
+ return html;
154
+ }
155
+
156
+ function extractTagContent(html: string, attrIndex: number): string | null {
157
+ // Walk backwards to find the opening < of the tag
158
+ let tagStart = attrIndex;
159
+ while (tagStart > 0 && html[tagStart] !== '<') tagStart--;
160
+
161
+ // Find the tag name
162
+ const tagNameMatch = html.slice(tagStart).match(/^<(\w+)/);
163
+ if (!tagNameMatch) return null;
164
+
165
+ const tagName = tagNameMatch[1];
166
+
167
+ // Find matching close tag accounting for nesting
168
+ let depth = 1;
169
+ const openRe = new RegExp(`<${tagName}[\\s>]`, 'gi');
170
+ const closeRe = new RegExp(`</${tagName}>`, 'gi');
171
+
172
+ // Find where the opening tag ends (the first > after tagStart)
173
+ const openTagEnd = html.indexOf('>', tagStart);
174
+ if (openTagEnd === -1) return null;
175
+
176
+ let pos = openTagEnd + 1;
177
+ const contentStart = pos;
178
+
179
+ while (depth > 0 && pos < html.length) {
180
+ openRe.lastIndex = pos;
181
+ closeRe.lastIndex = pos;
182
+
183
+ const nextOpen = openRe.exec(html);
184
+ const nextClose = closeRe.exec(html);
185
+
186
+ if (!nextClose) break; // no more close tags
187
+
188
+ if (nextOpen && nextOpen.index < nextClose.index) {
189
+ depth++;
190
+ pos = nextOpen.index + nextOpen[0].length;
191
+ } else {
192
+ depth--;
193
+ if (depth === 0) {
194
+ return html.slice(contentStart, nextClose.index);
195
+ }
196
+ pos = nextClose.index + nextClose[0].length;
197
+ }
198
+ }
199
+
200
+ return null;
201
+ }
202
+
203
+ function htmlToMarkdown(
204
+ html: string,
205
+ links: Array<{ text: string; href: string }>,
206
+ images: Array<{ alt: string; src: string }>,
207
+ ): string {
208
+ let md = html;
209
+
210
+ // Remove HTML comments
211
+ md = md.replace(/<!--[\s\S]*?-->/g, '');
212
+
213
+ // Convert headings
214
+ for (let level = 1; level <= 6; level++) {
215
+ const prefix = '#'.repeat(level);
216
+ const re = new RegExp(`<h${level}[^>]*>([\\s\\S]*?)<\\/h${level}>`, 'gi');
217
+ md = md.replace(re, (_m, content) => {
218
+ const text = stripAllTags(content).trim();
219
+ return `\n\n${prefix} ${text}\n\n`;
220
+ });
221
+ }
222
+
223
+ // Convert blockquotes (before paragraphs)
224
+ md = md.replace(/<blockquote[^>]*>([\s\S]*?)<\/blockquote>/gi, (_m, content) => {
225
+ const text = stripAllTags(content).trim();
226
+ const quoted = text.split('\n').map((l: string) => `> ${l}`).join('\n');
227
+ return `\n\n${quoted}\n\n`;
228
+ });
229
+
230
+ // Convert code blocks: <pre><code>...</code></pre>
231
+ md = md.replace(/<pre[^>]*>\s*<code[^>]*>([\s\S]*?)<\/code>\s*<\/pre>/gi, (_m, content) => {
232
+ const decoded = decodeEntities(content.trim());
233
+ return `\n\n\`\`\`\n${decoded}\n\`\`\`\n\n`;
234
+ });
235
+
236
+ // Convert standalone <pre> (without <code>)
237
+ md = md.replace(/<pre[^>]*>([\s\S]*?)<\/pre>/gi, (_m, content) => {
238
+ const decoded = decodeEntities(stripAllTags(content).trim());
239
+ return `\n\n\`\`\`\n${decoded}\n\`\`\`\n\n`;
240
+ });
241
+
242
+ // Convert inline code
243
+ md = md.replace(/<code[^>]*>([\s\S]*?)<\/code>/gi, (_m, content) => {
244
+ return `\`${decodeEntities(content)}\``;
245
+ });
246
+
247
+ // Convert tables
248
+ md = md.replace(/<table[^>]*>([\s\S]*?)<\/table>/gi, (_m, tableContent) => {
249
+ return convertTable(tableContent);
250
+ });
251
+
252
+ // Convert images (before stripping tags, so we can extract src/alt)
253
+ md = md.replace(/<img\s+[^>]*>/gi, (tag) => {
254
+ const srcMatch = tag.match(/src=["']([^"']*)["']/i);
255
+ const altMatch = tag.match(/alt=["']([^"']*)["']/i);
256
+ const src = srcMatch ? decodeEntities(srcMatch[1]) : '';
257
+ const alt = altMatch ? decodeEntities(altMatch[1]) : '';
258
+ if (src) {
259
+ images.push({ alt, src });
260
+ return `![${alt}](${src})`;
261
+ }
262
+ return '';
263
+ });
264
+
265
+ // Convert links
266
+ md = md.replace(/<a\s+[^>]*href=["']([^"']*)["'][^>]*>([\s\S]*?)<\/a>/gi, (_m, href, content) => {
267
+ const text = stripAllTags(content).trim();
268
+ const decodedHref = decodeEntities(href);
269
+ if (text && decodedHref) {
270
+ links.push({ text, href: decodedHref });
271
+ return `[${text}](${decodedHref})`;
272
+ }
273
+ return text;
274
+ });
275
+
276
+ // Convert bold
277
+ md = md.replace(/<(?:strong|b)[^>]*>([\s\S]*?)<\/(?:strong|b)>/gi, (_m, content) => {
278
+ return `**${stripAllTags(content)}**`;
279
+ });
280
+
281
+ // Convert italic
282
+ md = md.replace(/<(?:em|i)(?:\s[^>]*)?>(?!mg)([\s\S]*?)<\/(?:em|i)>/gi, (_m, content) => {
283
+ return `*${stripAllTags(content)}*`;
284
+ });
285
+
286
+ // Convert ordered lists
287
+ md = md.replace(/<ol[^>]*>([\s\S]*?)<\/ol>/gi, (_m, listContent) => {
288
+ let counter = 0;
289
+ const items = listContent.replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, (_lm: string, item: string) => {
290
+ counter++;
291
+ return `${counter}. ${stripAllTags(item).trim()}\n`;
292
+ });
293
+ return `\n\n${stripAllTags(items).trim()}\n\n`;
294
+ });
295
+
296
+ // Convert unordered lists
297
+ md = md.replace(/<ul[^>]*>([\s\S]*?)<\/ul>/gi, (_m, listContent) => {
298
+ const items = listContent.replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, (_lm: string, item: string) => {
299
+ return `- ${stripAllTags(item).trim()}\n`;
300
+ });
301
+ return `\n\n${stripAllTags(items).trim()}\n\n`;
302
+ });
303
+
304
+ // Convert paragraphs
305
+ md = md.replace(/<p[^>]*>([\s\S]*?)<\/p>/gi, (_m, content) => {
306
+ return `\n\n${content.trim()}\n\n`;
307
+ });
308
+
309
+ // Convert <br> tags
310
+ md = md.replace(/<br\s*\/?>/gi, '\n');
311
+
312
+ // Strip remaining HTML tags
313
+ md = stripAllTags(md);
314
+
315
+ // Decode entities
316
+ md = decodeEntities(md);
317
+
318
+ // Collapse whitespace: no more than 2 consecutive newlines
319
+ md = md.replace(/\n{3,}/g, '\n\n');
320
+
321
+ // Trim leading/trailing whitespace
322
+ md = md.trim();
323
+
324
+ return md;
325
+ }
326
+
327
+ function stripAllTags(html: string): string {
328
+ return html.replace(/<[^>]*>/g, '');
329
+ }
330
+
331
+ function convertTable(tableHtml: string): string {
332
+ const rows: string[][] = [];
333
+
334
+ // Extract rows
335
+ const rowMatches = tableHtml.match(/<tr[^>]*>[\s\S]*?<\/tr>/gi) || [];
336
+ for (const row of rowMatches) {
337
+ const cells: string[] = [];
338
+ const cellMatches = row.match(/<(?:td|th)[^>]*>[\s\S]*?<\/(?:td|th)>/gi) || [];
339
+ for (const cell of cellMatches) {
340
+ const content = cell.replace(/<\/?(?:td|th)[^>]*>/gi, '');
341
+ cells.push(stripAllTags(content).trim());
342
+ }
343
+ if (cells.length > 0) rows.push(cells);
344
+ }
345
+
346
+ if (rows.length === 0) return '';
347
+
348
+ // Normalize column count
349
+ const maxCols = Math.max(...rows.map((r) => r.length));
350
+ const normalized = rows.map((r) => {
351
+ while (r.length < maxCols) r.push('');
352
+ return r;
353
+ });
354
+
355
+ // Build markdown table
356
+ const lines: string[] = [];
357
+ const header = normalized[0];
358
+ lines.push('| ' + header.join(' | ') + ' |');
359
+ lines.push('| ' + header.map(() => '---').join(' | ') + ' |');
360
+
361
+ for (let i = 1; i < normalized.length; i++) {
362
+ lines.push('| ' + normalized[i].join(' | ') + ' |');
363
+ }
364
+
365
+ return '\n\n' + lines.join('\n') + '\n\n';
366
+ }
367
+
368
+ function getTextContent(html: string): string {
369
+ return stripAllTags(html).replace(/\s+/g, ' ').trim();
370
+ }
371
+
372
+ export function extractContent(html: string): ExtractResult {
373
+ const links: Array<{ text: string; href: string }> = [];
374
+ const images: Array<{ alt: string; src: string }> = [];
375
+
376
+ // Strip noise tags first
377
+ const cleaned = stripTags(html, NOISE_TAGS);
378
+
379
+ // Find content root
380
+ const contentHtml = findContentRoot(cleaned);
381
+
382
+ // Convert to markdown
383
+ const content = htmlToMarkdown(contentHtml, links, images);
384
+
385
+ // Detect SPA shell
386
+ const textContent = getTextContent(contentHtml);
387
+ const hasSpaMarker = SPA_MARKERS.some((marker) => html.includes(marker));
388
+ const isSpaShell = textContent.length < 200 && hasSpaMarker;
389
+
390
+ return {
391
+ content,
392
+ links,
393
+ images,
394
+ isSpaShell,
395
+ };
396
+ }
@@ -0,0 +1,78 @@
1
+ // src/read/index.ts
2
+ export { peek } from './peek.js';
3
+ export type { PeekOptions } from './peek.js';
4
+ export type { PeekResult, ReadResult, Decoder } from './types.js';
5
+
6
+ import type { ReadResult } from './types.js';
7
+ import { safeFetch } from '../discovery/fetch.js';
8
+ import { findDecoder } from './decoders/index.js';
9
+ import { parseHead, extractContent } from './extract.js';
10
+
11
+ export interface ReadOptions {
12
+ skipSsrf?: boolean;
13
+ maxBytes?: number;
14
+ }
15
+
16
+ /**
17
+ * Universal content decoder. Routes to site-specific decoders for known sites
18
+ * (Reddit, YouTube, Wikipedia, HN), falls back to generic HTML extraction.
19
+ * Returns null if content cannot be extracted.
20
+ */
21
+ export async function read(url: string, options: ReadOptions = {}): Promise<ReadResult | null> {
22
+ // Try site-specific decoder first
23
+ const decoder = findDecoder(url);
24
+ if (decoder) {
25
+ const result = await decoder.decode(url, { skipSsrf: options.skipSsrf });
26
+ if (result) {
27
+ if (options.maxBytes && result.content.length > options.maxBytes) {
28
+ result.content = result.content.slice(0, options.maxBytes);
29
+ result.cost.tokens = Math.ceil(result.content.length / 4);
30
+ }
31
+ return result;
32
+ }
33
+ // Decoder returned null -- fall through to generic
34
+ }
35
+
36
+ // Generic pipeline: fetch HTML -> parse head -> extract body
37
+ const fetchResult = await safeFetch(url, { skipSsrf: options.skipSsrf });
38
+ if (!fetchResult || fetchResult.status !== 200) return null;
39
+
40
+ const html = fetchResult.body;
41
+ const head = parseHead(html);
42
+ const body = extractContent(html);
43
+
44
+ // Determine source
45
+ let source: string;
46
+ if (body.isSpaShell) {
47
+ source = 'spa-shell';
48
+ } else if (body.content.trim().length === 0) {
49
+ source = 'og-tags-only';
50
+ } else {
51
+ source = 'readability';
52
+ }
53
+
54
+ let content = body.content;
55
+ if (options.maxBytes && content.length > options.maxBytes) {
56
+ content = content.slice(0, options.maxBytes);
57
+ }
58
+
59
+ const title = head.ogTitle || head.title || null;
60
+
61
+ return {
62
+ url,
63
+ title,
64
+ author: head.author || null,
65
+ description: head.ogDescription || null,
66
+ content,
67
+ links: body.links,
68
+ images: body.images,
69
+ metadata: {
70
+ type: head.ogType || 'unknown',
71
+ publishedAt: head.publishedTime || null,
72
+ source,
73
+ canonical: head.canonical || null,
74
+ siteName: head.ogSiteName || null,
75
+ },
76
+ cost: { tokens: Math.ceil(content.length / 4) },
77
+ };
78
+ }
@@ -0,0 +1,175 @@
1
+ // src/read/peek.ts
2
+ import type { PeekResult } from './types.js';
3
+ import { safeFetch } from '../discovery/fetch.js';
4
+
5
+ export interface PeekOptions {
6
+ skipSsrf?: boolean;
7
+ }
8
+
9
+ /**
10
+ * HTTP HEAD-only triage: checks accessibility, detects bot protection and frameworks.
11
+ * Falls back to GET if HEAD fails.
12
+ */
13
+ export async function peek(url: string, options: PeekOptions = {}): Promise<PeekResult> {
14
+ const signals: string[] = [];
15
+
16
+ // Try HEAD first
17
+ let result = await safeFetch(url, {
18
+ method: 'HEAD',
19
+ skipSsrf: options.skipSsrf,
20
+ });
21
+
22
+ // Fall back to GET if HEAD fails (null = network/SSRF error)
23
+ if (!result) {
24
+ result = await safeFetch(url, {
25
+ method: 'GET',
26
+ skipSsrf: options.skipSsrf,
27
+ });
28
+ }
29
+
30
+ // Both HEAD and GET failed
31
+ if (!result) {
32
+ return {
33
+ url,
34
+ status: 0,
35
+ accessible: false,
36
+ contentType: null,
37
+ server: null,
38
+ framework: null,
39
+ botProtection: null,
40
+ signals: ['fetch failed'],
41
+ recommendation: 'blocked',
42
+ };
43
+ }
44
+
45
+ const { status, headers } = result;
46
+
47
+ // Extract basic metadata
48
+ const contentType = headers['content-type'] || null;
49
+ const server = headers['server'] || null;
50
+
51
+ // Detect bot protection
52
+ const botProtection = detectBotProtection(headers, signals);
53
+
54
+ // Detect framework
55
+ const framework = detectFramework(headers, signals);
56
+
57
+ // Determine accessibility and recommendation
58
+ const accessible = status >= 200 && status < 400 && !botProtection;
59
+ const recommendation = computeRecommendation(status, botProtection);
60
+
61
+ return {
62
+ url,
63
+ status,
64
+ accessible,
65
+ contentType,
66
+ server,
67
+ framework,
68
+ botProtection,
69
+ signals,
70
+ recommendation,
71
+ };
72
+ }
73
+
74
+ function detectBotProtection(
75
+ headers: Record<string, string>,
76
+ signals: string[],
77
+ ): string | null {
78
+ // Cloudflare: cf-ray or cf-cache-status
79
+ if (headers['cf-ray']) {
80
+ signals.push('cf-ray header');
81
+ return 'cloudflare';
82
+ }
83
+ if (headers['cf-cache-status']) {
84
+ signals.push('cf-cache-status header');
85
+ return 'cloudflare';
86
+ }
87
+
88
+ // PerimeterX: x-px-* headers
89
+ for (const key of Object.keys(headers)) {
90
+ if (key.startsWith('x-px-')) {
91
+ signals.push(`${key} header`);
92
+ return 'perimeterx';
93
+ }
94
+ }
95
+
96
+ // DataDome: x-datadome* headers
97
+ for (const key of Object.keys(headers)) {
98
+ if (key.startsWith('x-datadome')) {
99
+ signals.push(`${key} header`);
100
+ return 'datadome';
101
+ }
102
+ }
103
+
104
+ return null;
105
+ }
106
+
107
+ function detectFramework(
108
+ headers: Record<string, string>,
109
+ signals: string[],
110
+ ): string | null {
111
+ // Next.js: x-powered-by: Next.js
112
+ const poweredBy = headers['x-powered-by'];
113
+ if (poweredBy && /next\.js/i.test(poweredBy)) {
114
+ signals.push('x-powered-by: Next.js');
115
+ return 'next.js';
116
+ }
117
+
118
+ // Express: x-powered-by: Express
119
+ if (poweredBy && /express/i.test(poweredBy)) {
120
+ signals.push('x-powered-by: Express');
121
+ return 'express';
122
+ }
123
+
124
+ // PHP: x-powered-by: PHP/*
125
+ if (poweredBy && /php/i.test(poweredBy)) {
126
+ signals.push('x-powered-by: PHP');
127
+ return 'php';
128
+ }
129
+
130
+ // WordPress: link header containing api.w.org
131
+ const link = headers['link'];
132
+ if (link && link.includes('api.w.org')) {
133
+ signals.push('link: api.w.org');
134
+ return 'wordpress';
135
+ }
136
+
137
+ // Shopify: x-shopify-stage header
138
+ if (headers['x-shopify-stage']) {
139
+ signals.push('x-shopify-stage header');
140
+ return 'shopify';
141
+ }
142
+
143
+ // Drupal: x-drupal-* headers
144
+ for (const key of Object.keys(headers)) {
145
+ if (key.startsWith('x-drupal-')) {
146
+ signals.push(`${key} header`);
147
+ return 'drupal';
148
+ }
149
+ }
150
+
151
+ return null;
152
+ }
153
+
154
+ function computeRecommendation(
155
+ status: number,
156
+ botProtection: string | null,
157
+ ): PeekResult['recommendation'] {
158
+ // Auth required
159
+ if (status === 401 || status === 407) {
160
+ return 'auth_required';
161
+ }
162
+
163
+ // Blocked: bot protection, 403, 429, or 5xx
164
+ if (botProtection) {
165
+ return 'blocked';
166
+ }
167
+ if (status === 403 || status === 429) {
168
+ return 'blocked';
169
+ }
170
+ if (status >= 500) {
171
+ return 'blocked';
172
+ }
173
+
174
+ return 'read';
175
+ }
@@ -0,0 +1,37 @@
1
+ // src/read/types.ts
2
+
3
+ export interface PeekResult {
4
+ url: string;
5
+ status: number;
6
+ accessible: boolean;
7
+ contentType: string | null;
8
+ server: string | null;
9
+ framework: string | null;
10
+ botProtection: string | null;
11
+ signals: string[];
12
+ recommendation: 'read' | 'capture' | 'auth_required' | 'blocked';
13
+ }
14
+
15
+ export interface ReadResult {
16
+ url: string;
17
+ title: string | null;
18
+ author: string | null;
19
+ description: string | null;
20
+ content: string;
21
+ links: Array<{ text: string; href: string }>;
22
+ images: Array<{ alt: string; src: string }>;
23
+ metadata: {
24
+ type: string;
25
+ publishedAt: string | null;
26
+ source: string;
27
+ canonical: string | null;
28
+ siteName: string | null;
29
+ };
30
+ cost: { tokens: number };
31
+ }
32
+
33
+ export interface Decoder {
34
+ name: string;
35
+ patterns: RegExp[];
36
+ decode(url: string, options?: { skipSsrf?: boolean; [key: string]: any }): Promise<ReadResult | null>;
37
+ }