@apitap/core 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (236) hide show
  1. package/LICENSE +60 -0
  2. package/README.md +362 -0
  3. package/SKILL.md +270 -0
  4. package/dist/auth/crypto.d.ts +31 -0
  5. package/dist/auth/crypto.js +66 -0
  6. package/dist/auth/crypto.js.map +1 -0
  7. package/dist/auth/handoff.d.ts +29 -0
  8. package/dist/auth/handoff.js +180 -0
  9. package/dist/auth/handoff.js.map +1 -0
  10. package/dist/auth/manager.d.ts +46 -0
  11. package/dist/auth/manager.js +127 -0
  12. package/dist/auth/manager.js.map +1 -0
  13. package/dist/auth/oauth-refresh.d.ts +16 -0
  14. package/dist/auth/oauth-refresh.js +91 -0
  15. package/dist/auth/oauth-refresh.js.map +1 -0
  16. package/dist/auth/refresh.d.ts +43 -0
  17. package/dist/auth/refresh.js +217 -0
  18. package/dist/auth/refresh.js.map +1 -0
  19. package/dist/capture/anti-bot.d.ts +15 -0
  20. package/dist/capture/anti-bot.js +43 -0
  21. package/dist/capture/anti-bot.js.map +1 -0
  22. package/dist/capture/blocklist.d.ts +6 -0
  23. package/dist/capture/blocklist.js +70 -0
  24. package/dist/capture/blocklist.js.map +1 -0
  25. package/dist/capture/body-diff.d.ts +8 -0
  26. package/dist/capture/body-diff.js +102 -0
  27. package/dist/capture/body-diff.js.map +1 -0
  28. package/dist/capture/body-variables.d.ts +13 -0
  29. package/dist/capture/body-variables.js +142 -0
  30. package/dist/capture/body-variables.js.map +1 -0
  31. package/dist/capture/domain.d.ts +8 -0
  32. package/dist/capture/domain.js +34 -0
  33. package/dist/capture/domain.js.map +1 -0
  34. package/dist/capture/entropy.d.ts +33 -0
  35. package/dist/capture/entropy.js +100 -0
  36. package/dist/capture/entropy.js.map +1 -0
  37. package/dist/capture/filter.d.ts +11 -0
  38. package/dist/capture/filter.js +49 -0
  39. package/dist/capture/filter.js.map +1 -0
  40. package/dist/capture/graphql.d.ts +21 -0
  41. package/dist/capture/graphql.js +99 -0
  42. package/dist/capture/graphql.js.map +1 -0
  43. package/dist/capture/idle.d.ts +23 -0
  44. package/dist/capture/idle.js +44 -0
  45. package/dist/capture/idle.js.map +1 -0
  46. package/dist/capture/monitor.d.ts +26 -0
  47. package/dist/capture/monitor.js +183 -0
  48. package/dist/capture/monitor.js.map +1 -0
  49. package/dist/capture/oauth-detector.d.ts +18 -0
  50. package/dist/capture/oauth-detector.js +96 -0
  51. package/dist/capture/oauth-detector.js.map +1 -0
  52. package/dist/capture/pagination.d.ts +9 -0
  53. package/dist/capture/pagination.js +40 -0
  54. package/dist/capture/pagination.js.map +1 -0
  55. package/dist/capture/parameterize.d.ts +17 -0
  56. package/dist/capture/parameterize.js +63 -0
  57. package/dist/capture/parameterize.js.map +1 -0
  58. package/dist/capture/scrubber.d.ts +5 -0
  59. package/dist/capture/scrubber.js +38 -0
  60. package/dist/capture/scrubber.js.map +1 -0
  61. package/dist/capture/session.d.ts +46 -0
  62. package/dist/capture/session.js +445 -0
  63. package/dist/capture/session.js.map +1 -0
  64. package/dist/capture/token-detector.d.ts +16 -0
  65. package/dist/capture/token-detector.js +62 -0
  66. package/dist/capture/token-detector.js.map +1 -0
  67. package/dist/capture/verifier.d.ts +17 -0
  68. package/dist/capture/verifier.js +147 -0
  69. package/dist/capture/verifier.js.map +1 -0
  70. package/dist/cli.d.ts +2 -0
  71. package/dist/cli.js +930 -0
  72. package/dist/cli.js.map +1 -0
  73. package/dist/discovery/auth.d.ts +17 -0
  74. package/dist/discovery/auth.js +81 -0
  75. package/dist/discovery/auth.js.map +1 -0
  76. package/dist/discovery/fetch.d.ts +17 -0
  77. package/dist/discovery/fetch.js +59 -0
  78. package/dist/discovery/fetch.js.map +1 -0
  79. package/dist/discovery/frameworks.d.ts +11 -0
  80. package/dist/discovery/frameworks.js +249 -0
  81. package/dist/discovery/frameworks.js.map +1 -0
  82. package/dist/discovery/index.d.ts +21 -0
  83. package/dist/discovery/index.js +219 -0
  84. package/dist/discovery/index.js.map +1 -0
  85. package/dist/discovery/openapi.d.ts +13 -0
  86. package/dist/discovery/openapi.js +175 -0
  87. package/dist/discovery/openapi.js.map +1 -0
  88. package/dist/discovery/probes.d.ts +9 -0
  89. package/dist/discovery/probes.js +70 -0
  90. package/dist/discovery/probes.js.map +1 -0
  91. package/dist/index.d.ts +25 -0
  92. package/dist/index.js +25 -0
  93. package/dist/index.js.map +1 -0
  94. package/dist/inspect/report.d.ts +52 -0
  95. package/dist/inspect/report.js +191 -0
  96. package/dist/inspect/report.js.map +1 -0
  97. package/dist/mcp.d.ts +8 -0
  98. package/dist/mcp.js +526 -0
  99. package/dist/mcp.js.map +1 -0
  100. package/dist/orchestration/browse.d.ts +38 -0
  101. package/dist/orchestration/browse.js +198 -0
  102. package/dist/orchestration/browse.js.map +1 -0
  103. package/dist/orchestration/cache.d.ts +15 -0
  104. package/dist/orchestration/cache.js +24 -0
  105. package/dist/orchestration/cache.js.map +1 -0
  106. package/dist/plugin.d.ts +17 -0
  107. package/dist/plugin.js +158 -0
  108. package/dist/plugin.js.map +1 -0
  109. package/dist/read/decoders/deepwiki.d.ts +2 -0
  110. package/dist/read/decoders/deepwiki.js +148 -0
  111. package/dist/read/decoders/deepwiki.js.map +1 -0
  112. package/dist/read/decoders/grokipedia.d.ts +2 -0
  113. package/dist/read/decoders/grokipedia.js +210 -0
  114. package/dist/read/decoders/grokipedia.js.map +1 -0
  115. package/dist/read/decoders/hackernews.d.ts +2 -0
  116. package/dist/read/decoders/hackernews.js +168 -0
  117. package/dist/read/decoders/hackernews.js.map +1 -0
  118. package/dist/read/decoders/index.d.ts +2 -0
  119. package/dist/read/decoders/index.js +12 -0
  120. package/dist/read/decoders/index.js.map +1 -0
  121. package/dist/read/decoders/reddit.d.ts +2 -0
  122. package/dist/read/decoders/reddit.js +142 -0
  123. package/dist/read/decoders/reddit.js.map +1 -0
  124. package/dist/read/decoders/twitter.d.ts +12 -0
  125. package/dist/read/decoders/twitter.js +187 -0
  126. package/dist/read/decoders/twitter.js.map +1 -0
  127. package/dist/read/decoders/wikipedia.d.ts +2 -0
  128. package/dist/read/decoders/wikipedia.js +66 -0
  129. package/dist/read/decoders/wikipedia.js.map +1 -0
  130. package/dist/read/decoders/youtube.d.ts +2 -0
  131. package/dist/read/decoders/youtube.js +69 -0
  132. package/dist/read/decoders/youtube.js.map +1 -0
  133. package/dist/read/extract.d.ts +25 -0
  134. package/dist/read/extract.js +320 -0
  135. package/dist/read/extract.js.map +1 -0
  136. package/dist/read/index.d.ts +14 -0
  137. package/dist/read/index.js +66 -0
  138. package/dist/read/index.js.map +1 -0
  139. package/dist/read/peek.d.ts +9 -0
  140. package/dist/read/peek.js +137 -0
  141. package/dist/read/peek.js.map +1 -0
  142. package/dist/read/types.d.ts +44 -0
  143. package/dist/read/types.js +3 -0
  144. package/dist/read/types.js.map +1 -0
  145. package/dist/replay/engine.d.ts +53 -0
  146. package/dist/replay/engine.js +441 -0
  147. package/dist/replay/engine.js.map +1 -0
  148. package/dist/replay/truncate.d.ts +16 -0
  149. package/dist/replay/truncate.js +92 -0
  150. package/dist/replay/truncate.js.map +1 -0
  151. package/dist/serve.d.ts +31 -0
  152. package/dist/serve.js +149 -0
  153. package/dist/serve.js.map +1 -0
  154. package/dist/skill/generator.d.ts +44 -0
  155. package/dist/skill/generator.js +419 -0
  156. package/dist/skill/generator.js.map +1 -0
  157. package/dist/skill/importer.d.ts +26 -0
  158. package/dist/skill/importer.js +80 -0
  159. package/dist/skill/importer.js.map +1 -0
  160. package/dist/skill/search.d.ts +19 -0
  161. package/dist/skill/search.js +51 -0
  162. package/dist/skill/search.js.map +1 -0
  163. package/dist/skill/signing.d.ts +16 -0
  164. package/dist/skill/signing.js +34 -0
  165. package/dist/skill/signing.js.map +1 -0
  166. package/dist/skill/ssrf.d.ts +27 -0
  167. package/dist/skill/ssrf.js +210 -0
  168. package/dist/skill/ssrf.js.map +1 -0
  169. package/dist/skill/store.d.ts +7 -0
  170. package/dist/skill/store.js +93 -0
  171. package/dist/skill/store.js.map +1 -0
  172. package/dist/stats/report.d.ts +26 -0
  173. package/dist/stats/report.js +157 -0
  174. package/dist/stats/report.js.map +1 -0
  175. package/dist/types.d.ts +214 -0
  176. package/dist/types.js +3 -0
  177. package/dist/types.js.map +1 -0
  178. package/package.json +58 -0
  179. package/src/auth/crypto.ts +92 -0
  180. package/src/auth/handoff.ts +229 -0
  181. package/src/auth/manager.ts +140 -0
  182. package/src/auth/oauth-refresh.ts +120 -0
  183. package/src/auth/refresh.ts +300 -0
  184. package/src/capture/anti-bot.ts +63 -0
  185. package/src/capture/blocklist.ts +75 -0
  186. package/src/capture/body-diff.ts +109 -0
  187. package/src/capture/body-variables.ts +156 -0
  188. package/src/capture/domain.ts +34 -0
  189. package/src/capture/entropy.ts +121 -0
  190. package/src/capture/filter.ts +56 -0
  191. package/src/capture/graphql.ts +124 -0
  192. package/src/capture/idle.ts +45 -0
  193. package/src/capture/monitor.ts +224 -0
  194. package/src/capture/oauth-detector.ts +106 -0
  195. package/src/capture/pagination.ts +49 -0
  196. package/src/capture/parameterize.ts +68 -0
  197. package/src/capture/scrubber.ts +49 -0
  198. package/src/capture/session.ts +502 -0
  199. package/src/capture/token-detector.ts +76 -0
  200. package/src/capture/verifier.ts +171 -0
  201. package/src/cli.ts +1031 -0
  202. package/src/discovery/auth.ts +99 -0
  203. package/src/discovery/fetch.ts +85 -0
  204. package/src/discovery/frameworks.ts +231 -0
  205. package/src/discovery/index.ts +256 -0
  206. package/src/discovery/openapi.ts +230 -0
  207. package/src/discovery/probes.ts +76 -0
  208. package/src/index.ts +26 -0
  209. package/src/inspect/report.ts +247 -0
  210. package/src/mcp.ts +618 -0
  211. package/src/orchestration/browse.ts +250 -0
  212. package/src/orchestration/cache.ts +37 -0
  213. package/src/plugin.ts +188 -0
  214. package/src/read/decoders/deepwiki.ts +180 -0
  215. package/src/read/decoders/grokipedia.ts +246 -0
  216. package/src/read/decoders/hackernews.ts +198 -0
  217. package/src/read/decoders/index.ts +15 -0
  218. package/src/read/decoders/reddit.ts +158 -0
  219. package/src/read/decoders/twitter.ts +211 -0
  220. package/src/read/decoders/wikipedia.ts +75 -0
  221. package/src/read/decoders/youtube.ts +75 -0
  222. package/src/read/extract.ts +396 -0
  223. package/src/read/index.ts +78 -0
  224. package/src/read/peek.ts +175 -0
  225. package/src/read/types.ts +37 -0
  226. package/src/replay/engine.ts +559 -0
  227. package/src/replay/truncate.ts +116 -0
  228. package/src/serve.ts +189 -0
  229. package/src/skill/generator.ts +473 -0
  230. package/src/skill/importer.ts +107 -0
  231. package/src/skill/search.ts +76 -0
  232. package/src/skill/signing.ts +36 -0
  233. package/src/skill/ssrf.ts +238 -0
  234. package/src/skill/store.ts +107 -0
  235. package/src/stats/report.ts +208 -0
  236. package/src/types.ts +233 -0
@@ -0,0 +1,320 @@
1
+ // src/read/extract.ts
2
+ // ---- HTML entity decoding ----
3
+ const ENTITY_MAP = {
4
+ '&': '&',
5
+ '&lt;': '<',
6
+ '&gt;': '>',
7
+ '&quot;': '"',
8
+ '&#39;': "'",
9
+ '&apos;': "'",
10
+ '&nbsp;': ' ',
11
+ };
12
+ function decodeEntities(text) {
13
+ return text.replace(/&(?:amp|lt|gt|quot|apos|nbsp|#39);/g, (m) => ENTITY_MAP[m] ?? m);
14
+ }
15
+ // ---- parseHead ----
16
+ function extractMetaContent(html, attrName, attrValue) {
17
+ // Handle both orders: property="X" content="Y" and content="Y" property="X"
18
+ // Also handle name="X" content="Y" for author etc.
19
+ const patterns = [
20
+ new RegExp(`<meta\\s+${attrName}=["']${escapeRegex(attrValue)}["']\\s+content=["']([^"']*)["']`, 'i'),
21
+ new RegExp(`<meta\\s+content=["']([^"']*)["']\\s+${attrName}=["']${escapeRegex(attrValue)}["']`, 'i'),
22
+ ];
23
+ for (const re of patterns) {
24
+ const m = html.match(re);
25
+ if (m)
26
+ return decodeEntities(m[1]);
27
+ }
28
+ return null;
29
+ }
30
+ function escapeRegex(s) {
31
+ return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
32
+ }
33
+ export function parseHead(html) {
34
+ // Extract <title>
35
+ const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
36
+ const title = titleMatch ? decodeEntities(titleMatch[1].trim()) : null;
37
+ // Extract canonical
38
+ const canonicalMatch = html.match(/<link\s+[^>]*rel=["']canonical["'][^>]*href=["']([^"']*)["'][^>]*\/?>/i)
39
+ ?? html.match(/<link\s+[^>]*href=["']([^"']*)["'][^>]*rel=["']canonical["'][^>]*\/?>/i);
40
+ const canonical = canonicalMatch ? decodeEntities(canonicalMatch[1]) : null;
41
+ return {
42
+ title,
43
+ ogTitle: extractMetaContent(html, 'property', 'og:title'),
44
+ ogDescription: extractMetaContent(html, 'property', 'og:description'),
45
+ ogImage: extractMetaContent(html, 'property', 'og:image'),
46
+ ogType: extractMetaContent(html, 'property', 'og:type'),
47
+ ogSiteName: extractMetaContent(html, 'property', 'og:site_name'),
48
+ canonical,
49
+ author: extractMetaContent(html, 'name', 'author'),
50
+ publishedTime: extractMetaContent(html, 'property', 'article:published_time'),
51
+ };
52
+ }
53
+ // ---- extractContent ----
54
+ /** Tags whose entire content (including children) should be removed */
55
+ const NOISE_TAGS = ['script', 'style', 'noscript', 'svg', 'iframe', 'nav', 'header', 'footer', 'aside'];
56
+ /** SPA shell markers */
57
+ const SPA_MARKERS = [
58
+ '<div id="root"',
59
+ '<div id="app"',
60
+ '<div id="__next"',
61
+ 'bundle.js',
62
+ 'main.js',
63
+ 'app.js',
64
+ '__NEXT_DATA__',
65
+ 'window.__INITIAL_STATE__',
66
+ 'window.__NUXT__',
67
+ ];
68
+ function stripTags(html, tags) {
69
+ let result = html;
70
+ for (const tag of tags) {
71
+ // Use non-greedy match with dotAll behavior via [\s\S]
72
+ const re = new RegExp(`<${tag}[\\s>][\\s\\S]*?<\\/${tag}>`, 'gi');
73
+ result = result.replace(re, '');
74
+ // Also strip self-closing variants (e.g. <iframe ... />)
75
+ const selfClose = new RegExp(`<${tag}[^>]*/?>`, 'gi');
76
+ result = result.replace(selfClose, '');
77
+ }
78
+ return result;
79
+ }
80
+ function findContentRoot(html) {
81
+ // Priority order for content root
82
+ const selectors = [
83
+ { re: /<article[^>]*>([\s\S]*?)<\/article>/i },
84
+ { re: /<main[^>]*>([\s\S]*?)<\/main>/i },
85
+ { re: /<[^>]+role=["']main["'][^>]*>([\s\S]*?)<\/div>/i },
86
+ ];
87
+ for (const { re } of selectors) {
88
+ const m = html.match(re);
89
+ if (m)
90
+ return m[1];
91
+ }
92
+ // Class-based selectors
93
+ const classPatterns = [
94
+ /class=["'][^"']*\bpost-content\b/i,
95
+ /class=["'][^"']*\barticle-body\b/i,
96
+ /class=["'][^"']*\bentry-content\b/i,
97
+ ];
98
+ for (const cp of classPatterns) {
99
+ const m = html.match(cp);
100
+ if (m) {
101
+ // Find the enclosing tag and extract its content
102
+ const idx = m.index;
103
+ const extracted = extractTagContent(html, idx);
104
+ if (extracted)
105
+ return extracted;
106
+ }
107
+ }
108
+ // id="content"
109
+ const contentId = html.match(/id=["']content["']/i);
110
+ if (contentId) {
111
+ const extracted = extractTagContent(html, contentId.index);
112
+ if (extracted)
113
+ return extracted;
114
+ }
115
+ // Fallback: <body>
116
+ const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
117
+ if (bodyMatch)
118
+ return bodyMatch[1];
119
+ return html;
120
+ }
121
+ function extractTagContent(html, attrIndex) {
122
+ // Walk backwards to find the opening < of the tag
123
+ let tagStart = attrIndex;
124
+ while (tagStart > 0 && html[tagStart] !== '<')
125
+ tagStart--;
126
+ // Find the tag name
127
+ const tagNameMatch = html.slice(tagStart).match(/^<(\w+)/);
128
+ if (!tagNameMatch)
129
+ return null;
130
+ const tagName = tagNameMatch[1];
131
+ // Find matching close tag accounting for nesting
132
+ let depth = 1;
133
+ const openRe = new RegExp(`<${tagName}[\\s>]`, 'gi');
134
+ const closeRe = new RegExp(`</${tagName}>`, 'gi');
135
+ // Find where the opening tag ends (the first > after tagStart)
136
+ const openTagEnd = html.indexOf('>', tagStart);
137
+ if (openTagEnd === -1)
138
+ return null;
139
+ let pos = openTagEnd + 1;
140
+ const contentStart = pos;
141
+ while (depth > 0 && pos < html.length) {
142
+ openRe.lastIndex = pos;
143
+ closeRe.lastIndex = pos;
144
+ const nextOpen = openRe.exec(html);
145
+ const nextClose = closeRe.exec(html);
146
+ if (!nextClose)
147
+ break; // no more close tags
148
+ if (nextOpen && nextOpen.index < nextClose.index) {
149
+ depth++;
150
+ pos = nextOpen.index + nextOpen[0].length;
151
+ }
152
+ else {
153
+ depth--;
154
+ if (depth === 0) {
155
+ return html.slice(contentStart, nextClose.index);
156
+ }
157
+ pos = nextClose.index + nextClose[0].length;
158
+ }
159
+ }
160
+ return null;
161
+ }
162
+ function htmlToMarkdown(html, links, images) {
163
+ let md = html;
164
+ // Remove HTML comments
165
+ md = md.replace(/<!--[\s\S]*?-->/g, '');
166
+ // Convert headings
167
+ for (let level = 1; level <= 6; level++) {
168
+ const prefix = '#'.repeat(level);
169
+ const re = new RegExp(`<h${level}[^>]*>([\\s\\S]*?)<\\/h${level}>`, 'gi');
170
+ md = md.replace(re, (_m, content) => {
171
+ const text = stripAllTags(content).trim();
172
+ return `\n\n${prefix} ${text}\n\n`;
173
+ });
174
+ }
175
+ // Convert blockquotes (before paragraphs)
176
+ md = md.replace(/<blockquote[^>]*>([\s\S]*?)<\/blockquote>/gi, (_m, content) => {
177
+ const text = stripAllTags(content).trim();
178
+ const quoted = text.split('\n').map((l) => `> ${l}`).join('\n');
179
+ return `\n\n${quoted}\n\n`;
180
+ });
181
+ // Convert code blocks: <pre><code>...</code></pre>
182
+ md = md.replace(/<pre[^>]*>\s*<code[^>]*>([\s\S]*?)<\/code>\s*<\/pre>/gi, (_m, content) => {
183
+ const decoded = decodeEntities(content.trim());
184
+ return `\n\n\`\`\`\n${decoded}\n\`\`\`\n\n`;
185
+ });
186
+ // Convert standalone <pre> (without <code>)
187
+ md = md.replace(/<pre[^>]*>([\s\S]*?)<\/pre>/gi, (_m, content) => {
188
+ const decoded = decodeEntities(stripAllTags(content).trim());
189
+ return `\n\n\`\`\`\n${decoded}\n\`\`\`\n\n`;
190
+ });
191
+ // Convert inline code
192
+ md = md.replace(/<code[^>]*>([\s\S]*?)<\/code>/gi, (_m, content) => {
193
+ return `\`${decodeEntities(content)}\``;
194
+ });
195
+ // Convert tables
196
+ md = md.replace(/<table[^>]*>([\s\S]*?)<\/table>/gi, (_m, tableContent) => {
197
+ return convertTable(tableContent);
198
+ });
199
+ // Convert images (before stripping tags, so we can extract src/alt)
200
+ md = md.replace(/<img\s+[^>]*>/gi, (tag) => {
201
+ const srcMatch = tag.match(/src=["']([^"']*)["']/i);
202
+ const altMatch = tag.match(/alt=["']([^"']*)["']/i);
203
+ const src = srcMatch ? decodeEntities(srcMatch[1]) : '';
204
+ const alt = altMatch ? decodeEntities(altMatch[1]) : '';
205
+ if (src) {
206
+ images.push({ alt, src });
207
+ return `![${alt}](${src})`;
208
+ }
209
+ return '';
210
+ });
211
+ // Convert links
212
+ md = md.replace(/<a\s+[^>]*href=["']([^"']*)["'][^>]*>([\s\S]*?)<\/a>/gi, (_m, href, content) => {
213
+ const text = stripAllTags(content).trim();
214
+ const decodedHref = decodeEntities(href);
215
+ if (text && decodedHref) {
216
+ links.push({ text, href: decodedHref });
217
+ return `[${text}](${decodedHref})`;
218
+ }
219
+ return text;
220
+ });
221
+ // Convert bold
222
+ md = md.replace(/<(?:strong|b)[^>]*>([\s\S]*?)<\/(?:strong|b)>/gi, (_m, content) => {
223
+ return `**${stripAllTags(content)}**`;
224
+ });
225
+ // Convert italic
226
+ md = md.replace(/<(?:em|i)(?:\s[^>]*)?>(?!mg)([\s\S]*?)<\/(?:em|i)>/gi, (_m, content) => {
227
+ return `*${stripAllTags(content)}*`;
228
+ });
229
+ // Convert ordered lists
230
+ md = md.replace(/<ol[^>]*>([\s\S]*?)<\/ol>/gi, (_m, listContent) => {
231
+ let counter = 0;
232
+ const items = listContent.replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, (_lm, item) => {
233
+ counter++;
234
+ return `${counter}. ${stripAllTags(item).trim()}\n`;
235
+ });
236
+ return `\n\n${stripAllTags(items).trim()}\n\n`;
237
+ });
238
+ // Convert unordered lists
239
+ md = md.replace(/<ul[^>]*>([\s\S]*?)<\/ul>/gi, (_m, listContent) => {
240
+ const items = listContent.replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, (_lm, item) => {
241
+ return `- ${stripAllTags(item).trim()}\n`;
242
+ });
243
+ return `\n\n${stripAllTags(items).trim()}\n\n`;
244
+ });
245
+ // Convert paragraphs
246
+ md = md.replace(/<p[^>]*>([\s\S]*?)<\/p>/gi, (_m, content) => {
247
+ return `\n\n${content.trim()}\n\n`;
248
+ });
249
+ // Convert <br> tags
250
+ md = md.replace(/<br\s*\/?>/gi, '\n');
251
+ // Strip remaining HTML tags
252
+ md = stripAllTags(md);
253
+ // Decode entities
254
+ md = decodeEntities(md);
255
+ // Collapse whitespace: no more than 2 consecutive newlines
256
+ md = md.replace(/\n{3,}/g, '\n\n');
257
+ // Trim leading/trailing whitespace
258
+ md = md.trim();
259
+ return md;
260
+ }
261
+ function stripAllTags(html) {
262
+ return html.replace(/<[^>]*>/g, '');
263
+ }
264
+ function convertTable(tableHtml) {
265
+ const rows = [];
266
+ // Extract rows
267
+ const rowMatches = tableHtml.match(/<tr[^>]*>[\s\S]*?<\/tr>/gi) || [];
268
+ for (const row of rowMatches) {
269
+ const cells = [];
270
+ const cellMatches = row.match(/<(?:td|th)[^>]*>[\s\S]*?<\/(?:td|th)>/gi) || [];
271
+ for (const cell of cellMatches) {
272
+ const content = cell.replace(/<\/?(?:td|th)[^>]*>/gi, '');
273
+ cells.push(stripAllTags(content).trim());
274
+ }
275
+ if (cells.length > 0)
276
+ rows.push(cells);
277
+ }
278
+ if (rows.length === 0)
279
+ return '';
280
+ // Normalize column count
281
+ const maxCols = Math.max(...rows.map((r) => r.length));
282
+ const normalized = rows.map((r) => {
283
+ while (r.length < maxCols)
284
+ r.push('');
285
+ return r;
286
+ });
287
+ // Build markdown table
288
+ const lines = [];
289
+ const header = normalized[0];
290
+ lines.push('| ' + header.join(' | ') + ' |');
291
+ lines.push('| ' + header.map(() => '---').join(' | ') + ' |');
292
+ for (let i = 1; i < normalized.length; i++) {
293
+ lines.push('| ' + normalized[i].join(' | ') + ' |');
294
+ }
295
+ return '\n\n' + lines.join('\n') + '\n\n';
296
+ }
297
+ function getTextContent(html) {
298
+ return stripAllTags(html).replace(/\s+/g, ' ').trim();
299
+ }
300
+ export function extractContent(html) {
301
+ const links = [];
302
+ const images = [];
303
+ // Strip noise tags first
304
+ const cleaned = stripTags(html, NOISE_TAGS);
305
+ // Find content root
306
+ const contentHtml = findContentRoot(cleaned);
307
+ // Convert to markdown
308
+ const content = htmlToMarkdown(contentHtml, links, images);
309
+ // Detect SPA shell
310
+ const textContent = getTextContent(contentHtml);
311
+ const hasSpaMarker = SPA_MARKERS.some((marker) => html.includes(marker));
312
+ const isSpaShell = textContent.length < 200 && hasSpaMarker;
313
+ return {
314
+ content,
315
+ links,
316
+ images,
317
+ isSpaShell,
318
+ };
319
+ }
320
+ //# sourceMappingURL=extract.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"extract.js","sourceRoot":"","sources":["../../src/read/extract.ts"],"names":[],"mappings":"AAAA,sBAAsB;AAqBtB,iCAAiC;AAEjC,MAAM,UAAU,GAA2B;IACzC,OAAO,EAAE,GAAG;IACZ,MAAM,EAAE,GAAG;IACX,MAAM,EAAE,GAAG;IACX,QAAQ,EAAE,GAAG;IACb,OAAO,EAAE,GAAG;IACZ,QAAQ,EAAE,GAAG;IACb,QAAQ,EAAE,GAAG;CACd,CAAC;AAEF,SAAS,cAAc,CAAC,IAAY;IAClC,OAAO,IAAI,CAAC,OAAO,CAAC,qCAAqC,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,UAAU,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC;AACxF,CAAC;AAED,sBAAsB;AAEtB,SAAS,kBAAkB,CAAC,IAAY,EAAE,QAAgB,EAAE,SAAiB;IAC3E,4EAA4E;IAC5E,mDAAmD;IACnD,MAAM,QAAQ,GAAG;QACf,IAAI,MAAM,CAAC,YAAY,QAAQ,QAAQ,WAAW,CAAC,SAAS,CAAC,kCAAkC,EAAE,GAAG,CAAC;QACrG,IAAI,MAAM,CAAC,wCAAwC,QAAQ,QAAQ,WAAW,CAAC,SAAS,CAAC,MAAM,EAAE,GAAG,CAAC;KACtG,CAAC;IACF,KAAK,MAAM,EAAE,IAAI,QAAQ,EAAE,CAAC;QAC1B,MAAM,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;QACzB,IAAI,CAAC;YAAE,OAAO,cAAc,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IACrC,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,SAAS,WAAW,CAAC,CAAS;IAC5B,OAAO,CAAC,CAAC,OAAO,CAAC,qBAAqB,EAAE,MAAM,CAAC,CAAC;AAClD,CAAC;AAED,MAAM,UAAU,SAAS,CAAC,IAAY;IACpC,kBAAkB;IAClB,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,kCAAkC,CAAC,CAAC;IAClE,MAAM,KAAK,GAAG,UAAU,CAAC,CAAC,CAAC,cAAc,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;IAEvE,oBAAoB;IACpB,MAAM,cAAc,GAAG,IAAI,CAAC,KAAK,CAAC,wEAAwE,CAAC;WACtG,IAAI,CAAC,KAAK,CAAC,wEAAwE,CAAC,CAAC;IAC1F,MAAM,SAAS,GAAG,cAAc,CAAC,CAAC,CAAC,cAAc,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;IAE5E,OAAO;QACL,KAAK;QACL,OAAO,EAAE,kBAAkB,CAAC,IAAI,EAAE,UAAU,EAAE,UAAU,CAAC;QACzD,aAAa,EAAE,kBAAkB,CAAC,IAAI,EAAE,UAAU,EAAE,gBAAgB,CAAC;QACrE,OAAO,EAAE,kBAAkB,CAAC,IAAI,EAAE,UAAU,EAAE,UAAU,CAAC;QACzD,MAAM,EAAE,kBAAkB,CAAC,IAAI,EAAE,UAAU,EAAE,SAAS,CAAC;QACvD,UAAU,EAAE,kBAAkB,CAAC,IAAI,EAAE,UAAU,EAAE,cAAc,CAAC;QAChE,SAAS;QACT,MAAM,EAAE,kBAAkB,CAAC,IAAI,EAAE,MAAM,EAAE,QAAQ,CAAC;QAClD,aAAa,EAAE,kBAAkB,CAAC,IAAI,EAAE,UAAU,EAAE,wBAAwB,CAAC;KAC9E,CAAC;AACJ,CAAC;AAED,2BAA2B;AAE3B,uEAAuE;AACvE,MAAM,UAAU,GAAG,CAAC,QAAQ,EAAE,OAAO,EAAE,UAAU,EAAE,KAAK,EAAE,QAAQ,EAAE,KAAK,EAAE,QAAQ,EAAE,QAAQ,EAAE,OAAO,CAAC,CAAC;AAExG,wBAAwB;AACxB,MAAM,WAAW,GAAG;IAClB,gBAAgB;IAChB,eAAe;IACf,kBAAkB;IAClB,WAAW;IACX,SAAS;IACT,QAAQ;IACR,eAAe;IACf,0BAA0B;IAC1B,iBAAiB;CAClB,CAAC;AAEF,SAAS,SAAS,CAAC,IAAY,EAAE,IAAc;IAC7C,IAAI,MAAM,GAAG,IAAI,CAAC;IAClB,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,uDAAuD;QACvD,MAAM,EAAE,GAAG,IAAI,MAAM,CAAC,IAAI,GAAG,uBAAuB,GAAG,GAAG,EAAE,IAAI,CAAC,CAAC;QAClE,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC;QAChC,yDAAyD;QACzD,MAAM,SAAS,GAAG,IAAI,MAAM,CAAC,IAAI,GAAG,UAAU,EAAE,IAAI,CAAC,CAAC;QACtD,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC;IACzC,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,SAAS,eAAe,CAAC,IAAY;IACnC,kCAAkC;IAClC,MAAM,SAAS,GAA0B;QACvC,EAAE,EAAE,EAAE,sCAAsC,EAAE;QAC9C,EAAE,EAAE,EAAE,gCAAgC,EAAE;QACxC,EAAE,EAAE,EAAE,iDAAiD,EAAE;KAC1D,CAAC;IAEF,KAAK,MAAM,EAAE,EAAE,EAAE,IAAI,SAAS,EAAE,CAAC;QAC/B,MAAM,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;QACzB,IAAI,CAAC;YAAE,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC;IACrB,CAAC;IAED,wBAAwB;IACxB,MAAM,aAAa,GAAG;QACpB,mCAAmC;QACnC,mCAAmC;QACnC,oCAAoC;KACrC,CAAC;IAEF,KAAK,MAAM,EAAE,IAAI,aAAa,EAAE,CAAC;QAC/B,MAAM,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;QACzB,IAAI,CAAC,EAAE,CAAC;YACN,iDAAiD;YACjD,MAAM,GAAG,GAAG,CAAC,CAAC,KAAM,CAAC;YACrB,MAAM,SAAS,GAAG,iBAAiB,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;YAC/C,IAAI,SAAS;gBAAE,OAAO,SAAS,CAAC;QAClC,CAAC;IACH,CAAC;IAED,eAAe;IACf,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,qBAAqB,CAAC,CAAC;IACpD,IAAI,SAAS,EAAE,CAAC;QACd,MAAM,SAAS,GAAG,iBAAiB,CAAC,IAAI,EAAE,SAAS,CAAC,KAAM,CAAC,CAAC;QAC5D,IAAI,SAAS;YAAE,OAAO,SAAS,CAAC;IAClC,CAAC;IAED,mBAAmB;IACnB,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,gCAAgC,CAAC,CAAC;IAC/D,IAAI,SAAS;QAAE,OAAO,SAAS,CAAC,CAAC,CAAC,CAAC;IAEnC,OAAO,IAAI,CAAC;AACd,CAAC;AAED,SAAS,iBAAiB,CAAC,IAAY,EAAE,SAAiB;IACxD,kDAAkD;IAClD,IAAI,QAAQ,GAAG,SAAS,CAAC;IACzB,OAAO,QAAQ,GAAG,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,KAAK,GAAG;QAAE,QAAQ,EAAE,CAAC;IAE1D,oBAAoB;IACpB,MAAM,YAAY,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;IAC3D,IAAI,CAAC,YAAY;QAAE,OAAO,IAAI,CAAC;IAE/B,MAAM,OAAO,GAAG,YAAY,CAAC,CAAC,CAAC,CAAC;IAEhC,iDAAiD;IACjD,IAAI,KAAK,GAAG,CAAC,CAAC;IACd,MAAM,MAAM,GAAG,IAAI,MAAM,CAAC,IAAI,OAAO,QAAQ,EAAE,IAAI,CAAC,CAAC;IACrD,MAAM,OAAO,GAAG,IAAI,MAAM,CAAC,KAAK,OAAO,GAAG,EAAE,IAAI,CAAC,CAAC;IAElD,+DAA+D;IAC/D,MAAM,UAAU,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,EAAE,QAAQ,CAAC,CAAC;IAC/C,IAAI,UAAU,KAAK,CAAC,CAAC;QAAE,OAAO,IAAI,CAAC;IAEnC,IAAI,GAAG,GAAG,UAAU,GAAG,CAAC,CAAC;IACzB,MAAM,YAAY,GAAG,GAAG,CAAC;IAEzB,OAAO,KAAK,GAAG,CAAC,IAAI,GAAG,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;QACtC,MAAM,CAAC,SAAS,GAAG,GAAG,CAAC;QACvB,OAAO,CAAC,SAAS,GAAG,GAAG,CAAC;QAExB,MAAM,QAAQ,GAAG,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACnC,MAAM,SAAS,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAErC,IAAI,CAAC,SAAS;YAAE,MAAM,CAAC,qBAAqB;QAE5C,IAAI,QAAQ,IAAI,QAAQ,CAAC,KAAK,GAAG,SAAS,CAAC,KAAK,EAAE,CAAC;YACjD,KAAK,EAAE,CAAC;YACR,GAAG,GAAG,QAAQ,CAAC,KAAK,GAAG,QAAQ,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;QAC5C,CAAC;aAAM,CAAC;YACN,KAAK,EAAE,CAAC;YACR,IAAI,KAAK,KAAK,CAAC,EAAE,CAAC;gBAChB,OAAO,IAAI,CAAC,KAAK,CAAC,YAAY,EAAE,SAAS,CAAC,KAAK,CAAC,CAAC;YACnD,CAAC;YACD,GAAG,GAAG,SAAS,CAAC,KAAK,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;QAC9C,CAAC;IACH,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED,SAAS,cAAc,CACrB,IAAY,EACZ,KAA4C,EAC5C,MAA2C;IAE3C,IAAI,EAAE,GAAG,IAAI,CAAC;IAEd,uBAAuB;IACvB,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,kBAAkB,EAAE,EAAE,CAAC,CAAC;IAExC,mBAAmB;IACnB,KAAK,IAAI,KAAK,GAAG,CAAC,EAAE,KAAK,IAAI,CAAC,EAAE,KAAK,EAAE,EAAE,CAAC;QACxC,MAAM,MAAM,GAAG,GAAG,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;QACjC,MAAM,EAAE,GAAG,IAAI,MAAM,CAAC,KAAK,KAAK,0BAA0B,KAAK,GAAG,EAAE,IAAI,CAAC,CAAC;QAC1E,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,EAAE,EAAE,CAAC,EAAE,EAAE,OAAO,EAAE,EAAE;YAClC,MAAM,IAAI,GAAG,YAAY,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC;YAC1C,OAAO,OAAO,MAAM,IAAI,IAAI,MAAM,CAAC;QACrC,CAAC,CAAC,CAAC;IACL,CAAC;IAED,0CAA0C;IAC1C,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,6CAA6C,EAAE,CAAC,EAAE,EAAE,OAAO,EAAE,EAAE;QAC7E,MAAM,IAAI,GAAG,YAAY,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC;QAC1C,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAS,EAAE,EAAE,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACxE,OAAO,OAAO,MAAM,MAAM,CAAC;IAC7B,CAAC,CAAC,CAAC;IAEH,mDAAmD;IACnD,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,wDAAwD,EAAE,CAAC,EAAE,EAAE,OAAO,EAAE,EAAE;QACxF,MAAM,OAAO,GAAG,cAAc,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,CAAC;QAC/C,OAAO,eAAe,OAAO,cAAc,CAAC;IAC9C,CAAC,CAAC,CAAC;IAEH,4CAA4C;IAC5C,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,+BAA+B,EAAE,CAAC,EAAE,EAAE,OAAO,EAAE,EAAE;QAC/D,MAAM,OAAO,GAAG,cAAc,CAAC,YAAY,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;QAC7D,OAAO,eAAe,OAAO,cAAc,CAAC;IAC9C,CAAC,CAAC,CAAC;IAEH,sBAAsB;IACtB,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,iCAAiC,EAAE,CAAC,EAAE,EAAE,OAAO,EAAE,EAAE;QACjE,OAAO,KAAK,cAAc,CAAC,OAAO,CAAC,IAAI,CAAC;IAC1C,CAAC,CAAC,CAAC;IAEH,iBAAiB;IACjB,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,mCAAmC,EAAE,CAAC,EAAE,EAAE,YAAY,EAAE,EAAE;QACxE,OAAO,YAAY,CAAC,YAAY,CAAC,CAAC;IACpC,CAAC,CAAC,CAAC;IAEH,oEAAoE;IACpE,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,iBAAiB,EAAE,CAAC,GAAG,EAAE,EAAE;QACzC,MAAM,QAAQ,GAAG,GAAG,CAAC,KAAK,CAAC,uBAAuB,CAAC,CAAC;QACpD,MAAM,QAAQ,GAAG,GAAG,CAAC,KAAK,CAAC,uBAAuB,CAAC,CAAC;QACpD,MAAM,GAAG,GAAG,QAAQ,CAAC,CAAC,CAAC,cAAc,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QACxD,MAAM,GAAG,GAAG,QAAQ,CAAC,CAAC,CAAC,cAAc,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QACxD,IAAI,GAAG,EAAE,CAAC;YACR,MAAM,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,GAAG,EAAE,CAAC,CAAC;YAC1B,OAAO,KAAK,GAAG,KAAK,GAAG,GAAG,CAAC;QAC7B,CAAC;QACD,OAAO,EAAE,CAAC;IACZ,CAAC,CAAC,CAAC;IAEH,gBAAgB;IAChB,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,wDAAwD,EAAE,CAAC,EAAE,EAAE,IAAI,EAAE,OAAO,EAAE,EAAE;QAC9F,MAAM,IAAI,GAAG,YAAY,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC;QAC1C,MAAM,WAAW,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;QACzC,IAAI,IAAI,IAAI,WAAW,EAAE,CAAC;YACxB,KAAK,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,WAAW,EAAE,CAAC,CAAC;YACxC,OAAO,IAAI,IAAI,KAAK,WAAW,GAAG,CAAC;QACrC,CAAC;QACD,OAAO,IAAI,CAAC;IACd,CAAC,CAAC,CAAC;IAEH,eAAe;IACf,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,iDAAiD,EAAE,CAAC,EAAE,EAAE,OAAO,EAAE,EAAE;QACjF,OAAO,KAAK,YAAY,CAAC,OAAO,CAAC,IAAI,CAAC;IACxC,CAAC,CAAC,CAAC;IAEH,iBAAiB;IACjB,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,sDAAsD,EAAE,CAAC,EAAE,EAAE,OAAO,EAAE,EAAE;QACtF,OAAO,IAAI,YAAY,CAAC,OAAO,CAAC,GAAG,CAAC;IACtC,CAAC,CAAC,CAAC;IAEH,wBAAwB;IACxB,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,6BAA6B,EAAE,CAAC,EAAE,EAAE,WAAW,EAAE,EAAE;QACjE,IAAI,OAAO,GAAG,CAAC,CAAC;QAChB,MAAM,KAAK,GAAG,WAAW,CAAC,OAAO,CAAC,6BAA6B,EAAE,CAAC,GAAW,EAAE,IAAY,EAAE,EAAE;YAC7F,OAAO,EAAE,CAAC;YACV,OAAO,GAAG,OAAO,KAAK,YAAY,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,IAAI,CAAC;QACtD,CAAC,CAAC,CAAC;QACH,OAAO,OAAO,YAAY,CAAC,KAAK,CAAC,CAAC,IAAI,EAAE,MAAM,CAAC;IACjD,CAAC,CAAC,CAAC;IAEH,0BAA0B;IAC1B,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,6BAA6B,EAAE,CAAC,EAAE,EAAE,WAAW,EAAE,EAAE;QACjE,MAAM,KAAK,GAAG,WAAW,CAAC,OAAO,CAAC,6BAA6B,EAAE,CAAC,GAAW,EAAE,IAAY,EAAE,EAAE;YAC7F,OAAO,KAAK,YAAY,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,IAAI,CAAC;QAC5C,CAAC,CAAC,CAAC;QACH,OAAO,OAAO,YAAY,CAAC,KAAK,CAAC,CAAC,IAAI,EAAE,MAAM,CAAC;IACjD,CAAC,CAAC,CAAC;IAEH,qBAAqB;IACrB,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,2BAA2B,EAAE,CAAC,EAAE,EAAE,OAAO,EAAE,EAAE;QAC3D,OAAO,OAAO,OAAO,CAAC,IAAI,EAAE,MAAM,CAAC;IACrC,CAAC,CAAC,CAAC;IAEH,oBAAoB;IACpB,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,cAAc,EAAE,IAAI,CAAC,CAAC;IAEtC,4BAA4B;IAC5B,EAAE,GAAG,YAAY,CAAC,EAAE,CAAC,CAAC;IAEtB,kBAAkB;IAClB,EAAE,GAAG,cAAc,CAAC,EAAE,CAAC,CAAC;IAExB,2DAA2D;IAC3D,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;IAEnC,mCAAmC;IACnC,EAAE,GAAG,EAAE,CAAC,IAAI,EAAE,CAAC;IAEf,OAAO,EAAE,CAAC;AACZ,CAAC;AAED,SAAS,YAAY,CAAC,IAAY;IAChC,OAAO,IAAI,CAAC,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC;AACtC,CAAC;AAED,SAAS,YAAY,CAAC,SAAiB;IACrC,MAAM,IAAI,GAAe,EAAE,CAAC;IAE5B,eAAe;IACf,MAAM,UAAU,GAAG,SAAS,CAAC,KAAK,CAAC,2BAA2B,CAAC,IAAI,EAAE,CAAC;IACtE,KAAK,MAAM,GAAG,IAAI,UAAU,EAAE,CAAC;QAC7B,MAAM,KAAK,GAAa,EAAE,CAAC;QAC3B,MAAM,WAAW,GAAG,GAAG,CAAC,KAAK,CAAC,yCAAyC,CAAC,IAAI,EAAE,CAAC;QAC/E,KAAK,MAAM,IAAI,IAAI,WAAW,EAAE,CAAC;YAC/B,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC,uBAAuB,EAAE,EAAE,CAAC,CAAC;YAC1D,KAAK,CAAC,IAAI,CAAC,YAAY,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;QAC3C,CAAC;QACD,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC;YAAE,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IACzC,CAAC;IAED,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAEjC,yBAAyB;IACzB,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC;IACvD,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE;QAChC,OAAO,CAAC,CAAC,MAAM,GAAG,OAAO;YAAE,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACtC,OAAO,CAAC,CAAC;IACX,CAAC,CAAC,CAAC;IAEH,uBAAuB;IACvB,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,MAAM,MAAM,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;IAC7B,KAAK,CAAC,IAAI,CAAC,IAAI,GAAG,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC,CAAC;IAC7C,KAAK,CAAC,IAAI,CAAC,IAAI,GAAG,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC,CAAC;IAE9D,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC3C,KAAK,CAAC,IAAI,CAAC,IAAI,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC,CAAC;IACtD,CAAC;IAED,OAAO,MAAM,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC;AAC5C,CAAC;AAED,SAAS,cAAc,CAAC,IAAY;IAClC,OAAO,YAAY,CAAC,IAAI,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;AACxD,CAAC;AAED,MAAM,UAAU,cAAc,CAAC,IAAY;IACzC,MAAM,KAAK,GAA0C,EAAE,CAAC;IACxD,MAAM,MAAM,GAAwC,EAAE,CAAC;IAEvD,yBAAyB;IACzB,MAAM,OAAO,GAAG,SAAS,CAAC,IAAI,EAAE,UAAU,CAAC,CAAC;IAE5C,oBAAoB;IACpB,MAAM,WAAW,GAAG,eAAe,CAAC,OAAO,CAAC,CAAC;IAE7C,sBAAsB;IACtB,MAAM,OAAO,GAAG,cAAc,CAAC,WAAW,EAAE,KAAK,EAAE,MAAM,CAAC,CAAC;IAE3D,mBAAmB;IACnB,MAAM,WAAW,GAAG,cAAc,CAAC,WAAW,CAAC,CAAC;IAChD,MAAM,YAAY,GAAG,WAAW,CAAC,IAAI,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,IAAI,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC;IACzE,MAAM,UAAU,GAAG,WAAW,CAAC,MAAM,GAAG,GAAG,IAAI,YAAY,CAAC;IAE5D,OAAO;QACL,OAAO;QACP,KAAK;QACL,MAAM;QACN,UAAU;KACX,CAAC;AACJ,CAAC"}
@@ -0,0 +1,14 @@
1
+ export { peek } from './peek.js';
2
+ export type { PeekOptions } from './peek.js';
3
+ export type { PeekResult, ReadResult, Decoder } from './types.js';
4
+ import type { ReadResult } from './types.js';
5
+ export interface ReadOptions {
6
+ skipSsrf?: boolean;
7
+ maxBytes?: number;
8
+ }
9
+ /**
10
+ * Universal content decoder. Routes to site-specific decoders for known sites
11
+ * (Reddit, YouTube, Wikipedia, HN), falls back to generic HTML extraction.
12
+ * Returns null if content cannot be extracted.
13
+ */
14
+ export declare function read(url: string, options?: ReadOptions): Promise<ReadResult | null>;
@@ -0,0 +1,66 @@
1
+ // src/read/index.ts
2
+ export { peek } from './peek.js';
3
+ import { safeFetch } from '../discovery/fetch.js';
4
+ import { findDecoder } from './decoders/index.js';
5
+ import { parseHead, extractContent } from './extract.js';
6
+ /**
7
+ * Universal content decoder. Routes to site-specific decoders for known sites
8
+ * (Reddit, YouTube, Wikipedia, HN), falls back to generic HTML extraction.
9
+ * Returns null if content cannot be extracted.
10
+ */
11
+ export async function read(url, options = {}) {
12
+ // Try site-specific decoder first
13
+ const decoder = findDecoder(url);
14
+ if (decoder) {
15
+ const result = await decoder.decode(url, { skipSsrf: options.skipSsrf });
16
+ if (result) {
17
+ if (options.maxBytes && result.content.length > options.maxBytes) {
18
+ result.content = result.content.slice(0, options.maxBytes);
19
+ result.cost.tokens = Math.ceil(result.content.length / 4);
20
+ }
21
+ return result;
22
+ }
23
+ // Decoder returned null -- fall through to generic
24
+ }
25
+ // Generic pipeline: fetch HTML -> parse head -> extract body
26
+ const fetchResult = await safeFetch(url, { skipSsrf: options.skipSsrf });
27
+ if (!fetchResult || fetchResult.status !== 200)
28
+ return null;
29
+ const html = fetchResult.body;
30
+ const head = parseHead(html);
31
+ const body = extractContent(html);
32
+ // Determine source
33
+ let source;
34
+ if (body.isSpaShell) {
35
+ source = 'spa-shell';
36
+ }
37
+ else if (body.content.trim().length === 0) {
38
+ source = 'og-tags-only';
39
+ }
40
+ else {
41
+ source = 'readability';
42
+ }
43
+ let content = body.content;
44
+ if (options.maxBytes && content.length > options.maxBytes) {
45
+ content = content.slice(0, options.maxBytes);
46
+ }
47
+ const title = head.ogTitle || head.title || null;
48
+ return {
49
+ url,
50
+ title,
51
+ author: head.author || null,
52
+ description: head.ogDescription || null,
53
+ content,
54
+ links: body.links,
55
+ images: body.images,
56
+ metadata: {
57
+ type: head.ogType || 'unknown',
58
+ publishedAt: head.publishedTime || null,
59
+ source,
60
+ canonical: head.canonical || null,
61
+ siteName: head.ogSiteName || null,
62
+ },
63
+ cost: { tokens: Math.ceil(content.length / 4) },
64
+ };
65
+ }
66
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/read/index.ts"],"names":[],"mappings":"AAAA,oBAAoB;AACpB,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAKjC,OAAO,EAAE,SAAS,EAAE,MAAM,uBAAuB,CAAC;AAClD,OAAO,EAAE,WAAW,EAAE,MAAM,qBAAqB,CAAC;AAClD,OAAO,EAAE,SAAS,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAOzD;;;;GAIG;AACH,MAAM,CAAC,KAAK,UAAU,IAAI,CAAC,GAAW,EAAE,UAAuB,EAAE;IAC/D,kCAAkC;IAClC,MAAM,OAAO,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC;IACjC,IAAI,OAAO,EAAE,CAAC;QACZ,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,MAAM,CAAC,GAAG,EAAE,EAAE,QAAQ,EAAE,OAAO,CAAC,QAAQ,EAAE,CAAC,CAAC;QACzE,IAAI,MAAM,EAAE,CAAC;YACX,IAAI,OAAO,CAAC,QAAQ,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,GAAG,OAAO,CAAC,QAAQ,EAAE,CAAC;gBACjE,MAAM,CAAC,OAAO,GAAG,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,OAAO,CAAC,QAAQ,CAAC,CAAC;gBAC3D,MAAM,CAAC,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;YAC5D,CAAC;YACD,OAAO,MAAM,CAAC;QAChB,CAAC;QACD,mDAAmD;IACrD,CAAC;IAED,6DAA6D;IAC7D,MAAM,WAAW,GAAG,MAAM,SAAS,CAAC,GAAG,EAAE,EAAE,QAAQ,EAAE,OAAO,CAAC,QAAQ,EAAE,CAAC,CAAC;IACzE,IAAI,CAAC,WAAW,IAAI,WAAW,CAAC,MAAM,KAAK,GAAG;QAAE,OAAO,IAAI,CAAC;IAE5D,MAAM,IAAI,GAAG,WAAW,CAAC,IAAI,CAAC;IAC9B,MAAM,IAAI,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;IAC7B,MAAM,IAAI,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;IAElC,mBAAmB;IACnB,IAAI,MAAc,CAAC;IACnB,IAAI,IAAI,CAAC,UAAU,EAAE,CAAC;QACpB,MAAM,GAAG,WAAW,CAAC;IACvB,CAAC;SAAM,IAAI,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC5C,MAAM,GAAG,cAAc,CAAC;IAC1B,CAAC;SAAM,CAAC;QACN,MAAM,GAAG,aAAa,CAAC;IACzB,CAAC;IAED,IAAI,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC;IAC3B,IAAI,OAAO,CAAC,QAAQ,IAAI,OAAO,CAAC,MAAM,GAAG,OAAO,CAAC,QAAQ,EAAE,CAAC;QAC1D,OAAO,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,OAAO,CAAC,QAAQ,CAAC,CAAC;IAC/C,CAAC;IAED,MAAM,KAAK,GAAG,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC,KAAK,IAAI,IAAI,CAAC;IAEjD,OAAO;QACL,GAAG;QACH,KAAK;QACL,MAAM,EAAE,IAAI,CAAC,MAAM,IAAI,IAAI;QAC3B,WAAW,EAAE,IAAI,CAAC,aAAa,IAAI,IAAI;QACvC,OAAO;QACP,KAAK,EAAE,IAAI,CAAC,KAAK;QACjB,MAAM,EAAE,IAAI,CAAC,MAAM;QACnB,QAAQ,EAAE;YACR,IAAI,EAAE,IAAI,CAAC,MAAM,IAAI,SAAS;YAC9B,WAAW,EAAE,IAAI,CAAC,aAAa,IAAI,IAAI;YACvC,MAAM;YACN,SAAS,EAAE,IAAI,CAAC,SAAS,IAAI,IAAI;YACjC,QAAQ,EAAE,IAAI,CAAC,UAAU,IAAI,IAAI;SAClC;QACD,IAAI,EAAE,EAAE,MAAM,EAAE,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,EAAE;KAChD,CAAC;AACJ,CAAC"}
@@ -0,0 +1,9 @@
1
+ import type { PeekResult } from './types.js';
2
+ export interface PeekOptions {
3
+ skipSsrf?: boolean;
4
+ }
5
+ /**
6
+ * HTTP HEAD-only triage: checks accessibility, detects bot protection and frameworks.
7
+ * Falls back to GET if HEAD fails.
8
+ */
9
+ export declare function peek(url: string, options?: PeekOptions): Promise<PeekResult>;
@@ -0,0 +1,137 @@
1
+ import { safeFetch } from '../discovery/fetch.js';
2
+ /**
3
+ * HTTP HEAD-only triage: checks accessibility, detects bot protection and frameworks.
4
+ * Falls back to GET if HEAD fails.
5
+ */
6
+ export async function peek(url, options = {}) {
7
+ const signals = [];
8
+ // Try HEAD first
9
+ let result = await safeFetch(url, {
10
+ method: 'HEAD',
11
+ skipSsrf: options.skipSsrf,
12
+ });
13
+ // Fall back to GET if HEAD fails (null = network/SSRF error)
14
+ if (!result) {
15
+ result = await safeFetch(url, {
16
+ method: 'GET',
17
+ skipSsrf: options.skipSsrf,
18
+ });
19
+ }
20
+ // Both HEAD and GET failed
21
+ if (!result) {
22
+ return {
23
+ url,
24
+ status: 0,
25
+ accessible: false,
26
+ contentType: null,
27
+ server: null,
28
+ framework: null,
29
+ botProtection: null,
30
+ signals: ['fetch failed'],
31
+ recommendation: 'blocked',
32
+ };
33
+ }
34
+ const { status, headers } = result;
35
+ // Extract basic metadata
36
+ const contentType = headers['content-type'] || null;
37
+ const server = headers['server'] || null;
38
+ // Detect bot protection
39
+ const botProtection = detectBotProtection(headers, signals);
40
+ // Detect framework
41
+ const framework = detectFramework(headers, signals);
42
+ // Determine accessibility and recommendation
43
+ const accessible = status >= 200 && status < 400 && !botProtection;
44
+ const recommendation = computeRecommendation(status, botProtection);
45
+ return {
46
+ url,
47
+ status,
48
+ accessible,
49
+ contentType,
50
+ server,
51
+ framework,
52
+ botProtection,
53
+ signals,
54
+ recommendation,
55
+ };
56
+ }
57
+ function detectBotProtection(headers, signals) {
58
+ // Cloudflare: cf-ray or cf-cache-status
59
+ if (headers['cf-ray']) {
60
+ signals.push('cf-ray header');
61
+ return 'cloudflare';
62
+ }
63
+ if (headers['cf-cache-status']) {
64
+ signals.push('cf-cache-status header');
65
+ return 'cloudflare';
66
+ }
67
+ // PerimeterX: x-px-* headers
68
+ for (const key of Object.keys(headers)) {
69
+ if (key.startsWith('x-px-')) {
70
+ signals.push(`${key} header`);
71
+ return 'perimeterx';
72
+ }
73
+ }
74
+ // DataDome: x-datadome* headers
75
+ for (const key of Object.keys(headers)) {
76
+ if (key.startsWith('x-datadome')) {
77
+ signals.push(`${key} header`);
78
+ return 'datadome';
79
+ }
80
+ }
81
+ return null;
82
+ }
83
+ function detectFramework(headers, signals) {
84
+ // Next.js: x-powered-by: Next.js
85
+ const poweredBy = headers['x-powered-by'];
86
+ if (poweredBy && /next\.js/i.test(poweredBy)) {
87
+ signals.push('x-powered-by: Next.js');
88
+ return 'next.js';
89
+ }
90
+ // Express: x-powered-by: Express
91
+ if (poweredBy && /express/i.test(poweredBy)) {
92
+ signals.push('x-powered-by: Express');
93
+ return 'express';
94
+ }
95
+ // PHP: x-powered-by: PHP/*
96
+ if (poweredBy && /php/i.test(poweredBy)) {
97
+ signals.push('x-powered-by: PHP');
98
+ return 'php';
99
+ }
100
+ // WordPress: link header containing api.w.org
101
+ const link = headers['link'];
102
+ if (link && link.includes('api.w.org')) {
103
+ signals.push('link: api.w.org');
104
+ return 'wordpress';
105
+ }
106
+ // Shopify: x-shopify-stage header
107
+ if (headers['x-shopify-stage']) {
108
+ signals.push('x-shopify-stage header');
109
+ return 'shopify';
110
+ }
111
+ // Drupal: x-drupal-* headers
112
+ for (const key of Object.keys(headers)) {
113
+ if (key.startsWith('x-drupal-')) {
114
+ signals.push(`${key} header`);
115
+ return 'drupal';
116
+ }
117
+ }
118
+ return null;
119
+ }
120
+ function computeRecommendation(status, botProtection) {
121
+ // Auth required
122
+ if (status === 401 || status === 407) {
123
+ return 'auth_required';
124
+ }
125
+ // Blocked: bot protection, 403, 429, or 5xx
126
+ if (botProtection) {
127
+ return 'blocked';
128
+ }
129
+ if (status === 403 || status === 429) {
130
+ return 'blocked';
131
+ }
132
+ if (status >= 500) {
133
+ return 'blocked';
134
+ }
135
+ return 'read';
136
+ }
137
+ //# sourceMappingURL=peek.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"peek.js","sourceRoot":"","sources":["../../src/read/peek.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,SAAS,EAAE,MAAM,uBAAuB,CAAC;AAMlD;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,IAAI,CAAC,GAAW,EAAE,UAAuB,EAAE;IAC/D,MAAM,OAAO,GAAa,EAAE,CAAC;IAE7B,iBAAiB;IACjB,IAAI,MAAM,GAAG,MAAM,SAAS,CAAC,GAAG,EAAE;QAChC,MAAM,EAAE,MAAM;QACd,QAAQ,EAAE,OAAO,CAAC,QAAQ;KAC3B,CAAC,CAAC;IAEH,6DAA6D;IAC7D,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,MAAM,GAAG,MAAM,SAAS,CAAC,GAAG,EAAE;YAC5B,MAAM,EAAE,KAAK;YACb,QAAQ,EAAE,OAAO,CAAC,QAAQ;SAC3B,CAAC,CAAC;IACL,CAAC;IAED,2BAA2B;IAC3B,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,OAAO;YACL,GAAG;YACH,MAAM,EAAE,CAAC;YACT,UAAU,EAAE,KAAK;YACjB,WAAW,EAAE,IAAI;YACjB,MAAM,EAAE,IAAI;YACZ,SAAS,EAAE,IAAI;YACf,aAAa,EAAE,IAAI;YACnB,OAAO,EAAE,CAAC,cAAc,CAAC;YACzB,cAAc,EAAE,SAAS;SAC1B,CAAC;IACJ,CAAC;IAED,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,GAAG,MAAM,CAAC;IAEnC,yBAAyB;IACzB,MAAM,WAAW,GAAG,OAAO,CAAC,cAAc,CAAC,IAAI,IAAI,CAAC;IACpD,MAAM,MAAM,GAAG,OAAO,CAAC,QAAQ,CAAC,IAAI,IAAI,CAAC;IAEzC,wBAAwB;IACxB,MAAM,aAAa,GAAG,mBAAmB,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC;IAE5D,mBAAmB;IACnB,MAAM,SAAS,GAAG,eAAe,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC;IAEpD,6CAA6C;IAC7C,MAAM,UAAU,GAAG,MAAM,IAAI,GAAG,IAAI,MAAM,GAAG,GAAG,IAAI,CAAC,aAAa,CAAC;IACnE,MAAM,cAAc,GAAG,qBAAqB,CAAC,MAAM,EAAE,aAAa,CAAC,CAAC;IAEpE,OAAO;QACL,GAAG;QACH,MAAM;QACN,UAAU;QACV,WAAW;QACX,MAAM;QACN,SAAS;QACT,aAAa;QACb,OAAO;QACP,cAAc;KACf,CAAC;AACJ,CAAC;AAED,SAAS,mBAAmB,CAC1B,OAA+B,EAC/B,OAAiB;IAEjB,wCAAwC;IACxC,IAAI,OAAO,CAAC,QAAQ,CAAC,EAAE,CAAC;QACtB,OAAO,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC;QAC9B,OAAO,YAAY,CAAC;IACtB,CAAC;IACD,IAAI,OAAO,CAAC,iBAAiB,CAAC,EAAE,CAAC;QAC/B,OAAO,CAAC,IAAI,CAAC,wBAAwB,CAAC,CAAC;QACvC,OAAO,YAAY,CAAC;IACtB,CAAC;IAED,6BAA6B;IAC7B,KAAK,MAAM,GAAG,IAAI,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC;QACvC,IAAI,GAAG,CAAC,UAAU,CAAC,OAAO,CAAC,EAAE,CAAC;YAC5B,OAAO,CAAC,IAAI,CAAC,GAAG,GAAG,SAAS,CAAC,CAAC;YAC9B,OAAO,YAAY,CAAC;QACtB,CAAC;IACH,CAAC;IAED,gCAAgC;IAChC,KAAK,MAAM,GAAG,IAAI,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC;QACvC,IAAI,GAAG,CAAC,UAAU,CAAC,YAAY,CAAC,EAAE,CAAC;YACjC,OAAO,CAAC,IAAI,CAAC,GAAG,GAAG,SAAS,CAAC,CAAC;YAC9B,OAAO,UAAU,CAAC;QACpB,CAAC;IACH,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED,SAAS,eAAe,CACtB,OAA+B,EAC/B,OAAiB;IAEjB,iCAAiC;IACjC,MAAM,SAAS,GAAG,OAAO,CAAC,cAAc,CAAC,CAAC;IAC1C,IAAI,SAAS,IAAI,WAAW,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,CAAC;QAC7C,OAAO,CAAC,IAAI,CAAC,uBAAuB,CAAC,CAAC;QACtC,OAAO,SAAS,CAAC;IACnB,CAAC;IAED,iCAAiC;IACjC,IAAI,SAAS,IAAI,UAAU,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,CAAC;QAC5C,OAAO,CAAC,IAAI,CAAC,uBAAuB,CAAC,CAAC;QACtC,OAAO,SAAS,CAAC;IACnB,CAAC;IAED,2BAA2B;IAC3B,IAAI,SAAS,IAAI,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,CAAC;QACxC,OAAO,CAAC,IAAI,CAAC,mBAAmB,CAAC,CAAC;QAClC,OAAO,KAAK,CAAC;IACf,CAAC;IAED,8CAA8C;IAC9C,MAAM,IAAI,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;IAC7B,IAAI,IAAI,IAAI,IAAI,CAAC,QAAQ,CAAC,WAAW,CAAC,EAAE,CAAC;QACvC,OAAO,CAAC,IAAI,CAAC,iBAAiB,CAAC,CAAC;QAChC,OAAO,WAAW,CAAC;IACrB,CAAC;IAED,kCAAkC;IAClC,IAAI,OAAO,CAAC,iBAAiB,CAAC,EAAE,CAAC;QAC/B,OAAO,CAAC,IAAI,CAAC,wBAAwB,CAAC,CAAC;QACvC,OAAO,SAAS,CAAC;IACnB,CAAC;IAED,6BAA6B;IAC7B,KAAK,MAAM,GAAG,IAAI,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC;QACvC,IAAI,GAAG,CAAC,UAAU,CAAC,WAAW,CAAC,EAAE,CAAC;YAChC,OAAO,CAAC,IAAI,CAAC,GAAG,GAAG,SAAS,CAAC,CAAC;YAC9B,OAAO,QAAQ,CAAC;QAClB,CAAC;IACH,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED,SAAS,qBAAqB,CAC5B,MAAc,EACd,aAA4B;IAE5B,gBAAgB;IAChB,IAAI,MAAM,KAAK,GAAG,IAAI,MAAM,KAAK,GAAG,EAAE,CAAC;QACrC,OAAO,eAAe,CAAC;IACzB,CAAC;IAED,4CAA4C;IAC5C,IAAI,aAAa,EAAE,CAAC;QAClB,OAAO,SAAS,CAAC;IACnB,CAAC;IACD,IAAI,MAAM,KAAK,GAAG,IAAI,MAAM,KAAK,GAAG,EAAE,CAAC;QACrC,OAAO,SAAS,CAAC;IACnB,CAAC;IACD,IAAI,MAAM,IAAI,GAAG,EAAE,CAAC;QAClB,OAAO,SAAS,CAAC;IACnB,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC"}
@@ -0,0 +1,44 @@
1
+ export interface PeekResult {
2
+ url: string;
3
+ status: number;
4
+ accessible: boolean;
5
+ contentType: string | null;
6
+ server: string | null;
7
+ framework: string | null;
8
+ botProtection: string | null;
9
+ signals: string[];
10
+ recommendation: 'read' | 'capture' | 'auth_required' | 'blocked';
11
+ }
12
+ export interface ReadResult {
13
+ url: string;
14
+ title: string | null;
15
+ author: string | null;
16
+ description: string | null;
17
+ content: string;
18
+ links: Array<{
19
+ text: string;
20
+ href: string;
21
+ }>;
22
+ images: Array<{
23
+ alt: string;
24
+ src: string;
25
+ }>;
26
+ metadata: {
27
+ type: string;
28
+ publishedAt: string | null;
29
+ source: string;
30
+ canonical: string | null;
31
+ siteName: string | null;
32
+ };
33
+ cost: {
34
+ tokens: number;
35
+ };
36
+ }
37
+ export interface Decoder {
38
+ name: string;
39
+ patterns: RegExp[];
40
+ decode(url: string, options?: {
41
+ skipSsrf?: boolean;
42
+ [key: string]: any;
43
+ }): Promise<ReadResult | null>;
44
+ }
@@ -0,0 +1,3 @@
1
+ // src/read/types.ts
2
+ export {};
3
+ //# sourceMappingURL=types.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.js","sourceRoot":"","sources":["../../src/read/types.ts"],"names":[],"mappings":"AAAA,oBAAoB"}