@apitap/core 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +60 -0
- package/README.md +362 -0
- package/SKILL.md +270 -0
- package/dist/auth/crypto.d.ts +31 -0
- package/dist/auth/crypto.js +66 -0
- package/dist/auth/crypto.js.map +1 -0
- package/dist/auth/handoff.d.ts +29 -0
- package/dist/auth/handoff.js +180 -0
- package/dist/auth/handoff.js.map +1 -0
- package/dist/auth/manager.d.ts +46 -0
- package/dist/auth/manager.js +127 -0
- package/dist/auth/manager.js.map +1 -0
- package/dist/auth/oauth-refresh.d.ts +16 -0
- package/dist/auth/oauth-refresh.js +91 -0
- package/dist/auth/oauth-refresh.js.map +1 -0
- package/dist/auth/refresh.d.ts +43 -0
- package/dist/auth/refresh.js +217 -0
- package/dist/auth/refresh.js.map +1 -0
- package/dist/capture/anti-bot.d.ts +15 -0
- package/dist/capture/anti-bot.js +43 -0
- package/dist/capture/anti-bot.js.map +1 -0
- package/dist/capture/blocklist.d.ts +6 -0
- package/dist/capture/blocklist.js +70 -0
- package/dist/capture/blocklist.js.map +1 -0
- package/dist/capture/body-diff.d.ts +8 -0
- package/dist/capture/body-diff.js +102 -0
- package/dist/capture/body-diff.js.map +1 -0
- package/dist/capture/body-variables.d.ts +13 -0
- package/dist/capture/body-variables.js +142 -0
- package/dist/capture/body-variables.js.map +1 -0
- package/dist/capture/domain.d.ts +8 -0
- package/dist/capture/domain.js +34 -0
- package/dist/capture/domain.js.map +1 -0
- package/dist/capture/entropy.d.ts +33 -0
- package/dist/capture/entropy.js +100 -0
- package/dist/capture/entropy.js.map +1 -0
- package/dist/capture/filter.d.ts +11 -0
- package/dist/capture/filter.js +49 -0
- package/dist/capture/filter.js.map +1 -0
- package/dist/capture/graphql.d.ts +21 -0
- package/dist/capture/graphql.js +99 -0
- package/dist/capture/graphql.js.map +1 -0
- package/dist/capture/idle.d.ts +23 -0
- package/dist/capture/idle.js +44 -0
- package/dist/capture/idle.js.map +1 -0
- package/dist/capture/monitor.d.ts +26 -0
- package/dist/capture/monitor.js +183 -0
- package/dist/capture/monitor.js.map +1 -0
- package/dist/capture/oauth-detector.d.ts +18 -0
- package/dist/capture/oauth-detector.js +96 -0
- package/dist/capture/oauth-detector.js.map +1 -0
- package/dist/capture/pagination.d.ts +9 -0
- package/dist/capture/pagination.js +40 -0
- package/dist/capture/pagination.js.map +1 -0
- package/dist/capture/parameterize.d.ts +17 -0
- package/dist/capture/parameterize.js +63 -0
- package/dist/capture/parameterize.js.map +1 -0
- package/dist/capture/scrubber.d.ts +5 -0
- package/dist/capture/scrubber.js +38 -0
- package/dist/capture/scrubber.js.map +1 -0
- package/dist/capture/session.d.ts +46 -0
- package/dist/capture/session.js +445 -0
- package/dist/capture/session.js.map +1 -0
- package/dist/capture/token-detector.d.ts +16 -0
- package/dist/capture/token-detector.js +62 -0
- package/dist/capture/token-detector.js.map +1 -0
- package/dist/capture/verifier.d.ts +17 -0
- package/dist/capture/verifier.js +147 -0
- package/dist/capture/verifier.js.map +1 -0
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +930 -0
- package/dist/cli.js.map +1 -0
- package/dist/discovery/auth.d.ts +17 -0
- package/dist/discovery/auth.js +81 -0
- package/dist/discovery/auth.js.map +1 -0
- package/dist/discovery/fetch.d.ts +17 -0
- package/dist/discovery/fetch.js +59 -0
- package/dist/discovery/fetch.js.map +1 -0
- package/dist/discovery/frameworks.d.ts +11 -0
- package/dist/discovery/frameworks.js +249 -0
- package/dist/discovery/frameworks.js.map +1 -0
- package/dist/discovery/index.d.ts +21 -0
- package/dist/discovery/index.js +219 -0
- package/dist/discovery/index.js.map +1 -0
- package/dist/discovery/openapi.d.ts +13 -0
- package/dist/discovery/openapi.js +175 -0
- package/dist/discovery/openapi.js.map +1 -0
- package/dist/discovery/probes.d.ts +9 -0
- package/dist/discovery/probes.js +70 -0
- package/dist/discovery/probes.js.map +1 -0
- package/dist/index.d.ts +25 -0
- package/dist/index.js +25 -0
- package/dist/index.js.map +1 -0
- package/dist/inspect/report.d.ts +52 -0
- package/dist/inspect/report.js +191 -0
- package/dist/inspect/report.js.map +1 -0
- package/dist/mcp.d.ts +8 -0
- package/dist/mcp.js +526 -0
- package/dist/mcp.js.map +1 -0
- package/dist/orchestration/browse.d.ts +38 -0
- package/dist/orchestration/browse.js +198 -0
- package/dist/orchestration/browse.js.map +1 -0
- package/dist/orchestration/cache.d.ts +15 -0
- package/dist/orchestration/cache.js +24 -0
- package/dist/orchestration/cache.js.map +1 -0
- package/dist/plugin.d.ts +17 -0
- package/dist/plugin.js +158 -0
- package/dist/plugin.js.map +1 -0
- package/dist/read/decoders/deepwiki.d.ts +2 -0
- package/dist/read/decoders/deepwiki.js +148 -0
- package/dist/read/decoders/deepwiki.js.map +1 -0
- package/dist/read/decoders/grokipedia.d.ts +2 -0
- package/dist/read/decoders/grokipedia.js +210 -0
- package/dist/read/decoders/grokipedia.js.map +1 -0
- package/dist/read/decoders/hackernews.d.ts +2 -0
- package/dist/read/decoders/hackernews.js +168 -0
- package/dist/read/decoders/hackernews.js.map +1 -0
- package/dist/read/decoders/index.d.ts +2 -0
- package/dist/read/decoders/index.js +12 -0
- package/dist/read/decoders/index.js.map +1 -0
- package/dist/read/decoders/reddit.d.ts +2 -0
- package/dist/read/decoders/reddit.js +142 -0
- package/dist/read/decoders/reddit.js.map +1 -0
- package/dist/read/decoders/twitter.d.ts +12 -0
- package/dist/read/decoders/twitter.js +187 -0
- package/dist/read/decoders/twitter.js.map +1 -0
- package/dist/read/decoders/wikipedia.d.ts +2 -0
- package/dist/read/decoders/wikipedia.js +66 -0
- package/dist/read/decoders/wikipedia.js.map +1 -0
- package/dist/read/decoders/youtube.d.ts +2 -0
- package/dist/read/decoders/youtube.js +69 -0
- package/dist/read/decoders/youtube.js.map +1 -0
- package/dist/read/extract.d.ts +25 -0
- package/dist/read/extract.js +320 -0
- package/dist/read/extract.js.map +1 -0
- package/dist/read/index.d.ts +14 -0
- package/dist/read/index.js +66 -0
- package/dist/read/index.js.map +1 -0
- package/dist/read/peek.d.ts +9 -0
- package/dist/read/peek.js +137 -0
- package/dist/read/peek.js.map +1 -0
- package/dist/read/types.d.ts +44 -0
- package/dist/read/types.js +3 -0
- package/dist/read/types.js.map +1 -0
- package/dist/replay/engine.d.ts +53 -0
- package/dist/replay/engine.js +441 -0
- package/dist/replay/engine.js.map +1 -0
- package/dist/replay/truncate.d.ts +16 -0
- package/dist/replay/truncate.js +92 -0
- package/dist/replay/truncate.js.map +1 -0
- package/dist/serve.d.ts +31 -0
- package/dist/serve.js +149 -0
- package/dist/serve.js.map +1 -0
- package/dist/skill/generator.d.ts +44 -0
- package/dist/skill/generator.js +419 -0
- package/dist/skill/generator.js.map +1 -0
- package/dist/skill/importer.d.ts +26 -0
- package/dist/skill/importer.js +80 -0
- package/dist/skill/importer.js.map +1 -0
- package/dist/skill/search.d.ts +19 -0
- package/dist/skill/search.js +51 -0
- package/dist/skill/search.js.map +1 -0
- package/dist/skill/signing.d.ts +16 -0
- package/dist/skill/signing.js +34 -0
- package/dist/skill/signing.js.map +1 -0
- package/dist/skill/ssrf.d.ts +27 -0
- package/dist/skill/ssrf.js +210 -0
- package/dist/skill/ssrf.js.map +1 -0
- package/dist/skill/store.d.ts +7 -0
- package/dist/skill/store.js +93 -0
- package/dist/skill/store.js.map +1 -0
- package/dist/stats/report.d.ts +26 -0
- package/dist/stats/report.js +157 -0
- package/dist/stats/report.js.map +1 -0
- package/dist/types.d.ts +214 -0
- package/dist/types.js +3 -0
- package/dist/types.js.map +1 -0
- package/package.json +58 -0
- package/src/auth/crypto.ts +92 -0
- package/src/auth/handoff.ts +229 -0
- package/src/auth/manager.ts +140 -0
- package/src/auth/oauth-refresh.ts +120 -0
- package/src/auth/refresh.ts +300 -0
- package/src/capture/anti-bot.ts +63 -0
- package/src/capture/blocklist.ts +75 -0
- package/src/capture/body-diff.ts +109 -0
- package/src/capture/body-variables.ts +156 -0
- package/src/capture/domain.ts +34 -0
- package/src/capture/entropy.ts +121 -0
- package/src/capture/filter.ts +56 -0
- package/src/capture/graphql.ts +124 -0
- package/src/capture/idle.ts +45 -0
- package/src/capture/monitor.ts +224 -0
- package/src/capture/oauth-detector.ts +106 -0
- package/src/capture/pagination.ts +49 -0
- package/src/capture/parameterize.ts +68 -0
- package/src/capture/scrubber.ts +49 -0
- package/src/capture/session.ts +502 -0
- package/src/capture/token-detector.ts +76 -0
- package/src/capture/verifier.ts +171 -0
- package/src/cli.ts +1031 -0
- package/src/discovery/auth.ts +99 -0
- package/src/discovery/fetch.ts +85 -0
- package/src/discovery/frameworks.ts +231 -0
- package/src/discovery/index.ts +256 -0
- package/src/discovery/openapi.ts +230 -0
- package/src/discovery/probes.ts +76 -0
- package/src/index.ts +26 -0
- package/src/inspect/report.ts +247 -0
- package/src/mcp.ts +618 -0
- package/src/orchestration/browse.ts +250 -0
- package/src/orchestration/cache.ts +37 -0
- package/src/plugin.ts +188 -0
- package/src/read/decoders/deepwiki.ts +180 -0
- package/src/read/decoders/grokipedia.ts +246 -0
- package/src/read/decoders/hackernews.ts +198 -0
- package/src/read/decoders/index.ts +15 -0
- package/src/read/decoders/reddit.ts +158 -0
- package/src/read/decoders/twitter.ts +211 -0
- package/src/read/decoders/wikipedia.ts +75 -0
- package/src/read/decoders/youtube.ts +75 -0
- package/src/read/extract.ts +396 -0
- package/src/read/index.ts +78 -0
- package/src/read/peek.ts +175 -0
- package/src/read/types.ts +37 -0
- package/src/replay/engine.ts +559 -0
- package/src/replay/truncate.ts +116 -0
- package/src/serve.ts +189 -0
- package/src/skill/generator.ts +473 -0
- package/src/skill/importer.ts +107 -0
- package/src/skill/search.ts +76 -0
- package/src/skill/signing.ts +36 -0
- package/src/skill/ssrf.ts +238 -0
- package/src/skill/store.ts +107 -0
- package/src/stats/report.ts +208 -0
- package/src/types.ts +233 -0
|
@@ -0,0 +1,396 @@
|
|
|
1
|
+
// src/read/extract.ts
|
|
2
|
+
|
|
3
|
+
export interface HeadMeta {
|
|
4
|
+
title: string | null;
|
|
5
|
+
ogTitle: string | null;
|
|
6
|
+
ogDescription: string | null;
|
|
7
|
+
ogImage: string | null;
|
|
8
|
+
ogType: string | null;
|
|
9
|
+
ogSiteName: string | null;
|
|
10
|
+
canonical: string | null;
|
|
11
|
+
author: string | null;
|
|
12
|
+
publishedTime: string | null;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export interface ExtractResult {
|
|
16
|
+
content: string;
|
|
17
|
+
links: Array<{ text: string; href: string }>;
|
|
18
|
+
images: Array<{ alt: string; src: string }>;
|
|
19
|
+
isSpaShell: boolean;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
// ---- HTML entity decoding ----
|
|
23
|
+
|
|
24
|
+
const ENTITY_MAP: Record<string, string> = {
|
|
25
|
+
'&': '&',
|
|
26
|
+
'<': '<',
|
|
27
|
+
'>': '>',
|
|
28
|
+
'"': '"',
|
|
29
|
+
''': "'",
|
|
30
|
+
''': "'",
|
|
31
|
+
' ': ' ',
|
|
32
|
+
};
|
|
33
|
+
|
|
34
|
+
function decodeEntities(text: string): string {
|
|
35
|
+
return text.replace(/&(?:amp|lt|gt|quot|apos|nbsp|#39);/g, (m) => ENTITY_MAP[m] ?? m);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// ---- parseHead ----
|
|
39
|
+
|
|
40
|
+
function extractMetaContent(html: string, attrName: string, attrValue: string): string | null {
|
|
41
|
+
// Handle both orders: property="X" content="Y" and content="Y" property="X"
|
|
42
|
+
// Also handle name="X" content="Y" for author etc.
|
|
43
|
+
const patterns = [
|
|
44
|
+
new RegExp(`<meta\\s+${attrName}=["']${escapeRegex(attrValue)}["']\\s+content=["']([^"']*)["']`, 'i'),
|
|
45
|
+
new RegExp(`<meta\\s+content=["']([^"']*)["']\\s+${attrName}=["']${escapeRegex(attrValue)}["']`, 'i'),
|
|
46
|
+
];
|
|
47
|
+
for (const re of patterns) {
|
|
48
|
+
const m = html.match(re);
|
|
49
|
+
if (m) return decodeEntities(m[1]);
|
|
50
|
+
}
|
|
51
|
+
return null;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
function escapeRegex(s: string): string {
|
|
55
|
+
return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
export function parseHead(html: string): HeadMeta {
|
|
59
|
+
// Extract <title>
|
|
60
|
+
const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
|
|
61
|
+
const title = titleMatch ? decodeEntities(titleMatch[1].trim()) : null;
|
|
62
|
+
|
|
63
|
+
// Extract canonical
|
|
64
|
+
const canonicalMatch = html.match(/<link\s+[^>]*rel=["']canonical["'][^>]*href=["']([^"']*)["'][^>]*\/?>/i)
|
|
65
|
+
?? html.match(/<link\s+[^>]*href=["']([^"']*)["'][^>]*rel=["']canonical["'][^>]*\/?>/i);
|
|
66
|
+
const canonical = canonicalMatch ? decodeEntities(canonicalMatch[1]) : null;
|
|
67
|
+
|
|
68
|
+
return {
|
|
69
|
+
title,
|
|
70
|
+
ogTitle: extractMetaContent(html, 'property', 'og:title'),
|
|
71
|
+
ogDescription: extractMetaContent(html, 'property', 'og:description'),
|
|
72
|
+
ogImage: extractMetaContent(html, 'property', 'og:image'),
|
|
73
|
+
ogType: extractMetaContent(html, 'property', 'og:type'),
|
|
74
|
+
ogSiteName: extractMetaContent(html, 'property', 'og:site_name'),
|
|
75
|
+
canonical,
|
|
76
|
+
author: extractMetaContent(html, 'name', 'author'),
|
|
77
|
+
publishedTime: extractMetaContent(html, 'property', 'article:published_time'),
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// ---- extractContent ----
|
|
82
|
+
|
|
83
|
+
/** Tags whose entire content (including children) should be removed */
|
|
84
|
+
const NOISE_TAGS = ['script', 'style', 'noscript', 'svg', 'iframe', 'nav', 'header', 'footer', 'aside'];
|
|
85
|
+
|
|
86
|
+
/** SPA shell markers */
|
|
87
|
+
const SPA_MARKERS = [
|
|
88
|
+
'<div id="root"',
|
|
89
|
+
'<div id="app"',
|
|
90
|
+
'<div id="__next"',
|
|
91
|
+
'bundle.js',
|
|
92
|
+
'main.js',
|
|
93
|
+
'app.js',
|
|
94
|
+
'__NEXT_DATA__',
|
|
95
|
+
'window.__INITIAL_STATE__',
|
|
96
|
+
'window.__NUXT__',
|
|
97
|
+
];
|
|
98
|
+
|
|
99
|
+
function stripTags(html: string, tags: string[]): string {
|
|
100
|
+
let result = html;
|
|
101
|
+
for (const tag of tags) {
|
|
102
|
+
// Use non-greedy match with dotAll behavior via [\s\S]
|
|
103
|
+
const re = new RegExp(`<${tag}[\\s>][\\s\\S]*?<\\/${tag}>`, 'gi');
|
|
104
|
+
result = result.replace(re, '');
|
|
105
|
+
// Also strip self-closing variants (e.g. <iframe ... />)
|
|
106
|
+
const selfClose = new RegExp(`<${tag}[^>]*/?>`, 'gi');
|
|
107
|
+
result = result.replace(selfClose, '');
|
|
108
|
+
}
|
|
109
|
+
return result;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
function findContentRoot(html: string): string {
|
|
113
|
+
// Priority order for content root
|
|
114
|
+
const selectors: Array<{ re: RegExp }> = [
|
|
115
|
+
{ re: /<article[^>]*>([\s\S]*?)<\/article>/i },
|
|
116
|
+
{ re: /<main[^>]*>([\s\S]*?)<\/main>/i },
|
|
117
|
+
{ re: /<[^>]+role=["']main["'][^>]*>([\s\S]*?)<\/div>/i },
|
|
118
|
+
];
|
|
119
|
+
|
|
120
|
+
for (const { re } of selectors) {
|
|
121
|
+
const m = html.match(re);
|
|
122
|
+
if (m) return m[1];
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
// Class-based selectors
|
|
126
|
+
const classPatterns = [
|
|
127
|
+
/class=["'][^"']*\bpost-content\b/i,
|
|
128
|
+
/class=["'][^"']*\barticle-body\b/i,
|
|
129
|
+
/class=["'][^"']*\bentry-content\b/i,
|
|
130
|
+
];
|
|
131
|
+
|
|
132
|
+
for (const cp of classPatterns) {
|
|
133
|
+
const m = html.match(cp);
|
|
134
|
+
if (m) {
|
|
135
|
+
// Find the enclosing tag and extract its content
|
|
136
|
+
const idx = m.index!;
|
|
137
|
+
const extracted = extractTagContent(html, idx);
|
|
138
|
+
if (extracted) return extracted;
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// id="content"
|
|
143
|
+
const contentId = html.match(/id=["']content["']/i);
|
|
144
|
+
if (contentId) {
|
|
145
|
+
const extracted = extractTagContent(html, contentId.index!);
|
|
146
|
+
if (extracted) return extracted;
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
// Fallback: <body>
|
|
150
|
+
const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
|
|
151
|
+
if (bodyMatch) return bodyMatch[1];
|
|
152
|
+
|
|
153
|
+
return html;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
function extractTagContent(html: string, attrIndex: number): string | null {
|
|
157
|
+
// Walk backwards to find the opening < of the tag
|
|
158
|
+
let tagStart = attrIndex;
|
|
159
|
+
while (tagStart > 0 && html[tagStart] !== '<') tagStart--;
|
|
160
|
+
|
|
161
|
+
// Find the tag name
|
|
162
|
+
const tagNameMatch = html.slice(tagStart).match(/^<(\w+)/);
|
|
163
|
+
if (!tagNameMatch) return null;
|
|
164
|
+
|
|
165
|
+
const tagName = tagNameMatch[1];
|
|
166
|
+
|
|
167
|
+
// Find matching close tag accounting for nesting
|
|
168
|
+
let depth = 1;
|
|
169
|
+
const openRe = new RegExp(`<${tagName}[\\s>]`, 'gi');
|
|
170
|
+
const closeRe = new RegExp(`</${tagName}>`, 'gi');
|
|
171
|
+
|
|
172
|
+
// Find where the opening tag ends (the first > after tagStart)
|
|
173
|
+
const openTagEnd = html.indexOf('>', tagStart);
|
|
174
|
+
if (openTagEnd === -1) return null;
|
|
175
|
+
|
|
176
|
+
let pos = openTagEnd + 1;
|
|
177
|
+
const contentStart = pos;
|
|
178
|
+
|
|
179
|
+
while (depth > 0 && pos < html.length) {
|
|
180
|
+
openRe.lastIndex = pos;
|
|
181
|
+
closeRe.lastIndex = pos;
|
|
182
|
+
|
|
183
|
+
const nextOpen = openRe.exec(html);
|
|
184
|
+
const nextClose = closeRe.exec(html);
|
|
185
|
+
|
|
186
|
+
if (!nextClose) break; // no more close tags
|
|
187
|
+
|
|
188
|
+
if (nextOpen && nextOpen.index < nextClose.index) {
|
|
189
|
+
depth++;
|
|
190
|
+
pos = nextOpen.index + nextOpen[0].length;
|
|
191
|
+
} else {
|
|
192
|
+
depth--;
|
|
193
|
+
if (depth === 0) {
|
|
194
|
+
return html.slice(contentStart, nextClose.index);
|
|
195
|
+
}
|
|
196
|
+
pos = nextClose.index + nextClose[0].length;
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
return null;
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
function htmlToMarkdown(
|
|
204
|
+
html: string,
|
|
205
|
+
links: Array<{ text: string; href: string }>,
|
|
206
|
+
images: Array<{ alt: string; src: string }>,
|
|
207
|
+
): string {
|
|
208
|
+
let md = html;
|
|
209
|
+
|
|
210
|
+
// Remove HTML comments
|
|
211
|
+
md = md.replace(/<!--[\s\S]*?-->/g, '');
|
|
212
|
+
|
|
213
|
+
// Convert headings
|
|
214
|
+
for (let level = 1; level <= 6; level++) {
|
|
215
|
+
const prefix = '#'.repeat(level);
|
|
216
|
+
const re = new RegExp(`<h${level}[^>]*>([\\s\\S]*?)<\\/h${level}>`, 'gi');
|
|
217
|
+
md = md.replace(re, (_m, content) => {
|
|
218
|
+
const text = stripAllTags(content).trim();
|
|
219
|
+
return `\n\n${prefix} ${text}\n\n`;
|
|
220
|
+
});
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
// Convert blockquotes (before paragraphs)
|
|
224
|
+
md = md.replace(/<blockquote[^>]*>([\s\S]*?)<\/blockquote>/gi, (_m, content) => {
|
|
225
|
+
const text = stripAllTags(content).trim();
|
|
226
|
+
const quoted = text.split('\n').map((l: string) => `> ${l}`).join('\n');
|
|
227
|
+
return `\n\n${quoted}\n\n`;
|
|
228
|
+
});
|
|
229
|
+
|
|
230
|
+
// Convert code blocks: <pre><code>...</code></pre>
|
|
231
|
+
md = md.replace(/<pre[^>]*>\s*<code[^>]*>([\s\S]*?)<\/code>\s*<\/pre>/gi, (_m, content) => {
|
|
232
|
+
const decoded = decodeEntities(content.trim());
|
|
233
|
+
return `\n\n\`\`\`\n${decoded}\n\`\`\`\n\n`;
|
|
234
|
+
});
|
|
235
|
+
|
|
236
|
+
// Convert standalone <pre> (without <code>)
|
|
237
|
+
md = md.replace(/<pre[^>]*>([\s\S]*?)<\/pre>/gi, (_m, content) => {
|
|
238
|
+
const decoded = decodeEntities(stripAllTags(content).trim());
|
|
239
|
+
return `\n\n\`\`\`\n${decoded}\n\`\`\`\n\n`;
|
|
240
|
+
});
|
|
241
|
+
|
|
242
|
+
// Convert inline code
|
|
243
|
+
md = md.replace(/<code[^>]*>([\s\S]*?)<\/code>/gi, (_m, content) => {
|
|
244
|
+
return `\`${decodeEntities(content)}\``;
|
|
245
|
+
});
|
|
246
|
+
|
|
247
|
+
// Convert tables
|
|
248
|
+
md = md.replace(/<table[^>]*>([\s\S]*?)<\/table>/gi, (_m, tableContent) => {
|
|
249
|
+
return convertTable(tableContent);
|
|
250
|
+
});
|
|
251
|
+
|
|
252
|
+
// Convert images (before stripping tags, so we can extract src/alt)
|
|
253
|
+
md = md.replace(/<img\s+[^>]*>/gi, (tag) => {
|
|
254
|
+
const srcMatch = tag.match(/src=["']([^"']*)["']/i);
|
|
255
|
+
const altMatch = tag.match(/alt=["']([^"']*)["']/i);
|
|
256
|
+
const src = srcMatch ? decodeEntities(srcMatch[1]) : '';
|
|
257
|
+
const alt = altMatch ? decodeEntities(altMatch[1]) : '';
|
|
258
|
+
if (src) {
|
|
259
|
+
images.push({ alt, src });
|
|
260
|
+
return ``;
|
|
261
|
+
}
|
|
262
|
+
return '';
|
|
263
|
+
});
|
|
264
|
+
|
|
265
|
+
// Convert links
|
|
266
|
+
md = md.replace(/<a\s+[^>]*href=["']([^"']*)["'][^>]*>([\s\S]*?)<\/a>/gi, (_m, href, content) => {
|
|
267
|
+
const text = stripAllTags(content).trim();
|
|
268
|
+
const decodedHref = decodeEntities(href);
|
|
269
|
+
if (text && decodedHref) {
|
|
270
|
+
links.push({ text, href: decodedHref });
|
|
271
|
+
return `[${text}](${decodedHref})`;
|
|
272
|
+
}
|
|
273
|
+
return text;
|
|
274
|
+
});
|
|
275
|
+
|
|
276
|
+
// Convert bold
|
|
277
|
+
md = md.replace(/<(?:strong|b)[^>]*>([\s\S]*?)<\/(?:strong|b)>/gi, (_m, content) => {
|
|
278
|
+
return `**${stripAllTags(content)}**`;
|
|
279
|
+
});
|
|
280
|
+
|
|
281
|
+
// Convert italic
|
|
282
|
+
md = md.replace(/<(?:em|i)(?:\s[^>]*)?>(?!mg)([\s\S]*?)<\/(?:em|i)>/gi, (_m, content) => {
|
|
283
|
+
return `*${stripAllTags(content)}*`;
|
|
284
|
+
});
|
|
285
|
+
|
|
286
|
+
// Convert ordered lists
|
|
287
|
+
md = md.replace(/<ol[^>]*>([\s\S]*?)<\/ol>/gi, (_m, listContent) => {
|
|
288
|
+
let counter = 0;
|
|
289
|
+
const items = listContent.replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, (_lm: string, item: string) => {
|
|
290
|
+
counter++;
|
|
291
|
+
return `${counter}. ${stripAllTags(item).trim()}\n`;
|
|
292
|
+
});
|
|
293
|
+
return `\n\n${stripAllTags(items).trim()}\n\n`;
|
|
294
|
+
});
|
|
295
|
+
|
|
296
|
+
// Convert unordered lists
|
|
297
|
+
md = md.replace(/<ul[^>]*>([\s\S]*?)<\/ul>/gi, (_m, listContent) => {
|
|
298
|
+
const items = listContent.replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, (_lm: string, item: string) => {
|
|
299
|
+
return `- ${stripAllTags(item).trim()}\n`;
|
|
300
|
+
});
|
|
301
|
+
return `\n\n${stripAllTags(items).trim()}\n\n`;
|
|
302
|
+
});
|
|
303
|
+
|
|
304
|
+
// Convert paragraphs
|
|
305
|
+
md = md.replace(/<p[^>]*>([\s\S]*?)<\/p>/gi, (_m, content) => {
|
|
306
|
+
return `\n\n${content.trim()}\n\n`;
|
|
307
|
+
});
|
|
308
|
+
|
|
309
|
+
// Convert <br> tags
|
|
310
|
+
md = md.replace(/<br\s*\/?>/gi, '\n');
|
|
311
|
+
|
|
312
|
+
// Strip remaining HTML tags
|
|
313
|
+
md = stripAllTags(md);
|
|
314
|
+
|
|
315
|
+
// Decode entities
|
|
316
|
+
md = decodeEntities(md);
|
|
317
|
+
|
|
318
|
+
// Collapse whitespace: no more than 2 consecutive newlines
|
|
319
|
+
md = md.replace(/\n{3,}/g, '\n\n');
|
|
320
|
+
|
|
321
|
+
// Trim leading/trailing whitespace
|
|
322
|
+
md = md.trim();
|
|
323
|
+
|
|
324
|
+
return md;
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
function stripAllTags(html: string): string {
|
|
328
|
+
return html.replace(/<[^>]*>/g, '');
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
function convertTable(tableHtml: string): string {
|
|
332
|
+
const rows: string[][] = [];
|
|
333
|
+
|
|
334
|
+
// Extract rows
|
|
335
|
+
const rowMatches = tableHtml.match(/<tr[^>]*>[\s\S]*?<\/tr>/gi) || [];
|
|
336
|
+
for (const row of rowMatches) {
|
|
337
|
+
const cells: string[] = [];
|
|
338
|
+
const cellMatches = row.match(/<(?:td|th)[^>]*>[\s\S]*?<\/(?:td|th)>/gi) || [];
|
|
339
|
+
for (const cell of cellMatches) {
|
|
340
|
+
const content = cell.replace(/<\/?(?:td|th)[^>]*>/gi, '');
|
|
341
|
+
cells.push(stripAllTags(content).trim());
|
|
342
|
+
}
|
|
343
|
+
if (cells.length > 0) rows.push(cells);
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
if (rows.length === 0) return '';
|
|
347
|
+
|
|
348
|
+
// Normalize column count
|
|
349
|
+
const maxCols = Math.max(...rows.map((r) => r.length));
|
|
350
|
+
const normalized = rows.map((r) => {
|
|
351
|
+
while (r.length < maxCols) r.push('');
|
|
352
|
+
return r;
|
|
353
|
+
});
|
|
354
|
+
|
|
355
|
+
// Build markdown table
|
|
356
|
+
const lines: string[] = [];
|
|
357
|
+
const header = normalized[0];
|
|
358
|
+
lines.push('| ' + header.join(' | ') + ' |');
|
|
359
|
+
lines.push('| ' + header.map(() => '---').join(' | ') + ' |');
|
|
360
|
+
|
|
361
|
+
for (let i = 1; i < normalized.length; i++) {
|
|
362
|
+
lines.push('| ' + normalized[i].join(' | ') + ' |');
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
return '\n\n' + lines.join('\n') + '\n\n';
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
function getTextContent(html: string): string {
|
|
369
|
+
return stripAllTags(html).replace(/\s+/g, ' ').trim();
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
export function extractContent(html: string): ExtractResult {
|
|
373
|
+
const links: Array<{ text: string; href: string }> = [];
|
|
374
|
+
const images: Array<{ alt: string; src: string }> = [];
|
|
375
|
+
|
|
376
|
+
// Strip noise tags first
|
|
377
|
+
const cleaned = stripTags(html, NOISE_TAGS);
|
|
378
|
+
|
|
379
|
+
// Find content root
|
|
380
|
+
const contentHtml = findContentRoot(cleaned);
|
|
381
|
+
|
|
382
|
+
// Convert to markdown
|
|
383
|
+
const content = htmlToMarkdown(contentHtml, links, images);
|
|
384
|
+
|
|
385
|
+
// Detect SPA shell
|
|
386
|
+
const textContent = getTextContent(contentHtml);
|
|
387
|
+
const hasSpaMarker = SPA_MARKERS.some((marker) => html.includes(marker));
|
|
388
|
+
const isSpaShell = textContent.length < 200 && hasSpaMarker;
|
|
389
|
+
|
|
390
|
+
return {
|
|
391
|
+
content,
|
|
392
|
+
links,
|
|
393
|
+
images,
|
|
394
|
+
isSpaShell,
|
|
395
|
+
};
|
|
396
|
+
}
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
// src/read/index.ts
|
|
2
|
+
export { peek } from './peek.js';
|
|
3
|
+
export type { PeekOptions } from './peek.js';
|
|
4
|
+
export type { PeekResult, ReadResult, Decoder } from './types.js';
|
|
5
|
+
|
|
6
|
+
import type { ReadResult } from './types.js';
|
|
7
|
+
import { safeFetch } from '../discovery/fetch.js';
|
|
8
|
+
import { findDecoder } from './decoders/index.js';
|
|
9
|
+
import { parseHead, extractContent } from './extract.js';
|
|
10
|
+
|
|
11
|
+
export interface ReadOptions {
|
|
12
|
+
skipSsrf?: boolean;
|
|
13
|
+
maxBytes?: number;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Universal content decoder. Routes to site-specific decoders for known sites
|
|
18
|
+
* (Reddit, YouTube, Wikipedia, HN), falls back to generic HTML extraction.
|
|
19
|
+
* Returns null if content cannot be extracted.
|
|
20
|
+
*/
|
|
21
|
+
export async function read(url: string, options: ReadOptions = {}): Promise<ReadResult | null> {
|
|
22
|
+
// Try site-specific decoder first
|
|
23
|
+
const decoder = findDecoder(url);
|
|
24
|
+
if (decoder) {
|
|
25
|
+
const result = await decoder.decode(url, { skipSsrf: options.skipSsrf });
|
|
26
|
+
if (result) {
|
|
27
|
+
if (options.maxBytes && result.content.length > options.maxBytes) {
|
|
28
|
+
result.content = result.content.slice(0, options.maxBytes);
|
|
29
|
+
result.cost.tokens = Math.ceil(result.content.length / 4);
|
|
30
|
+
}
|
|
31
|
+
return result;
|
|
32
|
+
}
|
|
33
|
+
// Decoder returned null -- fall through to generic
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
// Generic pipeline: fetch HTML -> parse head -> extract body
|
|
37
|
+
const fetchResult = await safeFetch(url, { skipSsrf: options.skipSsrf });
|
|
38
|
+
if (!fetchResult || fetchResult.status !== 200) return null;
|
|
39
|
+
|
|
40
|
+
const html = fetchResult.body;
|
|
41
|
+
const head = parseHead(html);
|
|
42
|
+
const body = extractContent(html);
|
|
43
|
+
|
|
44
|
+
// Determine source
|
|
45
|
+
let source: string;
|
|
46
|
+
if (body.isSpaShell) {
|
|
47
|
+
source = 'spa-shell';
|
|
48
|
+
} else if (body.content.trim().length === 0) {
|
|
49
|
+
source = 'og-tags-only';
|
|
50
|
+
} else {
|
|
51
|
+
source = 'readability';
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
let content = body.content;
|
|
55
|
+
if (options.maxBytes && content.length > options.maxBytes) {
|
|
56
|
+
content = content.slice(0, options.maxBytes);
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
const title = head.ogTitle || head.title || null;
|
|
60
|
+
|
|
61
|
+
return {
|
|
62
|
+
url,
|
|
63
|
+
title,
|
|
64
|
+
author: head.author || null,
|
|
65
|
+
description: head.ogDescription || null,
|
|
66
|
+
content,
|
|
67
|
+
links: body.links,
|
|
68
|
+
images: body.images,
|
|
69
|
+
metadata: {
|
|
70
|
+
type: head.ogType || 'unknown',
|
|
71
|
+
publishedAt: head.publishedTime || null,
|
|
72
|
+
source,
|
|
73
|
+
canonical: head.canonical || null,
|
|
74
|
+
siteName: head.ogSiteName || null,
|
|
75
|
+
},
|
|
76
|
+
cost: { tokens: Math.ceil(content.length / 4) },
|
|
77
|
+
};
|
|
78
|
+
}
|
package/src/read/peek.ts
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
// src/read/peek.ts
|
|
2
|
+
import type { PeekResult } from './types.js';
|
|
3
|
+
import { safeFetch } from '../discovery/fetch.js';
|
|
4
|
+
|
|
5
|
+
export interface PeekOptions {
|
|
6
|
+
skipSsrf?: boolean;
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* HTTP HEAD-only triage: checks accessibility, detects bot protection and frameworks.
|
|
11
|
+
* Falls back to GET if HEAD fails.
|
|
12
|
+
*/
|
|
13
|
+
export async function peek(url: string, options: PeekOptions = {}): Promise<PeekResult> {
|
|
14
|
+
const signals: string[] = [];
|
|
15
|
+
|
|
16
|
+
// Try HEAD first
|
|
17
|
+
let result = await safeFetch(url, {
|
|
18
|
+
method: 'HEAD',
|
|
19
|
+
skipSsrf: options.skipSsrf,
|
|
20
|
+
});
|
|
21
|
+
|
|
22
|
+
// Fall back to GET if HEAD fails (null = network/SSRF error)
|
|
23
|
+
if (!result) {
|
|
24
|
+
result = await safeFetch(url, {
|
|
25
|
+
method: 'GET',
|
|
26
|
+
skipSsrf: options.skipSsrf,
|
|
27
|
+
});
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
// Both HEAD and GET failed
|
|
31
|
+
if (!result) {
|
|
32
|
+
return {
|
|
33
|
+
url,
|
|
34
|
+
status: 0,
|
|
35
|
+
accessible: false,
|
|
36
|
+
contentType: null,
|
|
37
|
+
server: null,
|
|
38
|
+
framework: null,
|
|
39
|
+
botProtection: null,
|
|
40
|
+
signals: ['fetch failed'],
|
|
41
|
+
recommendation: 'blocked',
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
const { status, headers } = result;
|
|
46
|
+
|
|
47
|
+
// Extract basic metadata
|
|
48
|
+
const contentType = headers['content-type'] || null;
|
|
49
|
+
const server = headers['server'] || null;
|
|
50
|
+
|
|
51
|
+
// Detect bot protection
|
|
52
|
+
const botProtection = detectBotProtection(headers, signals);
|
|
53
|
+
|
|
54
|
+
// Detect framework
|
|
55
|
+
const framework = detectFramework(headers, signals);
|
|
56
|
+
|
|
57
|
+
// Determine accessibility and recommendation
|
|
58
|
+
const accessible = status >= 200 && status < 400 && !botProtection;
|
|
59
|
+
const recommendation = computeRecommendation(status, botProtection);
|
|
60
|
+
|
|
61
|
+
return {
|
|
62
|
+
url,
|
|
63
|
+
status,
|
|
64
|
+
accessible,
|
|
65
|
+
contentType,
|
|
66
|
+
server,
|
|
67
|
+
framework,
|
|
68
|
+
botProtection,
|
|
69
|
+
signals,
|
|
70
|
+
recommendation,
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
function detectBotProtection(
|
|
75
|
+
headers: Record<string, string>,
|
|
76
|
+
signals: string[],
|
|
77
|
+
): string | null {
|
|
78
|
+
// Cloudflare: cf-ray or cf-cache-status
|
|
79
|
+
if (headers['cf-ray']) {
|
|
80
|
+
signals.push('cf-ray header');
|
|
81
|
+
return 'cloudflare';
|
|
82
|
+
}
|
|
83
|
+
if (headers['cf-cache-status']) {
|
|
84
|
+
signals.push('cf-cache-status header');
|
|
85
|
+
return 'cloudflare';
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
// PerimeterX: x-px-* headers
|
|
89
|
+
for (const key of Object.keys(headers)) {
|
|
90
|
+
if (key.startsWith('x-px-')) {
|
|
91
|
+
signals.push(`${key} header`);
|
|
92
|
+
return 'perimeterx';
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// DataDome: x-datadome* headers
|
|
97
|
+
for (const key of Object.keys(headers)) {
|
|
98
|
+
if (key.startsWith('x-datadome')) {
|
|
99
|
+
signals.push(`${key} header`);
|
|
100
|
+
return 'datadome';
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
return null;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
function detectFramework(
|
|
108
|
+
headers: Record<string, string>,
|
|
109
|
+
signals: string[],
|
|
110
|
+
): string | null {
|
|
111
|
+
// Next.js: x-powered-by: Next.js
|
|
112
|
+
const poweredBy = headers['x-powered-by'];
|
|
113
|
+
if (poweredBy && /next\.js/i.test(poweredBy)) {
|
|
114
|
+
signals.push('x-powered-by: Next.js');
|
|
115
|
+
return 'next.js';
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
// Express: x-powered-by: Express
|
|
119
|
+
if (poweredBy && /express/i.test(poweredBy)) {
|
|
120
|
+
signals.push('x-powered-by: Express');
|
|
121
|
+
return 'express';
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// PHP: x-powered-by: PHP/*
|
|
125
|
+
if (poweredBy && /php/i.test(poweredBy)) {
|
|
126
|
+
signals.push('x-powered-by: PHP');
|
|
127
|
+
return 'php';
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
// WordPress: link header containing api.w.org
|
|
131
|
+
const link = headers['link'];
|
|
132
|
+
if (link && link.includes('api.w.org')) {
|
|
133
|
+
signals.push('link: api.w.org');
|
|
134
|
+
return 'wordpress';
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
// Shopify: x-shopify-stage header
|
|
138
|
+
if (headers['x-shopify-stage']) {
|
|
139
|
+
signals.push('x-shopify-stage header');
|
|
140
|
+
return 'shopify';
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// Drupal: x-drupal-* headers
|
|
144
|
+
for (const key of Object.keys(headers)) {
|
|
145
|
+
if (key.startsWith('x-drupal-')) {
|
|
146
|
+
signals.push(`${key} header`);
|
|
147
|
+
return 'drupal';
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
return null;
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
function computeRecommendation(
|
|
155
|
+
status: number,
|
|
156
|
+
botProtection: string | null,
|
|
157
|
+
): PeekResult['recommendation'] {
|
|
158
|
+
// Auth required
|
|
159
|
+
if (status === 401 || status === 407) {
|
|
160
|
+
return 'auth_required';
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
// Blocked: bot protection, 403, 429, or 5xx
|
|
164
|
+
if (botProtection) {
|
|
165
|
+
return 'blocked';
|
|
166
|
+
}
|
|
167
|
+
if (status === 403 || status === 429) {
|
|
168
|
+
return 'blocked';
|
|
169
|
+
}
|
|
170
|
+
if (status >= 500) {
|
|
171
|
+
return 'blocked';
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
return 'read';
|
|
175
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
// src/read/types.ts
|
|
2
|
+
|
|
3
|
+
export interface PeekResult {
|
|
4
|
+
url: string;
|
|
5
|
+
status: number;
|
|
6
|
+
accessible: boolean;
|
|
7
|
+
contentType: string | null;
|
|
8
|
+
server: string | null;
|
|
9
|
+
framework: string | null;
|
|
10
|
+
botProtection: string | null;
|
|
11
|
+
signals: string[];
|
|
12
|
+
recommendation: 'read' | 'capture' | 'auth_required' | 'blocked';
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export interface ReadResult {
|
|
16
|
+
url: string;
|
|
17
|
+
title: string | null;
|
|
18
|
+
author: string | null;
|
|
19
|
+
description: string | null;
|
|
20
|
+
content: string;
|
|
21
|
+
links: Array<{ text: string; href: string }>;
|
|
22
|
+
images: Array<{ alt: string; src: string }>;
|
|
23
|
+
metadata: {
|
|
24
|
+
type: string;
|
|
25
|
+
publishedAt: string | null;
|
|
26
|
+
source: string;
|
|
27
|
+
canonical: string | null;
|
|
28
|
+
siteName: string | null;
|
|
29
|
+
};
|
|
30
|
+
cost: { tokens: number };
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export interface Decoder {
|
|
34
|
+
name: string;
|
|
35
|
+
patterns: RegExp[];
|
|
36
|
+
decode(url: string, options?: { skipSsrf?: boolean; [key: string]: any }): Promise<ReadResult | null>;
|
|
37
|
+
}
|