@apitap/core 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +60 -0
- package/README.md +362 -0
- package/SKILL.md +270 -0
- package/dist/auth/crypto.d.ts +31 -0
- package/dist/auth/crypto.js +66 -0
- package/dist/auth/crypto.js.map +1 -0
- package/dist/auth/handoff.d.ts +29 -0
- package/dist/auth/handoff.js +180 -0
- package/dist/auth/handoff.js.map +1 -0
- package/dist/auth/manager.d.ts +46 -0
- package/dist/auth/manager.js +127 -0
- package/dist/auth/manager.js.map +1 -0
- package/dist/auth/oauth-refresh.d.ts +16 -0
- package/dist/auth/oauth-refresh.js +91 -0
- package/dist/auth/oauth-refresh.js.map +1 -0
- package/dist/auth/refresh.d.ts +43 -0
- package/dist/auth/refresh.js +217 -0
- package/dist/auth/refresh.js.map +1 -0
- package/dist/capture/anti-bot.d.ts +15 -0
- package/dist/capture/anti-bot.js +43 -0
- package/dist/capture/anti-bot.js.map +1 -0
- package/dist/capture/blocklist.d.ts +6 -0
- package/dist/capture/blocklist.js +70 -0
- package/dist/capture/blocklist.js.map +1 -0
- package/dist/capture/body-diff.d.ts +8 -0
- package/dist/capture/body-diff.js +102 -0
- package/dist/capture/body-diff.js.map +1 -0
- package/dist/capture/body-variables.d.ts +13 -0
- package/dist/capture/body-variables.js +142 -0
- package/dist/capture/body-variables.js.map +1 -0
- package/dist/capture/domain.d.ts +8 -0
- package/dist/capture/domain.js +34 -0
- package/dist/capture/domain.js.map +1 -0
- package/dist/capture/entropy.d.ts +33 -0
- package/dist/capture/entropy.js +100 -0
- package/dist/capture/entropy.js.map +1 -0
- package/dist/capture/filter.d.ts +11 -0
- package/dist/capture/filter.js +49 -0
- package/dist/capture/filter.js.map +1 -0
- package/dist/capture/graphql.d.ts +21 -0
- package/dist/capture/graphql.js +99 -0
- package/dist/capture/graphql.js.map +1 -0
- package/dist/capture/idle.d.ts +23 -0
- package/dist/capture/idle.js +44 -0
- package/dist/capture/idle.js.map +1 -0
- package/dist/capture/monitor.d.ts +26 -0
- package/dist/capture/monitor.js +183 -0
- package/dist/capture/monitor.js.map +1 -0
- package/dist/capture/oauth-detector.d.ts +18 -0
- package/dist/capture/oauth-detector.js +96 -0
- package/dist/capture/oauth-detector.js.map +1 -0
- package/dist/capture/pagination.d.ts +9 -0
- package/dist/capture/pagination.js +40 -0
- package/dist/capture/pagination.js.map +1 -0
- package/dist/capture/parameterize.d.ts +17 -0
- package/dist/capture/parameterize.js +63 -0
- package/dist/capture/parameterize.js.map +1 -0
- package/dist/capture/scrubber.d.ts +5 -0
- package/dist/capture/scrubber.js +38 -0
- package/dist/capture/scrubber.js.map +1 -0
- package/dist/capture/session.d.ts +46 -0
- package/dist/capture/session.js +445 -0
- package/dist/capture/session.js.map +1 -0
- package/dist/capture/token-detector.d.ts +16 -0
- package/dist/capture/token-detector.js +62 -0
- package/dist/capture/token-detector.js.map +1 -0
- package/dist/capture/verifier.d.ts +17 -0
- package/dist/capture/verifier.js +147 -0
- package/dist/capture/verifier.js.map +1 -0
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +930 -0
- package/dist/cli.js.map +1 -0
- package/dist/discovery/auth.d.ts +17 -0
- package/dist/discovery/auth.js +81 -0
- package/dist/discovery/auth.js.map +1 -0
- package/dist/discovery/fetch.d.ts +17 -0
- package/dist/discovery/fetch.js +59 -0
- package/dist/discovery/fetch.js.map +1 -0
- package/dist/discovery/frameworks.d.ts +11 -0
- package/dist/discovery/frameworks.js +249 -0
- package/dist/discovery/frameworks.js.map +1 -0
- package/dist/discovery/index.d.ts +21 -0
- package/dist/discovery/index.js +219 -0
- package/dist/discovery/index.js.map +1 -0
- package/dist/discovery/openapi.d.ts +13 -0
- package/dist/discovery/openapi.js +175 -0
- package/dist/discovery/openapi.js.map +1 -0
- package/dist/discovery/probes.d.ts +9 -0
- package/dist/discovery/probes.js +70 -0
- package/dist/discovery/probes.js.map +1 -0
- package/dist/index.d.ts +25 -0
- package/dist/index.js +25 -0
- package/dist/index.js.map +1 -0
- package/dist/inspect/report.d.ts +52 -0
- package/dist/inspect/report.js +191 -0
- package/dist/inspect/report.js.map +1 -0
- package/dist/mcp.d.ts +8 -0
- package/dist/mcp.js +526 -0
- package/dist/mcp.js.map +1 -0
- package/dist/orchestration/browse.d.ts +38 -0
- package/dist/orchestration/browse.js +198 -0
- package/dist/orchestration/browse.js.map +1 -0
- package/dist/orchestration/cache.d.ts +15 -0
- package/dist/orchestration/cache.js +24 -0
- package/dist/orchestration/cache.js.map +1 -0
- package/dist/plugin.d.ts +17 -0
- package/dist/plugin.js +158 -0
- package/dist/plugin.js.map +1 -0
- package/dist/read/decoders/deepwiki.d.ts +2 -0
- package/dist/read/decoders/deepwiki.js +148 -0
- package/dist/read/decoders/deepwiki.js.map +1 -0
- package/dist/read/decoders/grokipedia.d.ts +2 -0
- package/dist/read/decoders/grokipedia.js +210 -0
- package/dist/read/decoders/grokipedia.js.map +1 -0
- package/dist/read/decoders/hackernews.d.ts +2 -0
- package/dist/read/decoders/hackernews.js +168 -0
- package/dist/read/decoders/hackernews.js.map +1 -0
- package/dist/read/decoders/index.d.ts +2 -0
- package/dist/read/decoders/index.js +12 -0
- package/dist/read/decoders/index.js.map +1 -0
- package/dist/read/decoders/reddit.d.ts +2 -0
- package/dist/read/decoders/reddit.js +142 -0
- package/dist/read/decoders/reddit.js.map +1 -0
- package/dist/read/decoders/twitter.d.ts +12 -0
- package/dist/read/decoders/twitter.js +187 -0
- package/dist/read/decoders/twitter.js.map +1 -0
- package/dist/read/decoders/wikipedia.d.ts +2 -0
- package/dist/read/decoders/wikipedia.js +66 -0
- package/dist/read/decoders/wikipedia.js.map +1 -0
- package/dist/read/decoders/youtube.d.ts +2 -0
- package/dist/read/decoders/youtube.js +69 -0
- package/dist/read/decoders/youtube.js.map +1 -0
- package/dist/read/extract.d.ts +25 -0
- package/dist/read/extract.js +320 -0
- package/dist/read/extract.js.map +1 -0
- package/dist/read/index.d.ts +14 -0
- package/dist/read/index.js +66 -0
- package/dist/read/index.js.map +1 -0
- package/dist/read/peek.d.ts +9 -0
- package/dist/read/peek.js +137 -0
- package/dist/read/peek.js.map +1 -0
- package/dist/read/types.d.ts +44 -0
- package/dist/read/types.js +3 -0
- package/dist/read/types.js.map +1 -0
- package/dist/replay/engine.d.ts +53 -0
- package/dist/replay/engine.js +441 -0
- package/dist/replay/engine.js.map +1 -0
- package/dist/replay/truncate.d.ts +16 -0
- package/dist/replay/truncate.js +92 -0
- package/dist/replay/truncate.js.map +1 -0
- package/dist/serve.d.ts +31 -0
- package/dist/serve.js +149 -0
- package/dist/serve.js.map +1 -0
- package/dist/skill/generator.d.ts +44 -0
- package/dist/skill/generator.js +419 -0
- package/dist/skill/generator.js.map +1 -0
- package/dist/skill/importer.d.ts +26 -0
- package/dist/skill/importer.js +80 -0
- package/dist/skill/importer.js.map +1 -0
- package/dist/skill/search.d.ts +19 -0
- package/dist/skill/search.js +51 -0
- package/dist/skill/search.js.map +1 -0
- package/dist/skill/signing.d.ts +16 -0
- package/dist/skill/signing.js +34 -0
- package/dist/skill/signing.js.map +1 -0
- package/dist/skill/ssrf.d.ts +27 -0
- package/dist/skill/ssrf.js +210 -0
- package/dist/skill/ssrf.js.map +1 -0
- package/dist/skill/store.d.ts +7 -0
- package/dist/skill/store.js +93 -0
- package/dist/skill/store.js.map +1 -0
- package/dist/stats/report.d.ts +26 -0
- package/dist/stats/report.js +157 -0
- package/dist/stats/report.js.map +1 -0
- package/dist/types.d.ts +214 -0
- package/dist/types.js +3 -0
- package/dist/types.js.map +1 -0
- package/package.json +58 -0
- package/src/auth/crypto.ts +92 -0
- package/src/auth/handoff.ts +229 -0
- package/src/auth/manager.ts +140 -0
- package/src/auth/oauth-refresh.ts +120 -0
- package/src/auth/refresh.ts +300 -0
- package/src/capture/anti-bot.ts +63 -0
- package/src/capture/blocklist.ts +75 -0
- package/src/capture/body-diff.ts +109 -0
- package/src/capture/body-variables.ts +156 -0
- package/src/capture/domain.ts +34 -0
- package/src/capture/entropy.ts +121 -0
- package/src/capture/filter.ts +56 -0
- package/src/capture/graphql.ts +124 -0
- package/src/capture/idle.ts +45 -0
- package/src/capture/monitor.ts +224 -0
- package/src/capture/oauth-detector.ts +106 -0
- package/src/capture/pagination.ts +49 -0
- package/src/capture/parameterize.ts +68 -0
- package/src/capture/scrubber.ts +49 -0
- package/src/capture/session.ts +502 -0
- package/src/capture/token-detector.ts +76 -0
- package/src/capture/verifier.ts +171 -0
- package/src/cli.ts +1031 -0
- package/src/discovery/auth.ts +99 -0
- package/src/discovery/fetch.ts +85 -0
- package/src/discovery/frameworks.ts +231 -0
- package/src/discovery/index.ts +256 -0
- package/src/discovery/openapi.ts +230 -0
- package/src/discovery/probes.ts +76 -0
- package/src/index.ts +26 -0
- package/src/inspect/report.ts +247 -0
- package/src/mcp.ts +618 -0
- package/src/orchestration/browse.ts +250 -0
- package/src/orchestration/cache.ts +37 -0
- package/src/plugin.ts +188 -0
- package/src/read/decoders/deepwiki.ts +180 -0
- package/src/read/decoders/grokipedia.ts +246 -0
- package/src/read/decoders/hackernews.ts +198 -0
- package/src/read/decoders/index.ts +15 -0
- package/src/read/decoders/reddit.ts +158 -0
- package/src/read/decoders/twitter.ts +211 -0
- package/src/read/decoders/wikipedia.ts +75 -0
- package/src/read/decoders/youtube.ts +75 -0
- package/src/read/extract.ts +396 -0
- package/src/read/index.ts +78 -0
- package/src/read/peek.ts +175 -0
- package/src/read/types.ts +37 -0
- package/src/replay/engine.ts +559 -0
- package/src/replay/truncate.ts +116 -0
- package/src/serve.ts +189 -0
- package/src/skill/generator.ts +473 -0
- package/src/skill/importer.ts +107 -0
- package/src/skill/search.ts +76 -0
- package/src/skill/signing.ts +36 -0
- package/src/skill/ssrf.ts +238 -0
- package/src/skill/store.ts +107 -0
- package/src/stats/report.ts +208 -0
- package/src/types.ts +233 -0
|
@@ -0,0 +1,320 @@
|
|
|
1
|
+
// src/read/extract.ts
|
|
2
|
+
// ---- HTML entity decoding ----
|
|
3
|
+
const ENTITY_MAP = {
|
|
4
|
+
'&': '&',
|
|
5
|
+
'<': '<',
|
|
6
|
+
'>': '>',
|
|
7
|
+
'"': '"',
|
|
8
|
+
''': "'",
|
|
9
|
+
''': "'",
|
|
10
|
+
' ': ' ',
|
|
11
|
+
};
|
|
12
|
+
function decodeEntities(text) {
|
|
13
|
+
return text.replace(/&(?:amp|lt|gt|quot|apos|nbsp|#39);/g, (m) => ENTITY_MAP[m] ?? m);
|
|
14
|
+
}
|
|
15
|
+
// ---- parseHead ----
|
|
16
|
+
function extractMetaContent(html, attrName, attrValue) {
|
|
17
|
+
// Handle both orders: property="X" content="Y" and content="Y" property="X"
|
|
18
|
+
// Also handle name="X" content="Y" for author etc.
|
|
19
|
+
const patterns = [
|
|
20
|
+
new RegExp(`<meta\\s+${attrName}=["']${escapeRegex(attrValue)}["']\\s+content=["']([^"']*)["']`, 'i'),
|
|
21
|
+
new RegExp(`<meta\\s+content=["']([^"']*)["']\\s+${attrName}=["']${escapeRegex(attrValue)}["']`, 'i'),
|
|
22
|
+
];
|
|
23
|
+
for (const re of patterns) {
|
|
24
|
+
const m = html.match(re);
|
|
25
|
+
if (m)
|
|
26
|
+
return decodeEntities(m[1]);
|
|
27
|
+
}
|
|
28
|
+
return null;
|
|
29
|
+
}
|
|
30
|
+
function escapeRegex(s) {
|
|
31
|
+
return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
32
|
+
}
|
|
33
|
+
export function parseHead(html) {
|
|
34
|
+
// Extract <title>
|
|
35
|
+
const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
|
|
36
|
+
const title = titleMatch ? decodeEntities(titleMatch[1].trim()) : null;
|
|
37
|
+
// Extract canonical
|
|
38
|
+
const canonicalMatch = html.match(/<link\s+[^>]*rel=["']canonical["'][^>]*href=["']([^"']*)["'][^>]*\/?>/i)
|
|
39
|
+
?? html.match(/<link\s+[^>]*href=["']([^"']*)["'][^>]*rel=["']canonical["'][^>]*\/?>/i);
|
|
40
|
+
const canonical = canonicalMatch ? decodeEntities(canonicalMatch[1]) : null;
|
|
41
|
+
return {
|
|
42
|
+
title,
|
|
43
|
+
ogTitle: extractMetaContent(html, 'property', 'og:title'),
|
|
44
|
+
ogDescription: extractMetaContent(html, 'property', 'og:description'),
|
|
45
|
+
ogImage: extractMetaContent(html, 'property', 'og:image'),
|
|
46
|
+
ogType: extractMetaContent(html, 'property', 'og:type'),
|
|
47
|
+
ogSiteName: extractMetaContent(html, 'property', 'og:site_name'),
|
|
48
|
+
canonical,
|
|
49
|
+
author: extractMetaContent(html, 'name', 'author'),
|
|
50
|
+
publishedTime: extractMetaContent(html, 'property', 'article:published_time'),
|
|
51
|
+
};
|
|
52
|
+
}
|
|
53
|
+
// ---- extractContent ----
|
|
54
|
+
/** Tags whose entire content (including children) should be removed */
|
|
55
|
+
const NOISE_TAGS = ['script', 'style', 'noscript', 'svg', 'iframe', 'nav', 'header', 'footer', 'aside'];
|
|
56
|
+
/** SPA shell markers */
|
|
57
|
+
const SPA_MARKERS = [
|
|
58
|
+
'<div id="root"',
|
|
59
|
+
'<div id="app"',
|
|
60
|
+
'<div id="__next"',
|
|
61
|
+
'bundle.js',
|
|
62
|
+
'main.js',
|
|
63
|
+
'app.js',
|
|
64
|
+
'__NEXT_DATA__',
|
|
65
|
+
'window.__INITIAL_STATE__',
|
|
66
|
+
'window.__NUXT__',
|
|
67
|
+
];
|
|
68
|
+
function stripTags(html, tags) {
|
|
69
|
+
let result = html;
|
|
70
|
+
for (const tag of tags) {
|
|
71
|
+
// Use non-greedy match with dotAll behavior via [\s\S]
|
|
72
|
+
const re = new RegExp(`<${tag}[\\s>][\\s\\S]*?<\\/${tag}>`, 'gi');
|
|
73
|
+
result = result.replace(re, '');
|
|
74
|
+
// Also strip self-closing variants (e.g. <iframe ... />)
|
|
75
|
+
const selfClose = new RegExp(`<${tag}[^>]*/?>`, 'gi');
|
|
76
|
+
result = result.replace(selfClose, '');
|
|
77
|
+
}
|
|
78
|
+
return result;
|
|
79
|
+
}
|
|
80
|
+
function findContentRoot(html) {
|
|
81
|
+
// Priority order for content root
|
|
82
|
+
const selectors = [
|
|
83
|
+
{ re: /<article[^>]*>([\s\S]*?)<\/article>/i },
|
|
84
|
+
{ re: /<main[^>]*>([\s\S]*?)<\/main>/i },
|
|
85
|
+
{ re: /<[^>]+role=["']main["'][^>]*>([\s\S]*?)<\/div>/i },
|
|
86
|
+
];
|
|
87
|
+
for (const { re } of selectors) {
|
|
88
|
+
const m = html.match(re);
|
|
89
|
+
if (m)
|
|
90
|
+
return m[1];
|
|
91
|
+
}
|
|
92
|
+
// Class-based selectors
|
|
93
|
+
const classPatterns = [
|
|
94
|
+
/class=["'][^"']*\bpost-content\b/i,
|
|
95
|
+
/class=["'][^"']*\barticle-body\b/i,
|
|
96
|
+
/class=["'][^"']*\bentry-content\b/i,
|
|
97
|
+
];
|
|
98
|
+
for (const cp of classPatterns) {
|
|
99
|
+
const m = html.match(cp);
|
|
100
|
+
if (m) {
|
|
101
|
+
// Find the enclosing tag and extract its content
|
|
102
|
+
const idx = m.index;
|
|
103
|
+
const extracted = extractTagContent(html, idx);
|
|
104
|
+
if (extracted)
|
|
105
|
+
return extracted;
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
// id="content"
|
|
109
|
+
const contentId = html.match(/id=["']content["']/i);
|
|
110
|
+
if (contentId) {
|
|
111
|
+
const extracted = extractTagContent(html, contentId.index);
|
|
112
|
+
if (extracted)
|
|
113
|
+
return extracted;
|
|
114
|
+
}
|
|
115
|
+
// Fallback: <body>
|
|
116
|
+
const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
|
|
117
|
+
if (bodyMatch)
|
|
118
|
+
return bodyMatch[1];
|
|
119
|
+
return html;
|
|
120
|
+
}
|
|
121
|
+
function extractTagContent(html, attrIndex) {
|
|
122
|
+
// Walk backwards to find the opening < of the tag
|
|
123
|
+
let tagStart = attrIndex;
|
|
124
|
+
while (tagStart > 0 && html[tagStart] !== '<')
|
|
125
|
+
tagStart--;
|
|
126
|
+
// Find the tag name
|
|
127
|
+
const tagNameMatch = html.slice(tagStart).match(/^<(\w+)/);
|
|
128
|
+
if (!tagNameMatch)
|
|
129
|
+
return null;
|
|
130
|
+
const tagName = tagNameMatch[1];
|
|
131
|
+
// Find matching close tag accounting for nesting
|
|
132
|
+
let depth = 1;
|
|
133
|
+
const openRe = new RegExp(`<${tagName}[\\s>]`, 'gi');
|
|
134
|
+
const closeRe = new RegExp(`</${tagName}>`, 'gi');
|
|
135
|
+
// Find where the opening tag ends (the first > after tagStart)
|
|
136
|
+
const openTagEnd = html.indexOf('>', tagStart);
|
|
137
|
+
if (openTagEnd === -1)
|
|
138
|
+
return null;
|
|
139
|
+
let pos = openTagEnd + 1;
|
|
140
|
+
const contentStart = pos;
|
|
141
|
+
while (depth > 0 && pos < html.length) {
|
|
142
|
+
openRe.lastIndex = pos;
|
|
143
|
+
closeRe.lastIndex = pos;
|
|
144
|
+
const nextOpen = openRe.exec(html);
|
|
145
|
+
const nextClose = closeRe.exec(html);
|
|
146
|
+
if (!nextClose)
|
|
147
|
+
break; // no more close tags
|
|
148
|
+
if (nextOpen && nextOpen.index < nextClose.index) {
|
|
149
|
+
depth++;
|
|
150
|
+
pos = nextOpen.index + nextOpen[0].length;
|
|
151
|
+
}
|
|
152
|
+
else {
|
|
153
|
+
depth--;
|
|
154
|
+
if (depth === 0) {
|
|
155
|
+
return html.slice(contentStart, nextClose.index);
|
|
156
|
+
}
|
|
157
|
+
pos = nextClose.index + nextClose[0].length;
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
return null;
|
|
161
|
+
}
|
|
162
|
+
function htmlToMarkdown(html, links, images) {
|
|
163
|
+
let md = html;
|
|
164
|
+
// Remove HTML comments
|
|
165
|
+
md = md.replace(/<!--[\s\S]*?-->/g, '');
|
|
166
|
+
// Convert headings
|
|
167
|
+
for (let level = 1; level <= 6; level++) {
|
|
168
|
+
const prefix = '#'.repeat(level);
|
|
169
|
+
const re = new RegExp(`<h${level}[^>]*>([\\s\\S]*?)<\\/h${level}>`, 'gi');
|
|
170
|
+
md = md.replace(re, (_m, content) => {
|
|
171
|
+
const text = stripAllTags(content).trim();
|
|
172
|
+
return `\n\n${prefix} ${text}\n\n`;
|
|
173
|
+
});
|
|
174
|
+
}
|
|
175
|
+
// Convert blockquotes (before paragraphs)
|
|
176
|
+
md = md.replace(/<blockquote[^>]*>([\s\S]*?)<\/blockquote>/gi, (_m, content) => {
|
|
177
|
+
const text = stripAllTags(content).trim();
|
|
178
|
+
const quoted = text.split('\n').map((l) => `> ${l}`).join('\n');
|
|
179
|
+
return `\n\n${quoted}\n\n`;
|
|
180
|
+
});
|
|
181
|
+
// Convert code blocks: <pre><code>...</code></pre>
|
|
182
|
+
md = md.replace(/<pre[^>]*>\s*<code[^>]*>([\s\S]*?)<\/code>\s*<\/pre>/gi, (_m, content) => {
|
|
183
|
+
const decoded = decodeEntities(content.trim());
|
|
184
|
+
return `\n\n\`\`\`\n${decoded}\n\`\`\`\n\n`;
|
|
185
|
+
});
|
|
186
|
+
// Convert standalone <pre> (without <code>)
|
|
187
|
+
md = md.replace(/<pre[^>]*>([\s\S]*?)<\/pre>/gi, (_m, content) => {
|
|
188
|
+
const decoded = decodeEntities(stripAllTags(content).trim());
|
|
189
|
+
return `\n\n\`\`\`\n${decoded}\n\`\`\`\n\n`;
|
|
190
|
+
});
|
|
191
|
+
// Convert inline code
|
|
192
|
+
md = md.replace(/<code[^>]*>([\s\S]*?)<\/code>/gi, (_m, content) => {
|
|
193
|
+
return `\`${decodeEntities(content)}\``;
|
|
194
|
+
});
|
|
195
|
+
// Convert tables
|
|
196
|
+
md = md.replace(/<table[^>]*>([\s\S]*?)<\/table>/gi, (_m, tableContent) => {
|
|
197
|
+
return convertTable(tableContent);
|
|
198
|
+
});
|
|
199
|
+
// Convert images (before stripping tags, so we can extract src/alt)
|
|
200
|
+
md = md.replace(/<img\s+[^>]*>/gi, (tag) => {
|
|
201
|
+
const srcMatch = tag.match(/src=["']([^"']*)["']/i);
|
|
202
|
+
const altMatch = tag.match(/alt=["']([^"']*)["']/i);
|
|
203
|
+
const src = srcMatch ? decodeEntities(srcMatch[1]) : '';
|
|
204
|
+
const alt = altMatch ? decodeEntities(altMatch[1]) : '';
|
|
205
|
+
if (src) {
|
|
206
|
+
images.push({ alt, src });
|
|
207
|
+
return ``;
|
|
208
|
+
}
|
|
209
|
+
return '';
|
|
210
|
+
});
|
|
211
|
+
// Convert links
|
|
212
|
+
md = md.replace(/<a\s+[^>]*href=["']([^"']*)["'][^>]*>([\s\S]*?)<\/a>/gi, (_m, href, content) => {
|
|
213
|
+
const text = stripAllTags(content).trim();
|
|
214
|
+
const decodedHref = decodeEntities(href);
|
|
215
|
+
if (text && decodedHref) {
|
|
216
|
+
links.push({ text, href: decodedHref });
|
|
217
|
+
return `[${text}](${decodedHref})`;
|
|
218
|
+
}
|
|
219
|
+
return text;
|
|
220
|
+
});
|
|
221
|
+
// Convert bold
|
|
222
|
+
md = md.replace(/<(?:strong|b)[^>]*>([\s\S]*?)<\/(?:strong|b)>/gi, (_m, content) => {
|
|
223
|
+
return `**${stripAllTags(content)}**`;
|
|
224
|
+
});
|
|
225
|
+
// Convert italic
|
|
226
|
+
md = md.replace(/<(?:em|i)(?:\s[^>]*)?>(?!mg)([\s\S]*?)<\/(?:em|i)>/gi, (_m, content) => {
|
|
227
|
+
return `*${stripAllTags(content)}*`;
|
|
228
|
+
});
|
|
229
|
+
// Convert ordered lists
|
|
230
|
+
md = md.replace(/<ol[^>]*>([\s\S]*?)<\/ol>/gi, (_m, listContent) => {
|
|
231
|
+
let counter = 0;
|
|
232
|
+
const items = listContent.replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, (_lm, item) => {
|
|
233
|
+
counter++;
|
|
234
|
+
return `${counter}. ${stripAllTags(item).trim()}\n`;
|
|
235
|
+
});
|
|
236
|
+
return `\n\n${stripAllTags(items).trim()}\n\n`;
|
|
237
|
+
});
|
|
238
|
+
// Convert unordered lists
|
|
239
|
+
md = md.replace(/<ul[^>]*>([\s\S]*?)<\/ul>/gi, (_m, listContent) => {
|
|
240
|
+
const items = listContent.replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, (_lm, item) => {
|
|
241
|
+
return `- ${stripAllTags(item).trim()}\n`;
|
|
242
|
+
});
|
|
243
|
+
return `\n\n${stripAllTags(items).trim()}\n\n`;
|
|
244
|
+
});
|
|
245
|
+
// Convert paragraphs
|
|
246
|
+
md = md.replace(/<p[^>]*>([\s\S]*?)<\/p>/gi, (_m, content) => {
|
|
247
|
+
return `\n\n${content.trim()}\n\n`;
|
|
248
|
+
});
|
|
249
|
+
// Convert <br> tags
|
|
250
|
+
md = md.replace(/<br\s*\/?>/gi, '\n');
|
|
251
|
+
// Strip remaining HTML tags
|
|
252
|
+
md = stripAllTags(md);
|
|
253
|
+
// Decode entities
|
|
254
|
+
md = decodeEntities(md);
|
|
255
|
+
// Collapse whitespace: no more than 2 consecutive newlines
|
|
256
|
+
md = md.replace(/\n{3,}/g, '\n\n');
|
|
257
|
+
// Trim leading/trailing whitespace
|
|
258
|
+
md = md.trim();
|
|
259
|
+
return md;
|
|
260
|
+
}
|
|
261
|
+
function stripAllTags(html) {
|
|
262
|
+
return html.replace(/<[^>]*>/g, '');
|
|
263
|
+
}
|
|
264
|
+
function convertTable(tableHtml) {
|
|
265
|
+
const rows = [];
|
|
266
|
+
// Extract rows
|
|
267
|
+
const rowMatches = tableHtml.match(/<tr[^>]*>[\s\S]*?<\/tr>/gi) || [];
|
|
268
|
+
for (const row of rowMatches) {
|
|
269
|
+
const cells = [];
|
|
270
|
+
const cellMatches = row.match(/<(?:td|th)[^>]*>[\s\S]*?<\/(?:td|th)>/gi) || [];
|
|
271
|
+
for (const cell of cellMatches) {
|
|
272
|
+
const content = cell.replace(/<\/?(?:td|th)[^>]*>/gi, '');
|
|
273
|
+
cells.push(stripAllTags(content).trim());
|
|
274
|
+
}
|
|
275
|
+
if (cells.length > 0)
|
|
276
|
+
rows.push(cells);
|
|
277
|
+
}
|
|
278
|
+
if (rows.length === 0)
|
|
279
|
+
return '';
|
|
280
|
+
// Normalize column count
|
|
281
|
+
const maxCols = Math.max(...rows.map((r) => r.length));
|
|
282
|
+
const normalized = rows.map((r) => {
|
|
283
|
+
while (r.length < maxCols)
|
|
284
|
+
r.push('');
|
|
285
|
+
return r;
|
|
286
|
+
});
|
|
287
|
+
// Build markdown table
|
|
288
|
+
const lines = [];
|
|
289
|
+
const header = normalized[0];
|
|
290
|
+
lines.push('| ' + header.join(' | ') + ' |');
|
|
291
|
+
lines.push('| ' + header.map(() => '---').join(' | ') + ' |');
|
|
292
|
+
for (let i = 1; i < normalized.length; i++) {
|
|
293
|
+
lines.push('| ' + normalized[i].join(' | ') + ' |');
|
|
294
|
+
}
|
|
295
|
+
return '\n\n' + lines.join('\n') + '\n\n';
|
|
296
|
+
}
|
|
297
|
+
function getTextContent(html) {
|
|
298
|
+
return stripAllTags(html).replace(/\s+/g, ' ').trim();
|
|
299
|
+
}
|
|
300
|
+
export function extractContent(html) {
|
|
301
|
+
const links = [];
|
|
302
|
+
const images = [];
|
|
303
|
+
// Strip noise tags first
|
|
304
|
+
const cleaned = stripTags(html, NOISE_TAGS);
|
|
305
|
+
// Find content root
|
|
306
|
+
const contentHtml = findContentRoot(cleaned);
|
|
307
|
+
// Convert to markdown
|
|
308
|
+
const content = htmlToMarkdown(contentHtml, links, images);
|
|
309
|
+
// Detect SPA shell
|
|
310
|
+
const textContent = getTextContent(contentHtml);
|
|
311
|
+
const hasSpaMarker = SPA_MARKERS.some((marker) => html.includes(marker));
|
|
312
|
+
const isSpaShell = textContent.length < 200 && hasSpaMarker;
|
|
313
|
+
return {
|
|
314
|
+
content,
|
|
315
|
+
links,
|
|
316
|
+
images,
|
|
317
|
+
isSpaShell,
|
|
318
|
+
};
|
|
319
|
+
}
|
|
320
|
+
//# sourceMappingURL=extract.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"extract.js","sourceRoot":"","sources":["../../src/read/extract.ts"],"names":[],"mappings":"AAAA,sBAAsB;AAqBtB,iCAAiC;AAEjC,MAAM,UAAU,GAA2B;IACzC,OAAO,EAAE,GAAG;IACZ,MAAM,EAAE,GAAG;IACX,MAAM,EAAE,GAAG;IACX,QAAQ,EAAE,GAAG;IACb,OAAO,EAAE,GAAG;IACZ,QAAQ,EAAE,GAAG;IACb,QAAQ,EAAE,GAAG;CACd,CAAC;AAEF,SAAS,cAAc,CAAC,IAAY;IAClC,OAAO,IAAI,CAAC,OAAO,CAAC,qCAAqC,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,UAAU,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC;AACxF,CAAC;AAED,sBAAsB;AAEtB,SAAS,kBAAkB,CAAC,IAAY,EAAE,QAAgB,EAAE,SAAiB;IAC3E,4EAA4E;IAC5E,mDAAmD;IACnD,MAAM,QAAQ,GAAG;QACf,IAAI,MAAM,CAAC,YAAY,QAAQ,QAAQ,WAAW,CAAC,SAAS,CAAC,kCAAkC,EAAE,GAAG,CAAC;QACrG,IAAI,MAAM,CAAC,wCAAwC,QAAQ,QAAQ,WAAW,CAAC,SAAS,CAAC,MAAM,EAAE,GAAG,CAAC;KACtG,CAAC;IACF,KAAK,MAAM,EAAE,IAAI,QAAQ,EAAE,CAAC;QAC1B,MAAM,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;QACzB,IAAI,CAAC;YAAE,OAAO,cAAc,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IACrC,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,SAAS,WAAW,CAAC,CAAS;IAC5B,OAAO,CAAC,CAAC,OAAO,CAAC,qBAAqB,EAAE,MAAM,CAAC,CAAC;AAClD,CAAC;AAED,MAAM,UAAU,SAAS,CAAC,IAAY;IACpC,kBAAkB;IAClB,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,kCAAkC,CAAC,CAAC;IAClE,MAAM,KAAK,GAAG,UAAU,CAAC,CAAC,CAAC,cAAc,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;IAEvE,oBAAoB;IACpB,MAAM,cAAc,GAAG,IAAI,CAAC,KAAK,CAAC,wEAAwE,CAAC;WACtG,IAAI,CAAC,KAAK,CAAC,wEAAwE,CAAC,CAAC;IAC1F,MAAM,SAAS,GAAG,cAAc,CAAC,CAAC,CAAC,cAAc,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;IAE5E,OAAO;QACL,KAAK;QACL,OAAO,EAAE,kBAAkB,CAAC,IAAI,EAAE,UAAU,EAAE,UAAU,CAAC;QACzD,aAAa,EAAE,kBAAkB,CAAC,IAAI,EAAE,UAAU,EAAE,gBAAgB,CAAC;QACrE,OAAO,EAAE,kBAAkB,CAAC,IAAI,EAAE,UAAU,EAAE,UAAU,CAAC;QACzD,MAAM,EAAE,kBAAkB,CAAC,IAAI,EAAE,UAAU,EAAE,SAAS,CAAC;QACvD,UAAU,EAAE,kBAAkB,CAAC,IAAI,EAAE,UAAU,EAAE,cAAc,CAAC;QAChE,SAAS;QACT,MAAM,EAAE,kBAAkB,CAAC,IAAI,EAAE,MAAM,EAAE,QAAQ,CAAC;QAClD,aAAa,EAAE,kBAAkB,CAAC,IAAI,EAAE,UAAU,EAAE,wBAAwB,CAAC;KAC9E,CAAC;AACJ,CAAC;AAED,2BAA2B;AAE3B,uEAAuE;AACvE,MAAM,UAAU,GAAG,CAAC,QAAQ,EAAE,OAAO,EAAE,UAAU,EAAE,KAAK,EAAE,QAAQ,EAAE,KAAK,EAAE,QAAQ,EAAE,QAAQ,EAAE,OAAO,CAAC,CAAC;AAExG,wBAAwB;AACxB,MAAM,WAAW,GAAG;IAClB,gBAAgB;IAChB,eAAe;IACf,kBAAkB;IAClB,WAAW;IACX,SAAS;IACT,QAAQ;IACR,eAAe;IACf,0BAA0B;IAC1B,iBAAiB;CAClB,CAAC;AAEF,SAAS,SAAS,CAAC,IAAY,EAAE,IAAc;IAC7C,IAAI,MAAM,GAAG,IAAI,CAAC;IAClB,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,uDAAuD;QACvD,MAAM,EAAE,GAAG,IAAI,MAAM,CAAC,IAAI,GAAG,uBAAuB,GAAG,GAAG,EAAE,IAAI,CAAC,CAAC;QAClE,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC;QAChC,yDAAyD;QACzD,MAAM,SAAS,GAAG,IAAI,MAAM,CAAC,IAAI,GAAG,UAAU,EAAE,IAAI,CAAC,CAAC;QACtD,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC;IACzC,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,SAAS,eAAe,CAAC,IAAY;IACnC,kCAAkC;IAClC,MAAM,SAAS,GAA0B;QACvC,EAAE,EAAE,EAAE,sCAAsC,EAAE;QAC9C,EAAE,EAAE,EAAE,gCAAgC,EAAE;QACxC,EAAE,EAAE,EAAE,iDAAiD,EAAE;KAC1D,CAAC;IAEF,KAAK,MAAM,EAAE,EAAE,EAAE,IAAI,SAAS,EAAE,CAAC;QAC/B,MAAM,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;QACzB,IAAI,CAAC;YAAE,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC;IACrB,CAAC;IAED,wBAAwB;IACxB,MAAM,aAAa,GAAG;QACpB,mCAAmC;QACnC,mCAAmC;QACnC,oCAAoC;KACrC,CAAC;IAEF,KAAK,MAAM,EAAE,IAAI,aAAa,EAAE,CAAC;QAC/B,MAAM,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;QACzB,IAAI,CAAC,EAAE,CAAC;YACN,iDAAiD;YACjD,MAAM,GAAG,GAAG,CAAC,CAAC,KAAM,CAAC;YACrB,MAAM,SAAS,GAAG,iBAAiB,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;YAC/C,IAAI,SAAS;gBAAE,OAAO,SAAS,CAAC;QAClC,CAAC;IACH,CAAC;IAED,eAAe;IACf,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,qBAAqB,CAAC,CAAC;IACpD,IAAI,SAAS,EAAE,CAAC;QACd,MAAM,SAAS,GAAG,iBAAiB,CAAC,IAAI,EAAE,SAAS,CAAC,KAAM,CAAC,CAAC;QAC5D,IAAI,SAAS;YAAE,OAAO,SAAS,CAAC;IAClC,CAAC;IAED,mBAAmB;IACnB,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,gCAAgC,CAAC,CAAC;IAC/D,IAAI,SAAS;QAAE,OAAO,SAAS,CAAC,CAAC,CAAC,CAAC;IAEnC,OAAO,IAAI,CAAC;AACd,CAAC;AAED,SAAS,iBAAiB,CAAC,IAAY,EAAE,SAAiB;IACxD,kDAAkD;IAClD,IAAI,QAAQ,GAAG,SAAS,CAAC;IACzB,OAAO,QAAQ,GAAG,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,KAAK,GAAG;QAAE,QAAQ,EAAE,CAAC;IAE1D,oBAAoB;IACpB,MAAM,YAAY,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;IAC3D,IAAI,CAAC,YAAY;QAAE,OAAO,IAAI,CAAC;IAE/B,MAAM,OAAO,GAAG,YAAY,CAAC,CAAC,CAAC,CAAC;IAEhC,iDAAiD;IACjD,IAAI,KAAK,GAAG,CAAC,CAAC;IACd,MAAM,MAAM,GAAG,IAAI,MAAM,CAAC,IAAI,OAAO,QAAQ,EAAE,IAAI,CAAC,CAAC;IACrD,MAAM,OAAO,GAAG,IAAI,MAAM,CAAC,KAAK,OAAO,GAAG,EAAE,IAAI,CAAC,CAAC;IAElD,+DAA+D;IAC/D,MAAM,UAAU,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,EAAE,QAAQ,CAAC,CAAC;IAC/C,IAAI,UAAU,KAAK,CAAC,CAAC;QAAE,OAAO,IAAI,CAAC;IAEnC,IAAI,GAAG,GAAG,UAAU,GAAG,CAAC,CAAC;IACzB,MAAM,YAAY,GAAG,GAAG,CAAC;IAEzB,OAAO,KAAK,GAAG,CAAC,IAAI,GAAG,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;QACtC,MAAM,CAAC,SAAS,GAAG,GAAG,CAAC;QACvB,OAAO,CAAC,SAAS,GAAG,GAAG,CAAC;QAExB,MAAM,QAAQ,GAAG,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACnC,MAAM,SAAS,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAErC,IAAI,CAAC,SAAS;YAAE,MAAM,CAAC,qBAAqB;QAE5C,IAAI,QAAQ,IAAI,QAAQ,CAAC,KAAK,GAAG,SAAS,CAAC,KAAK,EAAE,CAAC;YACjD,KAAK,EAAE,CAAC;YACR,GAAG,GAAG,QAAQ,CAAC,KAAK,GAAG,QAAQ,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;QAC5C,CAAC;aAAM,CAAC;YACN,KAAK,EAAE,CAAC;YACR,IAAI,KAAK,KAAK,CAAC,EAAE,CAAC;gBAChB,OAAO,IAAI,CAAC,KAAK,CAAC,YAAY,EAAE,SAAS,CAAC,KAAK,CAAC,CAAC;YACnD,CAAC;YACD,GAAG,GAAG,SAAS,CAAC,KAAK,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;QAC9C,CAAC;IACH,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED,SAAS,cAAc,CACrB,IAAY,EACZ,KAA4C,EAC5C,MAA2C;IAE3C,IAAI,EAAE,GAAG,IAAI,CAAC;IAEd,uBAAuB;IACvB,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,kBAAkB,EAAE,EAAE,CAAC,CAAC;IAExC,mBAAmB;IACnB,KAAK,IAAI,KAAK,GAAG,CAAC,EAAE,KAAK,IAAI,CAAC,EAAE,KAAK,EAAE,EAAE,CAAC;QACxC,MAAM,MAAM,GAAG,GAAG,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;QACjC,MAAM,EAAE,GAAG,IAAI,MAAM,CAAC,KAAK,KAAK,0BAA0B,KAAK,GAAG,EAAE,IAAI,CAAC,CAAC;QAC1E,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,EAAE,EAAE,CAAC,EAAE,EAAE,OAAO,EAAE,EAAE;YAClC,MAAM,IAAI,GAAG,YAAY,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC;YAC1C,OAAO,OAAO,MAAM,IAAI,IAAI,MAAM,CAAC;QACrC,CAAC,CAAC,CAAC;IACL,CAAC;IAED,0CAA0C;IAC1C,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,6CAA6C,EAAE,CAAC,EAAE,EAAE,OAAO,EAAE,EAAE;QAC7E,MAAM,IAAI,GAAG,YAAY,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC;QAC1C,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAS,EAAE,EAAE,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACxE,OAAO,OAAO,MAAM,MAAM,CAAC;IAC7B,CAAC,CAAC,CAAC;IAEH,mDAAmD;IACnD,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,wDAAwD,EAAE,CAAC,EAAE,EAAE,OAAO,EAAE,EAAE;QACxF,MAAM,OAAO,GAAG,cAAc,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,CAAC;QAC/C,OAAO,eAAe,OAAO,cAAc,CAAC;IAC9C,CAAC,CAAC,CAAC;IAEH,4CAA4C;IAC5C,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,+BAA+B,EAAE,CAAC,EAAE,EAAE,OAAO,EAAE,EAAE;QAC/D,MAAM,OAAO,GAAG,cAAc,CAAC,YAAY,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;QAC7D,OAAO,eAAe,OAAO,cAAc,CAAC;IAC9C,CAAC,CAAC,CAAC;IAEH,sBAAsB;IACtB,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,iCAAiC,EAAE,CAAC,EAAE,EAAE,OAAO,EAAE,EAAE;QACjE,OAAO,KAAK,cAAc,CAAC,OAAO,CAAC,IAAI,CAAC;IAC1C,CAAC,CAAC,CAAC;IAEH,iBAAiB;IACjB,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,mCAAmC,EAAE,CAAC,EAAE,EAAE,YAAY,EAAE,EAAE;QACxE,OAAO,YAAY,CAAC,YAAY,CAAC,CAAC;IACpC,CAAC,CAAC,CAAC;IAEH,oEAAoE;IACpE,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,iBAAiB,EAAE,CAAC,GAAG,EAAE,EAAE;QACzC,MAAM,QAAQ,GAAG,GAAG,CAAC,KAAK,CAAC,uBAAuB,CAAC,CAAC;QACpD,MAAM,QAAQ,GAAG,GAAG,CAAC,KAAK,CAAC,uBAAuB,CAAC,CAAC;QACpD,MAAM,GAAG,GAAG,QAAQ,CAAC,CAAC,CAAC,cAAc,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QACxD,MAAM,GAAG,GAAG,QAAQ,CAAC,CAAC,CAAC,cAAc,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QACxD,IAAI,GAAG,EAAE,CAAC;YACR,MAAM,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,GAAG,EAAE,CAAC,CAAC;YAC1B,OAAO,KAAK,GAAG,KAAK,GAAG,GAAG,CAAC;QAC7B,CAAC;QACD,OAAO,EAAE,CAAC;IACZ,CAAC,CAAC,CAAC;IAEH,gBAAgB;IAChB,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,wDAAwD,EAAE,CAAC,EAAE,EAAE,IAAI,EAAE,OAAO,EAAE,EAAE;QAC9F,MAAM,IAAI,GAAG,YAAY,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC;QAC1C,MAAM,WAAW,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;QACzC,IAAI,IAAI,IAAI,WAAW,EAAE,CAAC;YACxB,KAAK,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,WAAW,EAAE,CAAC,CAAC;YACxC,OAAO,IAAI,IAAI,KAAK,WAAW,GAAG,CAAC;QACrC,CAAC;QACD,OAAO,IAAI,CAAC;IACd,CAAC,CAAC,CAAC;IAEH,eAAe;IACf,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,iDAAiD,EAAE,CAAC,EAAE,EAAE,OAAO,EAAE,EAAE;QACjF,OAAO,KAAK,YAAY,CAAC,OAAO,CAAC,IAAI,CAAC;IACxC,CAAC,CAAC,CAAC;IAEH,iBAAiB;IACjB,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,sDAAsD,EAAE,CAAC,EAAE,EAAE,OAAO,EAAE,EAAE;QACtF,OAAO,IAAI,YAAY,CAAC,OAAO,CAAC,GAAG,CAAC;IACtC,CAAC,CAAC,CAAC;IAEH,wBAAwB;IACxB,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,6BAA6B,EAAE,CAAC,EAAE,EAAE,WAAW,EAAE,EAAE;QACjE,IAAI,OAAO,GAAG,CAAC,CAAC;QAChB,MAAM,KAAK,GAAG,WAAW,CAAC,OAAO,CAAC,6BAA6B,EAAE,CAAC,GAAW,EAAE,IAAY,EAAE,EAAE;YAC7F,OAAO,EAAE,CAAC;YACV,OAAO,GAAG,OAAO,KAAK,YAAY,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,IAAI,CAAC;QACtD,CAAC,CAAC,CAAC;QACH,OAAO,OAAO,YAAY,CAAC,KAAK,CAAC,CAAC,IAAI,EAAE,MAAM,CAAC;IACjD,CAAC,CAAC,CAAC;IAEH,0BAA0B;IAC1B,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,6BAA6B,EAAE,CAAC,EAAE,EAAE,WAAW,EAAE,EAAE;QACjE,MAAM,KAAK,GAAG,WAAW,CAAC,OAAO,CAAC,6BAA6B,EAAE,CAAC,GAAW,EAAE,IAAY,EAAE,EAAE;YAC7F,OAAO,KAAK,YAAY,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,IAAI,CAAC;QAC5C,CAAC,CAAC,CAAC;QACH,OAAO,OAAO,YAAY,CAAC,KAAK,CAAC,CAAC,IAAI,EAAE,MAAM,CAAC;IACjD,CAAC,CAAC,CAAC;IAEH,qBAAqB;IACrB,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,2BAA2B,EAAE,CAAC,EAAE,EAAE,OAAO,EAAE,EAAE;QAC3D,OAAO,OAAO,OAAO,CAAC,IAAI,EAAE,MAAM,CAAC;IACrC,CAAC,CAAC,CAAC;IAEH,oBAAoB;IACpB,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,cAAc,EAAE,IAAI,CAAC,CAAC;IAEtC,4BAA4B;IAC5B,EAAE,GAAG,YAAY,CAAC,EAAE,CAAC,CAAC;IAEtB,kBAAkB;IAClB,EAAE,GAAG,cAAc,CAAC,EAAE,CAAC,CAAC;IAExB,2DAA2D;IAC3D,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;IAEnC,mCAAmC;IACnC,EAAE,GAAG,EAAE,CAAC,IAAI,EAAE,CAAC;IAEf,OAAO,EAAE,CAAC;AACZ,CAAC;AAED,SAAS,YAAY,CAAC,IAAY;IAChC,OAAO,IAAI,CAAC,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC;AACtC,CAAC;AAED,SAAS,YAAY,CAAC,SAAiB;IACrC,MAAM,IAAI,GAAe,EAAE,CAAC;IAE5B,eAAe;IACf,MAAM,UAAU,GAAG,SAAS,CAAC,KAAK,CAAC,2BAA2B,CAAC,IAAI,EAAE,CAAC;IACtE,KAAK,MAAM,GAAG,IAAI,UAAU,EAAE,CAAC;QAC7B,MAAM,KAAK,GAAa,EAAE,CAAC;QAC3B,MAAM,WAAW,GAAG,GAAG,CAAC,KAAK,CAAC,yCAAyC,CAAC,IAAI,EAAE,CAAC;QAC/E,KAAK,MAAM,IAAI,IAAI,WAAW,EAAE,CAAC;YAC/B,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC,uBAAuB,EAAE,EAAE,CAAC,CAAC;YAC1D,KAAK,CAAC,IAAI,CAAC,YAAY,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;QAC3C,CAAC;QACD,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC;YAAE,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IACzC,CAAC;IAED,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAEjC,yBAAyB;IACzB,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC;IACvD,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE;QAChC,OAAO,CAAC,CAAC,MAAM,GAAG,OAAO;YAAE,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACtC,OAAO,CAAC,CAAC;IACX,CAAC,CAAC,CAAC;IAEH,uBAAuB;IACvB,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,MAAM,MAAM,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;IAC7B,KAAK,CAAC,IAAI,CAAC,IAAI,GAAG,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC,CAAC;IAC7C,KAAK,CAAC,IAAI,CAAC,IAAI,GAAG,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC,CAAC;IAE9D,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC3C,KAAK,CAAC,IAAI,CAAC,IAAI,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC,CAAC;IACtD,CAAC;IAED,OAAO,MAAM,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC;AAC5C,CAAC;AAED,SAAS,cAAc,CAAC,IAAY;IAClC,OAAO,YAAY,CAAC,IAAI,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;AACxD,CAAC;AAED,MAAM,UAAU,cAAc,CAAC,IAAY;IACzC,MAAM,KAAK,GAA0C,EAAE,CAAC;IACxD,MAAM,MAAM,GAAwC,EAAE,CAAC;IAEvD,yBAAyB;IACzB,MAAM,OAAO,GAAG,SAAS,CAAC,IAAI,EAAE,UAAU,CAAC,CAAC;IAE5C,oBAAoB;IACpB,MAAM,WAAW,GAAG,eAAe,CAAC,OAAO,CAAC,CAAC;IAE7C,sBAAsB;IACtB,MAAM,OAAO,GAAG,cAAc,CAAC,WAAW,EAAE,KAAK,EAAE,MAAM,CAAC,CAAC;IAE3D,mBAAmB;IACnB,MAAM,WAAW,GAAG,cAAc,CAAC,WAAW,CAAC,CAAC;IAChD,MAAM,YAAY,GAAG,WAAW,CAAC,IAAI,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,IAAI,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC;IACzE,MAAM,UAAU,GAAG,WAAW,CAAC,MAAM,GAAG,GAAG,IAAI,YAAY,CAAC;IAE5D,OAAO;QACL,OAAO;QACP,KAAK;QACL,MAAM;QACN,UAAU;KACX,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
export { peek } from './peek.js';
|
|
2
|
+
export type { PeekOptions } from './peek.js';
|
|
3
|
+
export type { PeekResult, ReadResult, Decoder } from './types.js';
|
|
4
|
+
import type { ReadResult } from './types.js';
|
|
5
|
+
export interface ReadOptions {
|
|
6
|
+
skipSsrf?: boolean;
|
|
7
|
+
maxBytes?: number;
|
|
8
|
+
}
|
|
9
|
+
/**
|
|
10
|
+
* Universal content decoder. Routes to site-specific decoders for known sites
|
|
11
|
+
* (Reddit, YouTube, Wikipedia, HN), falls back to generic HTML extraction.
|
|
12
|
+
* Returns null if content cannot be extracted.
|
|
13
|
+
*/
|
|
14
|
+
export declare function read(url: string, options?: ReadOptions): Promise<ReadResult | null>;
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
// src/read/index.ts
|
|
2
|
+
export { peek } from './peek.js';
|
|
3
|
+
import { safeFetch } from '../discovery/fetch.js';
|
|
4
|
+
import { findDecoder } from './decoders/index.js';
|
|
5
|
+
import { parseHead, extractContent } from './extract.js';
|
|
6
|
+
/**
|
|
7
|
+
* Universal content decoder. Routes to site-specific decoders for known sites
|
|
8
|
+
* (Reddit, YouTube, Wikipedia, HN), falls back to generic HTML extraction.
|
|
9
|
+
* Returns null if content cannot be extracted.
|
|
10
|
+
*/
|
|
11
|
+
export async function read(url, options = {}) {
|
|
12
|
+
// Try site-specific decoder first
|
|
13
|
+
const decoder = findDecoder(url);
|
|
14
|
+
if (decoder) {
|
|
15
|
+
const result = await decoder.decode(url, { skipSsrf: options.skipSsrf });
|
|
16
|
+
if (result) {
|
|
17
|
+
if (options.maxBytes && result.content.length > options.maxBytes) {
|
|
18
|
+
result.content = result.content.slice(0, options.maxBytes);
|
|
19
|
+
result.cost.tokens = Math.ceil(result.content.length / 4);
|
|
20
|
+
}
|
|
21
|
+
return result;
|
|
22
|
+
}
|
|
23
|
+
// Decoder returned null -- fall through to generic
|
|
24
|
+
}
|
|
25
|
+
// Generic pipeline: fetch HTML -> parse head -> extract body
|
|
26
|
+
const fetchResult = await safeFetch(url, { skipSsrf: options.skipSsrf });
|
|
27
|
+
if (!fetchResult || fetchResult.status !== 200)
|
|
28
|
+
return null;
|
|
29
|
+
const html = fetchResult.body;
|
|
30
|
+
const head = parseHead(html);
|
|
31
|
+
const body = extractContent(html);
|
|
32
|
+
// Determine source
|
|
33
|
+
let source;
|
|
34
|
+
if (body.isSpaShell) {
|
|
35
|
+
source = 'spa-shell';
|
|
36
|
+
}
|
|
37
|
+
else if (body.content.trim().length === 0) {
|
|
38
|
+
source = 'og-tags-only';
|
|
39
|
+
}
|
|
40
|
+
else {
|
|
41
|
+
source = 'readability';
|
|
42
|
+
}
|
|
43
|
+
let content = body.content;
|
|
44
|
+
if (options.maxBytes && content.length > options.maxBytes) {
|
|
45
|
+
content = content.slice(0, options.maxBytes);
|
|
46
|
+
}
|
|
47
|
+
const title = head.ogTitle || head.title || null;
|
|
48
|
+
return {
|
|
49
|
+
url,
|
|
50
|
+
title,
|
|
51
|
+
author: head.author || null,
|
|
52
|
+
description: head.ogDescription || null,
|
|
53
|
+
content,
|
|
54
|
+
links: body.links,
|
|
55
|
+
images: body.images,
|
|
56
|
+
metadata: {
|
|
57
|
+
type: head.ogType || 'unknown',
|
|
58
|
+
publishedAt: head.publishedTime || null,
|
|
59
|
+
source,
|
|
60
|
+
canonical: head.canonical || null,
|
|
61
|
+
siteName: head.ogSiteName || null,
|
|
62
|
+
},
|
|
63
|
+
cost: { tokens: Math.ceil(content.length / 4) },
|
|
64
|
+
};
|
|
65
|
+
}
|
|
66
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/read/index.ts"],"names":[],"mappings":"AAAA,oBAAoB;AACpB,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAKjC,OAAO,EAAE,SAAS,EAAE,MAAM,uBAAuB,CAAC;AAClD,OAAO,EAAE,WAAW,EAAE,MAAM,qBAAqB,CAAC;AAClD,OAAO,EAAE,SAAS,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAOzD;;;;GAIG;AACH,MAAM,CAAC,KAAK,UAAU,IAAI,CAAC,GAAW,EAAE,UAAuB,EAAE;IAC/D,kCAAkC;IAClC,MAAM,OAAO,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC;IACjC,IAAI,OAAO,EAAE,CAAC;QACZ,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,MAAM,CAAC,GAAG,EAAE,EAAE,QAAQ,EAAE,OAAO,CAAC,QAAQ,EAAE,CAAC,CAAC;QACzE,IAAI,MAAM,EAAE,CAAC;YACX,IAAI,OAAO,CAAC,QAAQ,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,GAAG,OAAO,CAAC,QAAQ,EAAE,CAAC;gBACjE,MAAM,CAAC,OAAO,GAAG,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,OAAO,CAAC,QAAQ,CAAC,CAAC;gBAC3D,MAAM,CAAC,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;YAC5D,CAAC;YACD,OAAO,MAAM,CAAC;QAChB,CAAC;QACD,mDAAmD;IACrD,CAAC;IAED,6DAA6D;IAC7D,MAAM,WAAW,GAAG,MAAM,SAAS,CAAC,GAAG,EAAE,EAAE,QAAQ,EAAE,OAAO,CAAC,QAAQ,EAAE,CAAC,CAAC;IACzE,IAAI,CAAC,WAAW,IAAI,WAAW,CAAC,MAAM,KAAK,GAAG;QAAE,OAAO,IAAI,CAAC;IAE5D,MAAM,IAAI,GAAG,WAAW,CAAC,IAAI,CAAC;IAC9B,MAAM,IAAI,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;IAC7B,MAAM,IAAI,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;IAElC,mBAAmB;IACnB,IAAI,MAAc,CAAC;IACnB,IAAI,IAAI,CAAC,UAAU,EAAE,CAAC;QACpB,MAAM,GAAG,WAAW,CAAC;IACvB,CAAC;SAAM,IAAI,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC5C,MAAM,GAAG,cAAc,CAAC;IAC1B,CAAC;SAAM,CAAC;QACN,MAAM,GAAG,aAAa,CAAC;IACzB,CAAC;IAED,IAAI,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC;IAC3B,IAAI,OAAO,CAAC,QAAQ,IAAI,OAAO,CAAC,MAAM,GAAG,OAAO,CAAC,QAAQ,EAAE,CAAC;QAC1D,OAAO,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,OAAO,CAAC,QAAQ,CAAC,CAAC;IAC/C,CAAC;IAED,MAAM,KAAK,GAAG,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC,KAAK,IAAI,IAAI,CAAC;IAEjD,OAAO;QACL,GAAG;QACH,KAAK;QACL,MAAM,EAAE,IAAI,CAAC,MAAM,IAAI,IAAI;QAC3B,WAAW,EAAE,IAAI,CAAC,aAAa,IAAI,IAAI;QACvC,OAAO;QACP,KAAK,EAAE,IAAI,CAAC,KAAK;QACjB,MAAM,EAAE,IAAI,CAAC,MAAM;QACnB,QAAQ,EAAE;YACR,IAAI,EAAE,IAAI,CAAC,MAAM,IAAI,SAAS;YAC9B,WAAW,EAAE,IAAI,CAAC,aAAa,IAAI,IAAI;YACvC,MAAM;YACN,SAAS,EAAE,IAAI,CAAC,SAAS,IAAI,IAAI;YACjC,QAAQ,EAAE,IAAI,CAAC,UAAU,IAAI,IAAI;SAClC;QACD,IAAI,EAAE,EAAE,MAAM,EAAE,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,EAAE;KAChD,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import type { PeekResult } from './types.js';
|
|
2
|
+
export interface PeekOptions {
|
|
3
|
+
skipSsrf?: boolean;
|
|
4
|
+
}
|
|
5
|
+
/**
|
|
6
|
+
* HTTP HEAD-only triage: checks accessibility, detects bot protection and frameworks.
|
|
7
|
+
* Falls back to GET if HEAD fails.
|
|
8
|
+
*/
|
|
9
|
+
export declare function peek(url: string, options?: PeekOptions): Promise<PeekResult>;
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
import { safeFetch } from '../discovery/fetch.js';
|
|
2
|
+
/**
|
|
3
|
+
* HTTP HEAD-only triage: checks accessibility, detects bot protection and frameworks.
|
|
4
|
+
* Falls back to GET if HEAD fails.
|
|
5
|
+
*/
|
|
6
|
+
export async function peek(url, options = {}) {
|
|
7
|
+
const signals = [];
|
|
8
|
+
// Try HEAD first
|
|
9
|
+
let result = await safeFetch(url, {
|
|
10
|
+
method: 'HEAD',
|
|
11
|
+
skipSsrf: options.skipSsrf,
|
|
12
|
+
});
|
|
13
|
+
// Fall back to GET if HEAD fails (null = network/SSRF error)
|
|
14
|
+
if (!result) {
|
|
15
|
+
result = await safeFetch(url, {
|
|
16
|
+
method: 'GET',
|
|
17
|
+
skipSsrf: options.skipSsrf,
|
|
18
|
+
});
|
|
19
|
+
}
|
|
20
|
+
// Both HEAD and GET failed
|
|
21
|
+
if (!result) {
|
|
22
|
+
return {
|
|
23
|
+
url,
|
|
24
|
+
status: 0,
|
|
25
|
+
accessible: false,
|
|
26
|
+
contentType: null,
|
|
27
|
+
server: null,
|
|
28
|
+
framework: null,
|
|
29
|
+
botProtection: null,
|
|
30
|
+
signals: ['fetch failed'],
|
|
31
|
+
recommendation: 'blocked',
|
|
32
|
+
};
|
|
33
|
+
}
|
|
34
|
+
const { status, headers } = result;
|
|
35
|
+
// Extract basic metadata
|
|
36
|
+
const contentType = headers['content-type'] || null;
|
|
37
|
+
const server = headers['server'] || null;
|
|
38
|
+
// Detect bot protection
|
|
39
|
+
const botProtection = detectBotProtection(headers, signals);
|
|
40
|
+
// Detect framework
|
|
41
|
+
const framework = detectFramework(headers, signals);
|
|
42
|
+
// Determine accessibility and recommendation
|
|
43
|
+
const accessible = status >= 200 && status < 400 && !botProtection;
|
|
44
|
+
const recommendation = computeRecommendation(status, botProtection);
|
|
45
|
+
return {
|
|
46
|
+
url,
|
|
47
|
+
status,
|
|
48
|
+
accessible,
|
|
49
|
+
contentType,
|
|
50
|
+
server,
|
|
51
|
+
framework,
|
|
52
|
+
botProtection,
|
|
53
|
+
signals,
|
|
54
|
+
recommendation,
|
|
55
|
+
};
|
|
56
|
+
}
|
|
57
|
+
function detectBotProtection(headers, signals) {
|
|
58
|
+
// Cloudflare: cf-ray or cf-cache-status
|
|
59
|
+
if (headers['cf-ray']) {
|
|
60
|
+
signals.push('cf-ray header');
|
|
61
|
+
return 'cloudflare';
|
|
62
|
+
}
|
|
63
|
+
if (headers['cf-cache-status']) {
|
|
64
|
+
signals.push('cf-cache-status header');
|
|
65
|
+
return 'cloudflare';
|
|
66
|
+
}
|
|
67
|
+
// PerimeterX: x-px-* headers
|
|
68
|
+
for (const key of Object.keys(headers)) {
|
|
69
|
+
if (key.startsWith('x-px-')) {
|
|
70
|
+
signals.push(`${key} header`);
|
|
71
|
+
return 'perimeterx';
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
// DataDome: x-datadome* headers
|
|
75
|
+
for (const key of Object.keys(headers)) {
|
|
76
|
+
if (key.startsWith('x-datadome')) {
|
|
77
|
+
signals.push(`${key} header`);
|
|
78
|
+
return 'datadome';
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
return null;
|
|
82
|
+
}
|
|
83
|
+
function detectFramework(headers, signals) {
|
|
84
|
+
// Next.js: x-powered-by: Next.js
|
|
85
|
+
const poweredBy = headers['x-powered-by'];
|
|
86
|
+
if (poweredBy && /next\.js/i.test(poweredBy)) {
|
|
87
|
+
signals.push('x-powered-by: Next.js');
|
|
88
|
+
return 'next.js';
|
|
89
|
+
}
|
|
90
|
+
// Express: x-powered-by: Express
|
|
91
|
+
if (poweredBy && /express/i.test(poweredBy)) {
|
|
92
|
+
signals.push('x-powered-by: Express');
|
|
93
|
+
return 'express';
|
|
94
|
+
}
|
|
95
|
+
// PHP: x-powered-by: PHP/*
|
|
96
|
+
if (poweredBy && /php/i.test(poweredBy)) {
|
|
97
|
+
signals.push('x-powered-by: PHP');
|
|
98
|
+
return 'php';
|
|
99
|
+
}
|
|
100
|
+
// WordPress: link header containing api.w.org
|
|
101
|
+
const link = headers['link'];
|
|
102
|
+
if (link && link.includes('api.w.org')) {
|
|
103
|
+
signals.push('link: api.w.org');
|
|
104
|
+
return 'wordpress';
|
|
105
|
+
}
|
|
106
|
+
// Shopify: x-shopify-stage header
|
|
107
|
+
if (headers['x-shopify-stage']) {
|
|
108
|
+
signals.push('x-shopify-stage header');
|
|
109
|
+
return 'shopify';
|
|
110
|
+
}
|
|
111
|
+
// Drupal: x-drupal-* headers
|
|
112
|
+
for (const key of Object.keys(headers)) {
|
|
113
|
+
if (key.startsWith('x-drupal-')) {
|
|
114
|
+
signals.push(`${key} header`);
|
|
115
|
+
return 'drupal';
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
return null;
|
|
119
|
+
}
|
|
120
|
+
function computeRecommendation(status, botProtection) {
|
|
121
|
+
// Auth required
|
|
122
|
+
if (status === 401 || status === 407) {
|
|
123
|
+
return 'auth_required';
|
|
124
|
+
}
|
|
125
|
+
// Blocked: bot protection, 403, 429, or 5xx
|
|
126
|
+
if (botProtection) {
|
|
127
|
+
return 'blocked';
|
|
128
|
+
}
|
|
129
|
+
if (status === 403 || status === 429) {
|
|
130
|
+
return 'blocked';
|
|
131
|
+
}
|
|
132
|
+
if (status >= 500) {
|
|
133
|
+
return 'blocked';
|
|
134
|
+
}
|
|
135
|
+
return 'read';
|
|
136
|
+
}
|
|
137
|
+
//# sourceMappingURL=peek.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"peek.js","sourceRoot":"","sources":["../../src/read/peek.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,SAAS,EAAE,MAAM,uBAAuB,CAAC;AAMlD;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,IAAI,CAAC,GAAW,EAAE,UAAuB,EAAE;IAC/D,MAAM,OAAO,GAAa,EAAE,CAAC;IAE7B,iBAAiB;IACjB,IAAI,MAAM,GAAG,MAAM,SAAS,CAAC,GAAG,EAAE;QAChC,MAAM,EAAE,MAAM;QACd,QAAQ,EAAE,OAAO,CAAC,QAAQ;KAC3B,CAAC,CAAC;IAEH,6DAA6D;IAC7D,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,MAAM,GAAG,MAAM,SAAS,CAAC,GAAG,EAAE;YAC5B,MAAM,EAAE,KAAK;YACb,QAAQ,EAAE,OAAO,CAAC,QAAQ;SAC3B,CAAC,CAAC;IACL,CAAC;IAED,2BAA2B;IAC3B,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,OAAO;YACL,GAAG;YACH,MAAM,EAAE,CAAC;YACT,UAAU,EAAE,KAAK;YACjB,WAAW,EAAE,IAAI;YACjB,MAAM,EAAE,IAAI;YACZ,SAAS,EAAE,IAAI;YACf,aAAa,EAAE,IAAI;YACnB,OAAO,EAAE,CAAC,cAAc,CAAC;YACzB,cAAc,EAAE,SAAS;SAC1B,CAAC;IACJ,CAAC;IAED,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,GAAG,MAAM,CAAC;IAEnC,yBAAyB;IACzB,MAAM,WAAW,GAAG,OAAO,CAAC,cAAc,CAAC,IAAI,IAAI,CAAC;IACpD,MAAM,MAAM,GAAG,OAAO,CAAC,QAAQ,CAAC,IAAI,IAAI,CAAC;IAEzC,wBAAwB;IACxB,MAAM,aAAa,GAAG,mBAAmB,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC;IAE5D,mBAAmB;IACnB,MAAM,SAAS,GAAG,eAAe,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC;IAEpD,6CAA6C;IAC7C,MAAM,UAAU,GAAG,MAAM,IAAI,GAAG,IAAI,MAAM,GAAG,GAAG,IAAI,CAAC,aAAa,CAAC;IACnE,MAAM,cAAc,GAAG,qBAAqB,CAAC,MAAM,EAAE,aAAa,CAAC,CAAC;IAEpE,OAAO;QACL,GAAG;QACH,MAAM;QACN,UAAU;QACV,WAAW;QACX,MAAM;QACN,SAAS;QACT,aAAa;QACb,OAAO;QACP,cAAc;KACf,CAAC;AACJ,CAAC;AAED,SAAS,mBAAmB,CAC1B,OAA+B,EAC/B,OAAiB;IAEjB,wCAAwC;IACxC,IAAI,OAAO,CAAC,QAAQ,CAAC,EAAE,CAAC;QACtB,OAAO,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC;QAC9B,OAAO,YAAY,CAAC;IACtB,CAAC;IACD,IAAI,OAAO,CAAC,iBAAiB,CAAC,EAAE,CAAC;QAC/B,OAAO,CAAC,IAAI,CAAC,wBAAwB,CAAC,CAAC;QACvC,OAAO,YAAY,CAAC;IACtB,CAAC;IAED,6BAA6B;IAC7B,KAAK,MAAM,GAAG,IAAI,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC;QACvC,IAAI,GAAG,CAAC,UAAU,CAAC,OAAO,CAAC,EAAE,CAAC;YAC5B,OAAO,CAAC,IAAI,CAAC,GAAG,GAAG,SAAS,CAAC,CAAC;YAC9B,OAAO,YAAY,CAAC;QACtB,CAAC;IACH,CAAC;IAED,gCAAgC;IAChC,KAAK,MAAM,GAAG,IAAI,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC;QACvC,IAAI,GAAG,CAAC,UAAU,CAAC,YAAY,CAAC,EAAE,CAAC;YACjC,OAAO,CAAC,IAAI,CAAC,GAAG,GAAG,SAAS,CAAC,CAAC;YAC9B,OAAO,UAAU,CAAC;QACpB,CAAC;IACH,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED,SAAS,eAAe,CACtB,OAA+B,EAC/B,OAAiB;IAEjB,iCAAiC;IACjC,MAAM,SAAS,GAAG,OAAO,CAAC,cAAc,CAAC,CAAC;IAC1C,IAAI,SAAS,IAAI,WAAW,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,CAAC;QAC7C,OAAO,CAAC,IAAI,CAAC,uBAAuB,CAAC,CAAC;QACtC,OAAO,SAAS,CAAC;IACnB,CAAC;IAED,iCAAiC;IACjC,IAAI,SAAS,IAAI,UAAU,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,CAAC;QAC5C,OAAO,CAAC,IAAI,CAAC,uBAAuB,CAAC,CAAC;QACtC,OAAO,SAAS,CAAC;IACnB,CAAC;IAED,2BAA2B;IAC3B,IAAI,SAAS,IAAI,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,CAAC;QACxC,OAAO,CAAC,IAAI,CAAC,mBAAmB,CAAC,CAAC;QAClC,OAAO,KAAK,CAAC;IACf,CAAC;IAED,8CAA8C;IAC9C,MAAM,IAAI,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;IAC7B,IAAI,IAAI,IAAI,IAAI,CAAC,QAAQ,CAAC,WAAW,CAAC,EAAE,CAAC;QACvC,OAAO,CAAC,IAAI,CAAC,iBAAiB,CAAC,CAAC;QAChC,OAAO,WAAW,CAAC;IACrB,CAAC;IAED,kCAAkC;IAClC,IAAI,OAAO,CAAC,iBAAiB,CAAC,EAAE,CAAC;QAC/B,OAAO,CAAC,IAAI,CAAC,wBAAwB,CAAC,CAAC;QACvC,OAAO,SAAS,CAAC;IACnB,CAAC;IAED,6BAA6B;IAC7B,KAAK,MAAM,GAAG,IAAI,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC;QACvC,IAAI,GAAG,CAAC,UAAU,CAAC,WAAW,CAAC,EAAE,CAAC;YAChC,OAAO,CAAC,IAAI,CAAC,GAAG,GAAG,SAAS,CAAC,CAAC;YAC9B,OAAO,QAAQ,CAAC;QAClB,CAAC;IACH,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED,SAAS,qBAAqB,CAC5B,MAAc,EACd,aAA4B;IAE5B,gBAAgB;IAChB,IAAI,MAAM,KAAK,GAAG,IAAI,MAAM,KAAK,GAAG,EAAE,CAAC;QACrC,OAAO,eAAe,CAAC;IACzB,CAAC;IAED,4CAA4C;IAC5C,IAAI,aAAa,EAAE,CAAC;QAClB,OAAO,SAAS,CAAC;IACnB,CAAC;IACD,IAAI,MAAM,KAAK,GAAG,IAAI,MAAM,KAAK,GAAG,EAAE,CAAC;QACrC,OAAO,SAAS,CAAC;IACnB,CAAC;IACD,IAAI,MAAM,IAAI,GAAG,EAAE,CAAC;QAClB,OAAO,SAAS,CAAC;IACnB,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC"}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
export interface PeekResult {
|
|
2
|
+
url: string;
|
|
3
|
+
status: number;
|
|
4
|
+
accessible: boolean;
|
|
5
|
+
contentType: string | null;
|
|
6
|
+
server: string | null;
|
|
7
|
+
framework: string | null;
|
|
8
|
+
botProtection: string | null;
|
|
9
|
+
signals: string[];
|
|
10
|
+
recommendation: 'read' | 'capture' | 'auth_required' | 'blocked';
|
|
11
|
+
}
|
|
12
|
+
export interface ReadResult {
|
|
13
|
+
url: string;
|
|
14
|
+
title: string | null;
|
|
15
|
+
author: string | null;
|
|
16
|
+
description: string | null;
|
|
17
|
+
content: string;
|
|
18
|
+
links: Array<{
|
|
19
|
+
text: string;
|
|
20
|
+
href: string;
|
|
21
|
+
}>;
|
|
22
|
+
images: Array<{
|
|
23
|
+
alt: string;
|
|
24
|
+
src: string;
|
|
25
|
+
}>;
|
|
26
|
+
metadata: {
|
|
27
|
+
type: string;
|
|
28
|
+
publishedAt: string | null;
|
|
29
|
+
source: string;
|
|
30
|
+
canonical: string | null;
|
|
31
|
+
siteName: string | null;
|
|
32
|
+
};
|
|
33
|
+
cost: {
|
|
34
|
+
tokens: number;
|
|
35
|
+
};
|
|
36
|
+
}
|
|
37
|
+
export interface Decoder {
|
|
38
|
+
name: string;
|
|
39
|
+
patterns: RegExp[];
|
|
40
|
+
decode(url: string, options?: {
|
|
41
|
+
skipSsrf?: boolean;
|
|
42
|
+
[key: string]: any;
|
|
43
|
+
}): Promise<ReadResult | null>;
|
|
44
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../src/read/types.ts"],"names":[],"mappings":"AAAA,oBAAoB"}
|