@debriefer/sources 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +59 -0
- package/dist/__tests__/archives/chronicling-america.test.d.ts +8 -0
- package/dist/__tests__/archives/chronicling-america.test.d.ts.map +1 -0
- package/dist/__tests__/archives/chronicling-america.test.js +151 -0
- package/dist/__tests__/archives/chronicling-america.test.js.map +1 -0
- package/dist/__tests__/archives/europeana.test.d.ts +8 -0
- package/dist/__tests__/archives/europeana.test.d.ts.map +1 -0
- package/dist/__tests__/archives/europeana.test.js +200 -0
- package/dist/__tests__/archives/europeana.test.js.map +1 -0
- package/dist/__tests__/archives/internet-archive.test.d.ts +8 -0
- package/dist/__tests__/archives/internet-archive.test.d.ts.map +1 -0
- package/dist/__tests__/archives/internet-archive.test.js +189 -0
- package/dist/__tests__/archives/internet-archive.test.js.map +1 -0
- package/dist/__tests__/archives/trove.test.d.ts +8 -0
- package/dist/__tests__/archives/trove.test.d.ts.map +1 -0
- package/dist/__tests__/archives/trove.test.js +202 -0
- package/dist/__tests__/archives/trove.test.js.map +1 -0
- package/dist/__tests__/books/google-books.test.d.ts +8 -0
- package/dist/__tests__/books/google-books.test.d.ts.map +1 -0
- package/dist/__tests__/books/google-books.test.js +221 -0
- package/dist/__tests__/books/google-books.test.js.map +1 -0
- package/dist/__tests__/books/open-library.test.d.ts +8 -0
- package/dist/__tests__/books/open-library.test.d.ts.map +1 -0
- package/dist/__tests__/books/open-library.test.js +159 -0
- package/dist/__tests__/books/open-library.test.js.map +1 -0
- package/dist/__tests__/news/guardian.test.d.ts +9 -0
- package/dist/__tests__/news/guardian.test.d.ts.map +1 -0
- package/dist/__tests__/news/guardian.test.js +224 -0
- package/dist/__tests__/news/guardian.test.js.map +1 -0
- package/dist/__tests__/news/nytimes.test.d.ts +9 -0
- package/dist/__tests__/news/nytimes.test.d.ts.map +1 -0
- package/dist/__tests__/news/nytimes.test.js +271 -0
- package/dist/__tests__/news/nytimes.test.js.map +1 -0
- package/dist/__tests__/news/site-search-source.test.d.ts +9 -0
- package/dist/__tests__/news/site-search-source.test.d.ts.map +1 -0
- package/dist/__tests__/news/site-search-source.test.js +342 -0
- package/dist/__tests__/news/site-search-source.test.js.map +1 -0
- package/dist/__tests__/obituary/find-a-grave.test.d.ts +8 -0
- package/dist/__tests__/obituary/find-a-grave.test.d.ts.map +1 -0
- package/dist/__tests__/obituary/find-a-grave.test.js +238 -0
- package/dist/__tests__/obituary/find-a-grave.test.js.map +1 -0
- package/dist/__tests__/shared/duckduckgo-search.test.d.ts +9 -0
- package/dist/__tests__/shared/duckduckgo-search.test.d.ts.map +1 -0
- package/dist/__tests__/shared/duckduckgo-search.test.js +218 -0
- package/dist/__tests__/shared/duckduckgo-search.test.js.map +1 -0
- package/dist/__tests__/shared/fetch-page.test.d.ts +9 -0
- package/dist/__tests__/shared/fetch-page.test.d.ts.map +1 -0
- package/dist/__tests__/shared/fetch-page.test.js +281 -0
- package/dist/__tests__/shared/fetch-page.test.js.map +1 -0
- package/dist/__tests__/shared/html-utils.test.d.ts +2 -0
- package/dist/__tests__/shared/html-utils.test.d.ts.map +1 -0
- package/dist/__tests__/shared/html-utils.test.js +169 -0
- package/dist/__tests__/shared/html-utils.test.js.map +1 -0
- package/dist/__tests__/shared/readability-extract.test.d.ts +2 -0
- package/dist/__tests__/shared/readability-extract.test.d.ts.map +1 -0
- package/dist/__tests__/shared/readability-extract.test.js +107 -0
- package/dist/__tests__/shared/readability-extract.test.js.map +1 -0
- package/dist/__tests__/shared/sanitize-text.test.d.ts +2 -0
- package/dist/__tests__/shared/sanitize-text.test.d.ts.map +1 -0
- package/dist/__tests__/shared/sanitize-text.test.js +77 -0
- package/dist/__tests__/shared/sanitize-text.test.js.map +1 -0
- package/dist/__tests__/shared/search-utils.test.d.ts +2 -0
- package/dist/__tests__/shared/search-utils.test.d.ts.map +1 -0
- package/dist/__tests__/shared/search-utils.test.js +26 -0
- package/dist/__tests__/shared/search-utils.test.js.map +1 -0
- package/dist/__tests__/structured/wikidata.test.d.ts +9 -0
- package/dist/__tests__/structured/wikidata.test.d.ts.map +1 -0
- package/dist/__tests__/structured/wikidata.test.js +509 -0
- package/dist/__tests__/structured/wikidata.test.js.map +1 -0
- package/dist/__tests__/structured/wikipedia.test.d.ts +9 -0
- package/dist/__tests__/structured/wikipedia.test.d.ts.map +1 -0
- package/dist/__tests__/structured/wikipedia.test.js +643 -0
- package/dist/__tests__/structured/wikipedia.test.js.map +1 -0
- package/dist/__tests__/web-search/base.test.d.ts +9 -0
- package/dist/__tests__/web-search/base.test.d.ts.map +1 -0
- package/dist/__tests__/web-search/base.test.js +622 -0
- package/dist/__tests__/web-search/base.test.js.map +1 -0
- package/dist/__tests__/web-search/bing.test.d.ts +10 -0
- package/dist/__tests__/web-search/bing.test.d.ts.map +1 -0
- package/dist/__tests__/web-search/bing.test.js +277 -0
- package/dist/__tests__/web-search/bing.test.js.map +1 -0
- package/dist/__tests__/web-search/brave.test.d.ts +10 -0
- package/dist/__tests__/web-search/brave.test.d.ts.map +1 -0
- package/dist/__tests__/web-search/brave.test.js +264 -0
- package/dist/__tests__/web-search/brave.test.js.map +1 -0
- package/dist/__tests__/web-search/duckduckgo.test.d.ts +10 -0
- package/dist/__tests__/web-search/duckduckgo.test.d.ts.map +1 -0
- package/dist/__tests__/web-search/duckduckgo.test.js +107 -0
- package/dist/__tests__/web-search/duckduckgo.test.js.map +1 -0
- package/dist/__tests__/web-search/google.test.d.ts +9 -0
- package/dist/__tests__/web-search/google.test.d.ts.map +1 -0
- package/dist/__tests__/web-search/google.test.js +189 -0
- package/dist/__tests__/web-search/google.test.js.map +1 -0
- package/dist/archives/chronicling-america.d.ts +33 -0
- package/dist/archives/chronicling-america.d.ts.map +1 -0
- package/dist/archives/chronicling-america.js +85 -0
- package/dist/archives/chronicling-america.js.map +1 -0
- package/dist/archives/europeana.d.ts +37 -0
- package/dist/archives/europeana.d.ts.map +1 -0
- package/dist/archives/europeana.js +92 -0
- package/dist/archives/europeana.js.map +1 -0
- package/dist/archives/internet-archive.d.ts +32 -0
- package/dist/archives/internet-archive.d.ts.map +1 -0
- package/dist/archives/internet-archive.js +90 -0
- package/dist/archives/internet-archive.js.map +1 -0
- package/dist/archives/trove.d.ts +37 -0
- package/dist/archives/trove.d.ts.map +1 -0
- package/dist/archives/trove.js +97 -0
- package/dist/archives/trove.js.map +1 -0
- package/dist/books/google-books.d.ts +48 -0
- package/dist/books/google-books.d.ts.map +1 -0
- package/dist/books/google-books.js +111 -0
- package/dist/books/google-books.js.map +1 -0
- package/dist/books/open-library.d.ts +44 -0
- package/dist/books/open-library.d.ts.map +1 -0
- package/dist/books/open-library.js +103 -0
- package/dist/books/open-library.js.map +1 -0
- package/dist/index.d.ts +45 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +35 -0
- package/dist/index.js.map +1 -0
- package/dist/news/guardian.d.ts +51 -0
- package/dist/news/guardian.d.ts.map +1 -0
- package/dist/news/guardian.js +131 -0
- package/dist/news/guardian.js.map +1 -0
- package/dist/news/nytimes.d.ts +27 -0
- package/dist/news/nytimes.d.ts.map +1 -0
- package/dist/news/nytimes.js +104 -0
- package/dist/news/nytimes.js.map +1 -0
- package/dist/news/site-search-source.d.ts +89 -0
- package/dist/news/site-search-source.d.ts.map +1 -0
- package/dist/news/site-search-source.js +182 -0
- package/dist/news/site-search-source.js.map +1 -0
- package/dist/news/sources.d.ts +52 -0
- package/dist/news/sources.d.ts.map +1 -0
- package/dist/news/sources.js +276 -0
- package/dist/news/sources.js.map +1 -0
- package/dist/obituary/find-a-grave.d.ts +43 -0
- package/dist/obituary/find-a-grave.d.ts.map +1 -0
- package/dist/obituary/find-a-grave.js +173 -0
- package/dist/obituary/find-a-grave.js.map +1 -0
- package/dist/shared/duckduckgo-search.d.ts +86 -0
- package/dist/shared/duckduckgo-search.d.ts.map +1 -0
- package/dist/shared/duckduckgo-search.js +218 -0
- package/dist/shared/duckduckgo-search.js.map +1 -0
- package/dist/shared/fetch-page.d.ts +50 -0
- package/dist/shared/fetch-page.d.ts.map +1 -0
- package/dist/shared/fetch-page.js +212 -0
- package/dist/shared/fetch-page.js.map +1 -0
- package/dist/shared/html-utils.d.ts +99 -0
- package/dist/shared/html-utils.d.ts.map +1 -0
- package/dist/shared/html-utils.js +246 -0
- package/dist/shared/html-utils.js.map +1 -0
- package/dist/shared/readability-extract.d.ts +33 -0
- package/dist/shared/readability-extract.d.ts.map +1 -0
- package/dist/shared/readability-extract.js +45 -0
- package/dist/shared/readability-extract.js.map +1 -0
- package/dist/shared/sanitize-text.d.ts +24 -0
- package/dist/shared/sanitize-text.d.ts.map +1 -0
- package/dist/shared/sanitize-text.js +49 -0
- package/dist/shared/sanitize-text.js.map +1 -0
- package/dist/shared/search-utils.d.ts +18 -0
- package/dist/shared/search-utils.d.ts.map +1 -0
- package/dist/shared/search-utils.js +20 -0
- package/dist/shared/search-utils.js.map +1 -0
- package/dist/structured/wikidata.d.ts +128 -0
- package/dist/structured/wikidata.d.ts.map +1 -0
- package/dist/structured/wikidata.js +361 -0
- package/dist/structured/wikidata.js.map +1 -0
- package/dist/structured/wikipedia.d.ts +184 -0
- package/dist/structured/wikipedia.d.ts.map +1 -0
- package/dist/structured/wikipedia.js +275 -0
- package/dist/structured/wikipedia.js.map +1 -0
- package/dist/web-search/base.d.ts +128 -0
- package/dist/web-search/base.d.ts.map +1 -0
- package/dist/web-search/base.js +251 -0
- package/dist/web-search/base.js.map +1 -0
- package/dist/web-search/bing.d.ts +21 -0
- package/dist/web-search/bing.d.ts.map +1 -0
- package/dist/web-search/bing.js +53 -0
- package/dist/web-search/bing.js.map +1 -0
- package/dist/web-search/brave.d.ts +21 -0
- package/dist/web-search/brave.d.ts.map +1 -0
- package/dist/web-search/brave.js +56 -0
- package/dist/web-search/brave.js.map +1 -0
- package/dist/web-search/duckduckgo.d.ts +15 -0
- package/dist/web-search/duckduckgo.d.ts.map +1 -0
- package/dist/web-search/duckduckgo.js +21 -0
- package/dist/web-search/duckduckgo.js.map +1 -0
- package/dist/web-search/google.d.ts +24 -0
- package/dist/web-search/google.d.ts.map +1 -0
- package/dist/web-search/google.js +48 -0
- package/dist/web-search/google.js.map +1 -0
- package/package.json +58 -0
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Page fetching utility with browser-like headers and archive.org fallback.
|
|
3
|
+
*
|
|
4
|
+
* Provides a resilient page fetching pipeline:
|
|
5
|
+
* 1. Direct fetch with browser-like headers (Chrome UA, Accept text/html)
|
|
6
|
+
* 2. Block detection (hard HTTP blocks + soft body pattern matching)
|
|
7
|
+
* 3. Automatic archive.org fallback when blocked or on network error
|
|
8
|
+
* 4. Non-blocking HTTP errors (404, 500) return immediately without fallback
|
|
9
|
+
*
|
|
10
|
+
* Used by WebSearchBase when following links from search results.
|
|
11
|
+
*/
|
|
12
|
+
/** Default browser-like User-Agent string. */
|
|
13
|
+
const DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
|
|
14
|
+
/** Default total timeout budget in milliseconds (shared across direct + archive attempts). */
|
|
15
|
+
const DEFAULT_TIMEOUT_MS = 15000;
|
|
16
|
+
/** HTTP status codes that indicate a hard block (should trigger archive fallback). */
|
|
17
|
+
const HARD_BLOCK_STATUSES = new Set([401, 403, 429, 451]);
|
|
18
|
+
/** Maximum body length (in characters) for soft block detection. Pages larger than this are assumed to be real content. */
|
|
19
|
+
const SOFT_BLOCK_MAX_SIZE = 50_000;
|
|
20
|
+
/** Case-insensitive patterns in response body that indicate a soft block. */
|
|
21
|
+
const SOFT_BLOCK_PATTERNS = [
|
|
22
|
+
"captcha",
|
|
23
|
+
"please verify you are human",
|
|
24
|
+
"access denied",
|
|
25
|
+
"bot detection",
|
|
26
|
+
"unusual traffic",
|
|
27
|
+
"automated access",
|
|
28
|
+
"cloudflare",
|
|
29
|
+
"ddos protection",
|
|
30
|
+
"just a moment",
|
|
31
|
+
"recaptcha",
|
|
32
|
+
"hcaptcha",
|
|
33
|
+
];
|
|
34
|
+
/**
|
|
35
|
+
* Build the combined AbortSignal from a caller signal and a timeout.
|
|
36
|
+
*
|
|
37
|
+
* Uses `AbortSignal.any()` to combine both so that neither defeats the other.
|
|
38
|
+
*/
|
|
39
|
+
function buildSignal(callerSignal, timeoutMs) {
|
|
40
|
+
const timeout = timeoutMs ?? DEFAULT_TIMEOUT_MS;
|
|
41
|
+
const timeoutSignal = AbortSignal.timeout(timeout);
|
|
42
|
+
if (callerSignal) {
|
|
43
|
+
return AbortSignal.any([callerSignal, timeoutSignal]);
|
|
44
|
+
}
|
|
45
|
+
return timeoutSignal;
|
|
46
|
+
}
|
|
47
|
+
/** Build browser-like request headers. */
|
|
48
|
+
function buildHeaders(userAgent) {
|
|
49
|
+
return {
|
|
50
|
+
"User-Agent": userAgent,
|
|
51
|
+
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
52
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
53
|
+
"Cache-Control": "no-cache",
|
|
54
|
+
};
|
|
55
|
+
}
|
|
56
|
+
/**
|
|
57
|
+
* Check whether a response body indicates a soft block (captcha, bot detection, etc.).
|
|
58
|
+
*
|
|
59
|
+
* Only checks pages smaller than SOFT_BLOCK_MAX_SIZE to avoid false positives
|
|
60
|
+
* on large legitimate pages that happen to mention these words.
|
|
61
|
+
*/
|
|
62
|
+
function isSoftBlocked(body) {
|
|
63
|
+
if (body.length > SOFT_BLOCK_MAX_SIZE) {
|
|
64
|
+
return false;
|
|
65
|
+
}
|
|
66
|
+
const lower = body.toLowerCase();
|
|
67
|
+
return SOFT_BLOCK_PATTERNS.some((pattern) => lower.includes(pattern));
|
|
68
|
+
}
|
|
69
|
+
/**
|
|
70
|
+
* Check whether an error is an abort or timeout error.
|
|
71
|
+
* Node.js 22 AbortSignal.timeout() produces DOMException with name "TimeoutError".
|
|
72
|
+
*/
|
|
73
|
+
function isAbortError(error) {
|
|
74
|
+
return (error instanceof DOMException && (error.name === "AbortError" || error.name === "TimeoutError"));
|
|
75
|
+
}
|
|
76
|
+
/**
|
|
77
|
+
* Construct the archive.org Wayback Machine URL for a given URL.
|
|
78
|
+
* Strips fragments and credentials which aren't part of the archived resource key.
|
|
79
|
+
*/
|
|
80
|
+
function archiveUrl(url) {
|
|
81
|
+
try {
|
|
82
|
+
const normalized = new URL(url);
|
|
83
|
+
normalized.hash = "";
|
|
84
|
+
normalized.username = "";
|
|
85
|
+
normalized.password = "";
|
|
86
|
+
return `https://web.archive.org/web/${normalized.toString()}`;
|
|
87
|
+
}
|
|
88
|
+
catch {
|
|
89
|
+
return `https://web.archive.org/web/${url}`;
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
/**
|
|
93
|
+
* Attempt to fetch a page from archive.org.
|
|
94
|
+
*
|
|
95
|
+
* Returns null if the archive fetch fails for any reason.
|
|
96
|
+
*/
|
|
97
|
+
async function fetchFromArchive(url, headers, signal) {
|
|
98
|
+
const aUrl = archiveUrl(url);
|
|
99
|
+
try {
|
|
100
|
+
const response = await fetch(aUrl, { headers, signal });
|
|
101
|
+
if (response.ok) {
|
|
102
|
+
const content = await response.text();
|
|
103
|
+
return {
|
|
104
|
+
content,
|
|
105
|
+
url: response.url || aUrl,
|
|
106
|
+
fetchMethod: "archive.org",
|
|
107
|
+
};
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
catch {
|
|
111
|
+
// Archive fetch failed — fall through to return null
|
|
112
|
+
}
|
|
113
|
+
return null;
|
|
114
|
+
}
|
|
115
|
+
/**
|
|
116
|
+
* Fetch a page with browser-like headers and automatic archive.org fallback.
|
|
117
|
+
*
|
|
118
|
+
* Pipeline:
|
|
119
|
+
* 1. Direct fetch with browser-like headers
|
|
120
|
+
* 2. Block detection (hard HTTP status codes + soft body pattern matching)
|
|
121
|
+
* 3. If blocked and archiveFallback enabled, try archive.org
|
|
122
|
+
* 4. Non-blocking HTTP errors (404, 500) return "none" immediately
|
|
123
|
+
* 5. Network errors on direct fetch trigger archive fallback
|
|
124
|
+
*
|
|
125
|
+
* @param options - Fetch options including URL, signal, timeout, etc.
|
|
126
|
+
* @returns Result with content, final URL, and fetch method
|
|
127
|
+
*/
|
|
128
|
+
export async function fetchPage(options) {
|
|
129
|
+
const { url, signal: callerSignal, timeoutMs, userAgent = DEFAULT_USER_AGENT, archiveFallback = true, } = options;
|
|
130
|
+
const signal = buildSignal(callerSignal, timeoutMs);
|
|
131
|
+
const headers = buildHeaders(userAgent);
|
|
132
|
+
// --- Direct fetch attempt ---
|
|
133
|
+
let response;
|
|
134
|
+
try {
|
|
135
|
+
response = await fetch(url, { headers, signal });
|
|
136
|
+
}
|
|
137
|
+
catch (error) {
|
|
138
|
+
// Abort/timeout errors should not trigger archive fallback
|
|
139
|
+
if (isAbortError(error)) {
|
|
140
|
+
const reason = error instanceof DOMException && error.name === "TimeoutError"
|
|
141
|
+
? "Request timed out"
|
|
142
|
+
: "Request was aborted";
|
|
143
|
+
return {
|
|
144
|
+
content: "",
|
|
145
|
+
url,
|
|
146
|
+
fetchMethod: "none",
|
|
147
|
+
error: reason,
|
|
148
|
+
};
|
|
149
|
+
}
|
|
150
|
+
// Network error — try archive fallback
|
|
151
|
+
if (archiveFallback) {
|
|
152
|
+
const archiveResult = await fetchFromArchive(url, headers, signal);
|
|
153
|
+
if (archiveResult) {
|
|
154
|
+
return archiveResult;
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
158
|
+
return {
|
|
159
|
+
content: "",
|
|
160
|
+
url,
|
|
161
|
+
fetchMethod: "none",
|
|
162
|
+
error: `Network error: ${message}`,
|
|
163
|
+
};
|
|
164
|
+
}
|
|
165
|
+
// --- Hard block detection ---
|
|
166
|
+
if (HARD_BLOCK_STATUSES.has(response.status)) {
|
|
167
|
+
if (archiveFallback) {
|
|
168
|
+
const archiveResult = await fetchFromArchive(url, headers, signal);
|
|
169
|
+
if (archiveResult) {
|
|
170
|
+
return archiveResult;
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
return {
|
|
174
|
+
content: "",
|
|
175
|
+
url,
|
|
176
|
+
fetchMethod: "none",
|
|
177
|
+
error: `HTTP ${response.status} (blocked)`,
|
|
178
|
+
};
|
|
179
|
+
}
|
|
180
|
+
// --- Non-blocking HTTP errors (404, 500, etc.) — return immediately ---
|
|
181
|
+
if (!response.ok) {
|
|
182
|
+
return {
|
|
183
|
+
content: "",
|
|
184
|
+
url,
|
|
185
|
+
fetchMethod: "none",
|
|
186
|
+
error: `HTTP ${response.status}`,
|
|
187
|
+
};
|
|
188
|
+
}
|
|
189
|
+
// --- Read body and check for soft blocks ---
|
|
190
|
+
const body = await response.text();
|
|
191
|
+
if (isSoftBlocked(body)) {
|
|
192
|
+
if (archiveFallback) {
|
|
193
|
+
const archiveResult = await fetchFromArchive(url, headers, signal);
|
|
194
|
+
if (archiveResult) {
|
|
195
|
+
return archiveResult;
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
return {
|
|
199
|
+
content: "",
|
|
200
|
+
url,
|
|
201
|
+
fetchMethod: "none",
|
|
202
|
+
error: "Soft block detected (captcha/bot detection)",
|
|
203
|
+
};
|
|
204
|
+
}
|
|
205
|
+
// --- Success ---
|
|
206
|
+
return {
|
|
207
|
+
content: body,
|
|
208
|
+
url: response.url || url,
|
|
209
|
+
fetchMethod: "direct",
|
|
210
|
+
};
|
|
211
|
+
}
|
|
212
|
+
//# sourceMappingURL=fetch-page.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"fetch-page.js","sourceRoot":"","sources":["../../src/shared/fetch-page.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AA4BH,8CAA8C;AAC9C,MAAM,kBAAkB,GACtB,iHAAiH,CAAA;AAEnH,8FAA8F;AAC9F,MAAM,kBAAkB,GAAG,KAAK,CAAA;AAEhC,sFAAsF;AACtF,MAAM,mBAAmB,GAAG,IAAI,GAAG,CAAC,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAA;AAEzD,2HAA2H;AAC3H,MAAM,mBAAmB,GAAG,MAAM,CAAA;AAElC,6EAA6E;AAC7E,MAAM,mBAAmB,GAAG;IAC1B,SAAS;IACT,6BAA6B;IAC7B,eAAe;IACf,eAAe;IACf,iBAAiB;IACjB,kBAAkB;IAClB,YAAY;IACZ,iBAAiB;IACjB,eAAe;IACf,WAAW;IACX,UAAU;CACX,CAAA;AAED;;;;GAIG;AACH,SAAS,WAAW,CAAC,YAA0B,EAAE,SAAkB;IACjE,MAAM,OAAO,GAAG,SAAS,IAAI,kBAAkB,CAAA;IAC/C,MAAM,aAAa,GAAG,WAAW,CAAC,OAAO,CAAC,OAAO,CAAC,CAAA;IAElD,IAAI,YAAY,EAAE,CAAC;QACjB,OAAO,WAAW,CAAC,GAAG,CAAC,CAAC,YAAY,EAAE,aAAa,CAAC,CAAC,CAAA;IACvD,CAAC;IACD,OAAO,aAAa,CAAA;AACtB,CAAC;AAED,0CAA0C;AAC1C,SAAS,YAAY,CAAC,SAAiB;IACrC,OAAO;QACL,YAAY,EAAE,SAAS;QACvB,MAAM,EAAE,iEAAiE;QACzE,iBAAiB,EAAE,gBAAgB;QACnC,eAAe,EAAE,UAAU;KAC5B,CAAA;AACH,CAAC;AAED;;;;;GAKG;AACH,SAAS,aAAa,CAAC,IAAY;IACjC,IAAI,IAAI,CAAC,MAAM,GAAG,mBAAmB,EAAE,CAAC;QACtC,OAAO,KAAK,CAAA;IACd,CAAC;IAED,MAAM,KAAK,GAAG,IAAI,CAAC,WAAW,EAAE,CAAA;IAChC,OAAO,mBAAmB,CAAC,IAAI,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,KAAK,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAA;AACvE,CAAC;AAED;;;GAGG;AACH,SAAS,YAAY,CAAC,KAAc;IAClC,OAAO,CACL,KAAK,YAAY,YAAY,IAAI,CAAC,KAAK,CAAC,IAAI,KAAK,YAAY,IAAI,KAAK,CAAC,IAAI,KAAK,cAAc,CAAC,CAChG,CAAA;AACH,CAAC;AAED;;;GAGG;AACH,SAAS,UAAU,CAAC,GAAW;IAC7B,IAAI,CAAC;QACH,MAAM,UAAU,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAA;QAC/B,UAAU,CAAC,IAAI,GAAG,EAAE,CAAA;QACpB,UAAU,CAAC,QAAQ,GAAG,EAAE,CAAA;QACxB,UAAU,CAAC,QAAQ,GAAG,EAAE,CAAA;QACxB,OAAO,+BAA+B,UAAU,CAAC,QAAQ,EAAE,EAAE,CAAA;IAC/D,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,+BAA+B,GAAG,EAAE,CAAA;IAC7C,CAAC;AACH,CAAC;AAED;;;;GAIG;AACH,KAAK,UAAU,gBAAgB,CAC7B,GAAW,EACX,OAA+B,EAC/B,MAAmB;IAEnB,MAAM,IAAI,GAAG,UAAU,CAAC,GAAG,CAAC,CAAA;IAC5B,IAAI,CAAC;QACH,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,IAAI,EAAE,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC,CAAA;QACvD,IAAI,QAAQ,CAAC,EAAE,EAAE,CAAC;YAChB,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAA;YACrC,OAAO;gBACL,OAAO;gBACP,GAAG,EAAE,QAAQ,CAAC,GAAG,IAAI,IAAI;gBACzB,WAAW,EAAE,aAAa;aAC3B,CAAA;QACH,CAAC;IACH,CAAC;IAAC,MAAM,CAAC;QACP,qDAAqD;IACvD,CAAC;IACD,OAAO,IAAI,CAAA;AACb,CAAC;AAED;;;;;;;;;;;;GAYG;AACH,MAAM,CAAC,KAAK,UAAU,SAAS,CAAC,OAAyB;IACvD,MAAM,EACJ,GAAG,EACH,MAAM,EAAE,YAAY,EACpB,SAAS,EACT,SAAS,GAAG,kBAAkB,EAC9B,eAAe,GAAG,IAAI,GACvB,GAAG,OAAO,CAAA;IAEX,MAAM,MAAM,GAAG,WAAW,CAAC,YAAY,EAAE,SAAS,CAAC,CAAA;IACnD,MAAM,OAAO,GAAG,YAAY,CAAC,SAAS,CAAC,CAAA;IAEvC,+BAA+B;IAC/B,IAAI,QAAkB,CAAA;IACtB,IAAI,CAAC;QACH,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC,CAAA;IAClD,CAAC;IAAC,OAAO,KAAc,EAAE,CAAC;QACxB,2DAA2D;QAC3D,IAAI,YAAY,CAAC,KAAK,CAAC,EAAE,CAAC;YACxB,MAAM,MAAM,GACV,KAAK,YAAY,YAAY,IAAI,KAAK,CAAC,IAAI,KAAK,cAAc;gBAC5D,CAAC,CAAC,mBAAmB;gBACrB,CAAC,CAAC,qBAAqB,CAAA;YAC3B,OAAO;gBACL,OAAO,EAAE,EAAE;gBACX,GAAG;gBACH,WAAW,EAAE,MAAM;gBACnB,KAAK,EAAE,MAAM;aACd,CAAA;QACH,CAAC;QAED,uCAAuC;QACvC,IAAI,eAAe,EAAE,CAAC;YACpB,MAAM,aAAa,GAAG,MAAM,gBAAgB,CAAC,GAAG,EAAE,OAAO,EAAE,MAAM,CAAC,CAAA;YAClE,IAAI,aAAa,EAAE,CAAC;gBAClB,OAAO,aAAa,CAAA;YACtB,CAAC;QACH,CAAC;QAED,MAAM,OAAO,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAA;QACtE,OAAO;YACL,OAAO,EAAE,EAAE;YACX,GAAG;YACH,WAAW,EAAE,MAAM;YACnB,KAAK,EAAE,kBAAkB,OAAO,EAAE;SACnC,CAAA;IACH,CAAC;IAED,+BAA+B;IAC/B,IAAI,mBAAmB,CAAC,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC;QAC7C,IAAI,eAAe,EAAE,CAAC;YACpB,MAAM,aAAa,GAAG,MAAM,gBAAgB,CAAC,GAAG,EAAE,OAAO,EAAE,MAAM,CAAC,CAAA;YAClE,IAAI,aAAa,EAAE,CAAC;gBAClB,OAAO,aAAa,CAAA;YACtB,CAAC;QACH,CAAC;QAED,OAAO;YACL,OAAO,EAAE,EAAE;YACX,GAAG;YACH,WAAW,EAAE,MAAM;YACnB,KAAK,EAAE,QAAQ,QAAQ,CAAC,MAAM,YAAY;SAC3C,CAAA;IACH,CAAC;IAED,yEAAyE;IACzE,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;QACjB,OAAO;YACL,OAAO,EAAE,EAAE;YACX,GAAG;YACH,WAAW,EAAE,MAAM;YACnB,KAAK,EAAE,QAAQ,QAAQ,CAAC,MAAM,EAAE;SACjC,CAAA;IACH,CAAC;IAED,8CAA8C;IAC9C,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAA;IAElC,IAAI,aAAa,CAAC,IAAI,CAAC,EAAE,CAAC;QACxB,IAAI,eAAe,EAAE,CAAC;YACpB,MAAM,aAAa,GAAG,MAAM,gBAAgB,CAAC,GAAG,EAAE,OAAO,EAAE,MAAM,CAAC,CAAA;YAClE,IAAI,aAAa,EAAE,CAAC;gBAClB,OAAO,aAAa,CAAA;YACtB,CAAC;QACH,CAAC;QAED,OAAO;YACL,OAAO,EAAE,EAAE;YACX,GAAG;YACH,WAAW,EAAE,MAAM;YACnB,KAAK,EAAE,6CAA6C;SACrD,CAAA;IACH,CAAC;IAED,kBAAkB;IAClB,OAAO;QACL,OAAO,EAAE,IAAI;QACb,GAAG,EAAE,QAAQ,CAAC,GAAG,IAAI,GAAG;QACxB,WAAW,EAAE,QAAQ;KACtB,CAAA;AACH,CAAC"}
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* HTML sanitization and text extraction utilities.
|
|
3
|
+
*
|
|
4
|
+
* Provides a complete pipeline for converting raw HTML to clean plain text:
|
|
5
|
+
* script/style tag removal (via state machines for robustness), HTML tag
|
|
6
|
+
* stripping, entity decoding, whitespace normalization, and optional
|
|
7
|
+
* code-fragment detection/removal.
|
|
8
|
+
*/
|
|
9
|
+
/**
|
|
10
|
+
* Decode HTML entities in a string using the `he` library.
|
|
11
|
+
*
|
|
12
|
+
* Handles all HTML entities including named (&), decimal (&),
|
|
13
|
+
* and hexadecimal (&) numeric entities.
|
|
14
|
+
*
|
|
15
|
+
* @param text - Text containing HTML entities
|
|
16
|
+
* @returns Decoded text
|
|
17
|
+
*/
|
|
18
|
+
export declare function decodeHtmlEntities(text: string): string;
|
|
19
|
+
/**
|
|
20
|
+
* Remove script tags and their content from HTML.
|
|
21
|
+
*
|
|
22
|
+
* Uses a state-machine approach that handles edge cases like malformed
|
|
23
|
+
* tags better than a single regex would.
|
|
24
|
+
*
|
|
25
|
+
* @param html - HTML string
|
|
26
|
+
* @returns HTML with script tags and their content removed
|
|
27
|
+
*/
|
|
28
|
+
export declare function removeScriptTags(html: string): string;
|
|
29
|
+
/**
|
|
30
|
+
* Remove style tags and their content from HTML.
|
|
31
|
+
*
|
|
32
|
+
* Uses the same state-machine approach as removeScriptTags.
|
|
33
|
+
*
|
|
34
|
+
* @param html - HTML string
|
|
35
|
+
* @returns HTML with style tags and their content removed
|
|
36
|
+
*/
|
|
37
|
+
export declare function removeStyleTags(html: string): string;
|
|
38
|
+
/**
|
|
39
|
+
* Strip all HTML tags from a string, replacing them with spaces.
|
|
40
|
+
*
|
|
41
|
+
* @param html - HTML string
|
|
42
|
+
* @returns Plain text with tags replaced by spaces
|
|
43
|
+
*/
|
|
44
|
+
export declare function stripHtmlTags(html: string): string;
|
|
45
|
+
/**
|
|
46
|
+
* Convert HTML to clean plain text.
|
|
47
|
+
*
|
|
48
|
+
* Applies the full sanitization pipeline:
|
|
49
|
+
* 1. Remove script tags and content (state machine)
|
|
50
|
+
* 2. Remove style tags and content (state machine)
|
|
51
|
+
* 3. Strip remaining HTML tags
|
|
52
|
+
* 4. Decode HTML entities
|
|
53
|
+
* 5. Normalize whitespace
|
|
54
|
+
*
|
|
55
|
+
* @param html - HTML string to clean
|
|
56
|
+
* @returns Clean plain text
|
|
57
|
+
*/
|
|
58
|
+
export declare function htmlToText(html: string): string;
|
|
59
|
+
/**
|
|
60
|
+
* Decode HTML entities and normalize whitespace without removing tags.
|
|
61
|
+
*
|
|
62
|
+
* Useful when you want to preserve HTML structure but decode entities.
|
|
63
|
+
*
|
|
64
|
+
* @param html - HTML string
|
|
65
|
+
* @returns HTML with decoded entities and normalized whitespace
|
|
66
|
+
*/
|
|
67
|
+
export declare function cleanHtmlEntities(html: string): string;
|
|
68
|
+
/**
|
|
69
|
+
* Detect if text looks like programming code using heuristics.
|
|
70
|
+
*
|
|
71
|
+
* Designed to catch JavaScript/TypeScript code fragments that might
|
|
72
|
+
* appear in scraped web pages from client-side rendered sites.
|
|
73
|
+
*
|
|
74
|
+
* @param text - Text to analyze
|
|
75
|
+
* @returns True if text appears to be programming code
|
|
76
|
+
*/
|
|
77
|
+
export declare function looksLikeCode(text: string): boolean;
|
|
78
|
+
/**
|
|
79
|
+
* Strip code segments from text, keeping natural language content.
|
|
80
|
+
*
|
|
81
|
+
* If the entire text looks like code, returns an empty string.
|
|
82
|
+
* Otherwise splits by sentence/code boundaries and filters out
|
|
83
|
+
* segments that match code heuristics.
|
|
84
|
+
*
|
|
85
|
+
* @param text - Text that may contain code segments
|
|
86
|
+
* @returns Text with code segments removed
|
|
87
|
+
*/
|
|
88
|
+
export declare function stripCodeFromText(text: string): string;
|
|
89
|
+
/**
|
|
90
|
+
* Convert HTML to clean plain text, also stripping code-like content.
|
|
91
|
+
*
|
|
92
|
+
* Combines the full HTML cleaning pipeline with code detection for
|
|
93
|
+
* maximum safety when processing scraped web pages.
|
|
94
|
+
*
|
|
95
|
+
* @param html - HTML string to clean
|
|
96
|
+
* @returns Clean plain text with code segments removed
|
|
97
|
+
*/
|
|
98
|
+
export declare function htmlToTextClean(html: string): string;
|
|
99
|
+
//# sourceMappingURL=html-utils.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"html-utils.d.ts","sourceRoot":"","sources":["../../src/shared/html-utils.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAIH;;;;;;;;GAQG;AACH,wBAAgB,kBAAkB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAEvD;AAED;;;;;;;;GAQG;AACH,wBAAgB,gBAAgB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAuCrD;AAED;;;;;;;GAOG;AACH,wBAAgB,eAAe,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAiCpD;AAED;;;;;GAKG;AACH,wBAAgB,aAAa,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAElD;AAED;;;;;;;;;;;;GAYG;AACH,wBAAgB,UAAU,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAiB/C;AAED;;;;;;;GAOG;AACH,wBAAgB,iBAAiB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAItD;AAsCD;;;;;;;;GAQG;AACH,wBAAgB,aAAa,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAcnD;AAED;;;;;;;;;GASG;AACH,wBAAgB,iBAAiB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAoBtD;AAED;;;;;;;;GAQG;AACH,wBAAgB,eAAe,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAGpD"}
|
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* HTML sanitization and text extraction utilities.
|
|
3
|
+
*
|
|
4
|
+
* Provides a complete pipeline for converting raw HTML to clean plain text:
|
|
5
|
+
* script/style tag removal (via state machines for robustness), HTML tag
|
|
6
|
+
* stripping, entity decoding, whitespace normalization, and optional
|
|
7
|
+
* code-fragment detection/removal.
|
|
8
|
+
*/
|
|
9
|
+
import he from "he";
|
|
10
|
+
/**
|
|
11
|
+
* Decode HTML entities in a string using the `he` library.
|
|
12
|
+
*
|
|
13
|
+
* Handles all HTML entities including named (&), decimal (&),
|
|
14
|
+
* and hexadecimal (&) numeric entities.
|
|
15
|
+
*
|
|
16
|
+
* @param text - Text containing HTML entities
|
|
17
|
+
* @returns Decoded text
|
|
18
|
+
*/
|
|
19
|
+
export function decodeHtmlEntities(text) {
|
|
20
|
+
return he.decode(text);
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* Remove script tags and their content from HTML.
|
|
24
|
+
*
|
|
25
|
+
* Uses a state-machine approach that handles edge cases like malformed
|
|
26
|
+
* tags better than a single regex would.
|
|
27
|
+
*
|
|
28
|
+
* @param html - HTML string
|
|
29
|
+
* @returns HTML with script tags and their content removed
|
|
30
|
+
*/
|
|
31
|
+
export function removeScriptTags(html) {
|
|
32
|
+
let result = "";
|
|
33
|
+
let i = 0;
|
|
34
|
+
const lowerHtml = html.toLowerCase();
|
|
35
|
+
while (i < html.length) {
|
|
36
|
+
const scriptStart = lowerHtml.indexOf("<script", i);
|
|
37
|
+
if (scriptStart === -1) {
|
|
38
|
+
result += html.slice(i);
|
|
39
|
+
break;
|
|
40
|
+
}
|
|
41
|
+
// Add content before script tag
|
|
42
|
+
result += html.slice(i, scriptStart);
|
|
43
|
+
// Find the end of the opening script tag
|
|
44
|
+
const tagEnd = html.indexOf(">", scriptStart);
|
|
45
|
+
if (tagEnd === -1) {
|
|
46
|
+
// Malformed - no closing bracket, skip rest
|
|
47
|
+
break;
|
|
48
|
+
}
|
|
49
|
+
// Find closing </script> tag
|
|
50
|
+
const scriptEnd = lowerHtml.indexOf("</script", tagEnd);
|
|
51
|
+
if (scriptEnd === -1) {
|
|
52
|
+
// No closing tag, skip rest of document
|
|
53
|
+
break;
|
|
54
|
+
}
|
|
55
|
+
// Find end of closing tag
|
|
56
|
+
const closeEnd = html.indexOf(">", scriptEnd);
|
|
57
|
+
if (closeEnd === -1) {
|
|
58
|
+
break;
|
|
59
|
+
}
|
|
60
|
+
i = closeEnd + 1;
|
|
61
|
+
}
|
|
62
|
+
return result;
|
|
63
|
+
}
|
|
64
|
+
/**
|
|
65
|
+
* Remove style tags and their content from HTML.
|
|
66
|
+
*
|
|
67
|
+
* Uses the same state-machine approach as removeScriptTags.
|
|
68
|
+
*
|
|
69
|
+
* @param html - HTML string
|
|
70
|
+
* @returns HTML with style tags and their content removed
|
|
71
|
+
*/
|
|
72
|
+
export function removeStyleTags(html) {
|
|
73
|
+
let result = "";
|
|
74
|
+
let i = 0;
|
|
75
|
+
const lowerHtml = html.toLowerCase();
|
|
76
|
+
while (i < html.length) {
|
|
77
|
+
const styleStart = lowerHtml.indexOf("<style", i);
|
|
78
|
+
if (styleStart === -1) {
|
|
79
|
+
result += html.slice(i);
|
|
80
|
+
break;
|
|
81
|
+
}
|
|
82
|
+
result += html.slice(i, styleStart);
|
|
83
|
+
const tagEnd = html.indexOf(">", styleStart);
|
|
84
|
+
if (tagEnd === -1) {
|
|
85
|
+
break;
|
|
86
|
+
}
|
|
87
|
+
const styleEnd = lowerHtml.indexOf("</style", tagEnd);
|
|
88
|
+
if (styleEnd === -1) {
|
|
89
|
+
break;
|
|
90
|
+
}
|
|
91
|
+
const closeEnd = html.indexOf(">", styleEnd);
|
|
92
|
+
if (closeEnd === -1) {
|
|
93
|
+
break;
|
|
94
|
+
}
|
|
95
|
+
i = closeEnd + 1;
|
|
96
|
+
}
|
|
97
|
+
return result;
|
|
98
|
+
}
|
|
99
|
+
/**
|
|
100
|
+
* Strip all HTML tags from a string, replacing them with spaces.
|
|
101
|
+
*
|
|
102
|
+
* @param html - HTML string
|
|
103
|
+
* @returns Plain text with tags replaced by spaces
|
|
104
|
+
*/
|
|
105
|
+
export function stripHtmlTags(html) {
|
|
106
|
+
return html.replace(/<[^>]+>/g, " ");
|
|
107
|
+
}
|
|
108
|
+
/**
|
|
109
|
+
* Convert HTML to clean plain text.
|
|
110
|
+
*
|
|
111
|
+
* Applies the full sanitization pipeline:
|
|
112
|
+
* 1. Remove script tags and content (state machine)
|
|
113
|
+
* 2. Remove style tags and content (state machine)
|
|
114
|
+
* 3. Strip remaining HTML tags
|
|
115
|
+
* 4. Decode HTML entities
|
|
116
|
+
* 5. Normalize whitespace
|
|
117
|
+
*
|
|
118
|
+
* @param html - HTML string to clean
|
|
119
|
+
* @returns Clean plain text
|
|
120
|
+
*/
|
|
121
|
+
export function htmlToText(html) {
|
|
122
|
+
let text = html;
|
|
123
|
+
// Remove script and style tags first (before stripping all tags)
|
|
124
|
+
text = removeScriptTags(text);
|
|
125
|
+
text = removeStyleTags(text);
|
|
126
|
+
// Remove all other HTML tags
|
|
127
|
+
text = stripHtmlTags(text);
|
|
128
|
+
// Decode HTML entities
|
|
129
|
+
text = decodeHtmlEntities(text);
|
|
130
|
+
// Normalize whitespace
|
|
131
|
+
text = text.replace(/\s+/g, " ").trim();
|
|
132
|
+
return text;
|
|
133
|
+
}
|
|
134
|
+
/**
|
|
135
|
+
* Decode HTML entities and normalize whitespace without removing tags.
|
|
136
|
+
*
|
|
137
|
+
* Useful when you want to preserve HTML structure but decode entities.
|
|
138
|
+
*
|
|
139
|
+
* @param html - HTML string
|
|
140
|
+
* @returns HTML with decoded entities and normalized whitespace
|
|
141
|
+
*/
|
|
142
|
+
export function cleanHtmlEntities(html) {
|
|
143
|
+
let text = decodeHtmlEntities(html);
|
|
144
|
+
text = text.replace(/\s+/g, " ").trim();
|
|
145
|
+
return text;
|
|
146
|
+
}
|
|
147
|
+
// Patterns that strongly indicate JavaScript/TypeScript code.
|
|
148
|
+
// Hoisted to module level to avoid re-allocation on each call.
|
|
149
|
+
const CODE_PATTERNS = [
|
|
150
|
+
/\bfunction\s*\(/,
|
|
151
|
+
/\b(?:const|let|var)\s+\w+\s*=/,
|
|
152
|
+
/\bif\s*\([^)]+\)\s*\{/,
|
|
153
|
+
/\bdocument\.\w+/,
|
|
154
|
+
/=>\s*[{(]/,
|
|
155
|
+
/\bthis\.\w+\s*[=;]/,
|
|
156
|
+
/\breturn\s+(?:this|null|true|false|undefined)\b/,
|
|
157
|
+
/\bclass\s+\w+\s*\{/,
|
|
158
|
+
/\b(?:async|await)\s+\w+/,
|
|
159
|
+
/\b(?:try|catch|throw)\s*[{(]/,
|
|
160
|
+
/\bwindow\.\w+/,
|
|
161
|
+
/\bconsole\.\w+/,
|
|
162
|
+
/\.(?:push|pop|shift|unshift|slice|splice|map|filter|reduce)\s*\(/,
|
|
163
|
+
/\b(?:new|delete|typeof|instanceof)\s+\w+/,
|
|
164
|
+
/\[\s*\d+\s*\]/,
|
|
165
|
+
/===|!==|&&|\|\|/,
|
|
166
|
+
/\bfor\s*\([^)]+\)/,
|
|
167
|
+
/\bwhile\s*\([^)]+\)/,
|
|
168
|
+
/\bswitch\s*\([^)]+\)/,
|
|
169
|
+
/\)\s*\{|\{\s*$/,
|
|
170
|
+
/\.innerHTML\s*=/,
|
|
171
|
+
/\.innerText\s*=/,
|
|
172
|
+
/\.textContent\s*=/,
|
|
173
|
+
/\.value\s*=/,
|
|
174
|
+
/\.style\.\w+\s*=/,
|
|
175
|
+
/\.getElementById\s*\(/,
|
|
176
|
+
/\.querySelector\s*\(/,
|
|
177
|
+
/\.addEventListener\s*\(/,
|
|
178
|
+
];
|
|
179
|
+
/** Minimum number of pattern matches required to classify text as code. */
|
|
180
|
+
const CODE_PATTERN_THRESHOLD = 2;
|
|
181
|
+
/**
|
|
182
|
+
* Detect if text looks like programming code using heuristics.
|
|
183
|
+
*
|
|
184
|
+
* Designed to catch JavaScript/TypeScript code fragments that might
|
|
185
|
+
* appear in scraped web pages from client-side rendered sites.
|
|
186
|
+
*
|
|
187
|
+
* @param text - Text to analyze
|
|
188
|
+
* @returns True if text appears to be programming code
|
|
189
|
+
*/
|
|
190
|
+
export function looksLikeCode(text) {
|
|
191
|
+
if (!text || text.length < 20)
|
|
192
|
+
return false;
|
|
193
|
+
let matchCount = 0;
|
|
194
|
+
for (const pattern of CODE_PATTERNS) {
|
|
195
|
+
if (pattern.test(text)) {
|
|
196
|
+
matchCount++;
|
|
197
|
+
if (matchCount >= CODE_PATTERN_THRESHOLD) {
|
|
198
|
+
return true;
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
return false;
|
|
203
|
+
}
|
|
204
|
+
/**
|
|
205
|
+
* Strip code segments from text, keeping natural language content.
|
|
206
|
+
*
|
|
207
|
+
* If the entire text looks like code, returns an empty string.
|
|
208
|
+
* Otherwise splits by sentence/code boundaries and filters out
|
|
209
|
+
* segments that match code heuristics.
|
|
210
|
+
*
|
|
211
|
+
* @param text - Text that may contain code segments
|
|
212
|
+
* @returns Text with code segments removed
|
|
213
|
+
*/
|
|
214
|
+
export function stripCodeFromText(text) {
|
|
215
|
+
if (!text)
|
|
216
|
+
return "";
|
|
217
|
+
// If the whole text looks like code, return empty
|
|
218
|
+
if (looksLikeCode(text)) {
|
|
219
|
+
return "";
|
|
220
|
+
}
|
|
221
|
+
// Split into segments by sentence endings or code delimiters
|
|
222
|
+
const segments = text.split(/(?<=[.!?])\s+|(?<=[;{}])\s+/);
|
|
223
|
+
// Filter out code segments
|
|
224
|
+
const filtered = segments
|
|
225
|
+
.map((segment) => segment.trim())
|
|
226
|
+
.filter((segment) => {
|
|
227
|
+
if (segment.length < 15)
|
|
228
|
+
return false;
|
|
229
|
+
return !looksLikeCode(segment);
|
|
230
|
+
});
|
|
231
|
+
return filtered.join(" ").trim();
|
|
232
|
+
}
|
|
233
|
+
/**
|
|
234
|
+
* Convert HTML to clean plain text, also stripping code-like content.
|
|
235
|
+
*
|
|
236
|
+
* Combines the full HTML cleaning pipeline with code detection for
|
|
237
|
+
* maximum safety when processing scraped web pages.
|
|
238
|
+
*
|
|
239
|
+
* @param html - HTML string to clean
|
|
240
|
+
* @returns Clean plain text with code segments removed
|
|
241
|
+
*/
|
|
242
|
+
export function htmlToTextClean(html) {
|
|
243
|
+
const text = htmlToText(html);
|
|
244
|
+
return stripCodeFromText(text);
|
|
245
|
+
}
|
|
246
|
+
//# sourceMappingURL=html-utils.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"html-utils.js","sourceRoot":"","sources":["../../src/shared/html-utils.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,EAAE,MAAM,IAAI,CAAA;AAEnB;;;;;;;;GAQG;AACH,MAAM,UAAU,kBAAkB,CAAC,IAAY;IAC7C,OAAO,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,CAAA;AACxB,CAAC;AAED;;;;;;;;GAQG;AACH,MAAM,UAAU,gBAAgB,CAAC,IAAY;IAC3C,IAAI,MAAM,GAAG,EAAE,CAAA;IACf,IAAI,CAAC,GAAG,CAAC,CAAA;IACT,MAAM,SAAS,GAAG,IAAI,CAAC,WAAW,EAAE,CAAA;IAEpC,OAAO,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;QACvB,MAAM,WAAW,GAAG,SAAS,CAAC,OAAO,CAAC,SAAS,EAAE,CAAC,CAAC,CAAA;QACnD,IAAI,WAAW,KAAK,CAAC,CAAC,EAAE,CAAC;YACvB,MAAM,IAAI,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAA;YACvB,MAAK;QACP,CAAC;QAED,gCAAgC;QAChC,MAAM,IAAI,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,WAAW,CAAC,CAAA;QAEpC,yCAAyC;QACzC,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,EAAE,WAAW,CAAC,CAAA;QAC7C,IAAI,MAAM,KAAK,CAAC,CAAC,EAAE,CAAC;YAClB,4CAA4C;YAC5C,MAAK;QACP,CAAC;QAED,6BAA6B;QAC7B,MAAM,SAAS,GAAG,SAAS,CAAC,OAAO,CAAC,UAAU,EAAE,MAAM,CAAC,CAAA;QACvD,IAAI,SAAS,KAAK,CAAC,CAAC,EAAE,CAAC;YACrB,wCAAwC;YACxC,MAAK;QACP,CAAC;QAED,0BAA0B;QAC1B,MAAM,QAAQ,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,EAAE,SAAS,CAAC,CAAA;QAC7C,IAAI,QAAQ,KAAK,CAAC,CAAC,EAAE,CAAC;YACpB,MAAK;QACP,CAAC;QAED,CAAC,GAAG,QAAQ,GAAG,CAAC,CAAA;IAClB,CAAC;IAED,OAAO,MAAM,CAAA;AACf,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,UAAU,eAAe,CAAC,IAAY;IAC1C,IAAI,MAAM,GAAG,EAAE,CAAA;IACf,IAAI,CAAC,GAAG,CAAC,CAAA;IACT,MAAM,SAAS,GAAG,IAAI,CAAC,WAAW,EAAE,CAAA;IAEpC,OAAO,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;QACvB,MAAM,UAAU,GAAG,SAAS,CAAC,OAAO,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAA;QACjD,IAAI,UAAU,KAAK,CAAC,CAAC,EAAE,CAAC;YACtB,MAAM,IAAI,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAA;YACvB,MAAK;QACP,CAAC;QAED,MAAM,IAAI,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,UAAU,CAAC,CAAA;QAEnC,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,EAAE,UAAU,CAAC,CAAA;QAC5C,IAAI,MAAM,KAAK,CAAC,CAAC,EAAE,CAAC;YAClB,MAAK;QACP,CAAC;QAED,MAAM,QAAQ,GAAG,SAAS,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAA;QACrD,IAAI,QAAQ,KAAK,CAAC,CAAC,EAAE,CAAC;YACpB,MAAK;QACP,CAAC;QAED,MAAM,QAAQ,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,EAAE,QAAQ,CAAC,CAAA;QAC5C,IAAI,QAAQ,KAAK,CAAC,CAAC,EAAE,CAAC;YACpB,MAAK;QACP,CAAC;QAED,CAAC,GAAG,QAAQ,GAAG,CAAC,CAAA;IAClB,CAAC;IAED,OAAO,MAAM,CAAA;AACf,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,aAAa,CAAC,IAAY;IACxC,OAAO,IAAI,CAAC,OAAO,CAAC,UAAU,EAAE,GAAG,CAAC,CAAA;AACtC,CAAC;AAED;;;;;;;;;;;;GAYG;AACH,MAAM,UAAU,UAAU,CAAC,IAAY;IACrC,IAAI,IAAI,GAAG,IAAI,CAAA;IAEf,iEAAiE;IACjE,IAAI,GAAG,gBAAgB,CAAC,IAAI,CAAC,CAAA;IAC7B,IAAI,GAAG,eAAe,CAAC,IAAI,CAAC,CAAA;IAE5B,6BAA6B;IAC7B,IAAI,GAAG,aAAa,CAAC,IAAI,CAAC,CAAA;IAE1B,uBAAuB;IACvB,IAAI,GAAG,kBAAkB,CAAC,IAAI,CAAC,CAAA;IAE/B,uBAAuB;IACvB,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAA;IAEvC,OAAO,IAAI,CAAA;AACb,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,UAAU,iBAAiB,CAAC,IAAY;IAC5C,IAAI,IAAI,GAAG,kBAAkB,CAAC,IAAI,CAAC,CAAA;IACnC,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAA;IACvC,OAAO,IAAI,CAAA;AACb,CAAC;AAED,8DAA8D;AAC9D,+DAA+D;AAC/D,MAAM,aAAa,GAAG;IACpB,iBAAiB;IACjB,+BAA+B;IAC/B,uBAAuB;IACvB,iBAAiB;IACjB,WAAW;IACX,oBAAoB;IACpB,iDAAiD;IACjD,oBAAoB;IACpB,yBAAyB;IACzB,8BAA8B;IAC9B,eAAe;IACf,gBAAgB;IAChB,kEAAkE;IAClE,0CAA0C;IAC1C,eAAe;IACf,iBAAiB;IACjB,mBAAmB;IACnB,qBAAqB;IACrB,sBAAsB;IACtB,gBAAgB;IAChB,iBAAiB;IACjB,iBAAiB;IACjB,mBAAmB;IACnB,aAAa;IACb,kBAAkB;IAClB,uBAAuB;IACvB,sBAAsB;IACtB,yBAAyB;CAC1B,CAAA;AAED,2EAA2E;AAC3E,MAAM,sBAAsB,GAAG,CAAC,CAAA;AAEhC;;;;;;;;GAQG;AACH,MAAM,UAAU,aAAa,CAAC,IAAY;IACxC,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,MAAM,GAAG,EAAE;QAAE,OAAO,KAAK,CAAA;IAE3C,IAAI,UAAU,GAAG,CAAC,CAAA;IAClB,KAAK,MAAM,OAAO,IAAI,aAAa,EAAE,CAAC;QACpC,IAAI,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;YACvB,UAAU,EAAE,CAAA;YACZ,IAAI,UAAU,IAAI,sBAAsB,EAAE,CAAC;gBACzC,OAAO,IAAI,CAAA;YACb,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,KAAK,CAAA;AACd,CAAC;AAED;;;;;;;;;GASG;AACH,MAAM,UAAU,iBAAiB,CAAC,IAAY;IAC5C,IAAI,CAAC,IAAI;QAAE,OAAO,EAAE,CAAA;IAEpB,kDAAkD;IAClD,IAAI,aAAa,CAAC,IAAI,CAAC,EAAE,CAAC;QACxB,OAAO,EAAE,CAAA;IACX,CAAC;IAED,6DAA6D;IAC7D,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,6BAA6B,CAAC,CAAA;IAE1D,2BAA2B;IAC3B,MAAM,QAAQ,GAAG,QAAQ;SACtB,GAAG,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC;SAChC,MAAM,CAAC,CAAC,OAAO,EAAE,EAAE;QAClB,IAAI,OAAO,CAAC,MAAM,GAAG,EAAE;YAAE,OAAO,KAAK,CAAA;QACrC,OAAO,CAAC,aAAa,CAAC,OAAO,CAAC,CAAA;IAChC,CAAC,CAAC,CAAA;IAEJ,OAAO,QAAQ,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAA;AAClC,CAAC;AAED;;;;;;;;GAQG;AACH,MAAM,UAAU,eAAe,CAAC,IAAY;IAC1C,MAAM,IAAI,GAAG,UAAU,CAAC,IAAI,CAAC,CAAA;IAC7B,OAAO,iBAAiB,CAAC,IAAI,CAAC,CAAA;AAChC,CAAC"}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Article extraction using Mozilla Readability.
|
|
3
|
+
*
|
|
4
|
+
* Uses the same algorithm behind Firefox Reader View to extract article
|
|
5
|
+
* content from raw HTML, stripping navigation, ads, sidebars, and other
|
|
6
|
+
* non-content elements. Far more reliable than regex-based extraction.
|
|
7
|
+
*
|
|
8
|
+
* Dependencies: @mozilla/readability, jsdom
|
|
9
|
+
*/
|
|
10
|
+
/** Result of extracting article content from HTML. */
|
|
11
|
+
export interface ArticleExtractionResult {
|
|
12
|
+
text: string;
|
|
13
|
+
title: string | null;
|
|
14
|
+
author: string | null;
|
|
15
|
+
excerpt: string | null;
|
|
16
|
+
siteName: string | null;
|
|
17
|
+
}
|
|
18
|
+
/**
|
|
19
|
+
* Extract article content from raw HTML using Mozilla Readability.
|
|
20
|
+
*
|
|
21
|
+
* Parses the HTML into a DOM, runs Mozilla's Readability algorithm to
|
|
22
|
+
* identify the main article body, and returns the plain text content
|
|
23
|
+
* along with metadata (title, author, excerpt, site name).
|
|
24
|
+
*
|
|
25
|
+
* Returns null if Readability cannot identify article content or if
|
|
26
|
+
* the extracted text is shorter than 100 characters.
|
|
27
|
+
*
|
|
28
|
+
* @param html - Raw HTML string to extract from
|
|
29
|
+
* @param url - Optional URL for resolving relative links in the HTML
|
|
30
|
+
* @returns Extracted article content and metadata, or null
|
|
31
|
+
*/
|
|
32
|
+
export declare function extractArticleContent(html: string, url?: string): ArticleExtractionResult | null;
|
|
33
|
+
//# sourceMappingURL=readability-extract.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"readability-extract.d.ts","sourceRoot":"","sources":["../../src/shared/readability-extract.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAKH,sDAAsD;AACtD,MAAM,WAAW,uBAAuB;IACtC,IAAI,EAAE,MAAM,CAAA;IACZ,KAAK,EAAE,MAAM,GAAG,IAAI,CAAA;IACpB,MAAM,EAAE,MAAM,GAAG,IAAI,CAAA;IACrB,OAAO,EAAE,MAAM,GAAG,IAAI,CAAA;IACtB,QAAQ,EAAE,MAAM,GAAG,IAAI,CAAA;CACxB;AAED;;;;;;;;;;;;;GAaG;AACH,wBAAgB,qBAAqB,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,CAAC,EAAE,MAAM,GAAG,uBAAuB,GAAG,IAAI,CAqBhG"}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Article extraction using Mozilla Readability.
|
|
3
|
+
*
|
|
4
|
+
* Uses the same algorithm behind Firefox Reader View to extract article
|
|
5
|
+
* content from raw HTML, stripping navigation, ads, sidebars, and other
|
|
6
|
+
* non-content elements. Far more reliable than regex-based extraction.
|
|
7
|
+
*
|
|
8
|
+
* Dependencies: @mozilla/readability, jsdom
|
|
9
|
+
*/
|
|
10
|
+
import { Readability } from "@mozilla/readability";
|
|
11
|
+
import { JSDOM, VirtualConsole } from "jsdom";
|
|
12
|
+
/**
|
|
13
|
+
* Extract article content from raw HTML using Mozilla Readability.
|
|
14
|
+
*
|
|
15
|
+
* Parses the HTML into a DOM, runs Mozilla's Readability algorithm to
|
|
16
|
+
* identify the main article body, and returns the plain text content
|
|
17
|
+
* along with metadata (title, author, excerpt, site name).
|
|
18
|
+
*
|
|
19
|
+
* Returns null if Readability cannot identify article content or if
|
|
20
|
+
* the extracted text is shorter than 100 characters.
|
|
21
|
+
*
|
|
22
|
+
* @param html - Raw HTML string to extract from
|
|
23
|
+
* @param url - Optional URL for resolving relative links in the HTML
|
|
24
|
+
* @returns Extracted article content and metadata, or null
|
|
25
|
+
*/
|
|
26
|
+
export function extractArticleContent(html, url) {
|
|
27
|
+
// Suppress all JSDOM console output (CSS parsing warnings, resource loading errors, etc.)
|
|
28
|
+
// These are expected and harmless when parsing arbitrary web pages for article extraction.
|
|
29
|
+
const virtualConsole = new VirtualConsole();
|
|
30
|
+
// No event listeners attached — all errors are silently suppressed
|
|
31
|
+
const dom = new JSDOM(html, { url: url || undefined, virtualConsole });
|
|
32
|
+
const reader = new Readability(dom.window.document);
|
|
33
|
+
const article = reader.parse();
|
|
34
|
+
if (!article || !article.textContent || article.textContent.length < 100) {
|
|
35
|
+
return null;
|
|
36
|
+
}
|
|
37
|
+
return {
|
|
38
|
+
text: article.textContent,
|
|
39
|
+
title: article.title || null,
|
|
40
|
+
author: article.byline || null,
|
|
41
|
+
excerpt: article.excerpt || null,
|
|
42
|
+
siteName: article.siteName || null,
|
|
43
|
+
};
|
|
44
|
+
}
|
|
45
|
+
//# sourceMappingURL=readability-extract.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"readability-extract.js","sourceRoot":"","sources":["../../src/shared/readability-extract.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,OAAO,EAAE,WAAW,EAAE,MAAM,sBAAsB,CAAA;AAClD,OAAO,EAAE,KAAK,EAAE,cAAc,EAAE,MAAM,OAAO,CAAA;AAW7C;;;;;;;;;;;;;GAaG;AACH,MAAM,UAAU,qBAAqB,CAAC,IAAY,EAAE,GAAY;IAC9D,0FAA0F;IAC1F,2FAA2F;IAC3F,MAAM,cAAc,GAAG,IAAI,cAAc,EAAE,CAAA;IAC3C,mEAAmE;IAEnE,MAAM,GAAG,GAAG,IAAI,KAAK,CAAC,IAAI,EAAE,EAAE,GAAG,EAAE,GAAG,IAAI,SAAS,EAAE,cAAc,EAAE,CAAC,CAAA;IACtE,MAAM,MAAM,GAAG,IAAI,WAAW,CAAC,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAA;IACnD,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,EAAE,CAAA;IAE9B,IAAI,CAAC,OAAO,IAAI,CAAC,OAAO,CAAC,WAAW,IAAI,OAAO,CAAC,WAAW,CAAC,MAAM,GAAG,GAAG,EAAE,CAAC;QACzE,OAAO,IAAI,CAAA;IACb,CAAC;IAED,OAAO;QACL,IAAI,EAAE,OAAO,CAAC,WAAW;QACzB,KAAK,EAAE,OAAO,CAAC,KAAK,IAAI,IAAI;QAC5B,MAAM,EAAE,OAAO,CAAC,MAAM,IAAI,IAAI;QAC9B,OAAO,EAAE,OAAO,CAAC,OAAO,IAAI,IAAI;QAChC,QAAQ,EAAE,OAAO,CAAC,QAAQ,IAAI,IAAI;KACnC,CAAA;AACH,CAAC"}
|