@debriefer/sources 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. package/README.md +59 -0
  2. package/dist/__tests__/archives/chronicling-america.test.d.ts +8 -0
  3. package/dist/__tests__/archives/chronicling-america.test.d.ts.map +1 -0
  4. package/dist/__tests__/archives/chronicling-america.test.js +151 -0
  5. package/dist/__tests__/archives/chronicling-america.test.js.map +1 -0
  6. package/dist/__tests__/archives/europeana.test.d.ts +8 -0
  7. package/dist/__tests__/archives/europeana.test.d.ts.map +1 -0
  8. package/dist/__tests__/archives/europeana.test.js +200 -0
  9. package/dist/__tests__/archives/europeana.test.js.map +1 -0
  10. package/dist/__tests__/archives/internet-archive.test.d.ts +8 -0
  11. package/dist/__tests__/archives/internet-archive.test.d.ts.map +1 -0
  12. package/dist/__tests__/archives/internet-archive.test.js +189 -0
  13. package/dist/__tests__/archives/internet-archive.test.js.map +1 -0
  14. package/dist/__tests__/archives/trove.test.d.ts +8 -0
  15. package/dist/__tests__/archives/trove.test.d.ts.map +1 -0
  16. package/dist/__tests__/archives/trove.test.js +202 -0
  17. package/dist/__tests__/archives/trove.test.js.map +1 -0
  18. package/dist/__tests__/books/google-books.test.d.ts +8 -0
  19. package/dist/__tests__/books/google-books.test.d.ts.map +1 -0
  20. package/dist/__tests__/books/google-books.test.js +221 -0
  21. package/dist/__tests__/books/google-books.test.js.map +1 -0
  22. package/dist/__tests__/books/open-library.test.d.ts +8 -0
  23. package/dist/__tests__/books/open-library.test.d.ts.map +1 -0
  24. package/dist/__tests__/books/open-library.test.js +159 -0
  25. package/dist/__tests__/books/open-library.test.js.map +1 -0
  26. package/dist/__tests__/news/guardian.test.d.ts +9 -0
  27. package/dist/__tests__/news/guardian.test.d.ts.map +1 -0
  28. package/dist/__tests__/news/guardian.test.js +224 -0
  29. package/dist/__tests__/news/guardian.test.js.map +1 -0
  30. package/dist/__tests__/news/nytimes.test.d.ts +9 -0
  31. package/dist/__tests__/news/nytimes.test.d.ts.map +1 -0
  32. package/dist/__tests__/news/nytimes.test.js +271 -0
  33. package/dist/__tests__/news/nytimes.test.js.map +1 -0
  34. package/dist/__tests__/news/site-search-source.test.d.ts +9 -0
  35. package/dist/__tests__/news/site-search-source.test.d.ts.map +1 -0
  36. package/dist/__tests__/news/site-search-source.test.js +342 -0
  37. package/dist/__tests__/news/site-search-source.test.js.map +1 -0
  38. package/dist/__tests__/obituary/find-a-grave.test.d.ts +8 -0
  39. package/dist/__tests__/obituary/find-a-grave.test.d.ts.map +1 -0
  40. package/dist/__tests__/obituary/find-a-grave.test.js +238 -0
  41. package/dist/__tests__/obituary/find-a-grave.test.js.map +1 -0
  42. package/dist/__tests__/shared/duckduckgo-search.test.d.ts +9 -0
  43. package/dist/__tests__/shared/duckduckgo-search.test.d.ts.map +1 -0
  44. package/dist/__tests__/shared/duckduckgo-search.test.js +218 -0
  45. package/dist/__tests__/shared/duckduckgo-search.test.js.map +1 -0
  46. package/dist/__tests__/shared/fetch-page.test.d.ts +9 -0
  47. package/dist/__tests__/shared/fetch-page.test.d.ts.map +1 -0
  48. package/dist/__tests__/shared/fetch-page.test.js +281 -0
  49. package/dist/__tests__/shared/fetch-page.test.js.map +1 -0
  50. package/dist/__tests__/shared/html-utils.test.d.ts +2 -0
  51. package/dist/__tests__/shared/html-utils.test.d.ts.map +1 -0
  52. package/dist/__tests__/shared/html-utils.test.js +169 -0
  53. package/dist/__tests__/shared/html-utils.test.js.map +1 -0
  54. package/dist/__tests__/shared/readability-extract.test.d.ts +2 -0
  55. package/dist/__tests__/shared/readability-extract.test.d.ts.map +1 -0
  56. package/dist/__tests__/shared/readability-extract.test.js +107 -0
  57. package/dist/__tests__/shared/readability-extract.test.js.map +1 -0
  58. package/dist/__tests__/shared/sanitize-text.test.d.ts +2 -0
  59. package/dist/__tests__/shared/sanitize-text.test.d.ts.map +1 -0
  60. package/dist/__tests__/shared/sanitize-text.test.js +77 -0
  61. package/dist/__tests__/shared/sanitize-text.test.js.map +1 -0
  62. package/dist/__tests__/shared/search-utils.test.d.ts +2 -0
  63. package/dist/__tests__/shared/search-utils.test.d.ts.map +1 -0
  64. package/dist/__tests__/shared/search-utils.test.js +26 -0
  65. package/dist/__tests__/shared/search-utils.test.js.map +1 -0
  66. package/dist/__tests__/structured/wikidata.test.d.ts +9 -0
  67. package/dist/__tests__/structured/wikidata.test.d.ts.map +1 -0
  68. package/dist/__tests__/structured/wikidata.test.js +509 -0
  69. package/dist/__tests__/structured/wikidata.test.js.map +1 -0
  70. package/dist/__tests__/structured/wikipedia.test.d.ts +9 -0
  71. package/dist/__tests__/structured/wikipedia.test.d.ts.map +1 -0
  72. package/dist/__tests__/structured/wikipedia.test.js +643 -0
  73. package/dist/__tests__/structured/wikipedia.test.js.map +1 -0
  74. package/dist/__tests__/web-search/base.test.d.ts +9 -0
  75. package/dist/__tests__/web-search/base.test.d.ts.map +1 -0
  76. package/dist/__tests__/web-search/base.test.js +622 -0
  77. package/dist/__tests__/web-search/base.test.js.map +1 -0
  78. package/dist/__tests__/web-search/bing.test.d.ts +10 -0
  79. package/dist/__tests__/web-search/bing.test.d.ts.map +1 -0
  80. package/dist/__tests__/web-search/bing.test.js +277 -0
  81. package/dist/__tests__/web-search/bing.test.js.map +1 -0
  82. package/dist/__tests__/web-search/brave.test.d.ts +10 -0
  83. package/dist/__tests__/web-search/brave.test.d.ts.map +1 -0
  84. package/dist/__tests__/web-search/brave.test.js +264 -0
  85. package/dist/__tests__/web-search/brave.test.js.map +1 -0
  86. package/dist/__tests__/web-search/duckduckgo.test.d.ts +10 -0
  87. package/dist/__tests__/web-search/duckduckgo.test.d.ts.map +1 -0
  88. package/dist/__tests__/web-search/duckduckgo.test.js +107 -0
  89. package/dist/__tests__/web-search/duckduckgo.test.js.map +1 -0
  90. package/dist/__tests__/web-search/google.test.d.ts +9 -0
  91. package/dist/__tests__/web-search/google.test.d.ts.map +1 -0
  92. package/dist/__tests__/web-search/google.test.js +189 -0
  93. package/dist/__tests__/web-search/google.test.js.map +1 -0
  94. package/dist/archives/chronicling-america.d.ts +33 -0
  95. package/dist/archives/chronicling-america.d.ts.map +1 -0
  96. package/dist/archives/chronicling-america.js +85 -0
  97. package/dist/archives/chronicling-america.js.map +1 -0
  98. package/dist/archives/europeana.d.ts +37 -0
  99. package/dist/archives/europeana.d.ts.map +1 -0
  100. package/dist/archives/europeana.js +92 -0
  101. package/dist/archives/europeana.js.map +1 -0
  102. package/dist/archives/internet-archive.d.ts +32 -0
  103. package/dist/archives/internet-archive.d.ts.map +1 -0
  104. package/dist/archives/internet-archive.js +90 -0
  105. package/dist/archives/internet-archive.js.map +1 -0
  106. package/dist/archives/trove.d.ts +37 -0
  107. package/dist/archives/trove.d.ts.map +1 -0
  108. package/dist/archives/trove.js +97 -0
  109. package/dist/archives/trove.js.map +1 -0
  110. package/dist/books/google-books.d.ts +48 -0
  111. package/dist/books/google-books.d.ts.map +1 -0
  112. package/dist/books/google-books.js +111 -0
  113. package/dist/books/google-books.js.map +1 -0
  114. package/dist/books/open-library.d.ts +44 -0
  115. package/dist/books/open-library.d.ts.map +1 -0
  116. package/dist/books/open-library.js +103 -0
  117. package/dist/books/open-library.js.map +1 -0
  118. package/dist/index.d.ts +45 -0
  119. package/dist/index.d.ts.map +1 -0
  120. package/dist/index.js +35 -0
  121. package/dist/index.js.map +1 -0
  122. package/dist/news/guardian.d.ts +51 -0
  123. package/dist/news/guardian.d.ts.map +1 -0
  124. package/dist/news/guardian.js +131 -0
  125. package/dist/news/guardian.js.map +1 -0
  126. package/dist/news/nytimes.d.ts +27 -0
  127. package/dist/news/nytimes.d.ts.map +1 -0
  128. package/dist/news/nytimes.js +104 -0
  129. package/dist/news/nytimes.js.map +1 -0
  130. package/dist/news/site-search-source.d.ts +89 -0
  131. package/dist/news/site-search-source.d.ts.map +1 -0
  132. package/dist/news/site-search-source.js +182 -0
  133. package/dist/news/site-search-source.js.map +1 -0
  134. package/dist/news/sources.d.ts +52 -0
  135. package/dist/news/sources.d.ts.map +1 -0
  136. package/dist/news/sources.js +276 -0
  137. package/dist/news/sources.js.map +1 -0
  138. package/dist/obituary/find-a-grave.d.ts +43 -0
  139. package/dist/obituary/find-a-grave.d.ts.map +1 -0
  140. package/dist/obituary/find-a-grave.js +173 -0
  141. package/dist/obituary/find-a-grave.js.map +1 -0
  142. package/dist/shared/duckduckgo-search.d.ts +86 -0
  143. package/dist/shared/duckduckgo-search.d.ts.map +1 -0
  144. package/dist/shared/duckduckgo-search.js +218 -0
  145. package/dist/shared/duckduckgo-search.js.map +1 -0
  146. package/dist/shared/fetch-page.d.ts +50 -0
  147. package/dist/shared/fetch-page.d.ts.map +1 -0
  148. package/dist/shared/fetch-page.js +212 -0
  149. package/dist/shared/fetch-page.js.map +1 -0
  150. package/dist/shared/html-utils.d.ts +99 -0
  151. package/dist/shared/html-utils.d.ts.map +1 -0
  152. package/dist/shared/html-utils.js +246 -0
  153. package/dist/shared/html-utils.js.map +1 -0
  154. package/dist/shared/readability-extract.d.ts +33 -0
  155. package/dist/shared/readability-extract.d.ts.map +1 -0
  156. package/dist/shared/readability-extract.js +45 -0
  157. package/dist/shared/readability-extract.js.map +1 -0
  158. package/dist/shared/sanitize-text.d.ts +24 -0
  159. package/dist/shared/sanitize-text.d.ts.map +1 -0
  160. package/dist/shared/sanitize-text.js +49 -0
  161. package/dist/shared/sanitize-text.js.map +1 -0
  162. package/dist/shared/search-utils.d.ts +18 -0
  163. package/dist/shared/search-utils.d.ts.map +1 -0
  164. package/dist/shared/search-utils.js +20 -0
  165. package/dist/shared/search-utils.js.map +1 -0
  166. package/dist/structured/wikidata.d.ts +128 -0
  167. package/dist/structured/wikidata.d.ts.map +1 -0
  168. package/dist/structured/wikidata.js +361 -0
  169. package/dist/structured/wikidata.js.map +1 -0
  170. package/dist/structured/wikipedia.d.ts +184 -0
  171. package/dist/structured/wikipedia.d.ts.map +1 -0
  172. package/dist/structured/wikipedia.js +275 -0
  173. package/dist/structured/wikipedia.js.map +1 -0
  174. package/dist/web-search/base.d.ts +128 -0
  175. package/dist/web-search/base.d.ts.map +1 -0
  176. package/dist/web-search/base.js +251 -0
  177. package/dist/web-search/base.js.map +1 -0
  178. package/dist/web-search/bing.d.ts +21 -0
  179. package/dist/web-search/bing.d.ts.map +1 -0
  180. package/dist/web-search/bing.js +53 -0
  181. package/dist/web-search/bing.js.map +1 -0
  182. package/dist/web-search/brave.d.ts +21 -0
  183. package/dist/web-search/brave.d.ts.map +1 -0
  184. package/dist/web-search/brave.js +56 -0
  185. package/dist/web-search/brave.js.map +1 -0
  186. package/dist/web-search/duckduckgo.d.ts +15 -0
  187. package/dist/web-search/duckduckgo.d.ts.map +1 -0
  188. package/dist/web-search/duckduckgo.js +21 -0
  189. package/dist/web-search/duckduckgo.js.map +1 -0
  190. package/dist/web-search/google.d.ts +24 -0
  191. package/dist/web-search/google.d.ts.map +1 -0
  192. package/dist/web-search/google.js +48 -0
  193. package/dist/web-search/google.js.map +1 -0
  194. package/package.json +58 -0
@@ -0,0 +1,212 @@
1
+ /**
2
+ * Page fetching utility with browser-like headers and archive.org fallback.
3
+ *
4
+ * Provides a resilient page fetching pipeline:
5
+ * 1. Direct fetch with browser-like headers (Chrome UA, Accept text/html)
6
+ * 2. Block detection (hard HTTP blocks + soft body pattern matching)
7
+ * 3. Automatic archive.org fallback when blocked or on network error
8
+ * 4. Non-blocking HTTP errors (404, 500) return immediately without fallback
9
+ *
10
+ * Used by WebSearchBase when following links from search results.
11
+ */
12
+ /** Default browser-like User-Agent string. */
13
+ const DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
14
+ /** Default total timeout budget in milliseconds (shared across direct + archive attempts). */
15
+ const DEFAULT_TIMEOUT_MS = 15000;
16
+ /** HTTP status codes that indicate a hard block (should trigger archive fallback). */
17
+ const HARD_BLOCK_STATUSES = new Set([401, 403, 429, 451]);
18
+ /** Maximum body length (in characters) for soft block detection. Pages larger than this are assumed to be real content. */
19
+ const SOFT_BLOCK_MAX_SIZE = 50_000;
20
+ /** Case-insensitive patterns in response body that indicate a soft block. */
21
+ const SOFT_BLOCK_PATTERNS = [
22
+ "captcha",
23
+ "please verify you are human",
24
+ "access denied",
25
+ "bot detection",
26
+ "unusual traffic",
27
+ "automated access",
28
+ "cloudflare",
29
+ "ddos protection",
30
+ "just a moment",
31
+ "recaptcha",
32
+ "hcaptcha",
33
+ ];
34
+ /**
35
+ * Build the combined AbortSignal from a caller signal and a timeout.
36
+ *
37
+ * Uses `AbortSignal.any()` to combine both so that neither defeats the other.
38
+ */
39
+ function buildSignal(callerSignal, timeoutMs) {
40
+ const timeout = timeoutMs ?? DEFAULT_TIMEOUT_MS;
41
+ const timeoutSignal = AbortSignal.timeout(timeout);
42
+ if (callerSignal) {
43
+ return AbortSignal.any([callerSignal, timeoutSignal]);
44
+ }
45
+ return timeoutSignal;
46
+ }
47
+ /** Build browser-like request headers. */
48
+ function buildHeaders(userAgent) {
49
+ return {
50
+ "User-Agent": userAgent,
51
+ Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
52
+ "Accept-Language": "en-US,en;q=0.9",
53
+ "Cache-Control": "no-cache",
54
+ };
55
+ }
56
+ /**
57
+ * Check whether a response body indicates a soft block (captcha, bot detection, etc.).
58
+ *
59
+ * Only checks pages smaller than SOFT_BLOCK_MAX_SIZE to avoid false positives
60
+ * on large legitimate pages that happen to mention these words.
61
+ */
62
+ function isSoftBlocked(body) {
63
+ if (body.length > SOFT_BLOCK_MAX_SIZE) {
64
+ return false;
65
+ }
66
+ const lower = body.toLowerCase();
67
+ return SOFT_BLOCK_PATTERNS.some((pattern) => lower.includes(pattern));
68
+ }
69
+ /**
70
+ * Check whether an error is an abort or timeout error.
71
+ * Node.js 22 AbortSignal.timeout() produces DOMException with name "TimeoutError".
72
+ */
73
+ function isAbortError(error) {
74
+ return (error instanceof DOMException && (error.name === "AbortError" || error.name === "TimeoutError"));
75
+ }
76
+ /**
77
+ * Construct the archive.org Wayback Machine URL for a given URL.
78
+ * Strips fragments and credentials which aren't part of the archived resource key.
79
+ */
80
+ function archiveUrl(url) {
81
+ try {
82
+ const normalized = new URL(url);
83
+ normalized.hash = "";
84
+ normalized.username = "";
85
+ normalized.password = "";
86
+ return `https://web.archive.org/web/${normalized.toString()}`;
87
+ }
88
+ catch {
89
+ return `https://web.archive.org/web/${url}`;
90
+ }
91
+ }
92
+ /**
93
+ * Attempt to fetch a page from archive.org.
94
+ *
95
+ * Returns null if the archive fetch fails for any reason.
96
+ */
97
+ async function fetchFromArchive(url, headers, signal) {
98
+ const aUrl = archiveUrl(url);
99
+ try {
100
+ const response = await fetch(aUrl, { headers, signal });
101
+ if (response.ok) {
102
+ const content = await response.text();
103
+ return {
104
+ content,
105
+ url: response.url || aUrl,
106
+ fetchMethod: "archive.org",
107
+ };
108
+ }
109
+ }
110
+ catch {
111
+ // Archive fetch failed — fall through to return null
112
+ }
113
+ return null;
114
+ }
115
+ /**
116
+ * Fetch a page with browser-like headers and automatic archive.org fallback.
117
+ *
118
+ * Pipeline:
119
+ * 1. Direct fetch with browser-like headers
120
+ * 2. Block detection (hard HTTP status codes + soft body pattern matching)
121
+ * 3. If blocked and archiveFallback enabled, try archive.org
122
+ * 4. Non-blocking HTTP errors (404, 500) return "none" immediately
123
+ * 5. Network errors on direct fetch trigger archive fallback
124
+ *
125
+ * @param options - Fetch options including URL, signal, timeout, etc.
126
+ * @returns Result with content, final URL, and fetch method
127
+ */
128
+ export async function fetchPage(options) {
129
+ const { url, signal: callerSignal, timeoutMs, userAgent = DEFAULT_USER_AGENT, archiveFallback = true, } = options;
130
+ const signal = buildSignal(callerSignal, timeoutMs);
131
+ const headers = buildHeaders(userAgent);
132
+ // --- Direct fetch attempt ---
133
+ let response;
134
+ try {
135
+ response = await fetch(url, { headers, signal });
136
+ }
137
+ catch (error) {
138
+ // Abort/timeout errors should not trigger archive fallback
139
+ if (isAbortError(error)) {
140
+ const reason = error instanceof DOMException && error.name === "TimeoutError"
141
+ ? "Request timed out"
142
+ : "Request was aborted";
143
+ return {
144
+ content: "",
145
+ url,
146
+ fetchMethod: "none",
147
+ error: reason,
148
+ };
149
+ }
150
+ // Network error — try archive fallback
151
+ if (archiveFallback) {
152
+ const archiveResult = await fetchFromArchive(url, headers, signal);
153
+ if (archiveResult) {
154
+ return archiveResult;
155
+ }
156
+ }
157
+ const message = error instanceof Error ? error.message : String(error);
158
+ return {
159
+ content: "",
160
+ url,
161
+ fetchMethod: "none",
162
+ error: `Network error: ${message}`,
163
+ };
164
+ }
165
+ // --- Hard block detection ---
166
+ if (HARD_BLOCK_STATUSES.has(response.status)) {
167
+ if (archiveFallback) {
168
+ const archiveResult = await fetchFromArchive(url, headers, signal);
169
+ if (archiveResult) {
170
+ return archiveResult;
171
+ }
172
+ }
173
+ return {
174
+ content: "",
175
+ url,
176
+ fetchMethod: "none",
177
+ error: `HTTP ${response.status} (blocked)`,
178
+ };
179
+ }
180
+ // --- Non-blocking HTTP errors (404, 500, etc.) — return immediately ---
181
+ if (!response.ok) {
182
+ return {
183
+ content: "",
184
+ url,
185
+ fetchMethod: "none",
186
+ error: `HTTP ${response.status}`,
187
+ };
188
+ }
189
+ // --- Read body and check for soft blocks ---
190
+ const body = await response.text();
191
+ if (isSoftBlocked(body)) {
192
+ if (archiveFallback) {
193
+ const archiveResult = await fetchFromArchive(url, headers, signal);
194
+ if (archiveResult) {
195
+ return archiveResult;
196
+ }
197
+ }
198
+ return {
199
+ content: "",
200
+ url,
201
+ fetchMethod: "none",
202
+ error: "Soft block detected (captcha/bot detection)",
203
+ };
204
+ }
205
+ // --- Success ---
206
+ return {
207
+ content: body,
208
+ url: response.url || url,
209
+ fetchMethod: "direct",
210
+ };
211
+ }
212
+ //# sourceMappingURL=fetch-page.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"fetch-page.js","sourceRoot":"","sources":["../../src/shared/fetch-page.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AA4BH,8CAA8C;AAC9C,MAAM,kBAAkB,GACtB,iHAAiH,CAAA;AAEnH,8FAA8F;AAC9F,MAAM,kBAAkB,GAAG,KAAK,CAAA;AAEhC,sFAAsF;AACtF,MAAM,mBAAmB,GAAG,IAAI,GAAG,CAAC,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAA;AAEzD,2HAA2H;AAC3H,MAAM,mBAAmB,GAAG,MAAM,CAAA;AAElC,6EAA6E;AAC7E,MAAM,mBAAmB,GAAG;IAC1B,SAAS;IACT,6BAA6B;IAC7B,eAAe;IACf,eAAe;IACf,iBAAiB;IACjB,kBAAkB;IAClB,YAAY;IACZ,iBAAiB;IACjB,eAAe;IACf,WAAW;IACX,UAAU;CACX,CAAA;AAED;;;;GAIG;AACH,SAAS,WAAW,CAAC,YAA0B,EAAE,SAAkB;IACjE,MAAM,OAAO,GAAG,SAAS,IAAI,kBAAkB,CAAA;IAC/C,MAAM,aAAa,GAAG,WAAW,CAAC,OAAO,CAAC,OAAO,CAAC,CAAA;IAElD,IAAI,YAAY,EAAE,CAAC;QACjB,OAAO,WAAW,CAAC,GAAG,CAAC,CAAC,YAAY,EAAE,aAAa,CAAC,CAAC,CAAA;IACvD,CAAC;IACD,OAAO,aAAa,CAAA;AACtB,CAAC;AAED,0CAA0C;AAC1C,SAAS,YAAY,CAAC,SAAiB;IACrC,OAAO;QACL,YAAY,EAAE,SAAS;QACvB,MAAM,EAAE,iEAAiE;QACzE,iBAAiB,EAAE,gBAAgB;QACnC,eAAe,EAAE,UAAU;KAC5B,CAAA;AACH,CAAC;AAED;;;;;GAKG;AACH,SAAS,aAAa,CAAC,IAAY;IACjC,IAAI,IAAI,CAAC,MAAM,GAAG,mBAAmB,EAAE,CAAC;QACtC,OAAO,KAAK,CAAA;IACd,CAAC;IAED,MAAM,KAAK,GAAG,IAAI,CAAC,WAAW,EAAE,CAAA;IAChC,OAAO,mBAAmB,CAAC,IAAI,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,KAAK,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAA;AACvE,CAAC;AAED;;;GAGG;AACH,SAAS,YAAY,CAAC,KAAc;IAClC,OAAO,CACL,KAAK,YAAY,YAAY,IAAI,CAAC,KAAK,CAAC,IAAI,KAAK,YAAY,IAAI,KAAK,CAAC,IAAI,KAAK,cAAc,CAAC,CAChG,CAAA;AACH,CAAC;AAED;;;GAGG;AACH,SAAS,UAAU,CAAC,GAAW;IAC7B,IAAI,CAAC;QACH,MAAM,UAAU,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAA;QAC/B,UAAU,CAAC,IAAI,GAAG,EAAE,CAAA;QACpB,UAAU,CAAC,QAAQ,GAAG,EAAE,CAAA;QACxB,UAAU,CAAC,QAAQ,GAAG,EAAE,CAAA;QACxB,OAAO,+BAA+B,UAAU,CAAC,QAAQ,EAAE,EAAE,CAAA;IAC/D,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,+BAA+B,GAAG,EAAE,CAAA;IAC7C,CAAC;AACH,CAAC;AAED;;;;GAIG;AACH,KAAK,UAAU,gBAAgB,CAC7B,GAAW,EACX,OAA+B,EAC/B,MAAmB;IAEnB,MAAM,IAAI,GAAG,UAAU,CAAC,GAAG,CAAC,CAAA;IAC5B,IAAI,CAAC;QACH,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,IAAI,EAAE,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC,CAAA;QACvD,IAAI,QAAQ,CAAC,EAAE,EAAE,CAAC;YAChB,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAA;YACrC,OAAO;gBACL,OAAO;gBACP,GAAG,EAAE,QAAQ,CAAC,GAAG,IAAI,IAAI;gBACzB,WAAW,EAAE,aAAa;aAC3B,CAAA;QACH,CAAC;IACH,CAAC;IAAC,MAAM,CAAC;QACP,qDAAqD;IACvD,CAAC;IACD,OAAO,IAAI,CAAA;AACb,CAAC;AAED;;;;;;;;;;;;GAYG;AACH,MAAM,CAAC,KAAK,UAAU,SAAS,CAAC,OAAyB;IACvD,MAAM,EACJ,GAAG,EACH,MAAM,EAAE,YAAY,EACpB,SAAS,EACT,SAAS,GAAG,kBAAkB,EAC9B,eAAe,GAAG,IAAI,GACvB,GAAG,OAAO,CAAA;IAEX,MAAM,MAAM,GAAG,WAAW,CAAC,YAAY,EAAE,SAAS,CAAC,CAAA;IACnD,MAAM,OAAO,GAAG,YAAY,CAAC,SAAS,CAAC,CAAA;IAEvC,+BAA+B;IAC/B,IAAI,QAAkB,CAAA;IACtB,IAAI,CAAC;QACH,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC,CAAA;IAClD,CAAC;IAAC,OAAO,KAAc,EAAE,CAAC;QACxB,2DAA2D;QAC3D,IAAI,YAAY,CAAC,KAAK,CAAC,EAAE,CAAC;YACxB,MAAM,MAAM,GACV,KAAK,YAAY,YAAY,IAAI,KAAK,CAAC,IAAI,KAAK,cAAc;gBAC5D,CAAC,CAAC,mBAAmB;gBACrB,CAAC,CAAC,qBAAqB,CAAA;YAC3B,OAAO;gBACL,OAAO,EAAE,EAAE;gBACX,GAAG;gBACH,WAAW,EAAE,MAAM;gBACnB,KAAK,EAAE,MAAM;aACd,CAAA;QACH,CAAC;QAED,uCAAuC;QACvC,IAAI,eAAe,EAAE,CAAC;YACpB,MAAM,aAAa,GAAG,MAAM,gBAAgB,CAAC,GAAG,EAAE,OAAO,EAAE,MAAM,CAAC,CAAA;YAClE,IAAI,aAAa,EAAE,CAAC;gBAClB,OAAO,aAAa,CAAA;YACtB,CAAC;QACH,CAAC;QAED,MAAM,OAAO,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAA;QACtE,OAAO;YACL,OAAO,EAAE,EAAE;YACX,GAAG;YACH,WAAW,EAAE,MAAM;YACnB,KAAK,EAAE,kBAAkB,OAAO,EAAE;SACnC,CAAA;IACH,CAAC;IAED,+BAA+B;IAC/B,IAAI,mBAAmB,CAAC,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC;QAC7C,IAAI,eAAe,EAAE,CAAC;YACpB,MAAM,aAAa,GAAG,MAAM,gBAAgB,CAAC,GAAG,EAAE,OAAO,EAAE,MAAM,CAAC,CAAA;YAClE,IAAI,aAAa,EAAE,CAAC;gBAClB,OAAO,aAAa,CAAA;YACtB,CAAC;QACH,CAAC;QAED,OAAO;YACL,OAAO,EAAE,EAAE;YACX,GAAG;YACH,WAAW,EAAE,MAAM;YACnB,KAAK,EAAE,QAAQ,QAAQ,CAAC,MAAM,YAAY;SAC3C,CAAA;IACH,CAAC;IAED,yEAAyE;IACzE,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;QACjB,OAAO;YACL,OAAO,EAAE,EAAE;YACX,GAAG;YACH,WAAW,EAAE,MAAM;YACnB,KAAK,EAAE,QAAQ,QAAQ,CAAC,MAAM,EAAE;SACjC,CAAA;IACH,CAAC;IAED,8CAA8C;IAC9C,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAA;IAElC,IAAI,aAAa,CAAC,IAAI,CAAC,EAAE,CAAC;QACxB,IAAI,eAAe,EAAE,CAAC;YACpB,MAAM,aAAa,GAAG,MAAM,gBAAgB,CAAC,GAAG,EAAE,OAAO,EAAE,MAAM,CAAC,CAAA;YAClE,IAAI,aAAa,EAAE,CAAC;gBAClB,OAAO,aAAa,CAAA;YACtB,CAAC;QACH,CAAC;QAED,OAAO;YACL,OAAO,EAAE,EAAE;YACX,GAAG;YACH,WAAW,EAAE,MAAM;YACnB,KAAK,EAAE,6CAA6C;SACrD,CAAA;IACH,CAAC;IAED,kBAAkB;IAClB,OAAO;QACL,OAAO,EAAE,IAAI;QACb,GAAG,EAAE,QAAQ,CAAC,GAAG,IAAI,GAAG;QACxB,WAAW,EAAE,QAAQ;KACtB,CAAA;AACH,CAAC"}
@@ -0,0 +1,99 @@
1
+ /**
2
+ * HTML sanitization and text extraction utilities.
3
+ *
4
+ * Provides a complete pipeline for converting raw HTML to clean plain text:
5
+ * script/style tag removal (via state machines for robustness), HTML tag
6
+ * stripping, entity decoding, whitespace normalization, and optional
7
+ * code-fragment detection/removal.
8
+ */
9
+ /**
10
+ * Decode HTML entities in a string using the `he` library.
11
+ *
12
+ * Handles all HTML entities including named (&), decimal (&),
13
+ * and hexadecimal (&) numeric entities.
14
+ *
15
+ * @param text - Text containing HTML entities
16
+ * @returns Decoded text
17
+ */
18
+ export declare function decodeHtmlEntities(text: string): string;
19
+ /**
20
+ * Remove script tags and their content from HTML.
21
+ *
22
+ * Uses a state-machine approach that handles edge cases like malformed
23
+ * tags better than a single regex would.
24
+ *
25
+ * @param html - HTML string
26
+ * @returns HTML with script tags and their content removed
27
+ */
28
+ export declare function removeScriptTags(html: string): string;
29
+ /**
30
+ * Remove style tags and their content from HTML.
31
+ *
32
+ * Uses the same state-machine approach as removeScriptTags.
33
+ *
34
+ * @param html - HTML string
35
+ * @returns HTML with style tags and their content removed
36
+ */
37
+ export declare function removeStyleTags(html: string): string;
38
+ /**
39
+ * Strip all HTML tags from a string, replacing them with spaces.
40
+ *
41
+ * @param html - HTML string
42
+ * @returns Plain text with tags replaced by spaces
43
+ */
44
+ export declare function stripHtmlTags(html: string): string;
45
+ /**
46
+ * Convert HTML to clean plain text.
47
+ *
48
+ * Applies the full sanitization pipeline:
49
+ * 1. Remove script tags and content (state machine)
50
+ * 2. Remove style tags and content (state machine)
51
+ * 3. Strip remaining HTML tags
52
+ * 4. Decode HTML entities
53
+ * 5. Normalize whitespace
54
+ *
55
+ * @param html - HTML string to clean
56
+ * @returns Clean plain text
57
+ */
58
+ export declare function htmlToText(html: string): string;
59
+ /**
60
+ * Decode HTML entities and normalize whitespace without removing tags.
61
+ *
62
+ * Useful when you want to preserve HTML structure but decode entities.
63
+ *
64
+ * @param html - HTML string
65
+ * @returns HTML with decoded entities and normalized whitespace
66
+ */
67
+ export declare function cleanHtmlEntities(html: string): string;
68
+ /**
69
+ * Detect if text looks like programming code using heuristics.
70
+ *
71
+ * Designed to catch JavaScript/TypeScript code fragments that might
72
+ * appear in scraped web pages from client-side rendered sites.
73
+ *
74
+ * @param text - Text to analyze
75
+ * @returns True if text appears to be programming code
76
+ */
77
+ export declare function looksLikeCode(text: string): boolean;
78
+ /**
79
+ * Strip code segments from text, keeping natural language content.
80
+ *
81
+ * If the entire text looks like code, returns an empty string.
82
+ * Otherwise splits by sentence/code boundaries and filters out
83
+ * segments that match code heuristics.
84
+ *
85
+ * @param text - Text that may contain code segments
86
+ * @returns Text with code segments removed
87
+ */
88
+ export declare function stripCodeFromText(text: string): string;
89
+ /**
90
+ * Convert HTML to clean plain text, also stripping code-like content.
91
+ *
92
+ * Combines the full HTML cleaning pipeline with code detection for
93
+ * maximum safety when processing scraped web pages.
94
+ *
95
+ * @param html - HTML string to clean
96
+ * @returns Clean plain text with code segments removed
97
+ */
98
+ export declare function htmlToTextClean(html: string): string;
99
+ //# sourceMappingURL=html-utils.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"html-utils.d.ts","sourceRoot":"","sources":["../../src/shared/html-utils.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAIH;;;;;;;;GAQG;AACH,wBAAgB,kBAAkB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAEvD;AAED;;;;;;;;GAQG;AACH,wBAAgB,gBAAgB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAuCrD;AAED;;;;;;;GAOG;AACH,wBAAgB,eAAe,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAiCpD;AAED;;;;;GAKG;AACH,wBAAgB,aAAa,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAElD;AAED;;;;;;;;;;;;GAYG;AACH,wBAAgB,UAAU,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAiB/C;AAED;;;;;;;GAOG;AACH,wBAAgB,iBAAiB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAItD;AAsCD;;;;;;;;GAQG;AACH,wBAAgB,aAAa,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAcnD;AAED;;;;;;;;;GASG;AACH,wBAAgB,iBAAiB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAoBtD;AAED;;;;;;;;GAQG;AACH,wBAAgB,eAAe,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAGpD"}
@@ -0,0 +1,246 @@
1
+ /**
2
+ * HTML sanitization and text extraction utilities.
3
+ *
4
+ * Provides a complete pipeline for converting raw HTML to clean plain text:
5
+ * script/style tag removal (via state machines for robustness), HTML tag
6
+ * stripping, entity decoding, whitespace normalization, and optional
7
+ * code-fragment detection/removal.
8
+ */
9
+ import he from "he";
10
+ /**
11
+ * Decode HTML entities in a string using the `he` library.
12
+ *
13
+ * Handles all HTML entities including named (&), decimal (&),
14
+ * and hexadecimal (&) numeric entities.
15
+ *
16
+ * @param text - Text containing HTML entities
17
+ * @returns Decoded text
18
+ */
19
+ export function decodeHtmlEntities(text) {
20
+ return he.decode(text);
21
+ }
22
+ /**
23
+ * Remove script tags and their content from HTML.
24
+ *
25
+ * Uses a state-machine approach that handles edge cases like malformed
26
+ * tags better than a single regex would.
27
+ *
28
+ * @param html - HTML string
29
+ * @returns HTML with script tags and their content removed
30
+ */
31
+ export function removeScriptTags(html) {
32
+ let result = "";
33
+ let i = 0;
34
+ const lowerHtml = html.toLowerCase();
35
+ while (i < html.length) {
36
+ const scriptStart = lowerHtml.indexOf("<script", i);
37
+ if (scriptStart === -1) {
38
+ result += html.slice(i);
39
+ break;
40
+ }
41
+ // Add content before script tag
42
+ result += html.slice(i, scriptStart);
43
+ // Find the end of the opening script tag
44
+ const tagEnd = html.indexOf(">", scriptStart);
45
+ if (tagEnd === -1) {
46
+ // Malformed - no closing bracket, skip rest
47
+ break;
48
+ }
49
+ // Find closing </script> tag
50
+ const scriptEnd = lowerHtml.indexOf("</script", tagEnd);
51
+ if (scriptEnd === -1) {
52
+ // No closing tag, skip rest of document
53
+ break;
54
+ }
55
+ // Find end of closing tag
56
+ const closeEnd = html.indexOf(">", scriptEnd);
57
+ if (closeEnd === -1) {
58
+ break;
59
+ }
60
+ i = closeEnd + 1;
61
+ }
62
+ return result;
63
+ }
64
+ /**
65
+ * Remove style tags and their content from HTML.
66
+ *
67
+ * Uses the same state-machine approach as removeScriptTags.
68
+ *
69
+ * @param html - HTML string
70
+ * @returns HTML with style tags and their content removed
71
+ */
72
+ export function removeStyleTags(html) {
73
+ let result = "";
74
+ let i = 0;
75
+ const lowerHtml = html.toLowerCase();
76
+ while (i < html.length) {
77
+ const styleStart = lowerHtml.indexOf("<style", i);
78
+ if (styleStart === -1) {
79
+ result += html.slice(i);
80
+ break;
81
+ }
82
+ result += html.slice(i, styleStart);
83
+ const tagEnd = html.indexOf(">", styleStart);
84
+ if (tagEnd === -1) {
85
+ break;
86
+ }
87
+ const styleEnd = lowerHtml.indexOf("</style", tagEnd);
88
+ if (styleEnd === -1) {
89
+ break;
90
+ }
91
+ const closeEnd = html.indexOf(">", styleEnd);
92
+ if (closeEnd === -1) {
93
+ break;
94
+ }
95
+ i = closeEnd + 1;
96
+ }
97
+ return result;
98
+ }
99
+ /**
100
+ * Strip all HTML tags from a string, replacing them with spaces.
101
+ *
102
+ * @param html - HTML string
103
+ * @returns Plain text with tags replaced by spaces
104
+ */
105
+ export function stripHtmlTags(html) {
106
+ return html.replace(/<[^>]+>/g, " ");
107
+ }
108
+ /**
109
+ * Convert HTML to clean plain text.
110
+ *
111
+ * Applies the full sanitization pipeline:
112
+ * 1. Remove script tags and content (state machine)
113
+ * 2. Remove style tags and content (state machine)
114
+ * 3. Strip remaining HTML tags
115
+ * 4. Decode HTML entities
116
+ * 5. Normalize whitespace
117
+ *
118
+ * @param html - HTML string to clean
119
+ * @returns Clean plain text
120
+ */
121
+ export function htmlToText(html) {
122
+ let text = html;
123
+ // Remove script and style tags first (before stripping all tags)
124
+ text = removeScriptTags(text);
125
+ text = removeStyleTags(text);
126
+ // Remove all other HTML tags
127
+ text = stripHtmlTags(text);
128
+ // Decode HTML entities
129
+ text = decodeHtmlEntities(text);
130
+ // Normalize whitespace
131
+ text = text.replace(/\s+/g, " ").trim();
132
+ return text;
133
+ }
134
+ /**
135
+ * Decode HTML entities and normalize whitespace without removing tags.
136
+ *
137
+ * Useful when you want to preserve HTML structure but decode entities.
138
+ *
139
+ * @param html - HTML string
140
+ * @returns HTML with decoded entities and normalized whitespace
141
+ */
142
+ export function cleanHtmlEntities(html) {
143
+ let text = decodeHtmlEntities(html);
144
+ text = text.replace(/\s+/g, " ").trim();
145
+ return text;
146
+ }
147
+ // Patterns that strongly indicate JavaScript/TypeScript code.
148
+ // Hoisted to module level to avoid re-allocation on each call.
149
+ const CODE_PATTERNS = [
150
+ /\bfunction\s*\(/,
151
+ /\b(?:const|let|var)\s+\w+\s*=/,
152
+ /\bif\s*\([^)]+\)\s*\{/,
153
+ /\bdocument\.\w+/,
154
+ /=>\s*[{(]/,
155
+ /\bthis\.\w+\s*[=;]/,
156
+ /\breturn\s+(?:this|null|true|false|undefined)\b/,
157
+ /\bclass\s+\w+\s*\{/,
158
+ /\b(?:async|await)\s+\w+/,
159
+ /\b(?:try|catch|throw)\s*[{(]/,
160
+ /\bwindow\.\w+/,
161
+ /\bconsole\.\w+/,
162
+ /\.(?:push|pop|shift|unshift|slice|splice|map|filter|reduce)\s*\(/,
163
+ /\b(?:new|delete|typeof|instanceof)\s+\w+/,
164
+ /\[\s*\d+\s*\]/,
165
+ /===|!==|&&|\|\|/,
166
+ /\bfor\s*\([^)]+\)/,
167
+ /\bwhile\s*\([^)]+\)/,
168
+ /\bswitch\s*\([^)]+\)/,
169
+ /\)\s*\{|\{\s*$/,
170
+ /\.innerHTML\s*=/,
171
+ /\.innerText\s*=/,
172
+ /\.textContent\s*=/,
173
+ /\.value\s*=/,
174
+ /\.style\.\w+\s*=/,
175
+ /\.getElementById\s*\(/,
176
+ /\.querySelector\s*\(/,
177
+ /\.addEventListener\s*\(/,
178
+ ];
179
+ /** Minimum number of pattern matches required to classify text as code. */
180
+ const CODE_PATTERN_THRESHOLD = 2;
181
+ /**
182
+ * Detect if text looks like programming code using heuristics.
183
+ *
184
+ * Designed to catch JavaScript/TypeScript code fragments that might
185
+ * appear in scraped web pages from client-side rendered sites.
186
+ *
187
+ * @param text - Text to analyze
188
+ * @returns True if text appears to be programming code
189
+ */
190
+ export function looksLikeCode(text) {
191
+ if (!text || text.length < 20)
192
+ return false;
193
+ let matchCount = 0;
194
+ for (const pattern of CODE_PATTERNS) {
195
+ if (pattern.test(text)) {
196
+ matchCount++;
197
+ if (matchCount >= CODE_PATTERN_THRESHOLD) {
198
+ return true;
199
+ }
200
+ }
201
+ }
202
+ return false;
203
+ }
204
+ /**
205
+ * Strip code segments from text, keeping natural language content.
206
+ *
207
+ * If the entire text looks like code, returns an empty string.
208
+ * Otherwise splits by sentence/code boundaries and filters out
209
+ * segments that match code heuristics.
210
+ *
211
+ * @param text - Text that may contain code segments
212
+ * @returns Text with code segments removed
213
+ */
214
+ export function stripCodeFromText(text) {
215
+ if (!text)
216
+ return "";
217
+ // If the whole text looks like code, return empty
218
+ if (looksLikeCode(text)) {
219
+ return "";
220
+ }
221
+ // Split into segments by sentence endings or code delimiters
222
+ const segments = text.split(/(?<=[.!?])\s+|(?<=[;{}])\s+/);
223
+ // Filter out code segments
224
+ const filtered = segments
225
+ .map((segment) => segment.trim())
226
+ .filter((segment) => {
227
+ if (segment.length < 15)
228
+ return false;
229
+ return !looksLikeCode(segment);
230
+ });
231
+ return filtered.join(" ").trim();
232
+ }
233
+ /**
234
+ * Convert HTML to clean plain text, also stripping code-like content.
235
+ *
236
+ * Combines the full HTML cleaning pipeline with code detection for
237
+ * maximum safety when processing scraped web pages.
238
+ *
239
+ * @param html - HTML string to clean
240
+ * @returns Clean plain text with code segments removed
241
+ */
242
+ export function htmlToTextClean(html) {
243
+ const text = htmlToText(html);
244
+ return stripCodeFromText(text);
245
+ }
246
+ //# sourceMappingURL=html-utils.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"html-utils.js","sourceRoot":"","sources":["../../src/shared/html-utils.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,EAAE,MAAM,IAAI,CAAA;AAEnB;;;;;;;;GAQG;AACH,MAAM,UAAU,kBAAkB,CAAC,IAAY;IAC7C,OAAO,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,CAAA;AACxB,CAAC;AAED;;;;;;;;GAQG;AACH,MAAM,UAAU,gBAAgB,CAAC,IAAY;IAC3C,IAAI,MAAM,GAAG,EAAE,CAAA;IACf,IAAI,CAAC,GAAG,CAAC,CAAA;IACT,MAAM,SAAS,GAAG,IAAI,CAAC,WAAW,EAAE,CAAA;IAEpC,OAAO,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;QACvB,MAAM,WAAW,GAAG,SAAS,CAAC,OAAO,CAAC,SAAS,EAAE,CAAC,CAAC,CAAA;QACnD,IAAI,WAAW,KAAK,CAAC,CAAC,EAAE,CAAC;YACvB,MAAM,IAAI,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAA;YACvB,MAAK;QACP,CAAC;QAED,gCAAgC;QAChC,MAAM,IAAI,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,WAAW,CAAC,CAAA;QAEpC,yCAAyC;QACzC,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,EAAE,WAAW,CAAC,CAAA;QAC7C,IAAI,MAAM,KAAK,CAAC,CAAC,EAAE,CAAC;YAClB,4CAA4C;YAC5C,MAAK;QACP,CAAC;QAED,6BAA6B;QAC7B,MAAM,SAAS,GAAG,SAAS,CAAC,OAAO,CAAC,UAAU,EAAE,MAAM,CAAC,CAAA;QACvD,IAAI,SAAS,KAAK,CAAC,CAAC,EAAE,CAAC;YACrB,wCAAwC;YACxC,MAAK;QACP,CAAC;QAED,0BAA0B;QAC1B,MAAM,QAAQ,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,EAAE,SAAS,CAAC,CAAA;QAC7C,IAAI,QAAQ,KAAK,CAAC,CAAC,EAAE,CAAC;YACpB,MAAK;QACP,CAAC;QAED,CAAC,GAAG,QAAQ,GAAG,CAAC,CAAA;IAClB,CAAC;IAED,OAAO,MAAM,CAAA;AACf,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,UAAU,eAAe,CAAC,IAAY;IAC1C,IAAI,MAAM,GAAG,EAAE,CAAA;IACf,IAAI,CAAC,GAAG,CAAC,CAAA;IACT,MAAM,SAAS,GAAG,IAAI,CAAC,WAAW,EAAE,CAAA;IAEpC,OAAO,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;QACvB,MAAM,UAAU,GAAG,SAAS,CAAC,OAAO,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAA;QACjD,IAAI,UAAU,KAAK,CAAC,CAAC,EAAE,CAAC;YACtB,MAAM,IAAI,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAA;YACvB,MAAK;QACP,CAAC;QAED,MAAM,IAAI,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,UAAU,CAAC,CAAA;QAEnC,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,EAAE,UAAU,CAAC,CAAA;QAC5C,IAAI,MAAM,KAAK,CAAC,CAAC,EAAE,CAAC;YAClB,MAAK;QACP,CAAC;QAED,MAAM,QAAQ,GAAG,SAAS,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAA;QACrD,IAAI,QAAQ,KAAK,CAAC,CAAC,EAAE,CAAC;YACpB,MAAK;QACP,CAAC;QAED,MAAM,QAAQ,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,EAAE,QAAQ,CAAC,CAAA;QAC5C,IAAI,QAAQ,KAAK,CAAC,CAAC,EAAE,CAAC;YACpB,MAAK;QACP,CAAC;QAED,CAAC,GAAG,QAAQ,GAAG,CAAC,CAAA;IAClB,CAAC;IAED,OAAO,MAAM,CAAA;AACf,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,aAAa,CAAC,IAAY;IACxC,OAAO,IAAI,CAAC,OAAO,CAAC,UAAU,EAAE,GAAG,CAAC,CAAA;AACtC,CAAC;AAED;;;;;;;;;;;;GAYG;AACH,MAAM,UAAU,UAAU,CAAC,IAAY;IACrC,IAAI,IAAI,GAAG,IAAI,CAAA;IAEf,iEAAiE;IACjE,IAAI,GAAG,gBAAgB,CAAC,IAAI,CAAC,CAAA;IAC7B,IAAI,GAAG,eAAe,CAAC,IAAI,CAAC,CAAA;IAE5B,6BAA6B;IAC7B,IAAI,GAAG,aAAa,CAAC,IAAI,CAAC,CAAA;IAE1B,uBAAuB;IACvB,IAAI,GAAG,kBAAkB,CAAC,IAAI,CAAC,CAAA;IAE/B,uBAAuB;IACvB,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAA;IAEvC,OAAO,IAAI,CAAA;AACb,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,UAAU,iBAAiB,CAAC,IAAY;IAC5C,IAAI,IAAI,GAAG,kBAAkB,CAAC,IAAI,CAAC,CAAA;IACnC,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAA;IACvC,OAAO,IAAI,CAAA;AACb,CAAC;AAED,8DAA8D;AAC9D,+DAA+D;AAC/D,MAAM,aAAa,GAAG;IACpB,iBAAiB;IACjB,+BAA+B;IAC/B,uBAAuB;IACvB,iBAAiB;IACjB,WAAW;IACX,oBAAoB;IACpB,iDAAiD;IACjD,oBAAoB;IACpB,yBAAyB;IACzB,8BAA8B;IAC9B,eAAe;IACf,gBAAgB;IAChB,kEAAkE;IAClE,0CAA0C;IAC1C,eAAe;IACf,iBAAiB;IACjB,mBAAmB;IACnB,qBAAqB;IACrB,sBAAsB;IACtB,gBAAgB;IAChB,iBAAiB;IACjB,iBAAiB;IACjB,mBAAmB;IACnB,aAAa;IACb,kBAAkB;IAClB,uBAAuB;IACvB,sBAAsB;IACtB,yBAAyB;CAC1B,CAAA;AAED,2EAA2E;AAC3E,MAAM,sBAAsB,GAAG,CAAC,CAAA;AAEhC;;;;;;;;GAQG;AACH,MAAM,UAAU,aAAa,CAAC,IAAY;IACxC,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,MAAM,GAAG,EAAE;QAAE,OAAO,KAAK,CAAA;IAE3C,IAAI,UAAU,GAAG,CAAC,CAAA;IAClB,KAAK,MAAM,OAAO,IAAI,aAAa,EAAE,CAAC;QACpC,IAAI,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;YACvB,UAAU,EAAE,CAAA;YACZ,IAAI,UAAU,IAAI,sBAAsB,EAAE,CAAC;gBACzC,OAAO,IAAI,CAAA;YACb,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,KAAK,CAAA;AACd,CAAC;AAED;;;;;;;;;GASG;AACH,MAAM,UAAU,iBAAiB,CAAC,IAAY;IAC5C,IAAI,CAAC,IAAI;QAAE,OAAO,EAAE,CAAA;IAEpB,kDAAkD;IAClD,IAAI,aAAa,CAAC,IAAI,CAAC,EAAE,CAAC;QACxB,OAAO,EAAE,CAAA;IACX,CAAC;IAED,6DAA6D;IAC7D,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,6BAA6B,CAAC,CAAA;IAE1D,2BAA2B;IAC3B,MAAM,QAAQ,GAAG,QAAQ;SACtB,GAAG,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC;SAChC,MAAM,CAAC,CAAC,OAAO,EAAE,EAAE;QAClB,IAAI,OAAO,CAAC,MAAM,GAAG,EAAE;YAAE,OAAO,KAAK,CAAA;QACrC,OAAO,CAAC,aAAa,CAAC,OAAO,CAAC,CAAA;IAChC,CAAC,CAAC,CAAA;IAEJ,OAAO,QAAQ,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAA;AAClC,CAAC;AAED;;;;;;;;GAQG;AACH,MAAM,UAAU,eAAe,CAAC,IAAY;IAC1C,MAAM,IAAI,GAAG,UAAU,CAAC,IAAI,CAAC,CAAA;IAC7B,OAAO,iBAAiB,CAAC,IAAI,CAAC,CAAA;AAChC,CAAC"}
@@ -0,0 +1,33 @@
1
+ /**
2
+ * Article extraction using Mozilla Readability.
3
+ *
4
+ * Uses the same algorithm behind Firefox Reader View to extract article
5
+ * content from raw HTML, stripping navigation, ads, sidebars, and other
6
+ * non-content elements. Far more reliable than regex-based extraction.
7
+ *
8
+ * Dependencies: @mozilla/readability, jsdom
9
+ */
10
+ /** Result of extracting article content from HTML. */
11
+ export interface ArticleExtractionResult {
12
+ text: string;
13
+ title: string | null;
14
+ author: string | null;
15
+ excerpt: string | null;
16
+ siteName: string | null;
17
+ }
18
+ /**
19
+ * Extract article content from raw HTML using Mozilla Readability.
20
+ *
21
+ * Parses the HTML into a DOM, runs Mozilla's Readability algorithm to
22
+ * identify the main article body, and returns the plain text content
23
+ * along with metadata (title, author, excerpt, site name).
24
+ *
25
+ * Returns null if Readability cannot identify article content or if
26
+ * the extracted text is shorter than 100 characters.
27
+ *
28
+ * @param html - Raw HTML string to extract from
29
+ * @param url - Optional URL for resolving relative links in the HTML
30
+ * @returns Extracted article content and metadata, or null
31
+ */
32
+ export declare function extractArticleContent(html: string, url?: string): ArticleExtractionResult | null;
33
+ //# sourceMappingURL=readability-extract.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"readability-extract.d.ts","sourceRoot":"","sources":["../../src/shared/readability-extract.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAKH,sDAAsD;AACtD,MAAM,WAAW,uBAAuB;IACtC,IAAI,EAAE,MAAM,CAAA;IACZ,KAAK,EAAE,MAAM,GAAG,IAAI,CAAA;IACpB,MAAM,EAAE,MAAM,GAAG,IAAI,CAAA;IACrB,OAAO,EAAE,MAAM,GAAG,IAAI,CAAA;IACtB,QAAQ,EAAE,MAAM,GAAG,IAAI,CAAA;CACxB;AAED;;;;;;;;;;;;;GAaG;AACH,wBAAgB,qBAAqB,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,CAAC,EAAE,MAAM,GAAG,uBAAuB,GAAG,IAAI,CAqBhG"}
@@ -0,0 +1,45 @@
1
+ /**
2
+ * Article extraction using Mozilla Readability.
3
+ *
4
+ * Uses the same algorithm behind Firefox Reader View to extract article
5
+ * content from raw HTML, stripping navigation, ads, sidebars, and other
6
+ * non-content elements. Far more reliable than regex-based extraction.
7
+ *
8
+ * Dependencies: @mozilla/readability, jsdom
9
+ */
10
+ import { Readability } from "@mozilla/readability";
11
+ import { JSDOM, VirtualConsole } from "jsdom";
12
+ /**
13
+ * Extract article content from raw HTML using Mozilla Readability.
14
+ *
15
+ * Parses the HTML into a DOM, runs Mozilla's Readability algorithm to
16
+ * identify the main article body, and returns the plain text content
17
+ * along with metadata (title, author, excerpt, site name).
18
+ *
19
+ * Returns null if Readability cannot identify article content or if
20
+ * the extracted text is shorter than 100 characters.
21
+ *
22
+ * @param html - Raw HTML string to extract from
23
+ * @param url - Optional URL for resolving relative links in the HTML
24
+ * @returns Extracted article content and metadata, or null
25
+ */
26
+ export function extractArticleContent(html, url) {
27
+ // Suppress all JSDOM console output (CSS parsing warnings, resource loading errors, etc.)
28
+ // These are expected and harmless when parsing arbitrary web pages for article extraction.
29
+ const virtualConsole = new VirtualConsole();
30
+ // No event listeners attached — all errors are silently suppressed
31
+ const dom = new JSDOM(html, { url: url || undefined, virtualConsole });
32
+ const reader = new Readability(dom.window.document);
33
+ const article = reader.parse();
34
+ if (!article || !article.textContent || article.textContent.length < 100) {
35
+ return null;
36
+ }
37
+ return {
38
+ text: article.textContent,
39
+ title: article.title || null,
40
+ author: article.byline || null,
41
+ excerpt: article.excerpt || null,
42
+ siteName: article.siteName || null,
43
+ };
44
+ }
45
+ //# sourceMappingURL=readability-extract.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"readability-extract.js","sourceRoot":"","sources":["../../src/shared/readability-extract.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,OAAO,EAAE,WAAW,EAAE,MAAM,sBAAsB,CAAA;AAClD,OAAO,EAAE,KAAK,EAAE,cAAc,EAAE,MAAM,OAAO,CAAA;AAW7C;;;;;;;;;;;;;GAaG;AACH,MAAM,UAAU,qBAAqB,CAAC,IAAY,EAAE,GAAY;IAC9D,0FAA0F;IAC1F,2FAA2F;IAC3F,MAAM,cAAc,GAAG,IAAI,cAAc,EAAE,CAAA;IAC3C,mEAAmE;IAEnE,MAAM,GAAG,GAAG,IAAI,KAAK,CAAC,IAAI,EAAE,EAAE,GAAG,EAAE,GAAG,IAAI,SAAS,EAAE,cAAc,EAAE,CAAC,CAAA;IACtE,MAAM,MAAM,GAAG,IAAI,WAAW,CAAC,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAA;IACnD,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,EAAE,CAAA;IAE9B,IAAI,CAAC,OAAO,IAAI,CAAC,OAAO,CAAC,WAAW,IAAI,OAAO,CAAC,WAAW,CAAC,MAAM,GAAG,GAAG,EAAE,CAAC;QACzE,OAAO,IAAI,CAAA;IACb,CAAC;IAED,OAAO;QACL,IAAI,EAAE,OAAO,CAAC,WAAW;QACzB,KAAK,EAAE,OAAO,CAAC,KAAK,IAAI,IAAI;QAC5B,MAAM,EAAE,OAAO,CAAC,MAAM,IAAI,IAAI;QAC9B,OAAO,EAAE,OAAO,CAAC,OAAO,IAAI,IAAI;QAChC,QAAQ,EAAE,OAAO,CAAC,QAAQ,IAAI,IAAI;KACnC,CAAA;AACH,CAAC"}