@debriefer/sources 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. package/README.md +59 -0
  2. package/dist/__tests__/archives/chronicling-america.test.d.ts +8 -0
  3. package/dist/__tests__/archives/chronicling-america.test.d.ts.map +1 -0
  4. package/dist/__tests__/archives/chronicling-america.test.js +151 -0
  5. package/dist/__tests__/archives/chronicling-america.test.js.map +1 -0
  6. package/dist/__tests__/archives/europeana.test.d.ts +8 -0
  7. package/dist/__tests__/archives/europeana.test.d.ts.map +1 -0
  8. package/dist/__tests__/archives/europeana.test.js +200 -0
  9. package/dist/__tests__/archives/europeana.test.js.map +1 -0
  10. package/dist/__tests__/archives/internet-archive.test.d.ts +8 -0
  11. package/dist/__tests__/archives/internet-archive.test.d.ts.map +1 -0
  12. package/dist/__tests__/archives/internet-archive.test.js +189 -0
  13. package/dist/__tests__/archives/internet-archive.test.js.map +1 -0
  14. package/dist/__tests__/archives/trove.test.d.ts +8 -0
  15. package/dist/__tests__/archives/trove.test.d.ts.map +1 -0
  16. package/dist/__tests__/archives/trove.test.js +202 -0
  17. package/dist/__tests__/archives/trove.test.js.map +1 -0
  18. package/dist/__tests__/books/google-books.test.d.ts +8 -0
  19. package/dist/__tests__/books/google-books.test.d.ts.map +1 -0
  20. package/dist/__tests__/books/google-books.test.js +221 -0
  21. package/dist/__tests__/books/google-books.test.js.map +1 -0
  22. package/dist/__tests__/books/open-library.test.d.ts +8 -0
  23. package/dist/__tests__/books/open-library.test.d.ts.map +1 -0
  24. package/dist/__tests__/books/open-library.test.js +159 -0
  25. package/dist/__tests__/books/open-library.test.js.map +1 -0
  26. package/dist/__tests__/news/guardian.test.d.ts +9 -0
  27. package/dist/__tests__/news/guardian.test.d.ts.map +1 -0
  28. package/dist/__tests__/news/guardian.test.js +224 -0
  29. package/dist/__tests__/news/guardian.test.js.map +1 -0
  30. package/dist/__tests__/news/nytimes.test.d.ts +9 -0
  31. package/dist/__tests__/news/nytimes.test.d.ts.map +1 -0
  32. package/dist/__tests__/news/nytimes.test.js +271 -0
  33. package/dist/__tests__/news/nytimes.test.js.map +1 -0
  34. package/dist/__tests__/news/site-search-source.test.d.ts +9 -0
  35. package/dist/__tests__/news/site-search-source.test.d.ts.map +1 -0
  36. package/dist/__tests__/news/site-search-source.test.js +342 -0
  37. package/dist/__tests__/news/site-search-source.test.js.map +1 -0
  38. package/dist/__tests__/obituary/find-a-grave.test.d.ts +8 -0
  39. package/dist/__tests__/obituary/find-a-grave.test.d.ts.map +1 -0
  40. package/dist/__tests__/obituary/find-a-grave.test.js +238 -0
  41. package/dist/__tests__/obituary/find-a-grave.test.js.map +1 -0
  42. package/dist/__tests__/shared/duckduckgo-search.test.d.ts +9 -0
  43. package/dist/__tests__/shared/duckduckgo-search.test.d.ts.map +1 -0
  44. package/dist/__tests__/shared/duckduckgo-search.test.js +218 -0
  45. package/dist/__tests__/shared/duckduckgo-search.test.js.map +1 -0
  46. package/dist/__tests__/shared/fetch-page.test.d.ts +9 -0
  47. package/dist/__tests__/shared/fetch-page.test.d.ts.map +1 -0
  48. package/dist/__tests__/shared/fetch-page.test.js +281 -0
  49. package/dist/__tests__/shared/fetch-page.test.js.map +1 -0
  50. package/dist/__tests__/shared/html-utils.test.d.ts +2 -0
  51. package/dist/__tests__/shared/html-utils.test.d.ts.map +1 -0
  52. package/dist/__tests__/shared/html-utils.test.js +169 -0
  53. package/dist/__tests__/shared/html-utils.test.js.map +1 -0
  54. package/dist/__tests__/shared/readability-extract.test.d.ts +2 -0
  55. package/dist/__tests__/shared/readability-extract.test.d.ts.map +1 -0
  56. package/dist/__tests__/shared/readability-extract.test.js +107 -0
  57. package/dist/__tests__/shared/readability-extract.test.js.map +1 -0
  58. package/dist/__tests__/shared/sanitize-text.test.d.ts +2 -0
  59. package/dist/__tests__/shared/sanitize-text.test.d.ts.map +1 -0
  60. package/dist/__tests__/shared/sanitize-text.test.js +77 -0
  61. package/dist/__tests__/shared/sanitize-text.test.js.map +1 -0
  62. package/dist/__tests__/shared/search-utils.test.d.ts +2 -0
  63. package/dist/__tests__/shared/search-utils.test.d.ts.map +1 -0
  64. package/dist/__tests__/shared/search-utils.test.js +26 -0
  65. package/dist/__tests__/shared/search-utils.test.js.map +1 -0
  66. package/dist/__tests__/structured/wikidata.test.d.ts +9 -0
  67. package/dist/__tests__/structured/wikidata.test.d.ts.map +1 -0
  68. package/dist/__tests__/structured/wikidata.test.js +509 -0
  69. package/dist/__tests__/structured/wikidata.test.js.map +1 -0
  70. package/dist/__tests__/structured/wikipedia.test.d.ts +9 -0
  71. package/dist/__tests__/structured/wikipedia.test.d.ts.map +1 -0
  72. package/dist/__tests__/structured/wikipedia.test.js +643 -0
  73. package/dist/__tests__/structured/wikipedia.test.js.map +1 -0
  74. package/dist/__tests__/web-search/base.test.d.ts +9 -0
  75. package/dist/__tests__/web-search/base.test.d.ts.map +1 -0
  76. package/dist/__tests__/web-search/base.test.js +622 -0
  77. package/dist/__tests__/web-search/base.test.js.map +1 -0
  78. package/dist/__tests__/web-search/bing.test.d.ts +10 -0
  79. package/dist/__tests__/web-search/bing.test.d.ts.map +1 -0
  80. package/dist/__tests__/web-search/bing.test.js +277 -0
  81. package/dist/__tests__/web-search/bing.test.js.map +1 -0
  82. package/dist/__tests__/web-search/brave.test.d.ts +10 -0
  83. package/dist/__tests__/web-search/brave.test.d.ts.map +1 -0
  84. package/dist/__tests__/web-search/brave.test.js +264 -0
  85. package/dist/__tests__/web-search/brave.test.js.map +1 -0
  86. package/dist/__tests__/web-search/duckduckgo.test.d.ts +10 -0
  87. package/dist/__tests__/web-search/duckduckgo.test.d.ts.map +1 -0
  88. package/dist/__tests__/web-search/duckduckgo.test.js +107 -0
  89. package/dist/__tests__/web-search/duckduckgo.test.js.map +1 -0
  90. package/dist/__tests__/web-search/google.test.d.ts +9 -0
  91. package/dist/__tests__/web-search/google.test.d.ts.map +1 -0
  92. package/dist/__tests__/web-search/google.test.js +189 -0
  93. package/dist/__tests__/web-search/google.test.js.map +1 -0
  94. package/dist/archives/chronicling-america.d.ts +33 -0
  95. package/dist/archives/chronicling-america.d.ts.map +1 -0
  96. package/dist/archives/chronicling-america.js +85 -0
  97. package/dist/archives/chronicling-america.js.map +1 -0
  98. package/dist/archives/europeana.d.ts +37 -0
  99. package/dist/archives/europeana.d.ts.map +1 -0
  100. package/dist/archives/europeana.js +92 -0
  101. package/dist/archives/europeana.js.map +1 -0
  102. package/dist/archives/internet-archive.d.ts +32 -0
  103. package/dist/archives/internet-archive.d.ts.map +1 -0
  104. package/dist/archives/internet-archive.js +90 -0
  105. package/dist/archives/internet-archive.js.map +1 -0
  106. package/dist/archives/trove.d.ts +37 -0
  107. package/dist/archives/trove.d.ts.map +1 -0
  108. package/dist/archives/trove.js +97 -0
  109. package/dist/archives/trove.js.map +1 -0
  110. package/dist/books/google-books.d.ts +48 -0
  111. package/dist/books/google-books.d.ts.map +1 -0
  112. package/dist/books/google-books.js +111 -0
  113. package/dist/books/google-books.js.map +1 -0
  114. package/dist/books/open-library.d.ts +44 -0
  115. package/dist/books/open-library.d.ts.map +1 -0
  116. package/dist/books/open-library.js +103 -0
  117. package/dist/books/open-library.js.map +1 -0
  118. package/dist/index.d.ts +45 -0
  119. package/dist/index.d.ts.map +1 -0
  120. package/dist/index.js +35 -0
  121. package/dist/index.js.map +1 -0
  122. package/dist/news/guardian.d.ts +51 -0
  123. package/dist/news/guardian.d.ts.map +1 -0
  124. package/dist/news/guardian.js +131 -0
  125. package/dist/news/guardian.js.map +1 -0
  126. package/dist/news/nytimes.d.ts +27 -0
  127. package/dist/news/nytimes.d.ts.map +1 -0
  128. package/dist/news/nytimes.js +104 -0
  129. package/dist/news/nytimes.js.map +1 -0
  130. package/dist/news/site-search-source.d.ts +89 -0
  131. package/dist/news/site-search-source.d.ts.map +1 -0
  132. package/dist/news/site-search-source.js +182 -0
  133. package/dist/news/site-search-source.js.map +1 -0
  134. package/dist/news/sources.d.ts +52 -0
  135. package/dist/news/sources.d.ts.map +1 -0
  136. package/dist/news/sources.js +276 -0
  137. package/dist/news/sources.js.map +1 -0
  138. package/dist/obituary/find-a-grave.d.ts +43 -0
  139. package/dist/obituary/find-a-grave.d.ts.map +1 -0
  140. package/dist/obituary/find-a-grave.js +173 -0
  141. package/dist/obituary/find-a-grave.js.map +1 -0
  142. package/dist/shared/duckduckgo-search.d.ts +86 -0
  143. package/dist/shared/duckduckgo-search.d.ts.map +1 -0
  144. package/dist/shared/duckduckgo-search.js +218 -0
  145. package/dist/shared/duckduckgo-search.js.map +1 -0
  146. package/dist/shared/fetch-page.d.ts +50 -0
  147. package/dist/shared/fetch-page.d.ts.map +1 -0
  148. package/dist/shared/fetch-page.js +212 -0
  149. package/dist/shared/fetch-page.js.map +1 -0
  150. package/dist/shared/html-utils.d.ts +99 -0
  151. package/dist/shared/html-utils.d.ts.map +1 -0
  152. package/dist/shared/html-utils.js +246 -0
  153. package/dist/shared/html-utils.js.map +1 -0
  154. package/dist/shared/readability-extract.d.ts +33 -0
  155. package/dist/shared/readability-extract.d.ts.map +1 -0
  156. package/dist/shared/readability-extract.js +45 -0
  157. package/dist/shared/readability-extract.js.map +1 -0
  158. package/dist/shared/sanitize-text.d.ts +24 -0
  159. package/dist/shared/sanitize-text.d.ts.map +1 -0
  160. package/dist/shared/sanitize-text.js +49 -0
  161. package/dist/shared/sanitize-text.js.map +1 -0
  162. package/dist/shared/search-utils.d.ts +18 -0
  163. package/dist/shared/search-utils.d.ts.map +1 -0
  164. package/dist/shared/search-utils.js +20 -0
  165. package/dist/shared/search-utils.js.map +1 -0
  166. package/dist/structured/wikidata.d.ts +128 -0
  167. package/dist/structured/wikidata.d.ts.map +1 -0
  168. package/dist/structured/wikidata.js +361 -0
  169. package/dist/structured/wikidata.js.map +1 -0
  170. package/dist/structured/wikipedia.d.ts +184 -0
  171. package/dist/structured/wikipedia.d.ts.map +1 -0
  172. package/dist/structured/wikipedia.js +275 -0
  173. package/dist/structured/wikipedia.js.map +1 -0
  174. package/dist/web-search/base.d.ts +128 -0
  175. package/dist/web-search/base.d.ts.map +1 -0
  176. package/dist/web-search/base.js +251 -0
  177. package/dist/web-search/base.js.map +1 -0
  178. package/dist/web-search/bing.d.ts +21 -0
  179. package/dist/web-search/bing.d.ts.map +1 -0
  180. package/dist/web-search/bing.js +53 -0
  181. package/dist/web-search/bing.js.map +1 -0
  182. package/dist/web-search/brave.d.ts +21 -0
  183. package/dist/web-search/brave.d.ts.map +1 -0
  184. package/dist/web-search/brave.js +56 -0
  185. package/dist/web-search/brave.js.map +1 -0
  186. package/dist/web-search/duckduckgo.d.ts +15 -0
  187. package/dist/web-search/duckduckgo.d.ts.map +1 -0
  188. package/dist/web-search/duckduckgo.js +21 -0
  189. package/dist/web-search/duckduckgo.js.map +1 -0
  190. package/dist/web-search/google.d.ts +24 -0
  191. package/dist/web-search/google.d.ts.map +1 -0
  192. package/dist/web-search/google.js +48 -0
  193. package/dist/web-search/google.js.map +1 -0
  194. package/package.json +58 -0
@@ -0,0 +1,173 @@
1
+ /**
2
+ * Find a Grave obituary source.
3
+ *
4
+ * Searches findagrave.com for memorial pages by name, extracts biography
5
+ * content via Readability (with regex fallback), and returns sanitized text.
6
+ *
7
+ * Find a Grave is user-generated content — anyone can create or edit memorials.
8
+ * Reliability tier is UNRELIABLE_UGC (0.35) per Wikipedia RSP guidelines.
9
+ */
10
+ import { BaseResearchSource, ReliabilityTier, } from "@debriefer/core";
11
+ import { fetchPage } from "../shared/fetch-page.js";
12
+ import { extractArticleContent } from "../shared/readability-extract.js";
13
+ import { sanitizeSourceText } from "../shared/sanitize-text.js";
14
+ // ============================================================================
15
+ // Constants
16
+ // ============================================================================
17
+ const SEARCH_BASE_URL = "https://www.findagrave.com/memorial/search";
18
+ const MEMORIAL_URL_PATTERN = /\/memorial\/(\d+)\//g;
19
+ const MIN_BIO_LENGTH = 100;
20
+ const BIO_REGEX = /<div[^>]*id="bio"[^>]*>([\s\S]*?)<\/div>/i;
21
+ // ============================================================================
22
+ // Source Implementation
23
+ // ============================================================================
24
+ /**
25
+ * Find a Grave source for obituary / memorial content.
26
+ *
27
+ * Pipeline:
28
+ * 1. Search findagrave.com by first/last name
29
+ * 2. Parse memorial URLs from search results HTML
30
+ * 3. Filter for URLs containing the subject's name (normalized)
31
+ * 4. Fetch the memorial page via fetchPage (with archive fallback)
32
+ * 5. Extract bio via Readability, falling back to regex
33
+ * 6. Sanitize text and return if long enough
34
+ */
35
+ export class FindAGraveSource extends BaseResearchSource {
36
+ name = "Find a Grave";
37
+ type = "find-a-grave";
38
+ reliabilityTier = ReliabilityTier.UNRELIABLE_UGC;
39
+ domain = "www.findagrave.com";
40
+ isFree = true;
41
+ estimatedCostPerQuery = 0;
42
+ constructor(options = {}) {
43
+ super({ rateLimitMs: 2000, ...options });
44
+ }
45
+ async fetchResult(subject, signal) {
46
+ // Step 1: Split name into first/last
47
+ const nameParts = subject.name.trim().split(/\s+/);
48
+ const firstName = nameParts[0] ?? "";
49
+ const lastName = nameParts.slice(1).join(" ") || "";
50
+ // Step 2: Search for memorials
51
+ const searchUrl = `${SEARCH_BASE_URL}?firstname=${encodeURIComponent(firstName)}&lastname=${encodeURIComponent(lastName)}&orderby=r`;
52
+ const searchPage = await fetchPage({
53
+ url: searchUrl,
54
+ signal,
55
+ archiveFallback: false,
56
+ });
57
+ if (searchPage.fetchMethod === "none" || !searchPage.content) {
58
+ return null;
59
+ }
60
+ const searchHtml = searchPage.content;
61
+ // Step 3: Extract memorial URLs
62
+ const memorialUrls = [];
63
+ let match;
64
+ // Reset lastIndex before use
65
+ MEMORIAL_URL_PATTERN.lastIndex = 0;
66
+ while ((match = MEMORIAL_URL_PATTERN.exec(searchHtml)) !== null) {
67
+ const fullMatch = match[0];
68
+ const anchorIndex = match.index;
69
+ // Build the full memorial URL from the path found in the HTML
70
+ // Search result links look like: /memorial/12345/john-wayne
71
+ const startQuoteIdx = searchHtml.lastIndexOf('"', anchorIndex);
72
+ const endQuoteIdx = searchHtml.indexOf('"', anchorIndex + fullMatch.length);
73
+ if (startQuoteIdx === -1 || endQuoteIdx === -1) {
74
+ continue;
75
+ }
76
+ const startIdx = startQuoteIdx + 1;
77
+ const endIdx = endQuoteIdx;
78
+ const path = searchHtml.slice(startIdx, endIdx);
79
+ if (path.startsWith("/memorial/")) {
80
+ const url = `https://www.findagrave.com${path}`;
81
+ if (!memorialUrls.includes(url)) {
82
+ memorialUrls.push(url);
83
+ }
84
+ }
85
+ }
86
+ if (memorialUrls.length === 0) {
87
+ return null;
88
+ }
89
+ // Step 4: Prefer exact slug matches for the subject's normalized name
90
+ const normalizedName = subject.name.toLowerCase().replace(/\s+/g, "-");
91
+ // First: look for URLs where the slug segment exactly matches normalizedName.
92
+ const exactSlugMatches = memorialUrls.filter((url) => {
93
+ try {
94
+ const { pathname } = new URL(url);
95
+ const segments = pathname.split("/").filter((s) => s.length > 0);
96
+ // Find a Grave memorial path: /memorial/{id}/{slug}
97
+ const slug = segments[2]?.toLowerCase() ?? "";
98
+ return slug === normalizedName;
99
+ }
100
+ catch {
101
+ return false;
102
+ }
103
+ });
104
+ const matchingUrls = exactSlugMatches.length > 0
105
+ ? exactSlugMatches
106
+ : memorialUrls.filter((url) => url.toLowerCase().includes(normalizedName));
107
+ if (matchingUrls.length === 0) {
108
+ return null;
109
+ }
110
+ const memorialUrl = matchingUrls[0];
111
+ // Step 5: Fetch the memorial page
112
+ await this.rateLimiter?.acquire(this.domain, this.options.rateLimitMs);
113
+ const page = await fetchPage({ url: memorialUrl, signal });
114
+ if (page.fetchMethod === "none" || !page.content) {
115
+ return null;
116
+ }
117
+ // Step 6: Extract bio content
118
+ const actualUrl = page.url || memorialUrl;
119
+ let bioText = null;
120
+ // Try Readability first
121
+ const extracted = extractArticleContent(page.content, actualUrl);
122
+ if (extracted && extracted.text.length >= MIN_BIO_LENGTH) {
123
+ bioText = extracted.text;
124
+ }
125
+ // Fall back to regex if Readability didn't get enough content
126
+ if (!bioText) {
127
+ const bioMatch = BIO_REGEX.exec(page.content);
128
+ if (bioMatch && bioMatch[1]) {
129
+ // Strip HTML tags from the regex-extracted content
130
+ const stripped = bioMatch[1]
131
+ .replace(/<[^>]*>/g, " ")
132
+ .replace(/\s+/g, " ")
133
+ .trim();
134
+ if (stripped.length >= MIN_BIO_LENGTH) {
135
+ bioText = stripped;
136
+ }
137
+ }
138
+ }
139
+ if (!bioText) {
140
+ return null;
141
+ }
142
+ // Step 7: Sanitize and return
143
+ const text = sanitizeSourceText(bioText);
144
+ if (text.length < MIN_BIO_LENGTH) {
145
+ return null;
146
+ }
147
+ return {
148
+ text,
149
+ confidence: -1,
150
+ costUsd: 0,
151
+ url: actualUrl,
152
+ publication: "Find a Grave",
153
+ metadata: {
154
+ memorialUrl: actualUrl,
155
+ },
156
+ };
157
+ }
158
+ }
159
+ // ============================================================================
160
+ // Factory Function
161
+ // ============================================================================
162
+ /**
163
+ * Create a Find a Grave source instance.
164
+ *
165
+ * @example
166
+ * ```typescript
167
+ * const source = findAGrave()
168
+ * ```
169
+ */
170
+ export function findAGrave(options) {
171
+ return new FindAGraveSource(options);
172
+ }
173
+ //# sourceMappingURL=find-a-grave.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"find-a-grave.js","sourceRoot":"","sources":["../../src/obituary/find-a-grave.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,OAAO,EACL,kBAAkB,EAClB,eAAe,GAIhB,MAAM,iBAAiB,CAAA;AACxB,OAAO,EAAE,SAAS,EAAE,MAAM,yBAAyB,CAAA;AACnD,OAAO,EAAE,qBAAqB,EAAE,MAAM,kCAAkC,CAAA;AACxE,OAAO,EAAE,kBAAkB,EAAE,MAAM,4BAA4B,CAAA;AAE/D,+EAA+E;AAC/E,YAAY;AACZ,+EAA+E;AAE/E,MAAM,eAAe,GAAG,4CAA4C,CAAA;AACpE,MAAM,oBAAoB,GAAG,sBAAsB,CAAA;AACnD,MAAM,cAAc,GAAG,GAAG,CAAA;AAC1B,MAAM,SAAS,GAAG,2CAA2C,CAAA;AAS7D,+EAA+E;AAC/E,wBAAwB;AACxB,+EAA+E;AAE/E;;;;;;;;;;GAUG;AACH,MAAM,OAAO,gBAAiB,SAAQ,kBAAmC;IAC9D,IAAI,GAAG,cAAc,CAAA;IACrB,IAAI,GAAG,cAAc,CAAA;IACrB,eAAe,GAAG,eAAe,CAAC,cAAc,CAAA;IAChD,MAAM,GAAG,oBAAoB,CAAA;IAC7B,MAAM,GAAG,IAAI,CAAA;IACb,qBAAqB,GAAG,CAAC,CAAA;IAElC,YAAY,UAA6B,EAAE;QACzC,KAAK,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,GAAG,OAAO,EAAE,CAAC,CAAA;IAC1C,CAAC;IAES,KAAK,CAAC,WAAW,CACzB,OAAwB,EACxB,MAAmB;QAEnB,qCAAqC;QACrC,MAAM,SAAS,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,CAAA;QAClD,MAAM,SAAS,GAAG,SAAS,CAAC,CAAC,CAAC,IAAI,EAAE,CAAA;QACpC,MAAM,QAAQ,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,EAAE,CAAA;QAEnD,+BAA+B;QAC/B,MAAM,SAAS,GAAG,GAAG,eAAe,cAAc,kBAAkB,CAAC,SAAS,CAAC,aAAa,kBAAkB,CAAC,QAAQ,CAAC,YAAY,CAAA;QAEpI,MAAM,UAAU,GAAG,MAAM,SAAS,CAAC;YACjC,GAAG,EAAE,SAAS;YACd,MAAM;YACN,eAAe,EAAE,KAAK;SACvB,CAAC,CAAA;QAEF,IAAI,UAAU,CAAC,WAAW,KAAK,MAAM,IAAI,CAAC,UAAU,CAAC,OAAO,EAAE,CAAC;YAC7D,OAAO,IAAI,CAAA;QACb,CAAC;QAED,MAAM,UAAU,GAAG,UAAU,CAAC,OAAO,CAAA;QAErC,gCAAgC;QAChC,MAAM,YAAY,GAAa,EAAE,CAAA;QACjC,IAAI,KAA6B,CAAA;QACjC,6BAA6B;QAC7B,oBAAoB,CAAC,SAAS,GAAG,CAAC,CAAA;QAClC,OAAO,CAAC,KAAK,GAAG,oBAAoB,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;YAChE,MAAM,SAAS,GAAG,KAAK,CAAC,CAAC,CAAC,CAAA;YAC1B,MAAM,WAAW,GAAG,KAAK,CAAC,KAAK,CAAA;YAC/B,8DAA8D;YAC9D,4DAA4D;YAC5D,MAAM,aAAa,GAAG,UAAU,CAAC,WAAW,CAAC,GAAG,EAAE,WAAW,CAAC,CAAA;YAC9D,MAAM,WAAW,GAAG,UAAU,CAAC,OAAO,CAAC,GAAG,EAAE,WAAW,GAAG,SAAS,CAAC,MAAM,CAAC,CAAA;YAC3E,IAAI,aAAa,KAAK,CAAC,CAAC,IAAI,WAAW,KAAK,CAAC,CAAC,EAAE,CAAC;gBAC/C,SAAQ;YACV,CAAC;YACD,MAAM,QAAQ,GAAG,aAAa,GAAG,CAAC,CAAA;YAClC,MAAM,MAAM,GAAG,WAAW,CAAA;YAC1B,MAAM,IAAI,GAAG,UAAU,CAAC,KAAK,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAA;YAE/C,IAAI,IAAI,CAAC,UAAU,CAAC,YAAY,CAAC,EAAE,CAAC;gBAClC,MAAM,GAAG,GAAG,6BAA6B,IAAI,EAAE,CAAA;gBAC/C,IAAI,CAAC,YAAY,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;oBAChC,YAAY,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;gBACxB,CAAC;YACH,CAAC;QACH,CAAC;QAED,IAAI,YAAY,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC9B,OAAO,IAAI,CAAA;QACb,CAAC;QAED,sEAAsE;QACtE,MAAM,cAAc,GAAG,OAAO,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAA;QAEtE,8EAA8E;QAC9E,MAAM,gBAAgB,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,EAAE;YACnD,IAAI,CAAC;gBACH,MAAM,EAAE,QAAQ,EAAE,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAA;gBACjC,MAAM,QAAQ,GAAG,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;gBAChE,oDAAoD;gBACpD,MAAM,IAAI,GAAG,QAAQ,CAAC,CAAC,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,CAAA;gBAC7C,OAAO,IAAI,KAAK,cAAc,CAAA;YAChC,CAAC;YAAC,MAAM,CAAC;gBACP,OAAO,KAAK,CAAA;YACd,CAAC;QACH,CAAC,CAAC,CAAA;QAEF,MAAM,YAAY,GAChB,gBAAgB,CAAC,MAAM,GAAG,CAAC;YACzB,CAAC,CAAC,gBAAgB;YAClB,CAAC,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,cAAc,CAAC,CAAC,CAAA;QAE9E,IAAI,YAAY,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC9B,OAAO,IAAI,CAAA;QACb,CAAC;QAED,MAAM,WAAW,GAAG,YAAY,CAAC,CAAC,CAAC,CAAA;QAEnC,kCAAkC;QAClC,MAAM,IAAI,CAAC,WAAW,EAAE,OAAO,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,OAAO,CAAC,WAAY,CAAC,CAAA;QACvE,MAAM,IAAI,GAAG,MAAM,SAAS,CAAC,EAAE,GAAG,EAAE,WAAW,EAAE,MAAM,EAAE,CAAC,CAAA;QAE1D,IAAI,IAAI,CAAC,WAAW,KAAK,MAAM,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC;YACjD,OAAO,IAAI,CAAA;QACb,CAAC;QAED,8BAA8B;QAC9B,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,IAAI,WAAW,CAAA;QACzC,IAAI,OAAO,GAAkB,IAAI,CAAA;QAEjC,wBAAwB;QACxB,MAAM,SAAS,GAAG,qBAAqB,CAAC,IAAI,CAAC,OAAO,EAAE,SAAS,CAAC,CAAA;QAChE,IAAI,SAAS,IAAI,SAAS,CAAC,IAAI,CAAC,MAAM,IAAI,cAAc,EAAE,CAAC;YACzD,OAAO,GAAG,SAAS,CAAC,IAAI,CAAA;QAC1B,CAAC;QAED,8DAA8D;QAC9D,IAAI,CAAC,OAAO,EAAE,CAAC;YACb,MAAM,QAAQ,GAAG,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,CAAA;YAC7C,IAAI,QAAQ,IAAI,QAAQ,CAAC,CAAC,CAAC,EAAE,CAAC;gBAC5B,mDAAmD;gBACnD,MAAM,QAAQ,GAAG,QAAQ,CAAC,CAAC,CAAC;qBACzB,OAAO,CAAC,UAAU,EAAE,GAAG,CAAC;qBACxB,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC;qBACpB,IAAI,EAAE,CAAA;gBACT,IAAI,QAAQ,CAAC,MAAM,IAAI,cAAc,EAAE,CAAC;oBACtC,OAAO,GAAG,QAAQ,CAAA;gBACpB,CAAC;YACH,CAAC;QACH,CAAC;QAED,IAAI,CAAC,OAAO,EAAE,CAAC;YACb,OAAO,IAAI,CAAA;QACb,CAAC;QAED,8BAA8B;QAC9B,MAAM,IAAI,GAAG,kBAAkB,CAAC,OAAO,CAAC,CAAA;QAExC,IAAI,IAAI,CAAC,MAAM,GAAG,cAAc,EAAE,CAAC;YACjC,OAAO,IAAI,CAAA;QACb,CAAC;QAED,OAAO;YACL,IAAI;YACJ,UAAU,EAAE,CAAC,CAAC;YACd,OAAO,EAAE,CAAC;YACV,GAAG,EAAE,SAAS;YACd,WAAW,EAAE,cAAc;YAC3B,QAAQ,EAAE;gBACR,WAAW,EAAE,SAAS;aACvB;SACF,CAAA;IACH,CAAC;CACF;AAED,+EAA+E;AAC/E,mBAAmB;AACnB,+EAA+E;AAE/E;;;;;;;GAOG;AACH,MAAM,UAAU,UAAU,CAAC,OAA2B;IACpD,OAAO,IAAI,gBAAgB,CAAC,OAAO,CAAC,CAAA;AACtC,CAAC"}
@@ -0,0 +1,86 @@
1
+ /**
2
+ * DuckDuckGo HTML search utility.
3
+ *
4
+ * Scrapes DDG's HTML endpoint (no API key required) to extract
5
+ * search result URLs, titles, and snippets. Includes CAPTCHA detection,
6
+ * DDG redirect URL cleaning, and domain-based filtering.
7
+ *
8
+ * Used by DuckDuckGoSearchSource and future news sources for
9
+ * `site:domain.com` style queries.
10
+ */
11
+ /** Options for a DuckDuckGo HTML search. */
12
+ export interface DuckDuckGoSearchOptions {
13
+ /** Search query string. */
14
+ query: string;
15
+ /** Domain to restrict results to (prepended as site: to query). */
16
+ domainFilter?: string;
17
+ /** Maximum number of results to return. Default: 10. */
18
+ maxResults?: number;
19
+ /** AbortSignal from the caller (combined with timeoutMs). */
20
+ signal?: AbortSignal;
21
+ /** Timeout in milliseconds for the fetch. Default: 15000. */
22
+ timeoutMs?: number;
23
+ }
24
+ /** A single search result extracted from DDG HTML. */
25
+ export interface SearchResult {
26
+ /** Cleaned URL of the search result. */
27
+ url: string;
28
+ /** Title of the search result. */
29
+ title: string;
30
+ /** Snippet/description of the search result. */
31
+ snippet: string;
32
+ }
33
+ /**
34
+ * Detect whether DDG returned a CAPTCHA/bot-detection page.
35
+ *
36
+ * Checks for "anomaly-modal" (DDG's CAPTCHA container) and the
37
+ * "bots use DuckDuckGo too" message.
38
+ *
39
+ * @param html - Raw HTML response body from DDG
40
+ * @returns True if the page is a CAPTCHA challenge
41
+ */
42
+ export declare function isDuckDuckGoCaptcha(html: string): boolean;
43
+ /**
44
+ * Clean a DuckDuckGo result URL.
45
+ *
46
+ * DDG wraps result URLs in redirect links like:
47
+ * `//duckduckgo.com/l/?uddg=ENCODED_URL&rut=...`
48
+ *
49
+ * This function:
50
+ * 1. Extracts the real URL from the `uddg` query parameter
51
+ * 2. Handles protocol-relative `//` URLs by prepending `https:`
52
+ * 3. Passes normal URLs through unchanged
53
+ *
54
+ * @param url - URL from a DDG search result
55
+ * @returns Cleaned URL pointing to the actual destination
56
+ */
57
+ export declare function cleanDuckDuckGoUrl(url: string): string;
58
+ /**
59
+ * Extract search results from DuckDuckGo HTML response.
60
+ *
61
+ * Parses DDG's HTML structure:
62
+ * - `class="result__url"` href for URLs (primary)
63
+ * - `class="result__a"` for titles (and fallback URLs)
64
+ * - `class="result__snippet"` for snippets
65
+ *
66
+ * If no `result__url` matches are found, falls back to `result__a` hrefs.
67
+ * Filters by domain using URL hostname parsing to prevent substring spoofing
68
+ * (e.g., "nytimes.com.evil.com" won't match "nytimes.com").
69
+ *
70
+ * @param html - Raw HTML response from DDG
71
+ * @param domainFilter - Optional domain to filter results by
72
+ * @returns Array of extracted search results
73
+ */
74
+ export declare function extractUrlsFromDuckDuckGoHtml(html: string, domainFilter?: string): SearchResult[];
75
+ /**
76
+ * Search DuckDuckGo via its HTML endpoint.
77
+ *
78
+ * Fetches `https://html.duckduckgo.com/html/?q=QUERY`, optionally
79
+ * prepending `site:domain` when domainFilter is set. Returns an empty
80
+ * array on CAPTCHA, error, or non-OK response.
81
+ *
82
+ * @param options - Search options including query, domain filter, and limits
83
+ * @returns Array of search results (empty on failure)
84
+ */
85
+ export declare function searchDuckDuckGo(options: DuckDuckGoSearchOptions): Promise<SearchResult[]>;
86
+ //# sourceMappingURL=duckduckgo-search.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"duckduckgo-search.d.ts","sourceRoot":"","sources":["../../src/shared/duckduckgo-search.ts"],"names":[],"mappings":"AAAA;;;;;;;;;GASG;AAQH,4CAA4C;AAC5C,MAAM,WAAW,uBAAuB;IACtC,2BAA2B;IAC3B,KAAK,EAAE,MAAM,CAAA;IACb,mEAAmE;IACnE,YAAY,CAAC,EAAE,MAAM,CAAA;IACrB,wDAAwD;IACxD,UAAU,CAAC,EAAE,MAAM,CAAA;IACnB,6DAA6D;IAC7D,MAAM,CAAC,EAAE,WAAW,CAAA;IACpB,6DAA6D;IAC7D,SAAS,CAAC,EAAE,MAAM,CAAA;CACnB;AAED,sDAAsD;AACtD,MAAM,WAAW,YAAY;IAC3B,wCAAwC;IACxC,GAAG,EAAE,MAAM,CAAA;IACX,kCAAkC;IAClC,KAAK,EAAE,MAAM,CAAA;IACb,gDAAgD;IAChD,OAAO,EAAE,MAAM,CAAA;CAChB;AAmBD;;;;;;;;GAQG;AACH,wBAAgB,mBAAmB,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAGzD;AAMD;;;;;;;;;;;;;GAaG;AACH,wBAAgB,kBAAkB,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,CAyBtD;AAMD;;;;;;;;;;;;;;;GAeG;AACH,wBAAgB,6BAA6B,CAAC,IAAI,EAAE,MAAM,EAAE,YAAY,CAAC,EAAE,MAAM,GAAG,YAAY,EAAE,CAoEjG;AAqBD;;;;;;;;;GASG;AACH,wBAAsB,gBAAgB,CAAC,OAAO,EAAE,uBAAuB,GAAG,OAAO,CAAC,YAAY,EAAE,CAAC,CAmDhG"}
@@ -0,0 +1,218 @@
1
+ /**
2
+ * DuckDuckGo HTML search utility.
3
+ *
4
+ * Scrapes DDG's HTML endpoint (no API key required) to extract
5
+ * search result URLs, titles, and snippets. Includes CAPTCHA detection,
6
+ * DDG redirect URL cleaning, and domain-based filtering.
7
+ *
8
+ * Used by DuckDuckGoSearchSource and future news sources for
9
+ * `site:domain.com` style queries.
10
+ */
11
+ import { decodeHtmlEntities } from "./html-utils.js";
12
+ // ============================================================================
13
+ // Constants
14
+ // ============================================================================
15
+ /** DDG HTML search endpoint. */
16
+ const DDG_HTML_URL = "https://html.duckduckgo.com/html/";
17
+ /** Default timeout for the search fetch in milliseconds. */
18
+ const DEFAULT_TIMEOUT_MS = 15000;
19
+ /** Default maximum number of results to return. */
20
+ const DEFAULT_MAX_RESULTS = 10;
21
+ // ============================================================================
22
+ // CAPTCHA Detection
23
+ // ============================================================================
24
+ /**
25
+ * Detect whether DDG returned a CAPTCHA/bot-detection page.
26
+ *
27
+ * Checks for "anomaly-modal" (DDG's CAPTCHA container) and the
28
+ * "bots use DuckDuckGo too" message.
29
+ *
30
+ * @param html - Raw HTML response body from DDG
31
+ * @returns True if the page is a CAPTCHA challenge
32
+ */
33
+ export function isDuckDuckGoCaptcha(html) {
34
+ const lower = html.toLowerCase();
35
+ return lower.includes("anomaly-modal") || lower.includes("bots use duckduckgo too");
36
+ }
37
+ // ============================================================================
38
+ // URL Cleaning
39
+ // ============================================================================
40
+ /**
41
+ * Clean a DuckDuckGo result URL.
42
+ *
43
+ * DDG wraps result URLs in redirect links like:
44
+ * `//duckduckgo.com/l/?uddg=ENCODED_URL&rut=...`
45
+ *
46
+ * This function:
47
+ * 1. Extracts the real URL from the `uddg` query parameter
48
+ * 2. Handles protocol-relative `//` URLs by prepending `https:`
49
+ * 3. Passes normal URLs through unchanged
50
+ *
51
+ * @param url - URL from a DDG search result
52
+ * @returns Cleaned URL pointing to the actual destination
53
+ */
54
+ export function cleanDuckDuckGoUrl(url) {
55
+ // Handle DDG redirect URLs — only extract uddg from known DDG redirect paths
56
+ if (url.includes("uddg=")) {
57
+ const normalizedUrl = url.startsWith("//") ? `https:${url}` : url;
58
+ try {
59
+ const parsed = new URL(normalizedUrl);
60
+ const hostname = parsed.hostname.toLowerCase();
61
+ const isDDG = hostname === "duckduckgo.com" || hostname === "www.duckduckgo.com";
62
+ if (isDDG && parsed.pathname.startsWith("/l/")) {
63
+ const uddg = parsed.searchParams.get("uddg");
64
+ if (uddg) {
65
+ return uddg;
66
+ }
67
+ }
68
+ }
69
+ catch {
70
+ // Fall through to other checks
71
+ }
72
+ }
73
+ // Handle protocol-relative URLs
74
+ if (url.startsWith("//")) {
75
+ return `https:${url}`;
76
+ }
77
+ return url;
78
+ }
79
+ // ============================================================================
80
+ // HTML Extraction
81
+ // ============================================================================
82
+ /**
83
+ * Extract search results from DuckDuckGo HTML response.
84
+ *
85
+ * Parses DDG's HTML structure:
86
+ * - `class="result__url"` href for URLs (primary)
87
+ * - `class="result__a"` for titles (and fallback URLs)
88
+ * - `class="result__snippet"` for snippets
89
+ *
90
+ * If no `result__url` matches are found, falls back to `result__a` hrefs.
91
+ * Filters by domain using URL hostname parsing to prevent substring spoofing
92
+ * (e.g., "nytimes.com.evil.com" won't match "nytimes.com").
93
+ *
94
+ * @param html - Raw HTML response from DDG
95
+ * @param domainFilter - Optional domain to filter results by
96
+ * @returns Array of extracted search results
97
+ */
98
+ export function extractUrlsFromDuckDuckGoHtml(html, domainFilter) {
99
+ // Extract result__url hrefs
100
+ const urlPattern = /class="result__url"\s+href="([^"]+)"/g;
101
+ const resultUrls = [];
102
+ let match;
103
+ match = urlPattern.exec(html);
104
+ while (match !== null) {
105
+ resultUrls.push(cleanDuckDuckGoUrl(decodeHtmlEntities(match[1])));
106
+ match = urlPattern.exec(html);
107
+ }
108
+ // Extract result__a titles and hrefs
109
+ const titlePattern = /class="result__a"\s+href="([^"]+)"[^>]*>([^<]*)</g;
110
+ const titles = [];
111
+ match = titlePattern.exec(html);
112
+ while (match !== null) {
113
+ titles.push({
114
+ href: cleanDuckDuckGoUrl(decodeHtmlEntities(match[1])),
115
+ title: decodeHtmlEntities(match[2]).trim(),
116
+ });
117
+ match = titlePattern.exec(html);
118
+ }
119
+ // Extract result__snippet text
120
+ const snippetPattern = /class="result__snippet"[^>]*>([\s\S]*?)<\/a>/g;
121
+ const snippets = [];
122
+ match = snippetPattern.exec(html);
123
+ while (match !== null) {
124
+ // Strip any inline HTML tags from snippet content
125
+ const rawSnippet = match[1].replace(/<[^>]+>/g, "");
126
+ snippets.push(decodeHtmlEntities(rawSnippet).trim());
127
+ match = snippetPattern.exec(html);
128
+ }
129
+ // Build results: prefer result__url, fall back to result__a hrefs
130
+ const useUrls = resultUrls.length > 0;
131
+ const primaryUrls = useUrls ? resultUrls : titles.map((t) => t.href);
132
+ const count = primaryUrls.length;
133
+ const results = [];
134
+ for (let i = 0; i < count; i++) {
135
+ const url = primaryUrls[i];
136
+ const title = titles[i]?.title ?? "";
137
+ const snippet = snippets[i] ?? "";
138
+ // Filter by domain using hostname parsing to prevent substring spoofing.
139
+ // Normalize to lowercase since URL.hostname is always lowercase.
140
+ if (domainFilter) {
141
+ const normalizedFilter = domainFilter.toLowerCase().trim();
142
+ try {
143
+ const hostname = new URL(url).hostname;
144
+ if (hostname !== normalizedFilter && !hostname.endsWith("." + normalizedFilter)) {
145
+ continue;
146
+ }
147
+ }
148
+ catch {
149
+ // If URL can't be parsed, skip it when filtering
150
+ continue;
151
+ }
152
+ }
153
+ results.push({ url, title, snippet });
154
+ }
155
+ return results;
156
+ }
157
+ // ============================================================================
158
+ // Search Function
159
+ // ============================================================================
160
+ /**
161
+ * Build the combined AbortSignal from a caller signal and a timeout.
162
+ *
163
+ * Uses `AbortSignal.any()` to combine both so that neither defeats the other.
164
+ */
165
+ function buildSignal(callerSignal, timeoutMs) {
166
+ const timeout = timeoutMs ?? DEFAULT_TIMEOUT_MS;
167
+ const timeoutSignal = AbortSignal.timeout(timeout);
168
+ if (callerSignal) {
169
+ return AbortSignal.any([callerSignal, timeoutSignal]);
170
+ }
171
+ return timeoutSignal;
172
+ }
173
+ /**
174
+ * Search DuckDuckGo via its HTML endpoint.
175
+ *
176
+ * Fetches `https://html.duckduckgo.com/html/?q=QUERY`, optionally
177
+ * prepending `site:domain` when domainFilter is set. Returns an empty
178
+ * array on CAPTCHA, error, or non-OK response.
179
+ *
180
+ * @param options - Search options including query, domain filter, and limits
181
+ * @returns Array of search results (empty on failure)
182
+ */
183
+ export async function searchDuckDuckGo(options) {
184
+ const { query, domainFilter, maxResults = DEFAULT_MAX_RESULTS, signal: callerSignal, timeoutMs, } = options;
185
+ // Build the search query, prepending site: if domainFilter is set
186
+ const fullQuery = domainFilter ? `site:${domainFilter} ${query}` : query;
187
+ // Build the search URL
188
+ const searchUrl = `${DDG_HTML_URL}?q=${encodeURIComponent(fullQuery)}`;
189
+ // Build the abort signal combining caller signal with timeout
190
+ const signal = buildSignal(callerSignal, timeoutMs);
191
+ let response;
192
+ try {
193
+ response = await fetch(searchUrl, {
194
+ headers: {
195
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
196
+ },
197
+ signal,
198
+ });
199
+ }
200
+ catch (error) {
201
+ // Re-throw abort/timeout so BaseResearchSource.lookup() can record telemetry
202
+ if (error instanceof DOMException &&
203
+ (error.name === "AbortError" || error.name === "TimeoutError")) {
204
+ throw error;
205
+ }
206
+ return [];
207
+ }
208
+ if (!response.ok) {
209
+ return [];
210
+ }
211
+ const html = await response.text();
212
+ if (isDuckDuckGoCaptcha(html)) {
213
+ return [];
214
+ }
215
+ const results = extractUrlsFromDuckDuckGoHtml(html, domainFilter);
216
+ return results.slice(0, maxResults);
217
+ }
218
+ //# sourceMappingURL=duckduckgo-search.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"duckduckgo-search.js","sourceRoot":"","sources":["../../src/shared/duckduckgo-search.ts"],"names":[],"mappings":"AAAA;;;;;;;;;GASG;AAEH,OAAO,EAAE,kBAAkB,EAAE,MAAM,iBAAiB,CAAA;AA8BpD,+EAA+E;AAC/E,YAAY;AACZ,+EAA+E;AAE/E,gCAAgC;AAChC,MAAM,YAAY,GAAG,mCAAmC,CAAA;AAExD,4DAA4D;AAC5D,MAAM,kBAAkB,GAAG,KAAK,CAAA;AAEhC,mDAAmD;AACnD,MAAM,mBAAmB,GAAG,EAAE,CAAA;AAE9B,+EAA+E;AAC/E,oBAAoB;AACpB,+EAA+E;AAE/E;;;;;;;;GAQG;AACH,MAAM,UAAU,mBAAmB,CAAC,IAAY;IAC9C,MAAM,KAAK,GAAG,IAAI,CAAC,WAAW,EAAE,CAAA;IAChC,OAAO,KAAK,CAAC,QAAQ,CAAC,eAAe,CAAC,IAAI,KAAK,CAAC,QAAQ,CAAC,yBAAyB,CAAC,CAAA;AACrF,CAAC;AAED,+EAA+E;AAC/E,eAAe;AACf,+EAA+E;AAE/E;;;;;;;;;;;;;GAaG;AACH,MAAM,UAAU,kBAAkB,CAAC,GAAW;IAC5C,6EAA6E;IAC7E,IAAI,GAAG,CAAC,QAAQ,CAAC,OAAO,CAAC,EAAE,CAAC;QAC1B,MAAM,aAAa,GAAG,GAAG,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,GAAG,EAAE,CAAC,CAAC,CAAC,GAAG,CAAA;QACjE,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,aAAa,CAAC,CAAA;YACrC,MAAM,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC,WAAW,EAAE,CAAA;YAC9C,MAAM,KAAK,GAAG,QAAQ,KAAK,gBAAgB,IAAI,QAAQ,KAAK,oBAAoB,CAAA;YAChF,IAAI,KAAK,IAAI,MAAM,CAAC,QAAQ,CAAC,UAAU,CAAC,KAAK,CAAC,EAAE,CAAC;gBAC/C,MAAM,IAAI,GAAG,MAAM,CAAC,YAAY,CAAC,GAAG,CAAC,MAAM,CAAC,CAAA;gBAC5C,IAAI,IAAI,EAAE,CAAC;oBACT,OAAO,IAAI,CAAA;gBACb,CAAC;YACH,CAAC;QACH,CAAC;QAAC,MAAM,CAAC;YACP,+BAA+B;QACjC,CAAC;IACH,CAAC;IAED,gCAAgC;IAChC,IAAI,GAAG,CAAC,UAAU,CAAC,IAAI,CAAC,EAAE,CAAC;QACzB,OAAO,SAAS,GAAG,EAAE,CAAA;IACvB,CAAC;IAED,OAAO,GAAG,CAAA;AACZ,CAAC;AAED,+EAA+E;AAC/E,kBAAkB;AAClB,+EAA+E;AAE/E;;;;;;;;;;;;;;;GAeG;AACH,MAAM,UAAU,6BAA6B,CAAC,IAAY,EAAE,YAAqB;IAC/E,4BAA4B;IAC5B,MAAM,UAAU,GAAG,uCAAuC,CAAA;IAC1D,MAAM,UAAU,GAAa,EAAE,CAAA;IAC/B,IAAI,KAA6B,CAAA;IAEjC,KAAK,GAAG,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;IAC7B,OAAO,KAAK,KAAK,IAAI,EAAE,CAAC;QACtB,UAAU,CAAC,IAAI,CAAC,kBAAkB,CAAC,kBAAkB,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;QACjE,KAAK,GAAG,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;IAC/B,CAAC;IAED,qCAAqC;IACrC,MAAM,YAAY,GAAG,mDAAmD,CAAA;IACxE,MAAM,MAAM,GAA2C,EAAE,CAAA;IAEzD,KAAK,GAAG,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;IAC/B,OAAO,KAAK,KAAK,IAAI,EAAE,CAAC;QACtB,MAAM,CAAC,IAAI,CAAC;YACV,IAAI,EAAE,kBAAkB,CAAC,kBAAkB,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;YACtD,KAAK,EAAE,kBAAkB,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE;SAC3C,CAAC,CAAA;QACF,KAAK,GAAG,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;IACjC,CAAC;IAED,+BAA+B;IAC/B,MAAM,cAAc,GAAG,+CAA+C,CAAA;IACtE,MAAM,QAAQ,GAAa,EAAE,CAAA;IAE7B,KAAK,GAAG,cAAc,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;IACjC,OAAO,KAAK,KAAK,IAAI,EAAE,CAAC;QACtB,kDAAkD;QAClD,MAAM,UAAU,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC,CAAA;QACnD,QAAQ,CAAC,IAAI,CAAC,kBAAkB,CAAC,UAAU,CAAC,CAAC,IAAI,EAAE,CAAC,CAAA;QACpD,KAAK,GAAG,cAAc,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;IACnC,CAAC;IAED,kEAAkE;IAClE,MAAM,OAAO,GAAG,UAAU,CAAC,MAAM,GAAG,CAAC,CAAA;IACrC,MAAM,WAAW,GAAG,OAAO,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAA;IACpE,MAAM,KAAK,GAAG,WAAW,CAAC,MAAM,CAAA;IAEhC,MAAM,OAAO,GAAmB,EAAE,CAAA;IAElC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,EAAE,CAAC,EAAE,EAAE,CAAC;QAC/B,MAAM,GAAG,GAAG,WAAW,CAAC,CAAC,CAAC,CAAA;QAC1B,MAAM,KAAK,GAAG,MAAM,CAAC,CAAC,CAAC,EAAE,KAAK,IAAI,EAAE,CAAA;QACpC,MAAM,OAAO,GAAG,QAAQ,CAAC,CAAC,CAAC,IAAI,EAAE,CAAA;QAEjC,yEAAyE;QACzE,iEAAiE;QACjE,IAAI,YAAY,EAAE,CAAC;YACjB,MAAM,gBAAgB,GAAG,YAAY,CAAC,WAAW,EAAE,CAAC,IAAI,EAAE,CAAA;YAC1D,IAAI,CAAC;gBACH,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAA;gBACtC,IAAI,QAAQ,KAAK,gBAAgB,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,GAAG,GAAG,gBAAgB,CAAC,EAAE,CAAC;oBAChF,SAAQ;gBACV,CAAC;YACH,CAAC;YAAC,MAAM,CAAC;gBACP,iDAAiD;gBACjD,SAAQ;YACV,CAAC;QACH,CAAC;QAED,OAAO,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,KAAK,EAAE,OAAO,EAAE,CAAC,CAAA;IACvC,CAAC;IAED,OAAO,OAAO,CAAA;AAChB,CAAC;AAED,+EAA+E;AAC/E,kBAAkB;AAClB,+EAA+E;AAE/E;;;;GAIG;AACH,SAAS,WAAW,CAAC,YAA0B,EAAE,SAAkB;IACjE,MAAM,OAAO,GAAG,SAAS,IAAI,kBAAkB,CAAA;IAC/C,MAAM,aAAa,GAAG,WAAW,CAAC,OAAO,CAAC,OAAO,CAAC,CAAA;IAElD,IAAI,YAAY,EAAE,CAAC;QACjB,OAAO,WAAW,CAAC,GAAG,CAAC,CAAC,YAAY,EAAE,aAAa,CAAC,CAAC,CAAA;IACvD,CAAC;IACD,OAAO,aAAa,CAAA;AACtB,CAAC;AAED;;;;;;;;;GASG;AACH,MAAM,CAAC,KAAK,UAAU,gBAAgB,CAAC,OAAgC;IACrE,MAAM,EACJ,KAAK,EACL,YAAY,EACZ,UAAU,GAAG,mBAAmB,EAChC,MAAM,EAAE,YAAY,EACpB,SAAS,GACV,GAAG,OAAO,CAAA;IAEX,kEAAkE;IAClE,MAAM,SAAS,GAAG,YAAY,CAAC,CAAC,CAAC,QAAQ,YAAY,IAAI,KAAK,EAAE,CAAC,CAAC,CAAC,KAAK,CAAA;IAExE,uBAAuB;IACvB,MAAM,SAAS,GAAG,GAAG,YAAY,MAAM,kBAAkB,CAAC,SAAS,CAAC,EAAE,CAAA;IAEtE,8DAA8D;IAC9D,MAAM,MAAM,GAAG,WAAW,CAAC,YAAY,EAAE,SAAS,CAAC,CAAA;IAEnD,IAAI,QAAkB,CAAA;IACtB,IAAI,CAAC;QACH,QAAQ,GAAG,MAAM,KAAK,CAAC,SAAS,EAAE;YAChC,OAAO,EAAE;gBACP,YAAY,EACV,iHAAiH;aACpH;YACD,MAAM;SACP,CAAC,CAAA;IACJ,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,6EAA6E;QAC7E,IACE,KAAK,YAAY,YAAY;YAC7B,CAAC,KAAK,CAAC,IAAI,KAAK,YAAY,IAAI,KAAK,CAAC,IAAI,KAAK,cAAc,CAAC,EAC9D,CAAC;YACD,MAAM,KAAK,CAAA;QACb,CAAC;QACD,OAAO,EAAE,CAAA;IACX,CAAC;IAED,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;QACjB,OAAO,EAAE,CAAA;IACX,CAAC;IAED,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAA;IAElC,IAAI,mBAAmB,CAAC,IAAI,CAAC,EAAE,CAAC;QAC9B,OAAO,EAAE,CAAA;IACX,CAAC;IAED,MAAM,OAAO,GAAG,6BAA6B,CAAC,IAAI,EAAE,YAAY,CAAC,CAAA;IAEjE,OAAO,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,UAAU,CAAC,CAAA;AACrC,CAAC"}
@@ -0,0 +1,50 @@
1
+ /**
2
+ * Page fetching utility with browser-like headers and archive.org fallback.
3
+ *
4
+ * Provides a resilient page fetching pipeline:
5
+ * 1. Direct fetch with browser-like headers (Chrome UA, Accept text/html)
6
+ * 2. Block detection (hard HTTP blocks + soft body pattern matching)
7
+ * 3. Automatic archive.org fallback when blocked or on network error
8
+ * 4. Non-blocking HTTP errors (404, 500) return immediately without fallback
9
+ *
10
+ * Used by WebSearchBase when following links from search results.
11
+ */
12
+ /** Options for fetching a page. */
13
+ export interface FetchPageOptions {
14
+ /** URL to fetch. */
15
+ url: string;
16
+ /** AbortSignal from the caller (combined with timeoutMs). */
17
+ signal?: AbortSignal;
18
+ /** Total timeout budget in milliseconds (shared across direct + archive attempts). Default: 15000. */
19
+ timeoutMs?: number;
20
+ /** User-Agent header to send. Default: browser-like Chrome UA. */
21
+ userAgent?: string;
22
+ /** Whether to try archive.org when direct fetch is blocked. Default: true. */
23
+ archiveFallback?: boolean;
24
+ }
25
+ /** Result of a page fetch attempt. */
26
+ export interface FetchPageResult {
27
+ /** Raw HTML content (empty string if fetch failed). */
28
+ content: string;
29
+ /** Final URL (may differ from input if archive.org was used). */
30
+ url: string;
31
+ /** How the content was obtained. */
32
+ fetchMethod: "direct" | "archive.org" | "none";
33
+ /** Error description when fetchMethod is "none". */
34
+ error?: string;
35
+ }
36
+ /**
37
+ * Fetch a page with browser-like headers and automatic archive.org fallback.
38
+ *
39
+ * Pipeline:
40
+ * 1. Direct fetch with browser-like headers
41
+ * 2. Block detection (hard HTTP status codes + soft body pattern matching)
42
+ * 3. If blocked and archiveFallback enabled, try archive.org
43
+ * 4. Non-blocking HTTP errors (404, 500) return "none" immediately
44
+ * 5. Network errors on direct fetch trigger archive fallback
45
+ *
46
+ * @param options - Fetch options including URL, signal, timeout, etc.
47
+ * @returns Result with content, final URL, and fetch method
48
+ */
49
+ export declare function fetchPage(options: FetchPageOptions): Promise<FetchPageResult>;
50
+ //# sourceMappingURL=fetch-page.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"fetch-page.d.ts","sourceRoot":"","sources":["../../src/shared/fetch-page.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,mCAAmC;AACnC,MAAM,WAAW,gBAAgB;IAC/B,oBAAoB;IACpB,GAAG,EAAE,MAAM,CAAA;IACX,6DAA6D;IAC7D,MAAM,CAAC,EAAE,WAAW,CAAA;IACpB,sGAAsG;IACtG,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,kEAAkE;IAClE,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,8EAA8E;IAC9E,eAAe,CAAC,EAAE,OAAO,CAAA;CAC1B;AAED,sCAAsC;AACtC,MAAM,WAAW,eAAe;IAC9B,uDAAuD;IACvD,OAAO,EAAE,MAAM,CAAA;IACf,iEAAiE;IACjE,GAAG,EAAE,MAAM,CAAA;IACX,oCAAoC;IACpC,WAAW,EAAE,QAAQ,GAAG,aAAa,GAAG,MAAM,CAAA;IAC9C,oDAAoD;IACpD,KAAK,CAAC,EAAE,MAAM,CAAA;CACf;AA2HD;;;;;;;;;;;;GAYG;AACH,wBAAsB,SAAS,CAAC,OAAO,EAAE,gBAAgB,GAAG,OAAO,CAAC,eAAe,CAAC,CAoGnF"}