@debriefer/sources 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +59 -0
- package/dist/__tests__/archives/chronicling-america.test.d.ts +8 -0
- package/dist/__tests__/archives/chronicling-america.test.d.ts.map +1 -0
- package/dist/__tests__/archives/chronicling-america.test.js +151 -0
- package/dist/__tests__/archives/chronicling-america.test.js.map +1 -0
- package/dist/__tests__/archives/europeana.test.d.ts +8 -0
- package/dist/__tests__/archives/europeana.test.d.ts.map +1 -0
- package/dist/__tests__/archives/europeana.test.js +200 -0
- package/dist/__tests__/archives/europeana.test.js.map +1 -0
- package/dist/__tests__/archives/internet-archive.test.d.ts +8 -0
- package/dist/__tests__/archives/internet-archive.test.d.ts.map +1 -0
- package/dist/__tests__/archives/internet-archive.test.js +189 -0
- package/dist/__tests__/archives/internet-archive.test.js.map +1 -0
- package/dist/__tests__/archives/trove.test.d.ts +8 -0
- package/dist/__tests__/archives/trove.test.d.ts.map +1 -0
- package/dist/__tests__/archives/trove.test.js +202 -0
- package/dist/__tests__/archives/trove.test.js.map +1 -0
- package/dist/__tests__/books/google-books.test.d.ts +8 -0
- package/dist/__tests__/books/google-books.test.d.ts.map +1 -0
- package/dist/__tests__/books/google-books.test.js +221 -0
- package/dist/__tests__/books/google-books.test.js.map +1 -0
- package/dist/__tests__/books/open-library.test.d.ts +8 -0
- package/dist/__tests__/books/open-library.test.d.ts.map +1 -0
- package/dist/__tests__/books/open-library.test.js +159 -0
- package/dist/__tests__/books/open-library.test.js.map +1 -0
- package/dist/__tests__/news/guardian.test.d.ts +9 -0
- package/dist/__tests__/news/guardian.test.d.ts.map +1 -0
- package/dist/__tests__/news/guardian.test.js +224 -0
- package/dist/__tests__/news/guardian.test.js.map +1 -0
- package/dist/__tests__/news/nytimes.test.d.ts +9 -0
- package/dist/__tests__/news/nytimes.test.d.ts.map +1 -0
- package/dist/__tests__/news/nytimes.test.js +271 -0
- package/dist/__tests__/news/nytimes.test.js.map +1 -0
- package/dist/__tests__/news/site-search-source.test.d.ts +9 -0
- package/dist/__tests__/news/site-search-source.test.d.ts.map +1 -0
- package/dist/__tests__/news/site-search-source.test.js +342 -0
- package/dist/__tests__/news/site-search-source.test.js.map +1 -0
- package/dist/__tests__/obituary/find-a-grave.test.d.ts +8 -0
- package/dist/__tests__/obituary/find-a-grave.test.d.ts.map +1 -0
- package/dist/__tests__/obituary/find-a-grave.test.js +238 -0
- package/dist/__tests__/obituary/find-a-grave.test.js.map +1 -0
- package/dist/__tests__/shared/duckduckgo-search.test.d.ts +9 -0
- package/dist/__tests__/shared/duckduckgo-search.test.d.ts.map +1 -0
- package/dist/__tests__/shared/duckduckgo-search.test.js +218 -0
- package/dist/__tests__/shared/duckduckgo-search.test.js.map +1 -0
- package/dist/__tests__/shared/fetch-page.test.d.ts +9 -0
- package/dist/__tests__/shared/fetch-page.test.d.ts.map +1 -0
- package/dist/__tests__/shared/fetch-page.test.js +281 -0
- package/dist/__tests__/shared/fetch-page.test.js.map +1 -0
- package/dist/__tests__/shared/html-utils.test.d.ts +2 -0
- package/dist/__tests__/shared/html-utils.test.d.ts.map +1 -0
- package/dist/__tests__/shared/html-utils.test.js +169 -0
- package/dist/__tests__/shared/html-utils.test.js.map +1 -0
- package/dist/__tests__/shared/readability-extract.test.d.ts +2 -0
- package/dist/__tests__/shared/readability-extract.test.d.ts.map +1 -0
- package/dist/__tests__/shared/readability-extract.test.js +107 -0
- package/dist/__tests__/shared/readability-extract.test.js.map +1 -0
- package/dist/__tests__/shared/sanitize-text.test.d.ts +2 -0
- package/dist/__tests__/shared/sanitize-text.test.d.ts.map +1 -0
- package/dist/__tests__/shared/sanitize-text.test.js +77 -0
- package/dist/__tests__/shared/sanitize-text.test.js.map +1 -0
- package/dist/__tests__/shared/search-utils.test.d.ts +2 -0
- package/dist/__tests__/shared/search-utils.test.d.ts.map +1 -0
- package/dist/__tests__/shared/search-utils.test.js +26 -0
- package/dist/__tests__/shared/search-utils.test.js.map +1 -0
- package/dist/__tests__/structured/wikidata.test.d.ts +9 -0
- package/dist/__tests__/structured/wikidata.test.d.ts.map +1 -0
- package/dist/__tests__/structured/wikidata.test.js +509 -0
- package/dist/__tests__/structured/wikidata.test.js.map +1 -0
- package/dist/__tests__/structured/wikipedia.test.d.ts +9 -0
- package/dist/__tests__/structured/wikipedia.test.d.ts.map +1 -0
- package/dist/__tests__/structured/wikipedia.test.js +643 -0
- package/dist/__tests__/structured/wikipedia.test.js.map +1 -0
- package/dist/__tests__/web-search/base.test.d.ts +9 -0
- package/dist/__tests__/web-search/base.test.d.ts.map +1 -0
- package/dist/__tests__/web-search/base.test.js +622 -0
- package/dist/__tests__/web-search/base.test.js.map +1 -0
- package/dist/__tests__/web-search/bing.test.d.ts +10 -0
- package/dist/__tests__/web-search/bing.test.d.ts.map +1 -0
- package/dist/__tests__/web-search/bing.test.js +277 -0
- package/dist/__tests__/web-search/bing.test.js.map +1 -0
- package/dist/__tests__/web-search/brave.test.d.ts +10 -0
- package/dist/__tests__/web-search/brave.test.d.ts.map +1 -0
- package/dist/__tests__/web-search/brave.test.js +264 -0
- package/dist/__tests__/web-search/brave.test.js.map +1 -0
- package/dist/__tests__/web-search/duckduckgo.test.d.ts +10 -0
- package/dist/__tests__/web-search/duckduckgo.test.d.ts.map +1 -0
- package/dist/__tests__/web-search/duckduckgo.test.js +107 -0
- package/dist/__tests__/web-search/duckduckgo.test.js.map +1 -0
- package/dist/__tests__/web-search/google.test.d.ts +9 -0
- package/dist/__tests__/web-search/google.test.d.ts.map +1 -0
- package/dist/__tests__/web-search/google.test.js +189 -0
- package/dist/__tests__/web-search/google.test.js.map +1 -0
- package/dist/archives/chronicling-america.d.ts +33 -0
- package/dist/archives/chronicling-america.d.ts.map +1 -0
- package/dist/archives/chronicling-america.js +85 -0
- package/dist/archives/chronicling-america.js.map +1 -0
- package/dist/archives/europeana.d.ts +37 -0
- package/dist/archives/europeana.d.ts.map +1 -0
- package/dist/archives/europeana.js +92 -0
- package/dist/archives/europeana.js.map +1 -0
- package/dist/archives/internet-archive.d.ts +32 -0
- package/dist/archives/internet-archive.d.ts.map +1 -0
- package/dist/archives/internet-archive.js +90 -0
- package/dist/archives/internet-archive.js.map +1 -0
- package/dist/archives/trove.d.ts +37 -0
- package/dist/archives/trove.d.ts.map +1 -0
- package/dist/archives/trove.js +97 -0
- package/dist/archives/trove.js.map +1 -0
- package/dist/books/google-books.d.ts +48 -0
- package/dist/books/google-books.d.ts.map +1 -0
- package/dist/books/google-books.js +111 -0
- package/dist/books/google-books.js.map +1 -0
- package/dist/books/open-library.d.ts +44 -0
- package/dist/books/open-library.d.ts.map +1 -0
- package/dist/books/open-library.js +103 -0
- package/dist/books/open-library.js.map +1 -0
- package/dist/index.d.ts +45 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +35 -0
- package/dist/index.js.map +1 -0
- package/dist/news/guardian.d.ts +51 -0
- package/dist/news/guardian.d.ts.map +1 -0
- package/dist/news/guardian.js +131 -0
- package/dist/news/guardian.js.map +1 -0
- package/dist/news/nytimes.d.ts +27 -0
- package/dist/news/nytimes.d.ts.map +1 -0
- package/dist/news/nytimes.js +104 -0
- package/dist/news/nytimes.js.map +1 -0
- package/dist/news/site-search-source.d.ts +89 -0
- package/dist/news/site-search-source.d.ts.map +1 -0
- package/dist/news/site-search-source.js +182 -0
- package/dist/news/site-search-source.js.map +1 -0
- package/dist/news/sources.d.ts +52 -0
- package/dist/news/sources.d.ts.map +1 -0
- package/dist/news/sources.js +276 -0
- package/dist/news/sources.js.map +1 -0
- package/dist/obituary/find-a-grave.d.ts +43 -0
- package/dist/obituary/find-a-grave.d.ts.map +1 -0
- package/dist/obituary/find-a-grave.js +173 -0
- package/dist/obituary/find-a-grave.js.map +1 -0
- package/dist/shared/duckduckgo-search.d.ts +86 -0
- package/dist/shared/duckduckgo-search.d.ts.map +1 -0
- package/dist/shared/duckduckgo-search.js +218 -0
- package/dist/shared/duckduckgo-search.js.map +1 -0
- package/dist/shared/fetch-page.d.ts +50 -0
- package/dist/shared/fetch-page.d.ts.map +1 -0
- package/dist/shared/fetch-page.js +212 -0
- package/dist/shared/fetch-page.js.map +1 -0
- package/dist/shared/html-utils.d.ts +99 -0
- package/dist/shared/html-utils.d.ts.map +1 -0
- package/dist/shared/html-utils.js +246 -0
- package/dist/shared/html-utils.js.map +1 -0
- package/dist/shared/readability-extract.d.ts +33 -0
- package/dist/shared/readability-extract.d.ts.map +1 -0
- package/dist/shared/readability-extract.js +45 -0
- package/dist/shared/readability-extract.js.map +1 -0
- package/dist/shared/sanitize-text.d.ts +24 -0
- package/dist/shared/sanitize-text.d.ts.map +1 -0
- package/dist/shared/sanitize-text.js +49 -0
- package/dist/shared/sanitize-text.js.map +1 -0
- package/dist/shared/search-utils.d.ts +18 -0
- package/dist/shared/search-utils.d.ts.map +1 -0
- package/dist/shared/search-utils.js +20 -0
- package/dist/shared/search-utils.js.map +1 -0
- package/dist/structured/wikidata.d.ts +128 -0
- package/dist/structured/wikidata.d.ts.map +1 -0
- package/dist/structured/wikidata.js +361 -0
- package/dist/structured/wikidata.js.map +1 -0
- package/dist/structured/wikipedia.d.ts +184 -0
- package/dist/structured/wikipedia.d.ts.map +1 -0
- package/dist/structured/wikipedia.js +275 -0
- package/dist/structured/wikipedia.js.map +1 -0
- package/dist/web-search/base.d.ts +128 -0
- package/dist/web-search/base.d.ts.map +1 -0
- package/dist/web-search/base.js +251 -0
- package/dist/web-search/base.js.map +1 -0
- package/dist/web-search/bing.d.ts +21 -0
- package/dist/web-search/bing.d.ts.map +1 -0
- package/dist/web-search/bing.js +53 -0
- package/dist/web-search/bing.js.map +1 -0
- package/dist/web-search/brave.d.ts +21 -0
- package/dist/web-search/brave.d.ts.map +1 -0
- package/dist/web-search/brave.js +56 -0
- package/dist/web-search/brave.js.map +1 -0
- package/dist/web-search/duckduckgo.d.ts +15 -0
- package/dist/web-search/duckduckgo.d.ts.map +1 -0
- package/dist/web-search/duckduckgo.js +21 -0
- package/dist/web-search/duckduckgo.js.map +1 -0
- package/dist/web-search/google.d.ts +24 -0
- package/dist/web-search/google.d.ts.map +1 -0
- package/dist/web-search/google.js +48 -0
- package/dist/web-search/google.js.map +1 -0
- package/package.json +58 -0
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Find a Grave obituary source.
|
|
3
|
+
*
|
|
4
|
+
* Searches findagrave.com for memorial pages by name, extracts biography
|
|
5
|
+
* content via Readability (with regex fallback), and returns sanitized text.
|
|
6
|
+
*
|
|
7
|
+
* Find a Grave is user-generated content — anyone can create or edit memorials.
|
|
8
|
+
* Reliability tier is UNRELIABLE_UGC (0.35) per Wikipedia RSP guidelines.
|
|
9
|
+
*/
|
|
10
|
+
import { BaseResearchSource, ReliabilityTier, } from "@debriefer/core";
|
|
11
|
+
import { fetchPage } from "../shared/fetch-page.js";
|
|
12
|
+
import { extractArticleContent } from "../shared/readability-extract.js";
|
|
13
|
+
import { sanitizeSourceText } from "../shared/sanitize-text.js";
|
|
14
|
+
// ============================================================================
|
|
15
|
+
// Constants
|
|
16
|
+
// ============================================================================
|
|
17
|
+
const SEARCH_BASE_URL = "https://www.findagrave.com/memorial/search";
|
|
18
|
+
const MEMORIAL_URL_PATTERN = /\/memorial\/(\d+)\//g;
|
|
19
|
+
const MIN_BIO_LENGTH = 100;
|
|
20
|
+
const BIO_REGEX = /<div[^>]*id="bio"[^>]*>([\s\S]*?)<\/div>/i;
|
|
21
|
+
// ============================================================================
|
|
22
|
+
// Source Implementation
|
|
23
|
+
// ============================================================================
|
|
24
|
+
/**
|
|
25
|
+
* Find a Grave source for obituary / memorial content.
|
|
26
|
+
*
|
|
27
|
+
* Pipeline:
|
|
28
|
+
* 1. Search findagrave.com by first/last name
|
|
29
|
+
* 2. Parse memorial URLs from search results HTML
|
|
30
|
+
* 3. Filter for URLs containing the subject's name (normalized)
|
|
31
|
+
* 4. Fetch the memorial page via fetchPage (with archive fallback)
|
|
32
|
+
* 5. Extract bio via Readability, falling back to regex
|
|
33
|
+
* 6. Sanitize text and return if long enough
|
|
34
|
+
*/
|
|
35
|
+
export class FindAGraveSource extends BaseResearchSource {
|
|
36
|
+
name = "Find a Grave";
|
|
37
|
+
type = "find-a-grave";
|
|
38
|
+
reliabilityTier = ReliabilityTier.UNRELIABLE_UGC;
|
|
39
|
+
domain = "www.findagrave.com";
|
|
40
|
+
isFree = true;
|
|
41
|
+
estimatedCostPerQuery = 0;
|
|
42
|
+
constructor(options = {}) {
|
|
43
|
+
super({ rateLimitMs: 2000, ...options });
|
|
44
|
+
}
|
|
45
|
+
async fetchResult(subject, signal) {
|
|
46
|
+
// Step 1: Split name into first/last
|
|
47
|
+
const nameParts = subject.name.trim().split(/\s+/);
|
|
48
|
+
const firstName = nameParts[0] ?? "";
|
|
49
|
+
const lastName = nameParts.slice(1).join(" ") || "";
|
|
50
|
+
// Step 2: Search for memorials
|
|
51
|
+
const searchUrl = `${SEARCH_BASE_URL}?firstname=${encodeURIComponent(firstName)}&lastname=${encodeURIComponent(lastName)}&orderby=r`;
|
|
52
|
+
const searchPage = await fetchPage({
|
|
53
|
+
url: searchUrl,
|
|
54
|
+
signal,
|
|
55
|
+
archiveFallback: false,
|
|
56
|
+
});
|
|
57
|
+
if (searchPage.fetchMethod === "none" || !searchPage.content) {
|
|
58
|
+
return null;
|
|
59
|
+
}
|
|
60
|
+
const searchHtml = searchPage.content;
|
|
61
|
+
// Step 3: Extract memorial URLs
|
|
62
|
+
const memorialUrls = [];
|
|
63
|
+
let match;
|
|
64
|
+
// Reset lastIndex before use
|
|
65
|
+
MEMORIAL_URL_PATTERN.lastIndex = 0;
|
|
66
|
+
while ((match = MEMORIAL_URL_PATTERN.exec(searchHtml)) !== null) {
|
|
67
|
+
const fullMatch = match[0];
|
|
68
|
+
const anchorIndex = match.index;
|
|
69
|
+
// Build the full memorial URL from the path found in the HTML
|
|
70
|
+
// Search result links look like: /memorial/12345/john-wayne
|
|
71
|
+
const startQuoteIdx = searchHtml.lastIndexOf('"', anchorIndex);
|
|
72
|
+
const endQuoteIdx = searchHtml.indexOf('"', anchorIndex + fullMatch.length);
|
|
73
|
+
if (startQuoteIdx === -1 || endQuoteIdx === -1) {
|
|
74
|
+
continue;
|
|
75
|
+
}
|
|
76
|
+
const startIdx = startQuoteIdx + 1;
|
|
77
|
+
const endIdx = endQuoteIdx;
|
|
78
|
+
const path = searchHtml.slice(startIdx, endIdx);
|
|
79
|
+
if (path.startsWith("/memorial/")) {
|
|
80
|
+
const url = `https://www.findagrave.com${path}`;
|
|
81
|
+
if (!memorialUrls.includes(url)) {
|
|
82
|
+
memorialUrls.push(url);
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
if (memorialUrls.length === 0) {
|
|
87
|
+
return null;
|
|
88
|
+
}
|
|
89
|
+
// Step 4: Prefer exact slug matches for the subject's normalized name
|
|
90
|
+
const normalizedName = subject.name.toLowerCase().replace(/\s+/g, "-");
|
|
91
|
+
// First: look for URLs where the slug segment exactly matches normalizedName.
|
|
92
|
+
const exactSlugMatches = memorialUrls.filter((url) => {
|
|
93
|
+
try {
|
|
94
|
+
const { pathname } = new URL(url);
|
|
95
|
+
const segments = pathname.split("/").filter((s) => s.length > 0);
|
|
96
|
+
// Find a Grave memorial path: /memorial/{id}/{slug}
|
|
97
|
+
const slug = segments[2]?.toLowerCase() ?? "";
|
|
98
|
+
return slug === normalizedName;
|
|
99
|
+
}
|
|
100
|
+
catch {
|
|
101
|
+
return false;
|
|
102
|
+
}
|
|
103
|
+
});
|
|
104
|
+
const matchingUrls = exactSlugMatches.length > 0
|
|
105
|
+
? exactSlugMatches
|
|
106
|
+
: memorialUrls.filter((url) => url.toLowerCase().includes(normalizedName));
|
|
107
|
+
if (matchingUrls.length === 0) {
|
|
108
|
+
return null;
|
|
109
|
+
}
|
|
110
|
+
const memorialUrl = matchingUrls[0];
|
|
111
|
+
// Step 5: Fetch the memorial page
|
|
112
|
+
await this.rateLimiter?.acquire(this.domain, this.options.rateLimitMs);
|
|
113
|
+
const page = await fetchPage({ url: memorialUrl, signal });
|
|
114
|
+
if (page.fetchMethod === "none" || !page.content) {
|
|
115
|
+
return null;
|
|
116
|
+
}
|
|
117
|
+
// Step 6: Extract bio content
|
|
118
|
+
const actualUrl = page.url || memorialUrl;
|
|
119
|
+
let bioText = null;
|
|
120
|
+
// Try Readability first
|
|
121
|
+
const extracted = extractArticleContent(page.content, actualUrl);
|
|
122
|
+
if (extracted && extracted.text.length >= MIN_BIO_LENGTH) {
|
|
123
|
+
bioText = extracted.text;
|
|
124
|
+
}
|
|
125
|
+
// Fall back to regex if Readability didn't get enough content
|
|
126
|
+
if (!bioText) {
|
|
127
|
+
const bioMatch = BIO_REGEX.exec(page.content);
|
|
128
|
+
if (bioMatch && bioMatch[1]) {
|
|
129
|
+
// Strip HTML tags from the regex-extracted content
|
|
130
|
+
const stripped = bioMatch[1]
|
|
131
|
+
.replace(/<[^>]*>/g, " ")
|
|
132
|
+
.replace(/\s+/g, " ")
|
|
133
|
+
.trim();
|
|
134
|
+
if (stripped.length >= MIN_BIO_LENGTH) {
|
|
135
|
+
bioText = stripped;
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
if (!bioText) {
|
|
140
|
+
return null;
|
|
141
|
+
}
|
|
142
|
+
// Step 7: Sanitize and return
|
|
143
|
+
const text = sanitizeSourceText(bioText);
|
|
144
|
+
if (text.length < MIN_BIO_LENGTH) {
|
|
145
|
+
return null;
|
|
146
|
+
}
|
|
147
|
+
return {
|
|
148
|
+
text,
|
|
149
|
+
confidence: -1,
|
|
150
|
+
costUsd: 0,
|
|
151
|
+
url: actualUrl,
|
|
152
|
+
publication: "Find a Grave",
|
|
153
|
+
metadata: {
|
|
154
|
+
memorialUrl: actualUrl,
|
|
155
|
+
},
|
|
156
|
+
};
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
// ============================================================================
|
|
160
|
+
// Factory Function
|
|
161
|
+
// ============================================================================
|
|
162
|
+
/**
|
|
163
|
+
* Create a Find a Grave source instance.
|
|
164
|
+
*
|
|
165
|
+
* @example
|
|
166
|
+
* ```typescript
|
|
167
|
+
* const source = findAGrave()
|
|
168
|
+
* ```
|
|
169
|
+
*/
|
|
170
|
+
export function findAGrave(options) {
|
|
171
|
+
return new FindAGraveSource(options);
|
|
172
|
+
}
|
|
173
|
+
//# sourceMappingURL=find-a-grave.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"find-a-grave.js","sourceRoot":"","sources":["../../src/obituary/find-a-grave.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,OAAO,EACL,kBAAkB,EAClB,eAAe,GAIhB,MAAM,iBAAiB,CAAA;AACxB,OAAO,EAAE,SAAS,EAAE,MAAM,yBAAyB,CAAA;AACnD,OAAO,EAAE,qBAAqB,EAAE,MAAM,kCAAkC,CAAA;AACxE,OAAO,EAAE,kBAAkB,EAAE,MAAM,4BAA4B,CAAA;AAE/D,+EAA+E;AAC/E,YAAY;AACZ,+EAA+E;AAE/E,MAAM,eAAe,GAAG,4CAA4C,CAAA;AACpE,MAAM,oBAAoB,GAAG,sBAAsB,CAAA;AACnD,MAAM,cAAc,GAAG,GAAG,CAAA;AAC1B,MAAM,SAAS,GAAG,2CAA2C,CAAA;AAS7D,+EAA+E;AAC/E,wBAAwB;AACxB,+EAA+E;AAE/E;;;;;;;;;;GAUG;AACH,MAAM,OAAO,gBAAiB,SAAQ,kBAAmC;IAC9D,IAAI,GAAG,cAAc,CAAA;IACrB,IAAI,GAAG,cAAc,CAAA;IACrB,eAAe,GAAG,eAAe,CAAC,cAAc,CAAA;IAChD,MAAM,GAAG,oBAAoB,CAAA;IAC7B,MAAM,GAAG,IAAI,CAAA;IACb,qBAAqB,GAAG,CAAC,CAAA;IAElC,YAAY,UAA6B,EAAE;QACzC,KAAK,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,GAAG,OAAO,EAAE,CAAC,CAAA;IAC1C,CAAC;IAES,KAAK,CAAC,WAAW,CACzB,OAAwB,EACxB,MAAmB;QAEnB,qCAAqC;QACrC,MAAM,SAAS,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,CAAA;QAClD,MAAM,SAAS,GAAG,SAAS,CAAC,CAAC,CAAC,IAAI,EAAE,CAAA;QACpC,MAAM,QAAQ,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,EAAE,CAAA;QAEnD,+BAA+B;QAC/B,MAAM,SAAS,GAAG,GAAG,eAAe,cAAc,kBAAkB,CAAC,SAAS,CAAC,aAAa,kBAAkB,CAAC,QAAQ,CAAC,YAAY,CAAA;QAEpI,MAAM,UAAU,GAAG,MAAM,SAAS,CAAC;YACjC,GAAG,EAAE,SAAS;YACd,MAAM;YACN,eAAe,EAAE,KAAK;SACvB,CAAC,CAAA;QAEF,IAAI,UAAU,CAAC,WAAW,KAAK,MAAM,IAAI,CAAC,UAAU,CAAC,OAAO,EAAE,CAAC;YAC7D,OAAO,IAAI,CAAA;QACb,CAAC;QAED,MAAM,UAAU,GAAG,UAAU,CAAC,OAAO,CAAA;QAErC,gCAAgC;QAChC,MAAM,YAAY,GAAa,EAAE,CAAA;QACjC,IAAI,KAA6B,CAAA;QACjC,6BAA6B;QAC7B,oBAAoB,CAAC,SAAS,GAAG,CAAC,CAAA;QAClC,OAAO,CAAC,KAAK,GAAG,oBAAoB,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;YAChE,MAAM,SAAS,GAAG,KAAK,CAAC,CAAC,CAAC,CAAA;YAC1B,MAAM,WAAW,GAAG,KAAK,CAAC,KAAK,CAAA;YAC/B,8DAA8D;YAC9D,4DAA4D;YAC5D,MAAM,aAAa,GAAG,UAAU,CAAC,WAAW,CAAC,GAAG,EAAE,WAAW,CAAC,CAAA;YAC9D,MAAM,WAAW,GAAG,UAAU,CAAC,OAAO,CAAC,GAAG,EAAE,WAAW,GAAG,SAAS,CAAC,MAAM,CAAC,CAAA;YAC3E,IAAI,aAAa,KAAK,CAAC,CAAC,IAAI,WAAW,KAAK,CAAC,CAAC,EAAE,CAAC;gBAC/C,SAAQ;YACV,CAAC;YACD,MAAM,QAAQ,GAAG,aAAa,GAAG,CAAC,CAAA;YAClC,MAAM,MAAM,GAAG,WAAW,CAAA;YAC1B,MAAM,IAAI,GAAG,UAAU,CAAC,KAAK,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAA;YAE/C,IAAI,IAAI,CAAC,UAAU,CAAC,YAAY,CAAC,EAAE,CAAC;gBAClC,MAAM,GAAG,GAAG,6BAA6B,IAAI,EAAE,CAAA;gBAC/C,IAAI,CAAC,YAAY,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;oBAChC,YAAY,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;gBACxB,CAAC;YACH,CAAC;QACH,CAAC;QAED,IAAI,YAAY,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC9B,OAAO,IAAI,CAAA;QACb,CAAC;QAED,sEAAsE;QACtE,MAAM,cAAc,GAAG,OAAO,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAA;QAEtE,8EAA8E;QAC9E,MAAM,gBAAgB,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,EAAE;YACnD,IAAI,CAAC;gBACH,MAAM,EAAE,QAAQ,EAAE,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAA;gBACjC,MAAM,QAAQ,GAAG,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;gBAChE,oDAAoD;gBACpD,MAAM,IAAI,GAAG,QAAQ,CAAC,CAAC,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,CAAA;gBAC7C,OAAO,IAAI,KAAK,cAAc,CAAA;YAChC,CAAC;YAAC,MAAM,CAAC;gBACP,OAAO,KAAK,CAAA;YACd,CAAC;QACH,CAAC,CAAC,CAAA;QAEF,MAAM,YAAY,GAChB,gBAAgB,CAAC,MAAM,GAAG,CAAC;YACzB,CAAC,CAAC,gBAAgB;YAClB,CAAC,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,cAAc,CAAC,CAAC,CAAA;QAE9E,IAAI,YAAY,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC9B,OAAO,IAAI,CAAA;QACb,CAAC;QAED,MAAM,WAAW,GAAG,YAAY,CAAC,CAAC,CAAC,CAAA;QAEnC,kCAAkC;QAClC,MAAM,IAAI,CAAC,WAAW,EAAE,OAAO,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,OAAO,CAAC,WAAY,CAAC,CAAA;QACvE,MAAM,IAAI,GAAG,MAAM,SAAS,CAAC,EAAE,GAAG,EAAE,WAAW,EAAE,MAAM,EAAE,CAAC,CAAA;QAE1D,IAAI,IAAI,CAAC,WAAW,KAAK,MAAM,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC;YACjD,OAAO,IAAI,CAAA;QACb,CAAC;QAED,8BAA8B;QAC9B,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,IAAI,WAAW,CAAA;QACzC,IAAI,OAAO,GAAkB,IAAI,CAAA;QAEjC,wBAAwB;QACxB,MAAM,SAAS,GAAG,qBAAqB,CAAC,IAAI,CAAC,OAAO,EAAE,SAAS,CAAC,CAAA;QAChE,IAAI,SAAS,IAAI,SAAS,CAAC,IAAI,CAAC,MAAM,IAAI,cAAc,EAAE,CAAC;YACzD,OAAO,GAAG,SAAS,CAAC,IAAI,CAAA;QAC1B,CAAC;QAED,8DAA8D;QAC9D,IAAI,CAAC,OAAO,EAAE,CAAC;YACb,MAAM,QAAQ,GAAG,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,CAAA;YAC7C,IAAI,QAAQ,IAAI,QAAQ,CAAC,CAAC,CAAC,EAAE,CAAC;gBAC5B,mDAAmD;gBACnD,MAAM,QAAQ,GAAG,QAAQ,CAAC,CAAC,CAAC;qBACzB,OAAO,CAAC,UAAU,EAAE,GAAG,CAAC;qBACxB,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC;qBACpB,IAAI,EAAE,CAAA;gBACT,IAAI,QAAQ,CAAC,MAAM,IAAI,cAAc,EAAE,CAAC;oBACtC,OAAO,GAAG,QAAQ,CAAA;gBACpB,CAAC;YACH,CAAC;QACH,CAAC;QAED,IAAI,CAAC,OAAO,EAAE,CAAC;YACb,OAAO,IAAI,CAAA;QACb,CAAC;QAED,8BAA8B;QAC9B,MAAM,IAAI,GAAG,kBAAkB,CAAC,OAAO,CAAC,CAAA;QAExC,IAAI,IAAI,CAAC,MAAM,GAAG,cAAc,EAAE,CAAC;YACjC,OAAO,IAAI,CAAA;QACb,CAAC;QAED,OAAO;YACL,IAAI;YACJ,UAAU,EAAE,CAAC,CAAC;YACd,OAAO,EAAE,CAAC;YACV,GAAG,EAAE,SAAS;YACd,WAAW,EAAE,cAAc;YAC3B,QAAQ,EAAE;gBACR,WAAW,EAAE,SAAS;aACvB;SACF,CAAA;IACH,CAAC;CACF;AAED,+EAA+E;AAC/E,mBAAmB;AACnB,+EAA+E;AAE/E;;;;;;;GAOG;AACH,MAAM,UAAU,UAAU,CAAC,OAA2B;IACpD,OAAO,IAAI,gBAAgB,CAAC,OAAO,CAAC,CAAA;AACtC,CAAC"}
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* DuckDuckGo HTML search utility.
|
|
3
|
+
*
|
|
4
|
+
* Scrapes DDG's HTML endpoint (no API key required) to extract
|
|
5
|
+
* search result URLs, titles, and snippets. Includes CAPTCHA detection,
|
|
6
|
+
* DDG redirect URL cleaning, and domain-based filtering.
|
|
7
|
+
*
|
|
8
|
+
* Used by DuckDuckGoSearchSource and future news sources for
|
|
9
|
+
* `site:domain.com` style queries.
|
|
10
|
+
*/
|
|
11
|
+
/** Options for a DuckDuckGo HTML search. */
|
|
12
|
+
export interface DuckDuckGoSearchOptions {
|
|
13
|
+
/** Search query string. */
|
|
14
|
+
query: string;
|
|
15
|
+
/** Domain to restrict results to (prepended as site: to query). */
|
|
16
|
+
domainFilter?: string;
|
|
17
|
+
/** Maximum number of results to return. Default: 10. */
|
|
18
|
+
maxResults?: number;
|
|
19
|
+
/** AbortSignal from the caller (combined with timeoutMs). */
|
|
20
|
+
signal?: AbortSignal;
|
|
21
|
+
/** Timeout in milliseconds for the fetch. Default: 15000. */
|
|
22
|
+
timeoutMs?: number;
|
|
23
|
+
}
|
|
24
|
+
/** A single search result extracted from DDG HTML. */
|
|
25
|
+
export interface SearchResult {
|
|
26
|
+
/** Cleaned URL of the search result. */
|
|
27
|
+
url: string;
|
|
28
|
+
/** Title of the search result. */
|
|
29
|
+
title: string;
|
|
30
|
+
/** Snippet/description of the search result. */
|
|
31
|
+
snippet: string;
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Detect whether DDG returned a CAPTCHA/bot-detection page.
|
|
35
|
+
*
|
|
36
|
+
* Checks for "anomaly-modal" (DDG's CAPTCHA container) and the
|
|
37
|
+
* "bots use DuckDuckGo too" message.
|
|
38
|
+
*
|
|
39
|
+
* @param html - Raw HTML response body from DDG
|
|
40
|
+
* @returns True if the page is a CAPTCHA challenge
|
|
41
|
+
*/
|
|
42
|
+
export declare function isDuckDuckGoCaptcha(html: string): boolean;
|
|
43
|
+
/**
|
|
44
|
+
* Clean a DuckDuckGo result URL.
|
|
45
|
+
*
|
|
46
|
+
* DDG wraps result URLs in redirect links like:
|
|
47
|
+
* `//duckduckgo.com/l/?uddg=ENCODED_URL&rut=...`
|
|
48
|
+
*
|
|
49
|
+
* This function:
|
|
50
|
+
* 1. Extracts the real URL from the `uddg` query parameter
|
|
51
|
+
* 2. Handles protocol-relative `//` URLs by prepending `https:`
|
|
52
|
+
* 3. Passes normal URLs through unchanged
|
|
53
|
+
*
|
|
54
|
+
* @param url - URL from a DDG search result
|
|
55
|
+
* @returns Cleaned URL pointing to the actual destination
|
|
56
|
+
*/
|
|
57
|
+
export declare function cleanDuckDuckGoUrl(url: string): string;
|
|
58
|
+
/**
|
|
59
|
+
* Extract search results from DuckDuckGo HTML response.
|
|
60
|
+
*
|
|
61
|
+
* Parses DDG's HTML structure:
|
|
62
|
+
* - `class="result__url"` href for URLs (primary)
|
|
63
|
+
* - `class="result__a"` for titles (and fallback URLs)
|
|
64
|
+
* - `class="result__snippet"` for snippets
|
|
65
|
+
*
|
|
66
|
+
* If no `result__url` matches are found, falls back to `result__a` hrefs.
|
|
67
|
+
* Filters by domain using URL hostname parsing to prevent substring spoofing
|
|
68
|
+
* (e.g., "nytimes.com.evil.com" won't match "nytimes.com").
|
|
69
|
+
*
|
|
70
|
+
* @param html - Raw HTML response from DDG
|
|
71
|
+
* @param domainFilter - Optional domain to filter results by
|
|
72
|
+
* @returns Array of extracted search results
|
|
73
|
+
*/
|
|
74
|
+
export declare function extractUrlsFromDuckDuckGoHtml(html: string, domainFilter?: string): SearchResult[];
|
|
75
|
+
/**
|
|
76
|
+
* Search DuckDuckGo via its HTML endpoint.
|
|
77
|
+
*
|
|
78
|
+
* Fetches `https://html.duckduckgo.com/html/?q=QUERY`, optionally
|
|
79
|
+
* prepending `site:domain` when domainFilter is set. Returns an empty
|
|
80
|
+
* array on CAPTCHA, error, or non-OK response.
|
|
81
|
+
*
|
|
82
|
+
* @param options - Search options including query, domain filter, and limits
|
|
83
|
+
* @returns Array of search results (empty on failure)
|
|
84
|
+
*/
|
|
85
|
+
export declare function searchDuckDuckGo(options: DuckDuckGoSearchOptions): Promise<SearchResult[]>;
|
|
86
|
+
//# sourceMappingURL=duckduckgo-search.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"duckduckgo-search.d.ts","sourceRoot":"","sources":["../../src/shared/duckduckgo-search.ts"],"names":[],"mappings":"AAAA;;;;;;;;;GASG;AAQH,4CAA4C;AAC5C,MAAM,WAAW,uBAAuB;IACtC,2BAA2B;IAC3B,KAAK,EAAE,MAAM,CAAA;IACb,mEAAmE;IACnE,YAAY,CAAC,EAAE,MAAM,CAAA;IACrB,wDAAwD;IACxD,UAAU,CAAC,EAAE,MAAM,CAAA;IACnB,6DAA6D;IAC7D,MAAM,CAAC,EAAE,WAAW,CAAA;IACpB,6DAA6D;IAC7D,SAAS,CAAC,EAAE,MAAM,CAAA;CACnB;AAED,sDAAsD;AACtD,MAAM,WAAW,YAAY;IAC3B,wCAAwC;IACxC,GAAG,EAAE,MAAM,CAAA;IACX,kCAAkC;IAClC,KAAK,EAAE,MAAM,CAAA;IACb,gDAAgD;IAChD,OAAO,EAAE,MAAM,CAAA;CAChB;AAmBD;;;;;;;;GAQG;AACH,wBAAgB,mBAAmB,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAGzD;AAMD;;;;;;;;;;;;;GAaG;AACH,wBAAgB,kBAAkB,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,CAyBtD;AAMD;;;;;;;;;;;;;;;GAeG;AACH,wBAAgB,6BAA6B,CAAC,IAAI,EAAE,MAAM,EAAE,YAAY,CAAC,EAAE,MAAM,GAAG,YAAY,EAAE,CAoEjG;AAqBD;;;;;;;;;GASG;AACH,wBAAsB,gBAAgB,CAAC,OAAO,EAAE,uBAAuB,GAAG,OAAO,CAAC,YAAY,EAAE,CAAC,CAmDhG"}
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* DuckDuckGo HTML search utility.
|
|
3
|
+
*
|
|
4
|
+
* Scrapes DDG's HTML endpoint (no API key required) to extract
|
|
5
|
+
* search result URLs, titles, and snippets. Includes CAPTCHA detection,
|
|
6
|
+
* DDG redirect URL cleaning, and domain-based filtering.
|
|
7
|
+
*
|
|
8
|
+
* Used by DuckDuckGoSearchSource and future news sources for
|
|
9
|
+
* `site:domain.com` style queries.
|
|
10
|
+
*/
|
|
11
|
+
import { decodeHtmlEntities } from "./html-utils.js";
|
|
12
|
+
// ============================================================================
|
|
13
|
+
// Constants
|
|
14
|
+
// ============================================================================
|
|
15
|
+
/** DDG HTML search endpoint. */
|
|
16
|
+
const DDG_HTML_URL = "https://html.duckduckgo.com/html/";
|
|
17
|
+
/** Default timeout for the search fetch in milliseconds. */
|
|
18
|
+
const DEFAULT_TIMEOUT_MS = 15000;
|
|
19
|
+
/** Default maximum number of results to return. */
|
|
20
|
+
const DEFAULT_MAX_RESULTS = 10;
|
|
21
|
+
// ============================================================================
|
|
22
|
+
// CAPTCHA Detection
|
|
23
|
+
// ============================================================================
|
|
24
|
+
/**
|
|
25
|
+
* Detect whether DDG returned a CAPTCHA/bot-detection page.
|
|
26
|
+
*
|
|
27
|
+
* Checks for "anomaly-modal" (DDG's CAPTCHA container) and the
|
|
28
|
+
* "bots use DuckDuckGo too" message.
|
|
29
|
+
*
|
|
30
|
+
* @param html - Raw HTML response body from DDG
|
|
31
|
+
* @returns True if the page is a CAPTCHA challenge
|
|
32
|
+
*/
|
|
33
|
+
export function isDuckDuckGoCaptcha(html) {
|
|
34
|
+
const lower = html.toLowerCase();
|
|
35
|
+
return lower.includes("anomaly-modal") || lower.includes("bots use duckduckgo too");
|
|
36
|
+
}
|
|
37
|
+
// ============================================================================
|
|
38
|
+
// URL Cleaning
|
|
39
|
+
// ============================================================================
|
|
40
|
+
/**
|
|
41
|
+
* Clean a DuckDuckGo result URL.
|
|
42
|
+
*
|
|
43
|
+
* DDG wraps result URLs in redirect links like:
|
|
44
|
+
* `//duckduckgo.com/l/?uddg=ENCODED_URL&rut=...`
|
|
45
|
+
*
|
|
46
|
+
* This function:
|
|
47
|
+
* 1. Extracts the real URL from the `uddg` query parameter
|
|
48
|
+
* 2. Handles protocol-relative `//` URLs by prepending `https:`
|
|
49
|
+
* 3. Passes normal URLs through unchanged
|
|
50
|
+
*
|
|
51
|
+
* @param url - URL from a DDG search result
|
|
52
|
+
* @returns Cleaned URL pointing to the actual destination
|
|
53
|
+
*/
|
|
54
|
+
export function cleanDuckDuckGoUrl(url) {
|
|
55
|
+
// Handle DDG redirect URLs — only extract uddg from known DDG redirect paths
|
|
56
|
+
if (url.includes("uddg=")) {
|
|
57
|
+
const normalizedUrl = url.startsWith("//") ? `https:${url}` : url;
|
|
58
|
+
try {
|
|
59
|
+
const parsed = new URL(normalizedUrl);
|
|
60
|
+
const hostname = parsed.hostname.toLowerCase();
|
|
61
|
+
const isDDG = hostname === "duckduckgo.com" || hostname === "www.duckduckgo.com";
|
|
62
|
+
if (isDDG && parsed.pathname.startsWith("/l/")) {
|
|
63
|
+
const uddg = parsed.searchParams.get("uddg");
|
|
64
|
+
if (uddg) {
|
|
65
|
+
return uddg;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
catch {
|
|
70
|
+
// Fall through to other checks
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
// Handle protocol-relative URLs
|
|
74
|
+
if (url.startsWith("//")) {
|
|
75
|
+
return `https:${url}`;
|
|
76
|
+
}
|
|
77
|
+
return url;
|
|
78
|
+
}
|
|
79
|
+
// ============================================================================
|
|
80
|
+
// HTML Extraction
|
|
81
|
+
// ============================================================================
|
|
82
|
+
/**
|
|
83
|
+
* Extract search results from DuckDuckGo HTML response.
|
|
84
|
+
*
|
|
85
|
+
* Parses DDG's HTML structure:
|
|
86
|
+
* - `class="result__url"` href for URLs (primary)
|
|
87
|
+
* - `class="result__a"` for titles (and fallback URLs)
|
|
88
|
+
* - `class="result__snippet"` for snippets
|
|
89
|
+
*
|
|
90
|
+
* If no `result__url` matches are found, falls back to `result__a` hrefs.
|
|
91
|
+
* Filters by domain using URL hostname parsing to prevent substring spoofing
|
|
92
|
+
* (e.g., "nytimes.com.evil.com" won't match "nytimes.com").
|
|
93
|
+
*
|
|
94
|
+
* @param html - Raw HTML response from DDG
|
|
95
|
+
* @param domainFilter - Optional domain to filter results by
|
|
96
|
+
* @returns Array of extracted search results
|
|
97
|
+
*/
|
|
98
|
+
export function extractUrlsFromDuckDuckGoHtml(html, domainFilter) {
|
|
99
|
+
// Extract result__url hrefs
|
|
100
|
+
const urlPattern = /class="result__url"\s+href="([^"]+)"/g;
|
|
101
|
+
const resultUrls = [];
|
|
102
|
+
let match;
|
|
103
|
+
match = urlPattern.exec(html);
|
|
104
|
+
while (match !== null) {
|
|
105
|
+
resultUrls.push(cleanDuckDuckGoUrl(decodeHtmlEntities(match[1])));
|
|
106
|
+
match = urlPattern.exec(html);
|
|
107
|
+
}
|
|
108
|
+
// Extract result__a titles and hrefs
|
|
109
|
+
const titlePattern = /class="result__a"\s+href="([^"]+)"[^>]*>([^<]*)</g;
|
|
110
|
+
const titles = [];
|
|
111
|
+
match = titlePattern.exec(html);
|
|
112
|
+
while (match !== null) {
|
|
113
|
+
titles.push({
|
|
114
|
+
href: cleanDuckDuckGoUrl(decodeHtmlEntities(match[1])),
|
|
115
|
+
title: decodeHtmlEntities(match[2]).trim(),
|
|
116
|
+
});
|
|
117
|
+
match = titlePattern.exec(html);
|
|
118
|
+
}
|
|
119
|
+
// Extract result__snippet text
|
|
120
|
+
const snippetPattern = /class="result__snippet"[^>]*>([\s\S]*?)<\/a>/g;
|
|
121
|
+
const snippets = [];
|
|
122
|
+
match = snippetPattern.exec(html);
|
|
123
|
+
while (match !== null) {
|
|
124
|
+
// Strip any inline HTML tags from snippet content
|
|
125
|
+
const rawSnippet = match[1].replace(/<[^>]+>/g, "");
|
|
126
|
+
snippets.push(decodeHtmlEntities(rawSnippet).trim());
|
|
127
|
+
match = snippetPattern.exec(html);
|
|
128
|
+
}
|
|
129
|
+
// Build results: prefer result__url, fall back to result__a hrefs
|
|
130
|
+
const useUrls = resultUrls.length > 0;
|
|
131
|
+
const primaryUrls = useUrls ? resultUrls : titles.map((t) => t.href);
|
|
132
|
+
const count = primaryUrls.length;
|
|
133
|
+
const results = [];
|
|
134
|
+
for (let i = 0; i < count; i++) {
|
|
135
|
+
const url = primaryUrls[i];
|
|
136
|
+
const title = titles[i]?.title ?? "";
|
|
137
|
+
const snippet = snippets[i] ?? "";
|
|
138
|
+
// Filter by domain using hostname parsing to prevent substring spoofing.
|
|
139
|
+
// Normalize to lowercase since URL.hostname is always lowercase.
|
|
140
|
+
if (domainFilter) {
|
|
141
|
+
const normalizedFilter = domainFilter.toLowerCase().trim();
|
|
142
|
+
try {
|
|
143
|
+
const hostname = new URL(url).hostname;
|
|
144
|
+
if (hostname !== normalizedFilter && !hostname.endsWith("." + normalizedFilter)) {
|
|
145
|
+
continue;
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
catch {
|
|
149
|
+
// If URL can't be parsed, skip it when filtering
|
|
150
|
+
continue;
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
results.push({ url, title, snippet });
|
|
154
|
+
}
|
|
155
|
+
return results;
|
|
156
|
+
}
|
|
157
|
+
// ============================================================================
|
|
158
|
+
// Search Function
|
|
159
|
+
// ============================================================================
|
|
160
|
+
/**
|
|
161
|
+
* Build the combined AbortSignal from a caller signal and a timeout.
|
|
162
|
+
*
|
|
163
|
+
* Uses `AbortSignal.any()` to combine both so that neither defeats the other.
|
|
164
|
+
*/
|
|
165
|
+
function buildSignal(callerSignal, timeoutMs) {
|
|
166
|
+
const timeout = timeoutMs ?? DEFAULT_TIMEOUT_MS;
|
|
167
|
+
const timeoutSignal = AbortSignal.timeout(timeout);
|
|
168
|
+
if (callerSignal) {
|
|
169
|
+
return AbortSignal.any([callerSignal, timeoutSignal]);
|
|
170
|
+
}
|
|
171
|
+
return timeoutSignal;
|
|
172
|
+
}
|
|
173
|
+
/**
|
|
174
|
+
* Search DuckDuckGo via its HTML endpoint.
|
|
175
|
+
*
|
|
176
|
+
* Fetches `https://html.duckduckgo.com/html/?q=QUERY`, optionally
|
|
177
|
+
* prepending `site:domain` when domainFilter is set. Returns an empty
|
|
178
|
+
* array on CAPTCHA, error, or non-OK response.
|
|
179
|
+
*
|
|
180
|
+
* @param options - Search options including query, domain filter, and limits
|
|
181
|
+
* @returns Array of search results (empty on failure)
|
|
182
|
+
*/
|
|
183
|
+
export async function searchDuckDuckGo(options) {
|
|
184
|
+
const { query, domainFilter, maxResults = DEFAULT_MAX_RESULTS, signal: callerSignal, timeoutMs, } = options;
|
|
185
|
+
// Build the search query, prepending site: if domainFilter is set
|
|
186
|
+
const fullQuery = domainFilter ? `site:${domainFilter} ${query}` : query;
|
|
187
|
+
// Build the search URL
|
|
188
|
+
const searchUrl = `${DDG_HTML_URL}?q=${encodeURIComponent(fullQuery)}`;
|
|
189
|
+
// Build the abort signal combining caller signal with timeout
|
|
190
|
+
const signal = buildSignal(callerSignal, timeoutMs);
|
|
191
|
+
let response;
|
|
192
|
+
try {
|
|
193
|
+
response = await fetch(searchUrl, {
|
|
194
|
+
headers: {
|
|
195
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
196
|
+
},
|
|
197
|
+
signal,
|
|
198
|
+
});
|
|
199
|
+
}
|
|
200
|
+
catch (error) {
|
|
201
|
+
// Re-throw abort/timeout so BaseResearchSource.lookup() can record telemetry
|
|
202
|
+
if (error instanceof DOMException &&
|
|
203
|
+
(error.name === "AbortError" || error.name === "TimeoutError")) {
|
|
204
|
+
throw error;
|
|
205
|
+
}
|
|
206
|
+
return [];
|
|
207
|
+
}
|
|
208
|
+
if (!response.ok) {
|
|
209
|
+
return [];
|
|
210
|
+
}
|
|
211
|
+
const html = await response.text();
|
|
212
|
+
if (isDuckDuckGoCaptcha(html)) {
|
|
213
|
+
return [];
|
|
214
|
+
}
|
|
215
|
+
const results = extractUrlsFromDuckDuckGoHtml(html, domainFilter);
|
|
216
|
+
return results.slice(0, maxResults);
|
|
217
|
+
}
|
|
218
|
+
//# sourceMappingURL=duckduckgo-search.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"duckduckgo-search.js","sourceRoot":"","sources":["../../src/shared/duckduckgo-search.ts"],"names":[],"mappings":"AAAA;;;;;;;;;GASG;AAEH,OAAO,EAAE,kBAAkB,EAAE,MAAM,iBAAiB,CAAA;AA8BpD,+EAA+E;AAC/E,YAAY;AACZ,+EAA+E;AAE/E,gCAAgC;AAChC,MAAM,YAAY,GAAG,mCAAmC,CAAA;AAExD,4DAA4D;AAC5D,MAAM,kBAAkB,GAAG,KAAK,CAAA;AAEhC,mDAAmD;AACnD,MAAM,mBAAmB,GAAG,EAAE,CAAA;AAE9B,+EAA+E;AAC/E,oBAAoB;AACpB,+EAA+E;AAE/E;;;;;;;;GAQG;AACH,MAAM,UAAU,mBAAmB,CAAC,IAAY;IAC9C,MAAM,KAAK,GAAG,IAAI,CAAC,WAAW,EAAE,CAAA;IAChC,OAAO,KAAK,CAAC,QAAQ,CAAC,eAAe,CAAC,IAAI,KAAK,CAAC,QAAQ,CAAC,yBAAyB,CAAC,CAAA;AACrF,CAAC;AAED,+EAA+E;AAC/E,eAAe;AACf,+EAA+E;AAE/E;;;;;;;;;;;;;GAaG;AACH,MAAM,UAAU,kBAAkB,CAAC,GAAW;IAC5C,6EAA6E;IAC7E,IAAI,GAAG,CAAC,QAAQ,CAAC,OAAO,CAAC,EAAE,CAAC;QAC1B,MAAM,aAAa,GAAG,GAAG,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,GAAG,EAAE,CAAC,CAAC,CAAC,GAAG,CAAA;QACjE,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,aAAa,CAAC,CAAA;YACrC,MAAM,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC,WAAW,EAAE,CAAA;YAC9C,MAAM,KAAK,GAAG,QAAQ,KAAK,gBAAgB,IAAI,QAAQ,KAAK,oBAAoB,CAAA;YAChF,IAAI,KAAK,IAAI,MAAM,CAAC,QAAQ,CAAC,UAAU,CAAC,KAAK,CAAC,EAAE,CAAC;gBAC/C,MAAM,IAAI,GAAG,MAAM,CAAC,YAAY,CAAC,GAAG,CAAC,MAAM,CAAC,CAAA;gBAC5C,IAAI,IAAI,EAAE,CAAC;oBACT,OAAO,IAAI,CAAA;gBACb,CAAC;YACH,CAAC;QACH,CAAC;QAAC,MAAM,CAAC;YACP,+BAA+B;QACjC,CAAC;IACH,CAAC;IAED,gCAAgC;IAChC,IAAI,GAAG,CAAC,UAAU,CAAC,IAAI,CAAC,EAAE,CAAC;QACzB,OAAO,SAAS,GAAG,EAAE,CAAA;IACvB,CAAC;IAED,OAAO,GAAG,CAAA;AACZ,CAAC;AAED,+EAA+E;AAC/E,kBAAkB;AAClB,+EAA+E;AAE/E;;;;;;;;;;;;;;;GAeG;AACH,MAAM,UAAU,6BAA6B,CAAC,IAAY,EAAE,YAAqB;IAC/E,4BAA4B;IAC5B,MAAM,UAAU,GAAG,uCAAuC,CAAA;IAC1D,MAAM,UAAU,GAAa,EAAE,CAAA;IAC/B,IAAI,KAA6B,CAAA;IAEjC,KAAK,GAAG,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;IAC7B,OAAO,KAAK,KAAK,IAAI,EAAE,CAAC;QACtB,UAAU,CAAC,IAAI,CAAC,kBAAkB,CAAC,kBAAkB,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;QACjE,KAAK,GAAG,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;IAC/B,CAAC;IAED,qCAAqC;IACrC,MAAM,YAAY,GAAG,mDAAmD,CAAA;IACxE,MAAM,MAAM,GAA2C,EAAE,CAAA;IAEzD,KAAK,GAAG,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;IAC/B,OAAO,KAAK,KAAK,IAAI,EAAE,CAAC;QACtB,MAAM,CAAC,IAAI,CAAC;YACV,IAAI,EAAE,kBAAkB,CAAC,kBAAkB,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;YACtD,KAAK,EAAE,kBAAkB,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE;SAC3C,CAAC,CAAA;QACF,KAAK,GAAG,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;IACjC,CAAC;IAED,+BAA+B;IAC/B,MAAM,cAAc,GAAG,+CAA+C,CAAA;IACtE,MAAM,QAAQ,GAAa,EAAE,CAAA;IAE7B,KAAK,GAAG,cAAc,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;IACjC,OAAO,KAAK,KAAK,IAAI,EAAE,CAAC;QACtB,kDAAkD;QAClD,MAAM,UAAU,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC,CAAA;QACnD,QAAQ,CAAC,IAAI,CAAC,kBAAkB,CAAC,UAAU,CAAC,CAAC,IAAI,EAAE,CAAC,CAAA;QACpD,KAAK,GAAG,cAAc,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;IACnC,CAAC;IAED,kEAAkE;IAClE,MAAM,OAAO,GAAG,UAAU,CAAC,MAAM,GAAG,CAAC,CAAA;IACrC,MAAM,WAAW,GAAG,OAAO,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAA;IACpE,MAAM,KAAK,GAAG,WAAW,CAAC,MAAM,CAAA;IAEhC,MAAM,OAAO,GAAmB,EAAE,CAAA;IAElC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,EAAE,CAAC,EAAE,EAAE,CAAC;QAC/B,MAAM,GAAG,GAAG,WAAW,CAAC,CAAC,CAAC,CAAA;QAC1B,MAAM,KAAK,GAAG,MAAM,CAAC,CAAC,CAAC,EAAE,KAAK,IAAI,EAAE,CAAA;QACpC,MAAM,OAAO,GAAG,QAAQ,CAAC,CAAC,CAAC,IAAI,EAAE,CAAA;QAEjC,yEAAyE;QACzE,iEAAiE;QACjE,IAAI,YAAY,EAAE,CAAC;YACjB,MAAM,gBAAgB,GAAG,YAAY,CAAC,WAAW,EAAE,CAAC,IAAI,EAAE,CAAA;YAC1D,IAAI,CAAC;gBACH,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAA;gBACtC,IAAI,QAAQ,KAAK,gBAAgB,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,GAAG,GAAG,gBAAgB,CAAC,EAAE,CAAC;oBAChF,SAAQ;gBACV,CAAC;YACH,CAAC;YAAC,MAAM,CAAC;gBACP,iDAAiD;gBACjD,SAAQ;YACV,CAAC;QACH,CAAC;QAED,OAAO,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,KAAK,EAAE,OAAO,EAAE,CAAC,CAAA;IACvC,CAAC;IAED,OAAO,OAAO,CAAA;AAChB,CAAC;AAED,+EAA+E;AAC/E,kBAAkB;AAClB,+EAA+E;AAE/E;;;;GAIG;AACH,SAAS,WAAW,CAAC,YAA0B,EAAE,SAAkB;IACjE,MAAM,OAAO,GAAG,SAAS,IAAI,kBAAkB,CAAA;IAC/C,MAAM,aAAa,GAAG,WAAW,CAAC,OAAO,CAAC,OAAO,CAAC,CAAA;IAElD,IAAI,YAAY,EAAE,CAAC;QACjB,OAAO,WAAW,CAAC,GAAG,CAAC,CAAC,YAAY,EAAE,aAAa,CAAC,CAAC,CAAA;IACvD,CAAC;IACD,OAAO,aAAa,CAAA;AACtB,CAAC;AAED;;;;;;;;;GASG;AACH,MAAM,CAAC,KAAK,UAAU,gBAAgB,CAAC,OAAgC;IACrE,MAAM,EACJ,KAAK,EACL,YAAY,EACZ,UAAU,GAAG,mBAAmB,EAChC,MAAM,EAAE,YAAY,EACpB,SAAS,GACV,GAAG,OAAO,CAAA;IAEX,kEAAkE;IAClE,MAAM,SAAS,GAAG,YAAY,CAAC,CAAC,CAAC,QAAQ,YAAY,IAAI,KAAK,EAAE,CAAC,CAAC,CAAC,KAAK,CAAA;IAExE,uBAAuB;IACvB,MAAM,SAAS,GAAG,GAAG,YAAY,MAAM,kBAAkB,CAAC,SAAS,CAAC,EAAE,CAAA;IAEtE,8DAA8D;IAC9D,MAAM,MAAM,GAAG,WAAW,CAAC,YAAY,EAAE,SAAS,CAAC,CAAA;IAEnD,IAAI,QAAkB,CAAA;IACtB,IAAI,CAAC;QACH,QAAQ,GAAG,MAAM,KAAK,CAAC,SAAS,EAAE;YAChC,OAAO,EAAE;gBACP,YAAY,EACV,iHAAiH;aACpH;YACD,MAAM;SACP,CAAC,CAAA;IACJ,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,6EAA6E;QAC7E,IACE,KAAK,YAAY,YAAY;YAC7B,CAAC,KAAK,CAAC,IAAI,KAAK,YAAY,IAAI,KAAK,CAAC,IAAI,KAAK,cAAc,CAAC,EAC9D,CAAC;YACD,MAAM,KAAK,CAAA;QACb,CAAC;QACD,OAAO,EAAE,CAAA;IACX,CAAC;IAED,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;QACjB,OAAO,EAAE,CAAA;IACX,CAAC;IAED,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAA;IAElC,IAAI,mBAAmB,CAAC,IAAI,CAAC,EAAE,CAAC;QAC9B,OAAO,EAAE,CAAA;IACX,CAAC;IAED,MAAM,OAAO,GAAG,6BAA6B,CAAC,IAAI,EAAE,YAAY,CAAC,CAAA;IAEjE,OAAO,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,UAAU,CAAC,CAAA;AACrC,CAAC"}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Page fetching utility with browser-like headers and archive.org fallback.
|
|
3
|
+
*
|
|
4
|
+
* Provides a resilient page fetching pipeline:
|
|
5
|
+
* 1. Direct fetch with browser-like headers (Chrome UA, Accept text/html)
|
|
6
|
+
* 2. Block detection (hard HTTP blocks + soft body pattern matching)
|
|
7
|
+
* 3. Automatic archive.org fallback when blocked or on network error
|
|
8
|
+
* 4. Non-blocking HTTP errors (404, 500) return immediately without fallback
|
|
9
|
+
*
|
|
10
|
+
* Used by WebSearchBase when following links from search results.
|
|
11
|
+
*/
|
|
12
|
+
/** Options for fetching a page. */
|
|
13
|
+
export interface FetchPageOptions {
|
|
14
|
+
/** URL to fetch. */
|
|
15
|
+
url: string;
|
|
16
|
+
/** AbortSignal from the caller (combined with timeoutMs). */
|
|
17
|
+
signal?: AbortSignal;
|
|
18
|
+
/** Total timeout budget in milliseconds (shared across direct + archive attempts). Default: 15000. */
|
|
19
|
+
timeoutMs?: number;
|
|
20
|
+
/** User-Agent header to send. Default: browser-like Chrome UA. */
|
|
21
|
+
userAgent?: string;
|
|
22
|
+
/** Whether to try archive.org when direct fetch is blocked. Default: true. */
|
|
23
|
+
archiveFallback?: boolean;
|
|
24
|
+
}
|
|
25
|
+
/** Result of a page fetch attempt. */
|
|
26
|
+
export interface FetchPageResult {
|
|
27
|
+
/** Raw HTML content (empty string if fetch failed). */
|
|
28
|
+
content: string;
|
|
29
|
+
/** Final URL (may differ from input if archive.org was used). */
|
|
30
|
+
url: string;
|
|
31
|
+
/** How the content was obtained. */
|
|
32
|
+
fetchMethod: "direct" | "archive.org" | "none";
|
|
33
|
+
/** Error description when fetchMethod is "none". */
|
|
34
|
+
error?: string;
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* Fetch a page with browser-like headers and automatic archive.org fallback.
|
|
38
|
+
*
|
|
39
|
+
* Pipeline:
|
|
40
|
+
* 1. Direct fetch with browser-like headers
|
|
41
|
+
* 2. Block detection (hard HTTP status codes + soft body pattern matching)
|
|
42
|
+
* 3. If blocked and archiveFallback enabled, try archive.org
|
|
43
|
+
* 4. Non-blocking HTTP errors (404, 500) return "none" immediately
|
|
44
|
+
* 5. Network errors on direct fetch trigger archive fallback
|
|
45
|
+
*
|
|
46
|
+
* @param options - Fetch options including URL, signal, timeout, etc.
|
|
47
|
+
* @returns Result with content, final URL, and fetch method
|
|
48
|
+
*/
|
|
49
|
+
export declare function fetchPage(options: FetchPageOptions): Promise<FetchPageResult>;
|
|
50
|
+
//# sourceMappingURL=fetch-page.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"fetch-page.d.ts","sourceRoot":"","sources":["../../src/shared/fetch-page.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,mCAAmC;AACnC,MAAM,WAAW,gBAAgB;IAC/B,oBAAoB;IACpB,GAAG,EAAE,MAAM,CAAA;IACX,6DAA6D;IAC7D,MAAM,CAAC,EAAE,WAAW,CAAA;IACpB,sGAAsG;IACtG,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,kEAAkE;IAClE,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,8EAA8E;IAC9E,eAAe,CAAC,EAAE,OAAO,CAAA;CAC1B;AAED,sCAAsC;AACtC,MAAM,WAAW,eAAe;IAC9B,uDAAuD;IACvD,OAAO,EAAE,MAAM,CAAA;IACf,iEAAiE;IACjE,GAAG,EAAE,MAAM,CAAA;IACX,oCAAoC;IACpC,WAAW,EAAE,QAAQ,GAAG,aAAa,GAAG,MAAM,CAAA;IAC9C,oDAAoD;IACpD,KAAK,CAAC,EAAE,MAAM,CAAA;CACf;AA2HD;;;;;;;;;;;;GAYG;AACH,wBAAsB,SAAS,CAAC,OAAO,EAAE,gBAAgB,GAAG,OAAO,CAAC,eAAe,CAAC,CAoGnF"}
|