@debriefer/sources 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +59 -0
- package/dist/__tests__/archives/chronicling-america.test.d.ts +8 -0
- package/dist/__tests__/archives/chronicling-america.test.d.ts.map +1 -0
- package/dist/__tests__/archives/chronicling-america.test.js +151 -0
- package/dist/__tests__/archives/chronicling-america.test.js.map +1 -0
- package/dist/__tests__/archives/europeana.test.d.ts +8 -0
- package/dist/__tests__/archives/europeana.test.d.ts.map +1 -0
- package/dist/__tests__/archives/europeana.test.js +200 -0
- package/dist/__tests__/archives/europeana.test.js.map +1 -0
- package/dist/__tests__/archives/internet-archive.test.d.ts +8 -0
- package/dist/__tests__/archives/internet-archive.test.d.ts.map +1 -0
- package/dist/__tests__/archives/internet-archive.test.js +189 -0
- package/dist/__tests__/archives/internet-archive.test.js.map +1 -0
- package/dist/__tests__/archives/trove.test.d.ts +8 -0
- package/dist/__tests__/archives/trove.test.d.ts.map +1 -0
- package/dist/__tests__/archives/trove.test.js +202 -0
- package/dist/__tests__/archives/trove.test.js.map +1 -0
- package/dist/__tests__/books/google-books.test.d.ts +8 -0
- package/dist/__tests__/books/google-books.test.d.ts.map +1 -0
- package/dist/__tests__/books/google-books.test.js +221 -0
- package/dist/__tests__/books/google-books.test.js.map +1 -0
- package/dist/__tests__/books/open-library.test.d.ts +8 -0
- package/dist/__tests__/books/open-library.test.d.ts.map +1 -0
- package/dist/__tests__/books/open-library.test.js +159 -0
- package/dist/__tests__/books/open-library.test.js.map +1 -0
- package/dist/__tests__/news/guardian.test.d.ts +9 -0
- package/dist/__tests__/news/guardian.test.d.ts.map +1 -0
- package/dist/__tests__/news/guardian.test.js +224 -0
- package/dist/__tests__/news/guardian.test.js.map +1 -0
- package/dist/__tests__/news/nytimes.test.d.ts +9 -0
- package/dist/__tests__/news/nytimes.test.d.ts.map +1 -0
- package/dist/__tests__/news/nytimes.test.js +271 -0
- package/dist/__tests__/news/nytimes.test.js.map +1 -0
- package/dist/__tests__/news/site-search-source.test.d.ts +9 -0
- package/dist/__tests__/news/site-search-source.test.d.ts.map +1 -0
- package/dist/__tests__/news/site-search-source.test.js +342 -0
- package/dist/__tests__/news/site-search-source.test.js.map +1 -0
- package/dist/__tests__/obituary/find-a-grave.test.d.ts +8 -0
- package/dist/__tests__/obituary/find-a-grave.test.d.ts.map +1 -0
- package/dist/__tests__/obituary/find-a-grave.test.js +238 -0
- package/dist/__tests__/obituary/find-a-grave.test.js.map +1 -0
- package/dist/__tests__/shared/duckduckgo-search.test.d.ts +9 -0
- package/dist/__tests__/shared/duckduckgo-search.test.d.ts.map +1 -0
- package/dist/__tests__/shared/duckduckgo-search.test.js +218 -0
- package/dist/__tests__/shared/duckduckgo-search.test.js.map +1 -0
- package/dist/__tests__/shared/fetch-page.test.d.ts +9 -0
- package/dist/__tests__/shared/fetch-page.test.d.ts.map +1 -0
- package/dist/__tests__/shared/fetch-page.test.js +281 -0
- package/dist/__tests__/shared/fetch-page.test.js.map +1 -0
- package/dist/__tests__/shared/html-utils.test.d.ts +2 -0
- package/dist/__tests__/shared/html-utils.test.d.ts.map +1 -0
- package/dist/__tests__/shared/html-utils.test.js +169 -0
- package/dist/__tests__/shared/html-utils.test.js.map +1 -0
- package/dist/__tests__/shared/readability-extract.test.d.ts +2 -0
- package/dist/__tests__/shared/readability-extract.test.d.ts.map +1 -0
- package/dist/__tests__/shared/readability-extract.test.js +107 -0
- package/dist/__tests__/shared/readability-extract.test.js.map +1 -0
- package/dist/__tests__/shared/sanitize-text.test.d.ts +2 -0
- package/dist/__tests__/shared/sanitize-text.test.d.ts.map +1 -0
- package/dist/__tests__/shared/sanitize-text.test.js +77 -0
- package/dist/__tests__/shared/sanitize-text.test.js.map +1 -0
- package/dist/__tests__/shared/search-utils.test.d.ts +2 -0
- package/dist/__tests__/shared/search-utils.test.d.ts.map +1 -0
- package/dist/__tests__/shared/search-utils.test.js +26 -0
- package/dist/__tests__/shared/search-utils.test.js.map +1 -0
- package/dist/__tests__/structured/wikidata.test.d.ts +9 -0
- package/dist/__tests__/structured/wikidata.test.d.ts.map +1 -0
- package/dist/__tests__/structured/wikidata.test.js +509 -0
- package/dist/__tests__/structured/wikidata.test.js.map +1 -0
- package/dist/__tests__/structured/wikipedia.test.d.ts +9 -0
- package/dist/__tests__/structured/wikipedia.test.d.ts.map +1 -0
- package/dist/__tests__/structured/wikipedia.test.js +643 -0
- package/dist/__tests__/structured/wikipedia.test.js.map +1 -0
- package/dist/__tests__/web-search/base.test.d.ts +9 -0
- package/dist/__tests__/web-search/base.test.d.ts.map +1 -0
- package/dist/__tests__/web-search/base.test.js +622 -0
- package/dist/__tests__/web-search/base.test.js.map +1 -0
- package/dist/__tests__/web-search/bing.test.d.ts +10 -0
- package/dist/__tests__/web-search/bing.test.d.ts.map +1 -0
- package/dist/__tests__/web-search/bing.test.js +277 -0
- package/dist/__tests__/web-search/bing.test.js.map +1 -0
- package/dist/__tests__/web-search/brave.test.d.ts +10 -0
- package/dist/__tests__/web-search/brave.test.d.ts.map +1 -0
- package/dist/__tests__/web-search/brave.test.js +264 -0
- package/dist/__tests__/web-search/brave.test.js.map +1 -0
- package/dist/__tests__/web-search/duckduckgo.test.d.ts +10 -0
- package/dist/__tests__/web-search/duckduckgo.test.d.ts.map +1 -0
- package/dist/__tests__/web-search/duckduckgo.test.js +107 -0
- package/dist/__tests__/web-search/duckduckgo.test.js.map +1 -0
- package/dist/__tests__/web-search/google.test.d.ts +9 -0
- package/dist/__tests__/web-search/google.test.d.ts.map +1 -0
- package/dist/__tests__/web-search/google.test.js +189 -0
- package/dist/__tests__/web-search/google.test.js.map +1 -0
- package/dist/archives/chronicling-america.d.ts +33 -0
- package/dist/archives/chronicling-america.d.ts.map +1 -0
- package/dist/archives/chronicling-america.js +85 -0
- package/dist/archives/chronicling-america.js.map +1 -0
- package/dist/archives/europeana.d.ts +37 -0
- package/dist/archives/europeana.d.ts.map +1 -0
- package/dist/archives/europeana.js +92 -0
- package/dist/archives/europeana.js.map +1 -0
- package/dist/archives/internet-archive.d.ts +32 -0
- package/dist/archives/internet-archive.d.ts.map +1 -0
- package/dist/archives/internet-archive.js +90 -0
- package/dist/archives/internet-archive.js.map +1 -0
- package/dist/archives/trove.d.ts +37 -0
- package/dist/archives/trove.d.ts.map +1 -0
- package/dist/archives/trove.js +97 -0
- package/dist/archives/trove.js.map +1 -0
- package/dist/books/google-books.d.ts +48 -0
- package/dist/books/google-books.d.ts.map +1 -0
- package/dist/books/google-books.js +111 -0
- package/dist/books/google-books.js.map +1 -0
- package/dist/books/open-library.d.ts +44 -0
- package/dist/books/open-library.d.ts.map +1 -0
- package/dist/books/open-library.js +103 -0
- package/dist/books/open-library.js.map +1 -0
- package/dist/index.d.ts +45 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +35 -0
- package/dist/index.js.map +1 -0
- package/dist/news/guardian.d.ts +51 -0
- package/dist/news/guardian.d.ts.map +1 -0
- package/dist/news/guardian.js +131 -0
- package/dist/news/guardian.js.map +1 -0
- package/dist/news/nytimes.d.ts +27 -0
- package/dist/news/nytimes.d.ts.map +1 -0
- package/dist/news/nytimes.js +104 -0
- package/dist/news/nytimes.js.map +1 -0
- package/dist/news/site-search-source.d.ts +89 -0
- package/dist/news/site-search-source.d.ts.map +1 -0
- package/dist/news/site-search-source.js +182 -0
- package/dist/news/site-search-source.js.map +1 -0
- package/dist/news/sources.d.ts +52 -0
- package/dist/news/sources.d.ts.map +1 -0
- package/dist/news/sources.js +276 -0
- package/dist/news/sources.js.map +1 -0
- package/dist/obituary/find-a-grave.d.ts +43 -0
- package/dist/obituary/find-a-grave.d.ts.map +1 -0
- package/dist/obituary/find-a-grave.js +173 -0
- package/dist/obituary/find-a-grave.js.map +1 -0
- package/dist/shared/duckduckgo-search.d.ts +86 -0
- package/dist/shared/duckduckgo-search.d.ts.map +1 -0
- package/dist/shared/duckduckgo-search.js +218 -0
- package/dist/shared/duckduckgo-search.js.map +1 -0
- package/dist/shared/fetch-page.d.ts +50 -0
- package/dist/shared/fetch-page.d.ts.map +1 -0
- package/dist/shared/fetch-page.js +212 -0
- package/dist/shared/fetch-page.js.map +1 -0
- package/dist/shared/html-utils.d.ts +99 -0
- package/dist/shared/html-utils.d.ts.map +1 -0
- package/dist/shared/html-utils.js +246 -0
- package/dist/shared/html-utils.js.map +1 -0
- package/dist/shared/readability-extract.d.ts +33 -0
- package/dist/shared/readability-extract.d.ts.map +1 -0
- package/dist/shared/readability-extract.js +45 -0
- package/dist/shared/readability-extract.js.map +1 -0
- package/dist/shared/sanitize-text.d.ts +24 -0
- package/dist/shared/sanitize-text.d.ts.map +1 -0
- package/dist/shared/sanitize-text.js +49 -0
- package/dist/shared/sanitize-text.js.map +1 -0
- package/dist/shared/search-utils.d.ts +18 -0
- package/dist/shared/search-utils.d.ts.map +1 -0
- package/dist/shared/search-utils.js +20 -0
- package/dist/shared/search-utils.js.map +1 -0
- package/dist/structured/wikidata.d.ts +128 -0
- package/dist/structured/wikidata.d.ts.map +1 -0
- package/dist/structured/wikidata.js +361 -0
- package/dist/structured/wikidata.js.map +1 -0
- package/dist/structured/wikipedia.d.ts +184 -0
- package/dist/structured/wikipedia.d.ts.map +1 -0
- package/dist/structured/wikipedia.js +275 -0
- package/dist/structured/wikipedia.js.map +1 -0
- package/dist/web-search/base.d.ts +128 -0
- package/dist/web-search/base.d.ts.map +1 -0
- package/dist/web-search/base.js +251 -0
- package/dist/web-search/base.js.map +1 -0
- package/dist/web-search/bing.d.ts +21 -0
- package/dist/web-search/bing.d.ts.map +1 -0
- package/dist/web-search/bing.js +53 -0
- package/dist/web-search/bing.js.map +1 -0
- package/dist/web-search/brave.d.ts +21 -0
- package/dist/web-search/brave.d.ts.map +1 -0
- package/dist/web-search/brave.js +56 -0
- package/dist/web-search/brave.js.map +1 -0
- package/dist/web-search/duckduckgo.d.ts +15 -0
- package/dist/web-search/duckduckgo.d.ts.map +1 -0
- package/dist/web-search/duckduckgo.js +21 -0
- package/dist/web-search/duckduckgo.js.map +1 -0
- package/dist/web-search/google.d.ts +24 -0
- package/dist/web-search/google.d.ts.map +1 -0
- package/dist/web-search/google.js +48 -0
- package/dist/web-search/google.js.map +1 -0
- package/package.json +58 -0
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Generic Wikipedia source for encyclopedia content.
|
|
3
|
+
*
|
|
4
|
+
* Uses `wtf_wikipedia` to fetch and parse Wikipedia articles, producing clean
|
|
5
|
+
* plaintext with no citation markers, footnotes, or HTML artifacts.
|
|
6
|
+
*
|
|
7
|
+
* Domain-agnostic: consumers customize which sections to extract via the
|
|
8
|
+
* `sectionFilter` option. Default returns all sections. Common use cases:
|
|
9
|
+
* - Death research: filter for "Death", "Health", "Illness" sections
|
|
10
|
+
* - Biography research: filter for "Early life", "Personal life" sections
|
|
11
|
+
* - General research: return all sections (default)
|
|
12
|
+
*
|
|
13
|
+
* Handles disambiguation pages by trying alternate titles with common suffixes.
|
|
14
|
+
*/
|
|
15
|
+
import { BaseResearchSource, ReliabilityTier, type BaseSourceOptions, type ResearchSubject, type RawFinding } from "@debriefer/core";
|
|
16
|
+
/** Metadata about a Wikipedia article section */
|
|
17
|
+
export interface WikipediaSection {
|
|
18
|
+
/** Section index within the article */
|
|
19
|
+
index: number;
|
|
20
|
+
/** Section title (e.g., "Early life", "Death") */
|
|
21
|
+
title: string;
|
|
22
|
+
/** Depth level (0 = top-level, 1 = subsection, etc.) */
|
|
23
|
+
depth: number;
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Function that filters Wikipedia sections to determine which to include.
|
|
27
|
+
* Receives all sections and returns the ones that should be extracted.
|
|
28
|
+
*/
|
|
29
|
+
export type SectionFilter = (sections: WikipediaSection[]) => WikipediaSection[];
|
|
30
|
+
/**
|
|
31
|
+
* Async section filter that receives all sections and the full article text.
|
|
32
|
+
* Returns a promise resolving to the sections to include.
|
|
33
|
+
* Takes precedence over the sync `sectionFilter` when both are provided.
|
|
34
|
+
*/
|
|
35
|
+
export type AsyncSectionFilter = (sections: WikipediaSection[], articleText: string) => Promise<WikipediaSection[]>;
|
|
36
|
+
/** Options for the Wikipedia source */
|
|
37
|
+
export interface WikipediaOptions extends BaseSourceOptions {
|
|
38
|
+
/**
|
|
39
|
+
* Custom section filter. Receives all article sections, returns the ones
|
|
40
|
+
* to extract. Default: return all sections.
|
|
41
|
+
*
|
|
42
|
+
* @example
|
|
43
|
+
* ```typescript
|
|
44
|
+
* // Only extract death-related sections
|
|
45
|
+
* sectionFilter: (sections) => sections.filter(s =>
|
|
46
|
+
* /death|illness|health|assassination/i.test(s.title)
|
|
47
|
+
* )
|
|
48
|
+
* ```
|
|
49
|
+
*/
|
|
50
|
+
sectionFilter?: SectionFilter;
|
|
51
|
+
/**
|
|
52
|
+
* Async section filter. Receives all sections and the full article text,
|
|
53
|
+
* returns a promise of which sections to include. Takes precedence over
|
|
54
|
+
* the sync `sectionFilter`. Useful for AI-based section selection.
|
|
55
|
+
*
|
|
56
|
+
* @example
|
|
57
|
+
* ```typescript
|
|
58
|
+
* asyncSectionFilter: async (sections, articleText) => {
|
|
59
|
+
* const selected = await geminiSelectSections(sections, articleText)
|
|
60
|
+
* return selected
|
|
61
|
+
* }
|
|
62
|
+
* ```
|
|
63
|
+
*/
|
|
64
|
+
asyncSectionFilter?: AsyncSectionFilter;
|
|
65
|
+
/**
|
|
66
|
+
* Whether to include the article introduction (section 0).
|
|
67
|
+
* Default: true.
|
|
68
|
+
*/
|
|
69
|
+
includeIntro?: boolean;
|
|
70
|
+
/**
|
|
71
|
+
* Whether to handle disambiguation pages by trying alternate titles.
|
|
72
|
+
* Default: true.
|
|
73
|
+
*/
|
|
74
|
+
handleDisambiguation?: boolean;
|
|
75
|
+
/**
|
|
76
|
+
* Alternate title suffixes to try if the article is a disambiguation page
|
|
77
|
+
* or not found. Default: ["_(actor)", "_(actress)"].
|
|
78
|
+
* Set to an empty array to disable alternate title attempts.
|
|
79
|
+
*/
|
|
80
|
+
disambiguationSuffixes?: string[];
|
|
81
|
+
/**
|
|
82
|
+
* Validate that the fetched article matches the intended person.
|
|
83
|
+
* Receives the full article text and the subject. When provided and
|
|
84
|
+
* returns false, the source tries disambiguation suffixes before giving up.
|
|
85
|
+
*
|
|
86
|
+
* Supports both sync and async callbacks. An async callback (returning
|
|
87
|
+
* `Promise<boolean>`) is useful for AI-based validation (e.g., Gemini Flash
|
|
88
|
+
* date extraction) without blocking the event loop.
|
|
89
|
+
*
|
|
90
|
+
* @example
|
|
91
|
+
* ```typescript
|
|
92
|
+
* // Sync — simple birth-year check
|
|
93
|
+
* validatePerson: (articleText, subject) => {
|
|
94
|
+
* const birthYear = subject.context?.birthYear as string
|
|
95
|
+
* return birthYear ? articleText.includes(birthYear) : true
|
|
96
|
+
* }
|
|
97
|
+
*
|
|
98
|
+
* // Async — AI-based validation
|
|
99
|
+
* validatePerson: async (articleText, subject) => {
|
|
100
|
+
* const dates = await extractDatesWithAI(articleText)
|
|
101
|
+
* return dates.birthYear === subject.context?.birthYear
|
|
102
|
+
* }
|
|
103
|
+
* ```
|
|
104
|
+
*/
|
|
105
|
+
validatePerson?: (articleText: string, subject: ResearchSubject) => boolean | Promise<boolean>;
|
|
106
|
+
}
|
|
107
|
+
/**
|
|
108
|
+
* Wikipedia source for encyclopedia article content.
|
|
109
|
+
*
|
|
110
|
+
* Fetches Wikipedia articles via `wtf_wikipedia`, extracts sections based
|
|
111
|
+
* on a configurable filter, and returns clean plaintext content as a RawFinding.
|
|
112
|
+
*/
|
|
113
|
+
export declare class WikipediaSource extends BaseResearchSource<ResearchSubject> {
|
|
114
|
+
readonly name = "Wikipedia";
|
|
115
|
+
readonly type = "wikipedia";
|
|
116
|
+
readonly reliabilityTier = ReliabilityTier.SECONDARY_COMPILATION;
|
|
117
|
+
readonly domain = "en.wikipedia.org";
|
|
118
|
+
readonly isFree = true;
|
|
119
|
+
readonly estimatedCostPerQuery = 0;
|
|
120
|
+
private sectionFilter;
|
|
121
|
+
private asyncSectionFilter?;
|
|
122
|
+
private includeIntro;
|
|
123
|
+
private handleDisambiguation;
|
|
124
|
+
private disambiguationSuffixes;
|
|
125
|
+
private validatePerson?;
|
|
126
|
+
constructor(options?: WikipediaOptions);
|
|
127
|
+
protected fetchResult(subject: ResearchSubject, _signal: AbortSignal): Promise<RawFinding | null>;
|
|
128
|
+
/**
|
|
129
|
+
* Build the search query for cache key generation.
|
|
130
|
+
* Includes option-derived key material so different WikipediaSource instances
|
|
131
|
+
* with different sectionFilter/includeIntro options don't collide in cache.
|
|
132
|
+
*/
|
|
133
|
+
buildQuery(subject: ResearchSubject): string;
|
|
134
|
+
/**
|
|
135
|
+
* Fetch a Wikipedia document using wtf_wikipedia.
|
|
136
|
+
* Returns null if the article doesn't exist. Lets other errors propagate
|
|
137
|
+
* so BaseResearchSource.lookup() can record them via telemetry.
|
|
138
|
+
*/
|
|
139
|
+
private fetchDocument;
|
|
140
|
+
/**
|
|
141
|
+
* Check if a document is a disambiguation page.
|
|
142
|
+
*/
|
|
143
|
+
private isDisambig;
|
|
144
|
+
/**
|
|
145
|
+
* Try disambiguation suffixes to find a valid (non-disambiguation) article.
|
|
146
|
+
* Returns the first valid document found, or the provided fallback if none match.
|
|
147
|
+
*/
|
|
148
|
+
private tryDisambiguationSuffixes;
|
|
149
|
+
/**
|
|
150
|
+
* Extract full plaintext from a document for validation and async filtering.
|
|
151
|
+
*/
|
|
152
|
+
private getFullText;
|
|
153
|
+
/**
|
|
154
|
+
* Calculate content confidence based on text length and subject name presence.
|
|
155
|
+
* Returns a score between 0.3 and 0.9.
|
|
156
|
+
*/
|
|
157
|
+
private calculateContentConfidence;
|
|
158
|
+
}
|
|
159
|
+
/**
|
|
160
|
+
* Create a Wikipedia source instance.
|
|
161
|
+
*
|
|
162
|
+
* @example
|
|
163
|
+
* ```typescript
|
|
164
|
+
* // Default: all sections
|
|
165
|
+
* const source = wikipedia()
|
|
166
|
+
*
|
|
167
|
+
* // Death research: only death-related sections
|
|
168
|
+
* const deathSource = wikipedia({
|
|
169
|
+
* sectionFilter: (sections) => sections.filter(s =>
|
|
170
|
+
* /death|illness|health|assassination|final years/i.test(s.title)
|
|
171
|
+
* ),
|
|
172
|
+
* })
|
|
173
|
+
*
|
|
174
|
+
* // Biography research: personal life sections
|
|
175
|
+
* const bioSource = wikipedia({
|
|
176
|
+
* sectionFilter: (sections) => sections.filter(s =>
|
|
177
|
+
* /early life|personal|childhood|education|family/i.test(s.title)
|
|
178
|
+
* ),
|
|
179
|
+
* includeIntro: true,
|
|
180
|
+
* })
|
|
181
|
+
* ```
|
|
182
|
+
*/
|
|
183
|
+
export declare function wikipedia(options?: WikipediaOptions): WikipediaSource;
|
|
184
|
+
//# sourceMappingURL=wikipedia.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"wikipedia.d.ts","sourceRoot":"","sources":["../../src/structured/wikipedia.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAIH,OAAO,EACL,kBAAkB,EAClB,eAAe,EACf,KAAK,iBAAiB,EACtB,KAAK,eAAe,EACpB,KAAK,UAAU,EAChB,MAAM,iBAAiB,CAAA;AAYxB,iDAAiD;AACjD,MAAM,WAAW,gBAAgB;IAC/B,uCAAuC;IACvC,KAAK,EAAE,MAAM,CAAA;IACb,kDAAkD;IAClD,KAAK,EAAE,MAAM,CAAA;IACb,wDAAwD;IACxD,KAAK,EAAE,MAAM,CAAA;CACd;AAED;;;GAGG;AACH,MAAM,MAAM,aAAa,GAAG,CAAC,QAAQ,EAAE,gBAAgB,EAAE,KAAK,gBAAgB,EAAE,CAAA;AAEhF;;;;GAIG;AACH,MAAM,MAAM,kBAAkB,GAAG,CAC/B,QAAQ,EAAE,gBAAgB,EAAE,EAC5B,WAAW,EAAE,MAAM,KAChB,OAAO,CAAC,gBAAgB,EAAE,CAAC,CAAA;AAEhC,uCAAuC;AACvC,MAAM,WAAW,gBAAiB,SAAQ,iBAAiB;IACzD;;;;;;;;;;;OAWG;IACH,aAAa,CAAC,EAAE,aAAa,CAAA;IAE7B;;;;;;;;;;;;OAYG;IACH,kBAAkB,CAAC,EAAE,kBAAkB,CAAA;IAEvC;;;OAGG;IACH,YAAY,CAAC,EAAE,OAAO,CAAA;IAEtB;;;OAGG;IACH,oBAAoB,CAAC,EAAE,OAAO,CAAA;IAE9B;;;;OAIG;IACH,sBAAsB,CAAC,EAAE,MAAM,EAAE,CAAA;IAEjC;;;;;;;;;;;;;;;;;;;;;;;OAuBG;IACH,cAAc,CAAC,EAAE,CAAC,WAAW,EAAE,MAAM,EAAE,OAAO,EAAE,eAAe,KAAK,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,CAAA;CAC/F;AAiBD;;;;;GAKG;AACH,qBAAa,eAAgB,SAAQ,kBAAkB,CAAC,eAAe,CAAC;IACtE,QAAQ,CAAC,IAAI,eAAc;IAC3B,QAAQ,CAAC,IAAI,eAAc;IAC3B,QAAQ,CAAC,eAAe,yCAAwC;IAChE,QAAQ,CAAC,MAAM,sBAAqB;IACpC,QAAQ,CAAC,MAAM,QAAO;IACtB,QAAQ,CAAC,qBAAqB,KAAI;IAElC,OAAO,CAAC,aAAa,CAAe;IACpC,OAAO,CAAC,kBAAkB,CAAC,CAAoB;IAC/C,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,oBAAoB,CAAS;IACrC,OAAO,CAAC,sBAAsB,CAAU;IACxC,OAAO,CAAC,cAAc,CAAC,CAGQ;gBAEnB,OAAO,GAAE,gBAAqB;cAU1B,WAAW,CACzB,OAAO,EAAE,eAAe,EAIxB,OAAO,EAAE,WAAW,GACnB,OAAO,CAAC,UAAU,GAAG,IAAI,CAAC;IAyG7B;;;;OAIG;IACM,UAAU,CAAC,OAAO,EAAE,eAAe,GAAG,MAAM;IAarD;;;;OAIG;YACW,aAAa;IAK3B;;OAEG;IACH,OAAO,CAAC,UAAU;IAIlB;;;OAGG;YACW,yBAAyB;IAcvC;;OAEG;IACH,OAAO,CAAC,WAAW;IAKnB;;;OAGG;IACH,OAAO,CAAC,0BAA0B;CAiCnC;AAMD;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AACH,wBAAgB,SAAS,CAAC,OAAO,CAAC,EAAE,gBAAgB,GAAG,eAAe,CAErE"}
|
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Generic Wikipedia source for encyclopedia content.
|
|
3
|
+
*
|
|
4
|
+
* Uses `wtf_wikipedia` to fetch and parse Wikipedia articles, producing clean
|
|
5
|
+
* plaintext with no citation markers, footnotes, or HTML artifacts.
|
|
6
|
+
*
|
|
7
|
+
* Domain-agnostic: consumers customize which sections to extract via the
|
|
8
|
+
* `sectionFilter` option. Default returns all sections. Common use cases:
|
|
9
|
+
* - Death research: filter for "Death", "Health", "Illness" sections
|
|
10
|
+
* - Biography research: filter for "Early life", "Personal life" sections
|
|
11
|
+
* - General research: return all sections (default)
|
|
12
|
+
*
|
|
13
|
+
* Handles disambiguation pages by trying alternate titles with common suffixes.
|
|
14
|
+
*/
|
|
15
|
+
import wtf from "wtf_wikipedia";
|
|
16
|
+
import { BaseResearchSource, ReliabilityTier, } from "@debriefer/core";
|
|
17
|
+
// ============================================================================
|
|
18
|
+
// Constants
|
|
19
|
+
// ============================================================================
|
|
20
|
+
const MIN_SECTION_LENGTH = 50;
|
|
21
|
+
// ============================================================================
|
|
22
|
+
// Default Section Filter
|
|
23
|
+
// ============================================================================
|
|
24
|
+
/**
|
|
25
|
+
* Default section filter: returns all sections.
|
|
26
|
+
*/
|
|
27
|
+
function defaultSectionFilter(sections) {
|
|
28
|
+
return sections;
|
|
29
|
+
}
|
|
30
|
+
// ============================================================================
|
|
31
|
+
// Source Implementation
|
|
32
|
+
// ============================================================================
|
|
33
|
+
/**
|
|
34
|
+
* Wikipedia source for encyclopedia article content.
|
|
35
|
+
*
|
|
36
|
+
* Fetches Wikipedia articles via `wtf_wikipedia`, extracts sections based
|
|
37
|
+
* on a configurable filter, and returns clean plaintext content as a RawFinding.
|
|
38
|
+
*/
|
|
39
|
+
export class WikipediaSource extends BaseResearchSource {
|
|
40
|
+
name = "Wikipedia";
|
|
41
|
+
type = "wikipedia";
|
|
42
|
+
reliabilityTier = ReliabilityTier.SECONDARY_COMPILATION;
|
|
43
|
+
domain = "en.wikipedia.org";
|
|
44
|
+
isFree = true;
|
|
45
|
+
estimatedCostPerQuery = 0;
|
|
46
|
+
sectionFilter;
|
|
47
|
+
asyncSectionFilter;
|
|
48
|
+
includeIntro;
|
|
49
|
+
handleDisambiguation;
|
|
50
|
+
disambiguationSuffixes;
|
|
51
|
+
validatePerson;
|
|
52
|
+
constructor(options = {}) {
|
|
53
|
+
super({ rateLimitMs: 500, ...options });
|
|
54
|
+
this.sectionFilter = options.sectionFilter ?? defaultSectionFilter;
|
|
55
|
+
this.asyncSectionFilter = options.asyncSectionFilter;
|
|
56
|
+
this.includeIntro = options.includeIntro ?? true;
|
|
57
|
+
this.handleDisambiguation = options.handleDisambiguation ?? true;
|
|
58
|
+
this.disambiguationSuffixes = options.disambiguationSuffixes ?? ["_(actor)", "_(actress)"];
|
|
59
|
+
this.validatePerson = options.validatePerson;
|
|
60
|
+
}
|
|
61
|
+
async fetchResult(subject,
|
|
62
|
+
// Note: wtf_wikipedia.fetch() does not accept an AbortSignal.
|
|
63
|
+
// The base class timeout/abort still applies to the overall lookup()
|
|
64
|
+
// call, but the underlying HTTP request cannot be cancelled mid-flight.
|
|
65
|
+
_signal) {
|
|
66
|
+
const baseTitle = subject.name.replace(/ /g, "_");
|
|
67
|
+
// Try the base title first
|
|
68
|
+
let doc = await this.fetchDocument(baseTitle);
|
|
69
|
+
// Handle disambiguation pages
|
|
70
|
+
if (this.handleDisambiguation && (!doc || this.isDisambig(doc))) {
|
|
71
|
+
doc = await this.tryDisambiguationSuffixes(baseTitle, doc);
|
|
72
|
+
}
|
|
73
|
+
// If we still have no valid document, return null
|
|
74
|
+
if (!doc || this.isDisambig(doc))
|
|
75
|
+
return null;
|
|
76
|
+
// Validate person if callback is provided.
|
|
77
|
+
// Track fullText so we can reuse it for asyncSectionFilter without recomputing.
|
|
78
|
+
let cachedFullText;
|
|
79
|
+
if (this.validatePerson) {
|
|
80
|
+
cachedFullText = this.getFullText(doc);
|
|
81
|
+
if (!(await this.validatePerson(cachedFullText, subject))) {
|
|
82
|
+
// Validation failed — try disambiguation suffixes if enabled
|
|
83
|
+
if (!this.handleDisambiguation)
|
|
84
|
+
return null;
|
|
85
|
+
const altDoc = await this.tryDisambiguationSuffixes(baseTitle, null);
|
|
86
|
+
if (!altDoc || this.isDisambig(altDoc))
|
|
87
|
+
return null;
|
|
88
|
+
// Validate the alternate document too
|
|
89
|
+
const altText = this.getFullText(altDoc);
|
|
90
|
+
if (!(await this.validatePerson(altText, subject)))
|
|
91
|
+
return null;
|
|
92
|
+
doc = altDoc;
|
|
93
|
+
cachedFullText = altText;
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
const sections = doc.sections();
|
|
97
|
+
if (sections.length === 0)
|
|
98
|
+
return null;
|
|
99
|
+
// Reuse cached full text from validation, or compute once for async filter
|
|
100
|
+
const fullText = this.asyncSectionFilter ? (cachedFullText ?? this.getFullText(doc)) : undefined;
|
|
101
|
+
// Map wtf sections to WikipediaSection interface
|
|
102
|
+
const wikiSections = sections.map((s, i) => ({
|
|
103
|
+
index: i,
|
|
104
|
+
title: s.title() || "Introduction",
|
|
105
|
+
depth: s.depth(),
|
|
106
|
+
}));
|
|
107
|
+
// Apply section filter — async takes precedence over sync
|
|
108
|
+
const selectedSections = this.asyncSectionFilter
|
|
109
|
+
? await this.asyncSectionFilter(wikiSections, fullText)
|
|
110
|
+
: this.sectionFilter(wikiSections);
|
|
111
|
+
// Build the set of section indices to extract
|
|
112
|
+
const indicesToExtract = new Set(selectedSections.map((s) => s.index));
|
|
113
|
+
// Always include intro if configured and not already in the filter result
|
|
114
|
+
if (this.includeIntro && !indicesToExtract.has(0)) {
|
|
115
|
+
indicesToExtract.add(0);
|
|
116
|
+
}
|
|
117
|
+
// If nothing to extract (filter returned empty and intro disabled), return null
|
|
118
|
+
if (indicesToExtract.size === 0)
|
|
119
|
+
return null;
|
|
120
|
+
// Extract text from selected sections
|
|
121
|
+
const sectionTexts = [];
|
|
122
|
+
const extractedTitles = [];
|
|
123
|
+
// Sort indices for consistent output order (article order)
|
|
124
|
+
const sortedIndices = [...indicesToExtract].sort((a, b) => a - b);
|
|
125
|
+
for (const idx of sortedIndices) {
|
|
126
|
+
const section = sections[idx];
|
|
127
|
+
if (!section)
|
|
128
|
+
continue;
|
|
129
|
+
const title = section.title() || "Introduction";
|
|
130
|
+
const text = section.text({});
|
|
131
|
+
if (text && text.length >= MIN_SECTION_LENGTH) {
|
|
132
|
+
sectionTexts.push(`[${title}] ${text}`);
|
|
133
|
+
extractedTitles.push(title);
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
if (sectionTexts.length === 0)
|
|
137
|
+
return null;
|
|
138
|
+
const combinedText = sectionTexts.join("\n\n");
|
|
139
|
+
const resolvedTitle = doc.title() || baseTitle.replace(/_/g, " ");
|
|
140
|
+
const resolvedUrl = `https://en.wikipedia.org/wiki/${encodeURIComponent(resolvedTitle.replace(/ /g, "_"))}`;
|
|
141
|
+
// Calculate confidence based on content quality
|
|
142
|
+
const confidence = this.calculateContentConfidence(combinedText, subject, sectionTexts.length);
|
|
143
|
+
return {
|
|
144
|
+
text: combinedText,
|
|
145
|
+
confidence,
|
|
146
|
+
costUsd: 0,
|
|
147
|
+
url: resolvedUrl,
|
|
148
|
+
publication: "Wikipedia",
|
|
149
|
+
articleTitle: resolvedTitle,
|
|
150
|
+
metadata: {
|
|
151
|
+
sectionCount: sectionTexts.length,
|
|
152
|
+
sectionTitles: extractedTitles,
|
|
153
|
+
textLength: combinedText.length,
|
|
154
|
+
},
|
|
155
|
+
};
|
|
156
|
+
}
|
|
157
|
+
/**
|
|
158
|
+
* Build the search query for cache key generation.
|
|
159
|
+
* Includes option-derived key material so different WikipediaSource instances
|
|
160
|
+
* with different sectionFilter/includeIntro options don't collide in cache.
|
|
161
|
+
*/
|
|
162
|
+
buildQuery(subject) {
|
|
163
|
+
const parts = [subject.name];
|
|
164
|
+
if (this.asyncSectionFilter)
|
|
165
|
+
parts.push("sections:async");
|
|
166
|
+
else if (this.sectionFilter !== defaultSectionFilter)
|
|
167
|
+
parts.push("sections:custom");
|
|
168
|
+
if (this.includeIntro === false)
|
|
169
|
+
parts.push("no-intro");
|
|
170
|
+
if (this.validatePerson)
|
|
171
|
+
parts.push("validate:person");
|
|
172
|
+
if (!this.handleDisambiguation)
|
|
173
|
+
parts.push("disambig:off");
|
|
174
|
+
if (this.handleDisambiguation && this.disambiguationSuffixes.length > 0) {
|
|
175
|
+
parts.push(`suffixes:${this.disambiguationSuffixes.join(",")}`);
|
|
176
|
+
}
|
|
177
|
+
return parts.join("|");
|
|
178
|
+
}
|
|
179
|
+
/**
|
|
180
|
+
* Fetch a Wikipedia document using wtf_wikipedia.
|
|
181
|
+
* Returns null if the article doesn't exist. Lets other errors propagate
|
|
182
|
+
* so BaseResearchSource.lookup() can record them via telemetry.
|
|
183
|
+
*/
|
|
184
|
+
async fetchDocument(title) {
|
|
185
|
+
const doc = await wtf.fetch(title);
|
|
186
|
+
return doc ?? null;
|
|
187
|
+
}
|
|
188
|
+
/**
|
|
189
|
+
* Check if a document is a disambiguation page.
|
|
190
|
+
*/
|
|
191
|
+
isDisambig(doc) {
|
|
192
|
+
return doc.isDisambiguation();
|
|
193
|
+
}
|
|
194
|
+
/**
|
|
195
|
+
* Try disambiguation suffixes to find a valid (non-disambiguation) article.
|
|
196
|
+
* Returns the first valid document found, or the provided fallback if none match.
|
|
197
|
+
*/
|
|
198
|
+
async tryDisambiguationSuffixes(baseTitle, fallback) {
|
|
199
|
+
for (const suffix of this.disambiguationSuffixes) {
|
|
200
|
+
const altTitle = baseTitle + suffix;
|
|
201
|
+
const altDoc = await this.fetchDocument(altTitle);
|
|
202
|
+
if (altDoc && !this.isDisambig(altDoc)) {
|
|
203
|
+
return altDoc;
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
return fallback;
|
|
207
|
+
}
|
|
208
|
+
/**
|
|
209
|
+
* Extract full plaintext from a document for validation and async filtering.
|
|
210
|
+
*/
|
|
211
|
+
getFullText(doc) {
|
|
212
|
+
const sections = doc.sections();
|
|
213
|
+
return sections.map((s) => s.text({})).join("\n\n");
|
|
214
|
+
}
|
|
215
|
+
/**
|
|
216
|
+
* Calculate content confidence based on text length and subject name presence.
|
|
217
|
+
* Returns a score between 0.3 and 0.9.
|
|
218
|
+
*/
|
|
219
|
+
calculateContentConfidence(text, subject, sectionCount) {
|
|
220
|
+
let confidence = 0.4;
|
|
221
|
+
// Name presence
|
|
222
|
+
if (text.toLowerCase().includes(subject.name.toLowerCase())) {
|
|
223
|
+
confidence += 0.1;
|
|
224
|
+
}
|
|
225
|
+
// Content length
|
|
226
|
+
if (text.length > 500) {
|
|
227
|
+
confidence += 0.2;
|
|
228
|
+
}
|
|
229
|
+
else if (text.length > 200) {
|
|
230
|
+
confidence += 0.1;
|
|
231
|
+
}
|
|
232
|
+
// If keywords are configured, delegate to the base class keyword-based
|
|
233
|
+
// confidence calculation instead of using our content heuristics.
|
|
234
|
+
// The base class checks for confidence === -1 as the delegation signal.
|
|
235
|
+
if (this.options.requiredKeywords && this.options.requiredKeywords.length > 0) {
|
|
236
|
+
return -1; // DELEGATE_TO_BASE_CLASS: base-source.ts:150 replaces with keyword confidence
|
|
237
|
+
}
|
|
238
|
+
// Section count bonus — use actual extracted section count, not regex on text
|
|
239
|
+
if (sectionCount > 1) {
|
|
240
|
+
confidence += Math.min(0.2, sectionCount * 0.05);
|
|
241
|
+
}
|
|
242
|
+
return Math.min(0.9, confidence);
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
// ============================================================================
|
|
246
|
+
// Factory Function
|
|
247
|
+
// ============================================================================
|
|
248
|
+
/**
|
|
249
|
+
* Create a Wikipedia source instance.
|
|
250
|
+
*
|
|
251
|
+
* @example
|
|
252
|
+
* ```typescript
|
|
253
|
+
* // Default: all sections
|
|
254
|
+
* const source = wikipedia()
|
|
255
|
+
*
|
|
256
|
+
* // Death research: only death-related sections
|
|
257
|
+
* const deathSource = wikipedia({
|
|
258
|
+
* sectionFilter: (sections) => sections.filter(s =>
|
|
259
|
+
* /death|illness|health|assassination|final years/i.test(s.title)
|
|
260
|
+
* ),
|
|
261
|
+
* })
|
|
262
|
+
*
|
|
263
|
+
* // Biography research: personal life sections
|
|
264
|
+
* const bioSource = wikipedia({
|
|
265
|
+
* sectionFilter: (sections) => sections.filter(s =>
|
|
266
|
+
* /early life|personal|childhood|education|family/i.test(s.title)
|
|
267
|
+
* ),
|
|
268
|
+
* includeIntro: true,
|
|
269
|
+
* })
|
|
270
|
+
* ```
|
|
271
|
+
*/
|
|
272
|
+
export function wikipedia(options) {
|
|
273
|
+
return new WikipediaSource(options);
|
|
274
|
+
}
|
|
275
|
+
//# sourceMappingURL=wikipedia.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"wikipedia.js","sourceRoot":"","sources":["../../src/structured/wikipedia.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAEH,OAAO,GAAG,MAAM,eAAe,CAAA;AAE/B,OAAO,EACL,kBAAkB,EAClB,eAAe,GAIhB,MAAM,iBAAiB,CAAA;AAExB,+EAA+E;AAC/E,YAAY;AACZ,+EAA+E;AAE/E,MAAM,kBAAkB,GAAG,EAAE,CAAA;AA6G7B,+EAA+E;AAC/E,yBAAyB;AACzB,+EAA+E;AAE/E;;GAEG;AACH,SAAS,oBAAoB,CAAC,QAA4B;IACxD,OAAO,QAAQ,CAAA;AACjB,CAAC;AAED,+EAA+E;AAC/E,wBAAwB;AACxB,+EAA+E;AAE/E;;;;;GAKG;AACH,MAAM,OAAO,eAAgB,SAAQ,kBAAmC;IAC7D,IAAI,GAAG,WAAW,CAAA;IAClB,IAAI,GAAG,WAAW,CAAA;IAClB,eAAe,GAAG,eAAe,CAAC,qBAAqB,CAAA;IACvD,MAAM,GAAG,kBAAkB,CAAA;IAC3B,MAAM,GAAG,IAAI,CAAA;IACb,qBAAqB,GAAG,CAAC,CAAA;IAE1B,aAAa,CAAe;IAC5B,kBAAkB,CAAqB;IACvC,YAAY,CAAS;IACrB,oBAAoB,CAAS;IAC7B,sBAAsB,CAAU;IAChC,cAAc,CAGS;IAE/B,YAAY,UAA4B,EAAE;QACxC,KAAK,CAAC,EAAE,WAAW,EAAE,GAAG,EAAE,GAAG,OAAO,EAAE,CAAC,CAAA;QACvC,IAAI,CAAC,aAAa,GAAG,OAAO,CAAC,aAAa,IAAI,oBAAoB,CAAA;QAClE,IAAI,CAAC,kBAAkB,GAAG,OAAO,CAAC,kBAAkB,CAAA;QACpD,IAAI,CAAC,YAAY,GAAG,OAAO,CAAC,YAAY,IAAI,IAAI,CAAA;QAChD,IAAI,CAAC,oBAAoB,GAAG,OAAO,CAAC,oBAAoB,IAAI,IAAI,CAAA;QAChE,IAAI,CAAC,sBAAsB,GAAG,OAAO,CAAC,sBAAsB,IAAI,CAAC,UAAU,EAAE,YAAY,CAAC,CAAA;QAC1F,IAAI,CAAC,cAAc,GAAG,OAAO,CAAC,cAAc,CAAA;IAC9C,CAAC;IAES,KAAK,CAAC,WAAW,CACzB,OAAwB;IACxB,8DAA8D;IAC9D,qEAAqE;IACrE,wEAAwE;IACxE,OAAoB;QAEpB,MAAM,SAAS,GAAG,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,GAAG,CAAC,CAAA;QAEjD,2BAA2B;QAC3B,IAAI,GAAG,GAAG,MAAM,IAAI,CAAC,aAAa,CAAC,SAAS,CAAC,CAAA;QAE7C,8BAA8B;QAC9B,IAAI,IAAI,CAAC,oBAAoB,IAAI,CAAC,CAAC,GAAG,IAAI,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC;YAChE,GAAG,GAAG,MAAM,IAAI,CAAC,yBAAyB,CAAC,SAAS,EAAE,GAAG,CAAC,CAAA;QAC5D,CAAC;QAED,kDAAkD;QAClD,IAAI,CAAC,GAAG,IAAI,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC;YAAE,OAAO,IAAI,CAAA;QAE7C,2CAA2C;QAC3C,gFAAgF;QAChF,IAAI,cAAkC,CAAA;QACtC,IAAI,IAAI,CAAC,cAAc,EAAE,CAAC;YACxB,cAAc,GAAG,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,CAAA;YACtC,IAAI,CAAC,CAAC,MAAM,IAAI,CAAC,cAAc,CAAC,cAAc,EAAE,OAAO,CAAC,CAAC,EAAE,CAAC;gBAC1D,6DAA6D;gBAC7D,IAAI,CAAC,IAAI,CAAC,oBAAoB;oBAAE,OAAO,IAAI,CAAA;gBAC3C,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,yBAAyB,CAAC,SAAS,EAAE,IAAI,CAAC,CAAA;gBACpE,IAAI,CAAC,MAAM,IAAI,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC;oBAAE,OAAO,IAAI,CAAA;gBACnD,sCAAsC;gBACtC,MAAM,OAAO,GAAG,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC,CAAA;gBACxC,IAAI,CAAC,CAAC,MAAM,IAAI,CAAC,cAAc,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC;oBAAE,OAAO,IAAI,CAAA;gBAC/D,GAAG,GAAG,MAAM,CAAA;gBACZ,cAAc,GAAG,OAAO,CAAA;YAC1B,CAAC;QACH,CAAC;QAED,MAAM,QAAQ,GAAG,GAAG,CAAC,QAAQ,EAAmB,CAAA;QAChD,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,IAAI,CAAA;QAEtC,2EAA2E;QAC3E,MAAM,QAAQ,GAAG,IAAI,CAAC,kBAAkB,CAAC,CAAC,CAAC,CAAC,cAAc,IAAI,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAA;QAEhG,iDAAiD;QACjD,MAAM,YAAY,GAAuB,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAc,EAAE,CAAS,EAAE,EAAE,CAAC,CAAC;YACpF,KAAK,EAAE,CAAC;YACR,KAAK,EAAE,CAAC,CAAC,KAAK,EAAE,IAAI,cAAc;YAClC,KAAK,EAAE,CAAC,CAAC,KAAK,EAAE;SACjB,CAAC,CAAC,CAAA;QAEH,0DAA0D;QAC1D,MAAM,gBAAgB,GAAG,IAAI,CAAC,kBAAkB;YAC9C,CAAC,CAAC,MAAM,IAAI,CAAC,kBAAkB,CAAC,YAAY,EAAE,QAAS,CAAC;YACxD,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,YAAY,CAAC,CAAA;QAEpC,8CAA8C;QAC9C,MAAM,gBAAgB,GAAG,IAAI,GAAG,CAAC,gBAAgB,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAA;QAEtE,0EAA0E;QAC1E,IAAI,IAAI,CAAC,YAAY,IAAI,CAAC,gBAAgB,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;YAClD,gBAAgB,CAAC,GAAG,CAAC,CAAC,CAAC,CAAA;QACzB,CAAC;QAED,gFAAgF;QAChF,IAAI,gBAAgB,CAAC,IAAI,KAAK,CAAC;YAAE,OAAO,IAAI,CAAA;QAE5C,sCAAsC;QACtC,MAAM,YAAY,GAAa,EAAE,CAAA;QACjC,MAAM,eAAe,GAAa,EAAE,CAAA;QAEpC,2DAA2D;QAC3D,MAAM,aAAa,GAAG,CAAC,GAAG,gBAAgB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAA;QAEjE,KAAK,MAAM,GAAG,IAAI,aAAa,EAAE,CAAC;YAChC,MAAM,OAAO,GAAG,QAAQ,CAAC,GAAG,CAA4B,CAAA;YACxD,IAAI,CAAC,OAAO;gBAAE,SAAQ;YAEtB,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,EAAE,IAAI,cAAc,CAAA;YAC/C,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC,CAAA;YAE7B,IAAI,IAAI,IAAI,IAAI,CAAC,MAAM,IAAI,kBAAkB,EAAE,CAAC;gBAC9C,YAAY,CAAC,IAAI,CAAC,IAAI,KAAK,KAAK,IAAI,EAAE,CAAC,CAAA;gBACvC,eAAe,CAAC,IAAI,CAAC,KAAK,CAAC,CAAA;YAC7B,CAAC;QACH,CAAC;QAED,IAAI,YAAY,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,IAAI,CAAA;QAE1C,MAAM,YAAY,GAAG,YAAY,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QAC9C,MAAM,aAAa,GAAG,GAAG,CAAC,KAAK,EAAE,IAAI,SAAS,CAAC,OAAO,CAAC,IAAI,EAAE,GAAG,CAAC,CAAA;QACjE,MAAM,WAAW,GAAG,iCAAiC,kBAAkB,CAAC,aAAa,CAAC,OAAO,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC,EAAE,CAAA;QAE3G,gDAAgD;QAChD,MAAM,UAAU,GAAG,IAAI,CAAC,0BAA0B,CAAC,YAAY,EAAE,OAAO,EAAE,YAAY,CAAC,MAAM,CAAC,CAAA;QAE9F,OAAO;YACL,IAAI,EAAE,YAAY;YAClB,UAAU;YACV,OAAO,EAAE,CAAC;YACV,GAAG,EAAE,WAAW;YAChB,WAAW,EAAE,WAAW;YACxB,YAAY,EAAE,aAAa;YAC3B,QAAQ,EAAE;gBACR,YAAY,EAAE,YAAY,CAAC,MAAM;gBACjC,aAAa,EAAE,eAAe;gBAC9B,UAAU,EAAE,YAAY,CAAC,MAAM;aAChC;SACF,CAAA;IACH,CAAC;IAED;;;;OAIG;IACM,UAAU,CAAC,OAAwB;QAC1C,MAAM,KAAK,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,CAAA;QAC5B,IAAI,IAAI,CAAC,kBAAkB;YAAE,KAAK,CAAC,IAAI,CAAC,gBAAgB,CAAC,CAAA;aACpD,IAAI,IAAI,CAAC,aAAa,KAAK,oBAAoB;YAAE,KAAK,CAAC,IAAI,CAAC,iBAAiB,CAAC,CAAA;QACnF,IAAI,IAAI,CAAC,YAAY,KAAK,KAAK;YAAE,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAA;QACvD,IAAI,IAAI,CAAC,cAAc;YAAE,KAAK,CAAC,IAAI,CAAC,iBAAiB,CAAC,CAAA;QACtD,IAAI,CAAC,IAAI,CAAC,oBAAoB;YAAE,KAAK,CAAC,IAAI,CAAC,cAAc,CAAC,CAAA;QAC1D,IAAI,IAAI,CAAC,oBAAoB,IAAI,IAAI,CAAC,sBAAsB,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACxE,KAAK,CAAC,IAAI,CAAC,YAAY,IAAI,CAAC,sBAAsB,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,CAAA;QACjE,CAAC;QACD,OAAO,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;IACxB,CAAC;IAED;;;;OAIG;IACK,KAAK,CAAC,aAAa,CAAC,KAAa;QACvC,MAAM,GAAG,GAAG,MAAM,GAAG,CAAC,KAAK,CAAC,KAAK,CAAC,CAAA;QAClC,OAAQ,GAAgD,IAAI,IAAI,CAAA;IAClE,CAAC;IAED;;OAEG;IACK,UAAU,CAAC,GAAsC;QACvD,OAAO,GAAG,CAAC,gBAAgB,EAAE,CAAA;IAC/B,CAAC;IAED;;;OAGG;IACK,KAAK,CAAC,yBAAyB,CACrC,SAAiB,EACjB,QAAkD;QAElD,KAAK,MAAM,MAAM,IAAI,IAAI,CAAC,sBAAsB,EAAE,CAAC;YACjD,MAAM,QAAQ,GAAG,SAAS,GAAG,MAAM,CAAA;YACnC,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,aAAa,CAAC,QAAQ,CAAC,CAAA;YACjD,IAAI,MAAM,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,EAAE,CAAC;gBACvC,OAAO,MAAM,CAAA;YACf,CAAC;QACH,CAAC;QACD,OAAO,QAAQ,CAAA;IACjB,CAAC;IAED;;OAEG;IACK,WAAW,CAAC,GAAsC;QACxD,MAAM,QAAQ,GAAG,GAAG,CAAC,QAAQ,EAAmB,CAAA;QAChD,OAAO,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAc,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;IAClE,CAAC;IAED;;;OAGG;IACK,0BAA0B,CAChC,IAAY,EACZ,OAAwB,EACxB,YAAoB;QAEpB,IAAI,UAAU,GAAG,GAAG,CAAA;QAEpB,gBAAgB;QAChB,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,OAAO,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC,EAAE,CAAC;YAC5D,UAAU,IAAI,GAAG,CAAA;QACnB,CAAC;QAED,iBAAiB;QACjB,IAAI,IAAI,CAAC,MAAM,GAAG,GAAG,EAAE,CAAC;YACtB,UAAU,IAAI,GAAG,CAAA;QACnB,CAAC;aAAM,IAAI,IAAI,CAAC,MAAM,GAAG,GAAG,EAAE,CAAC;YAC7B,UAAU,IAAI,GAAG,CAAA;QACnB,CAAC;QAED,uEAAuE;QACvE,kEAAkE;QAClE,wEAAwE;QACxE,IAAI,IAAI,CAAC,OAAO,CAAC,gBAAgB,IAAI,IAAI,CAAC,OAAO,CAAC,gBAAgB,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC9E,OAAO,CAAC,CAAC,CAAA,CAAC,8EAA8E;QAC1F,CAAC;QAED,8EAA8E;QAC9E,IAAI,YAAY,GAAG,CAAC,EAAE,CAAC;YACrB,UAAU,IAAI,IAAI,CAAC,GAAG,CAAC,GAAG,EAAE,YAAY,GAAG,IAAI,CAAC,CAAA;QAClD,CAAC;QAED,OAAO,IAAI,CAAC,GAAG,CAAC,GAAG,EAAE,UAAU,CAAC,CAAA;IAClC,CAAC;CACF;AAED,+EAA+E;AAC/E,mBAAmB;AACnB,+EAA+E;AAE/E;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AACH,MAAM,UAAU,SAAS,CAAC,OAA0B;IAClD,OAAO,IAAI,eAAe,CAAC,OAAO,CAAC,CAAA;AACrC,CAAC"}
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Abstract base class for web search sources using the template method pattern.
|
|
3
|
+
*
|
|
4
|
+
* Subclasses only implement `performSearch()` — the base class handles the
|
|
5
|
+
* full pipeline: search → score/rank links → fetch pages → extract content →
|
|
6
|
+
* sanitize → combine with attribution.
|
|
7
|
+
*
|
|
8
|
+
* Used by Google, Bing, Brave, and DuckDuckGo search sources.
|
|
9
|
+
*/
|
|
10
|
+
import { BaseResearchSource, type BaseSourceOptions, type ResearchSubject, type RawFinding } from "@debriefer/core";
|
|
11
|
+
/** A single result from a web search engine. */
|
|
12
|
+
export interface WebSearchResult {
|
|
13
|
+
url: string;
|
|
14
|
+
title: string;
|
|
15
|
+
snippet: string;
|
|
16
|
+
}
|
|
17
|
+
/** Options for scoring and filtering search result links. */
|
|
18
|
+
export interface LinkSelectionOptions {
|
|
19
|
+
/** Domain → 0-100 score. Adds to link score when hostname matches. */
|
|
20
|
+
domainScores?: Record<string, number>;
|
|
21
|
+
/** Keywords that boost a result's score when found in title+snippet. */
|
|
22
|
+
boostKeywords?: Array<{
|
|
23
|
+
keyword: string;
|
|
24
|
+
boost: number;
|
|
25
|
+
}>;
|
|
26
|
+
/** Keywords that penalize a result's score when found in title+snippet. */
|
|
27
|
+
penaltyKeywords?: Array<{
|
|
28
|
+
keyword: string;
|
|
29
|
+
penalty: number;
|
|
30
|
+
}>;
|
|
31
|
+
/** Domains to completely exclude from results. */
|
|
32
|
+
blockedDomains?: string[];
|
|
33
|
+
}
|
|
34
|
+
/** Options for WebSearchBase sources, combining base source and link selection options. */
|
|
35
|
+
export interface WebSearchOptions extends BaseSourceOptions, LinkSelectionOptions {
|
|
36
|
+
/** Maximum number of search result pages to fetch. Default: 3. */
|
|
37
|
+
maxLinksToFollow?: number;
|
|
38
|
+
/** Minimum extracted text length in characters. Pages below this are filtered. Default: 200. */
|
|
39
|
+
minContentLength?: number;
|
|
40
|
+
/**
|
|
41
|
+
* Maximum cost in USD for link following per subject. When set, the source
|
|
42
|
+
* tracks cumulative fetch cost and stops following links when the budget
|
|
43
|
+
* is exhausted. Default: unlimited.
|
|
44
|
+
*/
|
|
45
|
+
maxLinkCost?: number;
|
|
46
|
+
/**
|
|
47
|
+
* Custom link selector that filters/reorders search results before fetching.
|
|
48
|
+
* Receives ranked results and the subject, returns the results to follow.
|
|
49
|
+
* Useful for AI-assisted link selection (e.g., Claude ranking URLs by relevance).
|
|
50
|
+
* Applied after scoring/ranking but before the maxLinksToFollow limit.
|
|
51
|
+
*/
|
|
52
|
+
linkSelector?: (results: WebSearchResult[], subject: ResearchSubject) => Promise<WebSearchResult[]> | WebSearchResult[];
|
|
53
|
+
/**
|
|
54
|
+
* Custom page fetcher that replaces the default fetch+readability pipeline.
|
|
55
|
+
* Useful for browser-based fetching (Playwright) or sites requiring
|
|
56
|
+
* authentication/fingerprinting. Returns extracted text or null on failure.
|
|
57
|
+
*/
|
|
58
|
+
fetchPage?: (url: string, signal: AbortSignal) => Promise<string | null>;
|
|
59
|
+
}
|
|
60
|
+
/**
|
|
61
|
+
* Abstract base class for web search sources.
|
|
62
|
+
*
|
|
63
|
+
* Implements the template method pattern: subclasses provide `performSearch()`
|
|
64
|
+
* and this class handles the rest of the pipeline (scoring, fetching,
|
|
65
|
+
* extracting, sanitizing, combining).
|
|
66
|
+
*/
|
|
67
|
+
export declare abstract class WebSearchBase extends BaseResearchSource<ResearchSubject> {
|
|
68
|
+
protected readonly maxLinksToFollow: number;
|
|
69
|
+
protected readonly minContentLength: number;
|
|
70
|
+
protected readonly domainScores: Record<string, number>;
|
|
71
|
+
protected readonly boostKeywords: Array<{
|
|
72
|
+
keyword: string;
|
|
73
|
+
boost: number;
|
|
74
|
+
}>;
|
|
75
|
+
protected readonly penaltyKeywords: Array<{
|
|
76
|
+
keyword: string;
|
|
77
|
+
penalty: number;
|
|
78
|
+
}>;
|
|
79
|
+
protected readonly blockedDomains: string[];
|
|
80
|
+
private readonly maxLinkCost?;
|
|
81
|
+
private readonly linkSelector?;
|
|
82
|
+
private readonly customFetchPage?;
|
|
83
|
+
constructor(options?: WebSearchOptions);
|
|
84
|
+
/**
|
|
85
|
+
* Subclass-specific search API call. Returns raw search results.
|
|
86
|
+
*
|
|
87
|
+
* @param query - The search query string
|
|
88
|
+
* @param signal - Abort signal for cancellation
|
|
89
|
+
* @returns Array of search results (URL, title, snippet)
|
|
90
|
+
*/
|
|
91
|
+
protected abstract performSearch(query: string, signal: AbortSignal): Promise<WebSearchResult[]>;
|
|
92
|
+
/**
|
|
93
|
+
* Full search pipeline: search → score → fetch → extract → combine.
|
|
94
|
+
*
|
|
95
|
+
* @param subject - The research subject
|
|
96
|
+
* @param signal - Abort signal for cancellation
|
|
97
|
+
* @returns RawFinding with combined text, or null if no content extracted
|
|
98
|
+
*/
|
|
99
|
+
protected fetchResult(subject: ResearchSubject, signal: AbortSignal): Promise<RawFinding | null>;
|
|
100
|
+
/**
|
|
101
|
+
* Score and rank search results by relevance.
|
|
102
|
+
*
|
|
103
|
+
* Scoring:
|
|
104
|
+
* - Base: 50 - index (preserves search engine ordering)
|
|
105
|
+
* - + domainScores[domain] if hostname matches
|
|
106
|
+
* - + boost for each boostKeyword found in title+snippet
|
|
107
|
+
* - - penalty for each penaltyKeyword found in title+snippet
|
|
108
|
+
*/
|
|
109
|
+
private scoreAndRank;
|
|
110
|
+
/**
|
|
111
|
+
* Check whether a URL should be excluded (blocked domain or unsafe URL).
|
|
112
|
+
* Blocks: non-http(s) schemes, localhost, private IP ranges, and user-specified domains.
|
|
113
|
+
*/
|
|
114
|
+
protected isDomainBlocked(url: string): boolean;
|
|
115
|
+
/**
|
|
116
|
+
* Check if a hostname matches a domain (exact or subdomain match).
|
|
117
|
+
* Normalizes domain to lowercase since URL.hostname is always lowercase.
|
|
118
|
+
*
|
|
119
|
+
* "www.example.com" matches "example.com"
|
|
120
|
+
* "sub.example.com" matches "example.com"
|
|
121
|
+
* "example.com" matches "example.com"
|
|
122
|
+
* "notexample.com" does NOT match "example.com"
|
|
123
|
+
*/
|
|
124
|
+
private hostnameMatchesDomain;
|
|
125
|
+
/** Check if hostname is in the 172.16.0.0–172.31.255.255 private range (RFC 1918). */
|
|
126
|
+
private isPrivate172;
|
|
127
|
+
}
|
|
128
|
+
//# sourceMappingURL=base.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"base.d.ts","sourceRoot":"","sources":["../../src/web-search/base.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,OAAO,EACL,kBAAkB,EAClB,KAAK,iBAAiB,EACtB,KAAK,eAAe,EACpB,KAAK,UAAU,EAChB,MAAM,iBAAiB,CAAA;AASxB,gDAAgD;AAChD,MAAM,WAAW,eAAe;IAC9B,GAAG,EAAE,MAAM,CAAA;IACX,KAAK,EAAE,MAAM,CAAA;IACb,OAAO,EAAE,MAAM,CAAA;CAChB;AAED,6DAA6D;AAC7D,MAAM,WAAW,oBAAoB;IACnC,sEAAsE;IACtE,YAAY,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAA;IACrC,wEAAwE;IACxE,aAAa,CAAC,EAAE,KAAK,CAAC;QAAE,OAAO,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAE,CAAC,CAAA;IACzD,2EAA2E;IAC3E,eAAe,CAAC,EAAE,KAAK,CAAC;QAAE,OAAO,EAAE,MAAM,CAAC;QAAC,OAAO,EAAE,MAAM,CAAA;KAAE,CAAC,CAAA;IAC7D,kDAAkD;IAClD,cAAc,CAAC,EAAE,MAAM,EAAE,CAAA;CAC1B;AAED,2FAA2F;AAC3F,MAAM,WAAW,gBAAiB,SAAQ,iBAAiB,EAAE,oBAAoB;IAC/E,kEAAkE;IAClE,gBAAgB,CAAC,EAAE,MAAM,CAAA;IACzB,gGAAgG;IAChG,gBAAgB,CAAC,EAAE,MAAM,CAAA;IACzB;;;;OAIG;IACH,WAAW,CAAC,EAAE,MAAM,CAAA;IACpB;;;;;OAKG;IACH,YAAY,CAAC,EAAE,CACb,OAAO,EAAE,eAAe,EAAE,EAC1B,OAAO,EAAE,eAAe,KACrB,OAAO,CAAC,eAAe,EAAE,CAAC,GAAG,eAAe,EAAE,CAAA;IACnD;;;;OAIG;IACH,SAAS,CAAC,EAAE,CAAC,GAAG,EAAE,MAAM,EAAE,MAAM,EAAE,WAAW,KAAK,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC,CAAA;CACzE;AAeD;;;;;;GAMG;AACH,8BAAsB,aAAc,SAAQ,kBAAkB,CAAC,eAAe,CAAC;IAC7E,SAAS,CAAC,QAAQ,CAAC,gBAAgB,EAAE,MAAM,CAAA;IAC3C,SAAS,CAAC,QAAQ,CAAC,gBAAgB,EAAE,MAAM,CAAA;IAC3C,SAAS,CAAC,QAAQ,CAAC,YAAY,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAA;IACvD,SAAS,CAAC,QAAQ,CAAC,aAAa,EAAE,KAAK,CAAC;QAAE,OAAO,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAE,CAAC,CAAA;IAC3E,SAAS,CAAC,QAAQ,CAAC,eAAe,EAAE,KAAK,CAAC;QAAE,OAAO,EAAE,MAAM,CAAC;QAAC,OAAO,EAAE,MAAM,CAAA;KAAE,CAAC,CAAA;IAC/E,SAAS,CAAC,QAAQ,CAAC,cAAc,EAAE,MAAM,EAAE,CAAA;IAC3C,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAQ;IACrC,OAAO,CAAC,QAAQ,CAAC,YAAY,CAAC,CAAkC;IAChE,OAAO,CAAC,QAAQ,CAAC,eAAe,CAAC,CAA+B;gBAEpD,OAAO,GAAE,gBAAqB;IAa1C;;;;;;OAMG;IACH,SAAS,CAAC,QAAQ,CAAC,aAAa,CAAC,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,WAAW,GAAG,OAAO,CAAC,eAAe,EAAE,CAAC;IAEhG;;;;;;OAMG;cACa,WAAW,CACzB,OAAO,EAAE,eAAe,EACxB,MAAM,EAAE,WAAW,GAClB,OAAO,CAAC,UAAU,GAAG,IAAI,CAAC;IA4G7B;;;;;;;;OAQG;IACH,OAAO,CAAC,YAAY;IA2CpB;;;OAGG;IACH,SAAS,CAAC,eAAe,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO;IA0C/C;;;;;;;;OAQG;IACH,OAAO,CAAC,qBAAqB;IAK7B,sFAAsF;IACtF,OAAO,CAAC,YAAY;CAKrB"}
|