@debriefer/sources 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. package/README.md +59 -0
  2. package/dist/__tests__/archives/chronicling-america.test.d.ts +8 -0
  3. package/dist/__tests__/archives/chronicling-america.test.d.ts.map +1 -0
  4. package/dist/__tests__/archives/chronicling-america.test.js +151 -0
  5. package/dist/__tests__/archives/chronicling-america.test.js.map +1 -0
  6. package/dist/__tests__/archives/europeana.test.d.ts +8 -0
  7. package/dist/__tests__/archives/europeana.test.d.ts.map +1 -0
  8. package/dist/__tests__/archives/europeana.test.js +200 -0
  9. package/dist/__tests__/archives/europeana.test.js.map +1 -0
  10. package/dist/__tests__/archives/internet-archive.test.d.ts +8 -0
  11. package/dist/__tests__/archives/internet-archive.test.d.ts.map +1 -0
  12. package/dist/__tests__/archives/internet-archive.test.js +189 -0
  13. package/dist/__tests__/archives/internet-archive.test.js.map +1 -0
  14. package/dist/__tests__/archives/trove.test.d.ts +8 -0
  15. package/dist/__tests__/archives/trove.test.d.ts.map +1 -0
  16. package/dist/__tests__/archives/trove.test.js +202 -0
  17. package/dist/__tests__/archives/trove.test.js.map +1 -0
  18. package/dist/__tests__/books/google-books.test.d.ts +8 -0
  19. package/dist/__tests__/books/google-books.test.d.ts.map +1 -0
  20. package/dist/__tests__/books/google-books.test.js +221 -0
  21. package/dist/__tests__/books/google-books.test.js.map +1 -0
  22. package/dist/__tests__/books/open-library.test.d.ts +8 -0
  23. package/dist/__tests__/books/open-library.test.d.ts.map +1 -0
  24. package/dist/__tests__/books/open-library.test.js +159 -0
  25. package/dist/__tests__/books/open-library.test.js.map +1 -0
  26. package/dist/__tests__/news/guardian.test.d.ts +9 -0
  27. package/dist/__tests__/news/guardian.test.d.ts.map +1 -0
  28. package/dist/__tests__/news/guardian.test.js +224 -0
  29. package/dist/__tests__/news/guardian.test.js.map +1 -0
  30. package/dist/__tests__/news/nytimes.test.d.ts +9 -0
  31. package/dist/__tests__/news/nytimes.test.d.ts.map +1 -0
  32. package/dist/__tests__/news/nytimes.test.js +271 -0
  33. package/dist/__tests__/news/nytimes.test.js.map +1 -0
  34. package/dist/__tests__/news/site-search-source.test.d.ts +9 -0
  35. package/dist/__tests__/news/site-search-source.test.d.ts.map +1 -0
  36. package/dist/__tests__/news/site-search-source.test.js +342 -0
  37. package/dist/__tests__/news/site-search-source.test.js.map +1 -0
  38. package/dist/__tests__/obituary/find-a-grave.test.d.ts +8 -0
  39. package/dist/__tests__/obituary/find-a-grave.test.d.ts.map +1 -0
  40. package/dist/__tests__/obituary/find-a-grave.test.js +238 -0
  41. package/dist/__tests__/obituary/find-a-grave.test.js.map +1 -0
  42. package/dist/__tests__/shared/duckduckgo-search.test.d.ts +9 -0
  43. package/dist/__tests__/shared/duckduckgo-search.test.d.ts.map +1 -0
  44. package/dist/__tests__/shared/duckduckgo-search.test.js +218 -0
  45. package/dist/__tests__/shared/duckduckgo-search.test.js.map +1 -0
  46. package/dist/__tests__/shared/fetch-page.test.d.ts +9 -0
  47. package/dist/__tests__/shared/fetch-page.test.d.ts.map +1 -0
  48. package/dist/__tests__/shared/fetch-page.test.js +281 -0
  49. package/dist/__tests__/shared/fetch-page.test.js.map +1 -0
  50. package/dist/__tests__/shared/html-utils.test.d.ts +2 -0
  51. package/dist/__tests__/shared/html-utils.test.d.ts.map +1 -0
  52. package/dist/__tests__/shared/html-utils.test.js +169 -0
  53. package/dist/__tests__/shared/html-utils.test.js.map +1 -0
  54. package/dist/__tests__/shared/readability-extract.test.d.ts +2 -0
  55. package/dist/__tests__/shared/readability-extract.test.d.ts.map +1 -0
  56. package/dist/__tests__/shared/readability-extract.test.js +107 -0
  57. package/dist/__tests__/shared/readability-extract.test.js.map +1 -0
  58. package/dist/__tests__/shared/sanitize-text.test.d.ts +2 -0
  59. package/dist/__tests__/shared/sanitize-text.test.d.ts.map +1 -0
  60. package/dist/__tests__/shared/sanitize-text.test.js +77 -0
  61. package/dist/__tests__/shared/sanitize-text.test.js.map +1 -0
  62. package/dist/__tests__/shared/search-utils.test.d.ts +2 -0
  63. package/dist/__tests__/shared/search-utils.test.d.ts.map +1 -0
  64. package/dist/__tests__/shared/search-utils.test.js +26 -0
  65. package/dist/__tests__/shared/search-utils.test.js.map +1 -0
  66. package/dist/__tests__/structured/wikidata.test.d.ts +9 -0
  67. package/dist/__tests__/structured/wikidata.test.d.ts.map +1 -0
  68. package/dist/__tests__/structured/wikidata.test.js +509 -0
  69. package/dist/__tests__/structured/wikidata.test.js.map +1 -0
  70. package/dist/__tests__/structured/wikipedia.test.d.ts +9 -0
  71. package/dist/__tests__/structured/wikipedia.test.d.ts.map +1 -0
  72. package/dist/__tests__/structured/wikipedia.test.js +643 -0
  73. package/dist/__tests__/structured/wikipedia.test.js.map +1 -0
  74. package/dist/__tests__/web-search/base.test.d.ts +9 -0
  75. package/dist/__tests__/web-search/base.test.d.ts.map +1 -0
  76. package/dist/__tests__/web-search/base.test.js +622 -0
  77. package/dist/__tests__/web-search/base.test.js.map +1 -0
  78. package/dist/__tests__/web-search/bing.test.d.ts +10 -0
  79. package/dist/__tests__/web-search/bing.test.d.ts.map +1 -0
  80. package/dist/__tests__/web-search/bing.test.js +277 -0
  81. package/dist/__tests__/web-search/bing.test.js.map +1 -0
  82. package/dist/__tests__/web-search/brave.test.d.ts +10 -0
  83. package/dist/__tests__/web-search/brave.test.d.ts.map +1 -0
  84. package/dist/__tests__/web-search/brave.test.js +264 -0
  85. package/dist/__tests__/web-search/brave.test.js.map +1 -0
  86. package/dist/__tests__/web-search/duckduckgo.test.d.ts +10 -0
  87. package/dist/__tests__/web-search/duckduckgo.test.d.ts.map +1 -0
  88. package/dist/__tests__/web-search/duckduckgo.test.js +107 -0
  89. package/dist/__tests__/web-search/duckduckgo.test.js.map +1 -0
  90. package/dist/__tests__/web-search/google.test.d.ts +9 -0
  91. package/dist/__tests__/web-search/google.test.d.ts.map +1 -0
  92. package/dist/__tests__/web-search/google.test.js +189 -0
  93. package/dist/__tests__/web-search/google.test.js.map +1 -0
  94. package/dist/archives/chronicling-america.d.ts +33 -0
  95. package/dist/archives/chronicling-america.d.ts.map +1 -0
  96. package/dist/archives/chronicling-america.js +85 -0
  97. package/dist/archives/chronicling-america.js.map +1 -0
  98. package/dist/archives/europeana.d.ts +37 -0
  99. package/dist/archives/europeana.d.ts.map +1 -0
  100. package/dist/archives/europeana.js +92 -0
  101. package/dist/archives/europeana.js.map +1 -0
  102. package/dist/archives/internet-archive.d.ts +32 -0
  103. package/dist/archives/internet-archive.d.ts.map +1 -0
  104. package/dist/archives/internet-archive.js +90 -0
  105. package/dist/archives/internet-archive.js.map +1 -0
  106. package/dist/archives/trove.d.ts +37 -0
  107. package/dist/archives/trove.d.ts.map +1 -0
  108. package/dist/archives/trove.js +97 -0
  109. package/dist/archives/trove.js.map +1 -0
  110. package/dist/books/google-books.d.ts +48 -0
  111. package/dist/books/google-books.d.ts.map +1 -0
  112. package/dist/books/google-books.js +111 -0
  113. package/dist/books/google-books.js.map +1 -0
  114. package/dist/books/open-library.d.ts +44 -0
  115. package/dist/books/open-library.d.ts.map +1 -0
  116. package/dist/books/open-library.js +103 -0
  117. package/dist/books/open-library.js.map +1 -0
  118. package/dist/index.d.ts +45 -0
  119. package/dist/index.d.ts.map +1 -0
  120. package/dist/index.js +35 -0
  121. package/dist/index.js.map +1 -0
  122. package/dist/news/guardian.d.ts +51 -0
  123. package/dist/news/guardian.d.ts.map +1 -0
  124. package/dist/news/guardian.js +131 -0
  125. package/dist/news/guardian.js.map +1 -0
  126. package/dist/news/nytimes.d.ts +27 -0
  127. package/dist/news/nytimes.d.ts.map +1 -0
  128. package/dist/news/nytimes.js +104 -0
  129. package/dist/news/nytimes.js.map +1 -0
  130. package/dist/news/site-search-source.d.ts +89 -0
  131. package/dist/news/site-search-source.d.ts.map +1 -0
  132. package/dist/news/site-search-source.js +182 -0
  133. package/dist/news/site-search-source.js.map +1 -0
  134. package/dist/news/sources.d.ts +52 -0
  135. package/dist/news/sources.d.ts.map +1 -0
  136. package/dist/news/sources.js +276 -0
  137. package/dist/news/sources.js.map +1 -0
  138. package/dist/obituary/find-a-grave.d.ts +43 -0
  139. package/dist/obituary/find-a-grave.d.ts.map +1 -0
  140. package/dist/obituary/find-a-grave.js +173 -0
  141. package/dist/obituary/find-a-grave.js.map +1 -0
  142. package/dist/shared/duckduckgo-search.d.ts +86 -0
  143. package/dist/shared/duckduckgo-search.d.ts.map +1 -0
  144. package/dist/shared/duckduckgo-search.js +218 -0
  145. package/dist/shared/duckduckgo-search.js.map +1 -0
  146. package/dist/shared/fetch-page.d.ts +50 -0
  147. package/dist/shared/fetch-page.d.ts.map +1 -0
  148. package/dist/shared/fetch-page.js +212 -0
  149. package/dist/shared/fetch-page.js.map +1 -0
  150. package/dist/shared/html-utils.d.ts +99 -0
  151. package/dist/shared/html-utils.d.ts.map +1 -0
  152. package/dist/shared/html-utils.js +246 -0
  153. package/dist/shared/html-utils.js.map +1 -0
  154. package/dist/shared/readability-extract.d.ts +33 -0
  155. package/dist/shared/readability-extract.d.ts.map +1 -0
  156. package/dist/shared/readability-extract.js +45 -0
  157. package/dist/shared/readability-extract.js.map +1 -0
  158. package/dist/shared/sanitize-text.d.ts +24 -0
  159. package/dist/shared/sanitize-text.d.ts.map +1 -0
  160. package/dist/shared/sanitize-text.js +49 -0
  161. package/dist/shared/sanitize-text.js.map +1 -0
  162. package/dist/shared/search-utils.d.ts +18 -0
  163. package/dist/shared/search-utils.d.ts.map +1 -0
  164. package/dist/shared/search-utils.js +20 -0
  165. package/dist/shared/search-utils.js.map +1 -0
  166. package/dist/structured/wikidata.d.ts +128 -0
  167. package/dist/structured/wikidata.d.ts.map +1 -0
  168. package/dist/structured/wikidata.js +361 -0
  169. package/dist/structured/wikidata.js.map +1 -0
  170. package/dist/structured/wikipedia.d.ts +184 -0
  171. package/dist/structured/wikipedia.d.ts.map +1 -0
  172. package/dist/structured/wikipedia.js +275 -0
  173. package/dist/structured/wikipedia.js.map +1 -0
  174. package/dist/web-search/base.d.ts +128 -0
  175. package/dist/web-search/base.d.ts.map +1 -0
  176. package/dist/web-search/base.js +251 -0
  177. package/dist/web-search/base.js.map +1 -0
  178. package/dist/web-search/bing.d.ts +21 -0
  179. package/dist/web-search/bing.d.ts.map +1 -0
  180. package/dist/web-search/bing.js +53 -0
  181. package/dist/web-search/bing.js.map +1 -0
  182. package/dist/web-search/brave.d.ts +21 -0
  183. package/dist/web-search/brave.d.ts.map +1 -0
  184. package/dist/web-search/brave.js +56 -0
  185. package/dist/web-search/brave.js.map +1 -0
  186. package/dist/web-search/duckduckgo.d.ts +15 -0
  187. package/dist/web-search/duckduckgo.d.ts.map +1 -0
  188. package/dist/web-search/duckduckgo.js +21 -0
  189. package/dist/web-search/duckduckgo.js.map +1 -0
  190. package/dist/web-search/google.d.ts +24 -0
  191. package/dist/web-search/google.d.ts.map +1 -0
  192. package/dist/web-search/google.js +48 -0
  193. package/dist/web-search/google.js.map +1 -0
  194. package/package.json +58 -0
@@ -0,0 +1,184 @@
1
+ /**
2
+ * Generic Wikipedia source for encyclopedia content.
3
+ *
4
+ * Uses `wtf_wikipedia` to fetch and parse Wikipedia articles, producing clean
5
+ * plaintext with no citation markers, footnotes, or HTML artifacts.
6
+ *
7
+ * Domain-agnostic: consumers customize which sections to extract via the
8
+ * `sectionFilter` option. Default returns all sections. Common use cases:
9
+ * - Death research: filter for "Death", "Health", "Illness" sections
10
+ * - Biography research: filter for "Early life", "Personal life" sections
11
+ * - General research: return all sections (default)
12
+ *
13
+ * Handles disambiguation pages by trying alternate titles with common suffixes.
14
+ */
15
+ import { BaseResearchSource, ReliabilityTier, type BaseSourceOptions, type ResearchSubject, type RawFinding } from "@debriefer/core";
16
+ /** Metadata about a Wikipedia article section */
17
+ export interface WikipediaSection {
18
+ /** Section index within the article */
19
+ index: number;
20
+ /** Section title (e.g., "Early life", "Death") */
21
+ title: string;
22
+ /** Depth level (0 = top-level, 1 = subsection, etc.) */
23
+ depth: number;
24
+ }
25
+ /**
26
+ * Function that filters Wikipedia sections to determine which to include.
27
+ * Receives all sections and returns the ones that should be extracted.
28
+ */
29
+ export type SectionFilter = (sections: WikipediaSection[]) => WikipediaSection[];
30
+ /**
31
+ * Async section filter that receives all sections and the full article text.
32
+ * Returns a promise resolving to the sections to include.
33
+ * Takes precedence over the sync `sectionFilter` when both are provided.
34
+ */
35
+ export type AsyncSectionFilter = (sections: WikipediaSection[], articleText: string) => Promise<WikipediaSection[]>;
36
+ /** Options for the Wikipedia source */
37
+ export interface WikipediaOptions extends BaseSourceOptions {
38
+ /**
39
+ * Custom section filter. Receives all article sections, returns the ones
40
+ * to extract. Default: return all sections.
41
+ *
42
+ * @example
43
+ * ```typescript
44
+ * // Only extract death-related sections
45
+ * sectionFilter: (sections) => sections.filter(s =>
46
+ * /death|illness|health|assassination/i.test(s.title)
47
+ * )
48
+ * ```
49
+ */
50
+ sectionFilter?: SectionFilter;
51
+ /**
52
+ * Async section filter. Receives all sections and the full article text,
53
+ * returns a promise of which sections to include. Takes precedence over
54
+ * the sync `sectionFilter`. Useful for AI-based section selection.
55
+ *
56
+ * @example
57
+ * ```typescript
58
+ * asyncSectionFilter: async (sections, articleText) => {
59
+ * const selected = await geminiSelectSections(sections, articleText)
60
+ * return selected
61
+ * }
62
+ * ```
63
+ */
64
+ asyncSectionFilter?: AsyncSectionFilter;
65
+ /**
66
+ * Whether to include the article introduction (section 0).
67
+ * Default: true.
68
+ */
69
+ includeIntro?: boolean;
70
+ /**
71
+ * Whether to handle disambiguation pages by trying alternate titles.
72
+ * Default: true.
73
+ */
74
+ handleDisambiguation?: boolean;
75
+ /**
76
+ * Alternate title suffixes to try if the article is a disambiguation page
77
+ * or not found. Default: ["_(actor)", "_(actress)"].
78
+ * Set to an empty array to disable alternate title attempts.
79
+ */
80
+ disambiguationSuffixes?: string[];
81
+ /**
82
+ * Validate that the fetched article matches the intended person.
83
+ * Receives the full article text and the subject. When provided and
84
+ * returns false, the source tries disambiguation suffixes before giving up.
85
+ *
86
+ * Supports both sync and async callbacks. An async callback (returning
87
+ * `Promise<boolean>`) is useful for AI-based validation (e.g., Gemini Flash
88
+ * date extraction) without blocking the event loop.
89
+ *
90
+ * @example
91
+ * ```typescript
92
+ * // Sync — simple birth-year check
93
+ * validatePerson: (articleText, subject) => {
94
+ * const birthYear = subject.context?.birthYear as string
95
+ * return birthYear ? articleText.includes(birthYear) : true
96
+ * }
97
+ *
98
+ * // Async — AI-based validation
99
+ * validatePerson: async (articleText, subject) => {
100
+ * const dates = await extractDatesWithAI(articleText)
101
+ * return dates.birthYear === subject.context?.birthYear
102
+ * }
103
+ * ```
104
+ */
105
+ validatePerson?: (articleText: string, subject: ResearchSubject) => boolean | Promise<boolean>;
106
+ }
107
+ /**
108
+ * Wikipedia source for encyclopedia article content.
109
+ *
110
+ * Fetches Wikipedia articles via `wtf_wikipedia`, extracts sections based
111
+ * on a configurable filter, and returns clean plaintext content as a RawFinding.
112
+ */
113
+ export declare class WikipediaSource extends BaseResearchSource<ResearchSubject> {
114
+ readonly name = "Wikipedia";
115
+ readonly type = "wikipedia";
116
+ readonly reliabilityTier = ReliabilityTier.SECONDARY_COMPILATION;
117
+ readonly domain = "en.wikipedia.org";
118
+ readonly isFree = true;
119
+ readonly estimatedCostPerQuery = 0;
120
+ private sectionFilter;
121
+ private asyncSectionFilter?;
122
+ private includeIntro;
123
+ private handleDisambiguation;
124
+ private disambiguationSuffixes;
125
+ private validatePerson?;
126
+ constructor(options?: WikipediaOptions);
127
+ protected fetchResult(subject: ResearchSubject, _signal: AbortSignal): Promise<RawFinding | null>;
128
+ /**
129
+ * Build the search query for cache key generation.
130
+ * Includes option-derived key material so different WikipediaSource instances
131
+ * with different sectionFilter/includeIntro options don't collide in cache.
132
+ */
133
+ buildQuery(subject: ResearchSubject): string;
134
+ /**
135
+ * Fetch a Wikipedia document using wtf_wikipedia.
136
+ * Returns null if the article doesn't exist. Lets other errors propagate
137
+ * so BaseResearchSource.lookup() can record them via telemetry.
138
+ */
139
+ private fetchDocument;
140
+ /**
141
+ * Check if a document is a disambiguation page.
142
+ */
143
+ private isDisambig;
144
+ /**
145
+ * Try disambiguation suffixes to find a valid (non-disambiguation) article.
146
+ * Returns the first valid document found, or the provided fallback if none match.
147
+ */
148
+ private tryDisambiguationSuffixes;
149
+ /**
150
+ * Extract full plaintext from a document for validation and async filtering.
151
+ */
152
+ private getFullText;
153
+ /**
154
+ * Calculate content confidence based on text length and subject name presence.
155
+ * Returns a score between 0.3 and 0.9.
156
+ */
157
+ private calculateContentConfidence;
158
+ }
159
+ /**
160
+ * Create a Wikipedia source instance.
161
+ *
162
+ * @example
163
+ * ```typescript
164
+ * // Default: all sections
165
+ * const source = wikipedia()
166
+ *
167
+ * // Death research: only death-related sections
168
+ * const deathSource = wikipedia({
169
+ * sectionFilter: (sections) => sections.filter(s =>
170
+ * /death|illness|health|assassination|final years/i.test(s.title)
171
+ * ),
172
+ * })
173
+ *
174
+ * // Biography research: personal life sections
175
+ * const bioSource = wikipedia({
176
+ * sectionFilter: (sections) => sections.filter(s =>
177
+ * /early life|personal|childhood|education|family/i.test(s.title)
178
+ * ),
179
+ * includeIntro: true,
180
+ * })
181
+ * ```
182
+ */
183
+ export declare function wikipedia(options?: WikipediaOptions): WikipediaSource;
184
+ //# sourceMappingURL=wikipedia.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"wikipedia.d.ts","sourceRoot":"","sources":["../../src/structured/wikipedia.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAIH,OAAO,EACL,kBAAkB,EAClB,eAAe,EACf,KAAK,iBAAiB,EACtB,KAAK,eAAe,EACpB,KAAK,UAAU,EAChB,MAAM,iBAAiB,CAAA;AAYxB,iDAAiD;AACjD,MAAM,WAAW,gBAAgB;IAC/B,uCAAuC;IACvC,KAAK,EAAE,MAAM,CAAA;IACb,kDAAkD;IAClD,KAAK,EAAE,MAAM,CAAA;IACb,wDAAwD;IACxD,KAAK,EAAE,MAAM,CAAA;CACd;AAED;;;GAGG;AACH,MAAM,MAAM,aAAa,GAAG,CAAC,QAAQ,EAAE,gBAAgB,EAAE,KAAK,gBAAgB,EAAE,CAAA;AAEhF;;;;GAIG;AACH,MAAM,MAAM,kBAAkB,GAAG,CAC/B,QAAQ,EAAE,gBAAgB,EAAE,EAC5B,WAAW,EAAE,MAAM,KAChB,OAAO,CAAC,gBAAgB,EAAE,CAAC,CAAA;AAEhC,uCAAuC;AACvC,MAAM,WAAW,gBAAiB,SAAQ,iBAAiB;IACzD;;;;;;;;;;;OAWG;IACH,aAAa,CAAC,EAAE,aAAa,CAAA;IAE7B;;;;;;;;;;;;OAYG;IACH,kBAAkB,CAAC,EAAE,kBAAkB,CAAA;IAEvC;;;OAGG;IACH,YAAY,CAAC,EAAE,OAAO,CAAA;IAEtB;;;OAGG;IACH,oBAAoB,CAAC,EAAE,OAAO,CAAA;IAE9B;;;;OAIG;IACH,sBAAsB,CAAC,EAAE,MAAM,EAAE,CAAA;IAEjC;;;;;;;;;;;;;;;;;;;;;;;OAuBG;IACH,cAAc,CAAC,EAAE,CAAC,WAAW,EAAE,MAAM,EAAE,OAAO,EAAE,eAAe,KAAK,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,CAAA;CAC/F;AAiBD;;;;;GAKG;AACH,qBAAa,eAAgB,SAAQ,kBAAkB,CAAC,eAAe,CAAC;IACtE,QAAQ,CAAC,IAAI,eAAc;IAC3B,QAAQ,CAAC,IAAI,eAAc;IAC3B,QAAQ,CAAC,eAAe,yCAAwC;IAChE,QAAQ,CAAC,MAAM,sBAAqB;IACpC,QAAQ,CAAC,MAAM,QAAO;IACtB,QAAQ,CAAC,qBAAqB,KAAI;IAElC,OAAO,CAAC,aAAa,CAAe;IACpC,OAAO,CAAC,kBAAkB,CAAC,CAAoB;IAC/C,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,oBAAoB,CAAS;IACrC,OAAO,CAAC,sBAAsB,CAAU;IACxC,OAAO,CAAC,cAAc,CAAC,CAGQ;gBAEnB,OAAO,GAAE,gBAAqB;cAU1B,WAAW,CACzB,OAAO,EAAE,eAAe,EAIxB,OAAO,EAAE,WAAW,GACnB,OAAO,CAAC,UAAU,GAAG,IAAI,CAAC;IAyG7B;;;;OAIG;IACM,UAAU,CAAC,OAAO,EAAE,eAAe,GAAG,MAAM;IAarD;;;;OAIG;YACW,aAAa;IAK3B;;OAEG;IACH,OAAO,CAAC,UAAU;IAIlB;;;OAGG;YACW,yBAAyB;IAcvC;;OAEG;IACH,OAAO,CAAC,WAAW;IAKnB;;;OAGG;IACH,OAAO,CAAC,0BAA0B;CAiCnC;AAMD;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AACH,wBAAgB,SAAS,CAAC,OAAO,CAAC,EAAE,gBAAgB,GAAG,eAAe,CAErE"}
@@ -0,0 +1,275 @@
1
+ /**
2
+ * Generic Wikipedia source for encyclopedia content.
3
+ *
4
+ * Uses `wtf_wikipedia` to fetch and parse Wikipedia articles, producing clean
5
+ * plaintext with no citation markers, footnotes, or HTML artifacts.
6
+ *
7
+ * Domain-agnostic: consumers customize which sections to extract via the
8
+ * `sectionFilter` option. Default returns all sections. Common use cases:
9
+ * - Death research: filter for "Death", "Health", "Illness" sections
10
+ * - Biography research: filter for "Early life", "Personal life" sections
11
+ * - General research: return all sections (default)
12
+ *
13
+ * Handles disambiguation pages by trying alternate titles with common suffixes.
14
+ */
15
+ import wtf from "wtf_wikipedia";
16
+ import { BaseResearchSource, ReliabilityTier, } from "@debriefer/core";
17
+ // ============================================================================
18
+ // Constants
19
+ // ============================================================================
20
+ const MIN_SECTION_LENGTH = 50;
21
+ // ============================================================================
22
+ // Default Section Filter
23
+ // ============================================================================
24
+ /**
25
+ * Default section filter: returns all sections.
26
+ */
27
+ function defaultSectionFilter(sections) {
28
+ return sections;
29
+ }
30
+ // ============================================================================
31
+ // Source Implementation
32
+ // ============================================================================
33
+ /**
34
+ * Wikipedia source for encyclopedia article content.
35
+ *
36
+ * Fetches Wikipedia articles via `wtf_wikipedia`, extracts sections based
37
+ * on a configurable filter, and returns clean plaintext content as a RawFinding.
38
+ */
39
+ export class WikipediaSource extends BaseResearchSource {
40
+ name = "Wikipedia";
41
+ type = "wikipedia";
42
+ reliabilityTier = ReliabilityTier.SECONDARY_COMPILATION;
43
+ domain = "en.wikipedia.org";
44
+ isFree = true;
45
+ estimatedCostPerQuery = 0;
46
+ sectionFilter;
47
+ asyncSectionFilter;
48
+ includeIntro;
49
+ handleDisambiguation;
50
+ disambiguationSuffixes;
51
+ validatePerson;
52
+ constructor(options = {}) {
53
+ super({ rateLimitMs: 500, ...options });
54
+ this.sectionFilter = options.sectionFilter ?? defaultSectionFilter;
55
+ this.asyncSectionFilter = options.asyncSectionFilter;
56
+ this.includeIntro = options.includeIntro ?? true;
57
+ this.handleDisambiguation = options.handleDisambiguation ?? true;
58
+ this.disambiguationSuffixes = options.disambiguationSuffixes ?? ["_(actor)", "_(actress)"];
59
+ this.validatePerson = options.validatePerson;
60
+ }
61
+ async fetchResult(subject,
62
+ // Note: wtf_wikipedia.fetch() does not accept an AbortSignal.
63
+ // The base class timeout/abort still applies to the overall lookup()
64
+ // call, but the underlying HTTP request cannot be cancelled mid-flight.
65
+ _signal) {
66
+ const baseTitle = subject.name.replace(/ /g, "_");
67
+ // Try the base title first
68
+ let doc = await this.fetchDocument(baseTitle);
69
+ // Handle disambiguation pages
70
+ if (this.handleDisambiguation && (!doc || this.isDisambig(doc))) {
71
+ doc = await this.tryDisambiguationSuffixes(baseTitle, doc);
72
+ }
73
+ // If we still have no valid document, return null
74
+ if (!doc || this.isDisambig(doc))
75
+ return null;
76
+ // Validate person if callback is provided.
77
+ // Track fullText so we can reuse it for asyncSectionFilter without recomputing.
78
+ let cachedFullText;
79
+ if (this.validatePerson) {
80
+ cachedFullText = this.getFullText(doc);
81
+ if (!(await this.validatePerson(cachedFullText, subject))) {
82
+ // Validation failed — try disambiguation suffixes if enabled
83
+ if (!this.handleDisambiguation)
84
+ return null;
85
+ const altDoc = await this.tryDisambiguationSuffixes(baseTitle, null);
86
+ if (!altDoc || this.isDisambig(altDoc))
87
+ return null;
88
+ // Validate the alternate document too
89
+ const altText = this.getFullText(altDoc);
90
+ if (!(await this.validatePerson(altText, subject)))
91
+ return null;
92
+ doc = altDoc;
93
+ cachedFullText = altText;
94
+ }
95
+ }
96
+ const sections = doc.sections();
97
+ if (sections.length === 0)
98
+ return null;
99
+ // Reuse cached full text from validation, or compute once for async filter
100
+ const fullText = this.asyncSectionFilter ? (cachedFullText ?? this.getFullText(doc)) : undefined;
101
+ // Map wtf sections to WikipediaSection interface
102
+ const wikiSections = sections.map((s, i) => ({
103
+ index: i,
104
+ title: s.title() || "Introduction",
105
+ depth: s.depth(),
106
+ }));
107
+ // Apply section filter — async takes precedence over sync
108
+ const selectedSections = this.asyncSectionFilter
109
+ ? await this.asyncSectionFilter(wikiSections, fullText)
110
+ : this.sectionFilter(wikiSections);
111
+ // Build the set of section indices to extract
112
+ const indicesToExtract = new Set(selectedSections.map((s) => s.index));
113
+ // Always include intro if configured and not already in the filter result
114
+ if (this.includeIntro && !indicesToExtract.has(0)) {
115
+ indicesToExtract.add(0);
116
+ }
117
+ // If nothing to extract (filter returned empty and intro disabled), return null
118
+ if (indicesToExtract.size === 0)
119
+ return null;
120
+ // Extract text from selected sections
121
+ const sectionTexts = [];
122
+ const extractedTitles = [];
123
+ // Sort indices for consistent output order (article order)
124
+ const sortedIndices = [...indicesToExtract].sort((a, b) => a - b);
125
+ for (const idx of sortedIndices) {
126
+ const section = sections[idx];
127
+ if (!section)
128
+ continue;
129
+ const title = section.title() || "Introduction";
130
+ const text = section.text({});
131
+ if (text && text.length >= MIN_SECTION_LENGTH) {
132
+ sectionTexts.push(`[${title}] ${text}`);
133
+ extractedTitles.push(title);
134
+ }
135
+ }
136
+ if (sectionTexts.length === 0)
137
+ return null;
138
+ const combinedText = sectionTexts.join("\n\n");
139
+ const resolvedTitle = doc.title() || baseTitle.replace(/_/g, " ");
140
+ const resolvedUrl = `https://en.wikipedia.org/wiki/${encodeURIComponent(resolvedTitle.replace(/ /g, "_"))}`;
141
+ // Calculate confidence based on content quality
142
+ const confidence = this.calculateContentConfidence(combinedText, subject, sectionTexts.length);
143
+ return {
144
+ text: combinedText,
145
+ confidence,
146
+ costUsd: 0,
147
+ url: resolvedUrl,
148
+ publication: "Wikipedia",
149
+ articleTitle: resolvedTitle,
150
+ metadata: {
151
+ sectionCount: sectionTexts.length,
152
+ sectionTitles: extractedTitles,
153
+ textLength: combinedText.length,
154
+ },
155
+ };
156
+ }
157
+ /**
158
+ * Build the search query for cache key generation.
159
+ * Includes option-derived key material so different WikipediaSource instances
160
+ * with different sectionFilter/includeIntro options don't collide in cache.
161
+ */
162
+ buildQuery(subject) {
163
+ const parts = [subject.name];
164
+ if (this.asyncSectionFilter)
165
+ parts.push("sections:async");
166
+ else if (this.sectionFilter !== defaultSectionFilter)
167
+ parts.push("sections:custom");
168
+ if (this.includeIntro === false)
169
+ parts.push("no-intro");
170
+ if (this.validatePerson)
171
+ parts.push("validate:person");
172
+ if (!this.handleDisambiguation)
173
+ parts.push("disambig:off");
174
+ if (this.handleDisambiguation && this.disambiguationSuffixes.length > 0) {
175
+ parts.push(`suffixes:${this.disambiguationSuffixes.join(",")}`);
176
+ }
177
+ return parts.join("|");
178
+ }
179
+ /**
180
+ * Fetch a Wikipedia document using wtf_wikipedia.
181
+ * Returns null if the article doesn't exist. Lets other errors propagate
182
+ * so BaseResearchSource.lookup() can record them via telemetry.
183
+ */
184
+ async fetchDocument(title) {
185
+ const doc = await wtf.fetch(title);
186
+ return doc ?? null;
187
+ }
188
+ /**
189
+ * Check if a document is a disambiguation page.
190
+ */
191
+ isDisambig(doc) {
192
+ return doc.isDisambiguation();
193
+ }
194
+ /**
195
+ * Try disambiguation suffixes to find a valid (non-disambiguation) article.
196
+ * Returns the first valid document found, or the provided fallback if none match.
197
+ */
198
+ async tryDisambiguationSuffixes(baseTitle, fallback) {
199
+ for (const suffix of this.disambiguationSuffixes) {
200
+ const altTitle = baseTitle + suffix;
201
+ const altDoc = await this.fetchDocument(altTitle);
202
+ if (altDoc && !this.isDisambig(altDoc)) {
203
+ return altDoc;
204
+ }
205
+ }
206
+ return fallback;
207
+ }
208
+ /**
209
+ * Extract full plaintext from a document for validation and async filtering.
210
+ */
211
+ getFullText(doc) {
212
+ const sections = doc.sections();
213
+ return sections.map((s) => s.text({})).join("\n\n");
214
+ }
215
+ /**
216
+ * Calculate content confidence based on text length and subject name presence.
217
+ * Returns a score between 0.3 and 0.9.
218
+ */
219
+ calculateContentConfidence(text, subject, sectionCount) {
220
+ let confidence = 0.4;
221
+ // Name presence
222
+ if (text.toLowerCase().includes(subject.name.toLowerCase())) {
223
+ confidence += 0.1;
224
+ }
225
+ // Content length
226
+ if (text.length > 500) {
227
+ confidence += 0.2;
228
+ }
229
+ else if (text.length > 200) {
230
+ confidence += 0.1;
231
+ }
232
+ // If keywords are configured, delegate to the base class keyword-based
233
+ // confidence calculation instead of using our content heuristics.
234
+ // The base class checks for confidence === -1 as the delegation signal.
235
+ if (this.options.requiredKeywords && this.options.requiredKeywords.length > 0) {
236
+ return -1; // DELEGATE_TO_BASE_CLASS: base-source.ts:150 replaces with keyword confidence
237
+ }
238
+ // Section count bonus — use actual extracted section count, not regex on text
239
+ if (sectionCount > 1) {
240
+ confidence += Math.min(0.2, sectionCount * 0.05);
241
+ }
242
+ return Math.min(0.9, confidence);
243
+ }
244
+ }
245
+ // ============================================================================
246
+ // Factory Function
247
+ // ============================================================================
248
+ /**
249
+ * Create a Wikipedia source instance.
250
+ *
251
+ * @example
252
+ * ```typescript
253
+ * // Default: all sections
254
+ * const source = wikipedia()
255
+ *
256
+ * // Death research: only death-related sections
257
+ * const deathSource = wikipedia({
258
+ * sectionFilter: (sections) => sections.filter(s =>
259
+ * /death|illness|health|assassination|final years/i.test(s.title)
260
+ * ),
261
+ * })
262
+ *
263
+ * // Biography research: personal life sections
264
+ * const bioSource = wikipedia({
265
+ * sectionFilter: (sections) => sections.filter(s =>
266
+ * /early life|personal|childhood|education|family/i.test(s.title)
267
+ * ),
268
+ * includeIntro: true,
269
+ * })
270
+ * ```
271
+ */
272
+ export function wikipedia(options) {
273
+ return new WikipediaSource(options);
274
+ }
275
+ //# sourceMappingURL=wikipedia.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"wikipedia.js","sourceRoot":"","sources":["../../src/structured/wikipedia.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAEH,OAAO,GAAG,MAAM,eAAe,CAAA;AAE/B,OAAO,EACL,kBAAkB,EAClB,eAAe,GAIhB,MAAM,iBAAiB,CAAA;AAExB,+EAA+E;AAC/E,YAAY;AACZ,+EAA+E;AAE/E,MAAM,kBAAkB,GAAG,EAAE,CAAA;AA6G7B,+EAA+E;AAC/E,yBAAyB;AACzB,+EAA+E;AAE/E;;GAEG;AACH,SAAS,oBAAoB,CAAC,QAA4B;IACxD,OAAO,QAAQ,CAAA;AACjB,CAAC;AAED,+EAA+E;AAC/E,wBAAwB;AACxB,+EAA+E;AAE/E;;;;;GAKG;AACH,MAAM,OAAO,eAAgB,SAAQ,kBAAmC;IAC7D,IAAI,GAAG,WAAW,CAAA;IAClB,IAAI,GAAG,WAAW,CAAA;IAClB,eAAe,GAAG,eAAe,CAAC,qBAAqB,CAAA;IACvD,MAAM,GAAG,kBAAkB,CAAA;IAC3B,MAAM,GAAG,IAAI,CAAA;IACb,qBAAqB,GAAG,CAAC,CAAA;IAE1B,aAAa,CAAe;IAC5B,kBAAkB,CAAqB;IACvC,YAAY,CAAS;IACrB,oBAAoB,CAAS;IAC7B,sBAAsB,CAAU;IAChC,cAAc,CAGS;IAE/B,YAAY,UAA4B,EAAE;QACxC,KAAK,CAAC,EAAE,WAAW,EAAE,GAAG,EAAE,GAAG,OAAO,EAAE,CAAC,CAAA;QACvC,IAAI,CAAC,aAAa,GAAG,OAAO,CAAC,aAAa,IAAI,oBAAoB,CAAA;QAClE,IAAI,CAAC,kBAAkB,GAAG,OAAO,CAAC,kBAAkB,CAAA;QACpD,IAAI,CAAC,YAAY,GAAG,OAAO,CAAC,YAAY,IAAI,IAAI,CAAA;QAChD,IAAI,CAAC,oBAAoB,GAAG,OAAO,CAAC,oBAAoB,IAAI,IAAI,CAAA;QAChE,IAAI,CAAC,sBAAsB,GAAG,OAAO,CAAC,sBAAsB,IAAI,CAAC,UAAU,EAAE,YAAY,CAAC,CAAA;QAC1F,IAAI,CAAC,cAAc,GAAG,OAAO,CAAC,cAAc,CAAA;IAC9C,CAAC;IAES,KAAK,CAAC,WAAW,CACzB,OAAwB;IACxB,8DAA8D;IAC9D,qEAAqE;IACrE,wEAAwE;IACxE,OAAoB;QAEpB,MAAM,SAAS,GAAG,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,GAAG,CAAC,CAAA;QAEjD,2BAA2B;QAC3B,IAAI,GAAG,GAAG,MAAM,IAAI,CAAC,aAAa,CAAC,SAAS,CAAC,CAAA;QAE7C,8BAA8B;QAC9B,IAAI,IAAI,CAAC,oBAAoB,IAAI,CAAC,CAAC,GAAG,IAAI,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC;YAChE,GAAG,GAAG,MAAM,IAAI,CAAC,yBAAyB,CAAC,SAAS,EAAE,GAAG,CAAC,CAAA;QAC5D,CAAC;QAED,kDAAkD;QAClD,IAAI,CAAC,GAAG,IAAI,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC;YAAE,OAAO,IAAI,CAAA;QAE7C,2CAA2C;QAC3C,gFAAgF;QAChF,IAAI,cAAkC,CAAA;QACtC,IAAI,IAAI,CAAC,cAAc,EAAE,CAAC;YACxB,cAAc,GAAG,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,CAAA;YACtC,IAAI,CAAC,CAAC,MAAM,IAAI,CAAC,cAAc,CAAC,cAAc,EAAE,OAAO,CAAC,CAAC,EAAE,CAAC;gBAC1D,6DAA6D;gBAC7D,IAAI,CAAC,IAAI,CAAC,oBAAoB;oBAAE,OAAO,IAAI,CAAA;gBAC3C,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,yBAAyB,CAAC,SAAS,EAAE,IAAI,CAAC,CAAA;gBACpE,IAAI,CAAC,MAAM,IAAI,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC;oBAAE,OAAO,IAAI,CAAA;gBACnD,sCAAsC;gBACtC,MAAM,OAAO,GAAG,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC,CAAA;gBACxC,IAAI,CAAC,CAAC,MAAM,IAAI,CAAC,cAAc,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC;oBAAE,OAAO,IAAI,CAAA;gBAC/D,GAAG,GAAG,MAAM,CAAA;gBACZ,cAAc,GAAG,OAAO,CAAA;YAC1B,CAAC;QACH,CAAC;QAED,MAAM,QAAQ,GAAG,GAAG,CAAC,QAAQ,EAAmB,CAAA;QAChD,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,IAAI,CAAA;QAEtC,2EAA2E;QAC3E,MAAM,QAAQ,GAAG,IAAI,CAAC,kBAAkB,CAAC,CAAC,CAAC,CAAC,cAAc,IAAI,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAA;QAEhG,iDAAiD;QACjD,MAAM,YAAY,GAAuB,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAc,EAAE,CAAS,EAAE,EAAE,CAAC,CAAC;YACpF,KAAK,EAAE,CAAC;YACR,KAAK,EAAE,CAAC,CAAC,KAAK,EAAE,IAAI,cAAc;YAClC,KAAK,EAAE,CAAC,CAAC,KAAK,EAAE;SACjB,CAAC,CAAC,CAAA;QAEH,0DAA0D;QAC1D,MAAM,gBAAgB,GAAG,IAAI,CAAC,kBAAkB;YAC9C,CAAC,CAAC,MAAM,IAAI,CAAC,kBAAkB,CAAC,YAAY,EAAE,QAAS,CAAC;YACxD,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,YAAY,CAAC,CAAA;QAEpC,8CAA8C;QAC9C,MAAM,gBAAgB,GAAG,IAAI,GAAG,CAAC,gBAAgB,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAA;QAEtE,0EAA0E;QAC1E,IAAI,IAAI,CAAC,YAAY,IAAI,CAAC,gBAAgB,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;YAClD,gBAAgB,CAAC,GAAG,CAAC,CAAC,CAAC,CAAA;QACzB,CAAC;QAED,gFAAgF;QAChF,IAAI,gBAAgB,CAAC,IAAI,KAAK,CAAC;YAAE,OAAO,IAAI,CAAA;QAE5C,sCAAsC;QACtC,MAAM,YAAY,GAAa,EAAE,CAAA;QACjC,MAAM,eAAe,GAAa,EAAE,CAAA;QAEpC,2DAA2D;QAC3D,MAAM,aAAa,GAAG,CAAC,GAAG,gBAAgB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAA;QAEjE,KAAK,MAAM,GAAG,IAAI,aAAa,EAAE,CAAC;YAChC,MAAM,OAAO,GAAG,QAAQ,CAAC,GAAG,CAA4B,CAAA;YACxD,IAAI,CAAC,OAAO;gBAAE,SAAQ;YAEtB,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,EAAE,IAAI,cAAc,CAAA;YAC/C,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC,CAAA;YAE7B,IAAI,IAAI,IAAI,IAAI,CAAC,MAAM,IAAI,kBAAkB,EAAE,CAAC;gBAC9C,YAAY,CAAC,IAAI,CAAC,IAAI,KAAK,KAAK,IAAI,EAAE,CAAC,CAAA;gBACvC,eAAe,CAAC,IAAI,CAAC,KAAK,CAAC,CAAA;YAC7B,CAAC;QACH,CAAC;QAED,IAAI,YAAY,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,IAAI,CAAA;QAE1C,MAAM,YAAY,GAAG,YAAY,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QAC9C,MAAM,aAAa,GAAG,GAAG,CAAC,KAAK,EAAE,IAAI,SAAS,CAAC,OAAO,CAAC,IAAI,EAAE,GAAG,CAAC,CAAA;QACjE,MAAM,WAAW,GAAG,iCAAiC,kBAAkB,CAAC,aAAa,CAAC,OAAO,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC,EAAE,CAAA;QAE3G,gDAAgD;QAChD,MAAM,UAAU,GAAG,IAAI,CAAC,0BAA0B,CAAC,YAAY,EAAE,OAAO,EAAE,YAAY,CAAC,MAAM,CAAC,CAAA;QAE9F,OAAO;YACL,IAAI,EAAE,YAAY;YAClB,UAAU;YACV,OAAO,EAAE,CAAC;YACV,GAAG,EAAE,WAAW;YAChB,WAAW,EAAE,WAAW;YACxB,YAAY,EAAE,aAAa;YAC3B,QAAQ,EAAE;gBACR,YAAY,EAAE,YAAY,CAAC,MAAM;gBACjC,aAAa,EAAE,eAAe;gBAC9B,UAAU,EAAE,YAAY,CAAC,MAAM;aAChC;SACF,CAAA;IACH,CAAC;IAED;;;;OAIG;IACM,UAAU,CAAC,OAAwB;QAC1C,MAAM,KAAK,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,CAAA;QAC5B,IAAI,IAAI,CAAC,kBAAkB;YAAE,KAAK,CAAC,IAAI,CAAC,gBAAgB,CAAC,CAAA;aACpD,IAAI,IAAI,CAAC,aAAa,KAAK,oBAAoB;YAAE,KAAK,CAAC,IAAI,CAAC,iBAAiB,CAAC,CAAA;QACnF,IAAI,IAAI,CAAC,YAAY,KAAK,KAAK;YAAE,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAA;QACvD,IAAI,IAAI,CAAC,cAAc;YAAE,KAAK,CAAC,IAAI,CAAC,iBAAiB,CAAC,CAAA;QACtD,IAAI,CAAC,IAAI,CAAC,oBAAoB;YAAE,KAAK,CAAC,IAAI,CAAC,cAAc,CAAC,CAAA;QAC1D,IAAI,IAAI,CAAC,oBAAoB,IAAI,IAAI,CAAC,sBAAsB,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACxE,KAAK,CAAC,IAAI,CAAC,YAAY,IAAI,CAAC,sBAAsB,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,CAAA;QACjE,CAAC;QACD,OAAO,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;IACxB,CAAC;IAED;;;;OAIG;IACK,KAAK,CAAC,aAAa,CAAC,KAAa;QACvC,MAAM,GAAG,GAAG,MAAM,GAAG,CAAC,KAAK,CAAC,KAAK,CAAC,CAAA;QAClC,OAAQ,GAAgD,IAAI,IAAI,CAAA;IAClE,CAAC;IAED;;OAEG;IACK,UAAU,CAAC,GAAsC;QACvD,OAAO,GAAG,CAAC,gBAAgB,EAAE,CAAA;IAC/B,CAAC;IAED;;;OAGG;IACK,KAAK,CAAC,yBAAyB,CACrC,SAAiB,EACjB,QAAkD;QAElD,KAAK,MAAM,MAAM,IAAI,IAAI,CAAC,sBAAsB,EAAE,CAAC;YACjD,MAAM,QAAQ,GAAG,SAAS,GAAG,MAAM,CAAA;YACnC,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,aAAa,CAAC,QAAQ,CAAC,CAAA;YACjD,IAAI,MAAM,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,EAAE,CAAC;gBACvC,OAAO,MAAM,CAAA;YACf,CAAC;QACH,CAAC;QACD,OAAO,QAAQ,CAAA;IACjB,CAAC;IAED;;OAEG;IACK,WAAW,CAAC,GAAsC;QACxD,MAAM,QAAQ,GAAG,GAAG,CAAC,QAAQ,EAAmB,CAAA;QAChD,OAAO,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAc,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;IAClE,CAAC;IAED;;;OAGG;IACK,0BAA0B,CAChC,IAAY,EACZ,OAAwB,EACxB,YAAoB;QAEpB,IAAI,UAAU,GAAG,GAAG,CAAA;QAEpB,gBAAgB;QAChB,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,OAAO,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC,EAAE,CAAC;YAC5D,UAAU,IAAI,GAAG,CAAA;QACnB,CAAC;QAED,iBAAiB;QACjB,IAAI,IAAI,CAAC,MAAM,GAAG,GAAG,EAAE,CAAC;YACtB,UAAU,IAAI,GAAG,CAAA;QACnB,CAAC;aAAM,IAAI,IAAI,CAAC,MAAM,GAAG,GAAG,EAAE,CAAC;YAC7B,UAAU,IAAI,GAAG,CAAA;QACnB,CAAC;QAED,uEAAuE;QACvE,kEAAkE;QAClE,wEAAwE;QACxE,IAAI,IAAI,CAAC,OAAO,CAAC,gBAAgB,IAAI,IAAI,CAAC,OAAO,CAAC,gBAAgB,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC9E,OAAO,CAAC,CAAC,CAAA,CAAC,8EAA8E;QAC1F,CAAC;QAED,8EAA8E;QAC9E,IAAI,YAAY,GAAG,CAAC,EAAE,CAAC;YACrB,UAAU,IAAI,IAAI,CAAC,GAAG,CAAC,GAAG,EAAE,YAAY,GAAG,IAAI,CAAC,CAAA;QAClD,CAAC;QAED,OAAO,IAAI,CAAC,GAAG,CAAC,GAAG,EAAE,UAAU,CAAC,CAAA;IAClC,CAAC;CACF;AAED,+EAA+E;AAC/E,mBAAmB;AACnB,+EAA+E;AAE/E;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AACH,MAAM,UAAU,SAAS,CAAC,OAA0B;IAClD,OAAO,IAAI,eAAe,CAAC,OAAO,CAAC,CAAA;AACrC,CAAC"}
@@ -0,0 +1,128 @@
1
+ /**
2
+ * Abstract base class for web search sources using the template method pattern.
3
+ *
4
+ * Subclasses only implement `performSearch()` — the base class handles the
5
+ * full pipeline: search → score/rank links → fetch pages → extract content →
6
+ * sanitize → combine with attribution.
7
+ *
8
+ * Used by Google, Bing, Brave, and DuckDuckGo search sources.
9
+ */
10
+ import { BaseResearchSource, type BaseSourceOptions, type ResearchSubject, type RawFinding } from "@debriefer/core";
11
+ /** A single result from a web search engine. */
12
+ export interface WebSearchResult {
13
+ url: string;
14
+ title: string;
15
+ snippet: string;
16
+ }
17
+ /** Options for scoring and filtering search result links. */
18
+ export interface LinkSelectionOptions {
19
+ /** Domain → 0-100 score. Adds to link score when hostname matches. */
20
+ domainScores?: Record<string, number>;
21
+ /** Keywords that boost a result's score when found in title+snippet. */
22
+ boostKeywords?: Array<{
23
+ keyword: string;
24
+ boost: number;
25
+ }>;
26
+ /** Keywords that penalize a result's score when found in title+snippet. */
27
+ penaltyKeywords?: Array<{
28
+ keyword: string;
29
+ penalty: number;
30
+ }>;
31
+ /** Domains to completely exclude from results. */
32
+ blockedDomains?: string[];
33
+ }
34
+ /** Options for WebSearchBase sources, combining base source and link selection options. */
35
+ export interface WebSearchOptions extends BaseSourceOptions, LinkSelectionOptions {
36
+ /** Maximum number of search result pages to fetch. Default: 3. */
37
+ maxLinksToFollow?: number;
38
+ /** Minimum extracted text length in characters. Pages below this are filtered. Default: 200. */
39
+ minContentLength?: number;
40
+ /**
41
+ * Maximum cost in USD for link following per subject. When set, the source
42
+ * tracks cumulative fetch cost and stops following links when the budget
43
+ * is exhausted. Default: unlimited.
44
+ */
45
+ maxLinkCost?: number;
46
+ /**
47
+ * Custom link selector that filters/reorders search results before fetching.
48
+ * Receives ranked results and the subject, returns the results to follow.
49
+ * Useful for AI-assisted link selection (e.g., Claude ranking URLs by relevance).
50
+ * Applied after scoring/ranking but before the maxLinksToFollow limit.
51
+ */
52
+ linkSelector?: (results: WebSearchResult[], subject: ResearchSubject) => Promise<WebSearchResult[]> | WebSearchResult[];
53
+ /**
54
+ * Custom page fetcher that replaces the default fetch+readability pipeline.
55
+ * Useful for browser-based fetching (Playwright) or sites requiring
56
+ * authentication/fingerprinting. Returns extracted text or null on failure.
57
+ */
58
+ fetchPage?: (url: string, signal: AbortSignal) => Promise<string | null>;
59
+ }
60
+ /**
61
+ * Abstract base class for web search sources.
62
+ *
63
+ * Implements the template method pattern: subclasses provide `performSearch()`
64
+ * and this class handles the rest of the pipeline (scoring, fetching,
65
+ * extracting, sanitizing, combining).
66
+ */
67
+ export declare abstract class WebSearchBase extends BaseResearchSource<ResearchSubject> {
68
+ protected readonly maxLinksToFollow: number;
69
+ protected readonly minContentLength: number;
70
+ protected readonly domainScores: Record<string, number>;
71
+ protected readonly boostKeywords: Array<{
72
+ keyword: string;
73
+ boost: number;
74
+ }>;
75
+ protected readonly penaltyKeywords: Array<{
76
+ keyword: string;
77
+ penalty: number;
78
+ }>;
79
+ protected readonly blockedDomains: string[];
80
+ private readonly maxLinkCost?;
81
+ private readonly linkSelector?;
82
+ private readonly customFetchPage?;
83
+ constructor(options?: WebSearchOptions);
84
+ /**
85
+ * Subclass-specific search API call. Returns raw search results.
86
+ *
87
+ * @param query - The search query string
88
+ * @param signal - Abort signal for cancellation
89
+ * @returns Array of search results (URL, title, snippet)
90
+ */
91
+ protected abstract performSearch(query: string, signal: AbortSignal): Promise<WebSearchResult[]>;
92
+ /**
93
+ * Full search pipeline: search → score → fetch → extract → combine.
94
+ *
95
+ * @param subject - The research subject
96
+ * @param signal - Abort signal for cancellation
97
+ * @returns RawFinding with combined text, or null if no content extracted
98
+ */
99
+ protected fetchResult(subject: ResearchSubject, signal: AbortSignal): Promise<RawFinding | null>;
100
+ /**
101
+ * Score and rank search results by relevance.
102
+ *
103
+ * Scoring:
104
+ * - Base: 50 - index (preserves search engine ordering)
105
+ * - + domainScores[domain] if hostname matches
106
+ * - + boost for each boostKeyword found in title+snippet
107
+ * - - penalty for each penaltyKeyword found in title+snippet
108
+ */
109
+ private scoreAndRank;
110
+ /**
111
+ * Check whether a URL should be excluded (blocked domain or unsafe URL).
112
+ * Blocks: non-http(s) schemes, localhost, private IP ranges, and user-specified domains.
113
+ */
114
+ protected isDomainBlocked(url: string): boolean;
115
+ /**
116
+ * Check if a hostname matches a domain (exact or subdomain match).
117
+ * Normalizes domain to lowercase since URL.hostname is always lowercase.
118
+ *
119
+ * "www.example.com" matches "example.com"
120
+ * "sub.example.com" matches "example.com"
121
+ * "example.com" matches "example.com"
122
+ * "notexample.com" does NOT match "example.com"
123
+ */
124
+ private hostnameMatchesDomain;
125
+ /** Check if hostname is in the 172.16.0.0–172.31.255.255 private range (RFC 1918). */
126
+ private isPrivate172;
127
+ }
128
+ //# sourceMappingURL=base.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"base.d.ts","sourceRoot":"","sources":["../../src/web-search/base.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,OAAO,EACL,kBAAkB,EAClB,KAAK,iBAAiB,EACtB,KAAK,eAAe,EACpB,KAAK,UAAU,EAChB,MAAM,iBAAiB,CAAA;AASxB,gDAAgD;AAChD,MAAM,WAAW,eAAe;IAC9B,GAAG,EAAE,MAAM,CAAA;IACX,KAAK,EAAE,MAAM,CAAA;IACb,OAAO,EAAE,MAAM,CAAA;CAChB;AAED,6DAA6D;AAC7D,MAAM,WAAW,oBAAoB;IACnC,sEAAsE;IACtE,YAAY,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAA;IACrC,wEAAwE;IACxE,aAAa,CAAC,EAAE,KAAK,CAAC;QAAE,OAAO,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAE,CAAC,CAAA;IACzD,2EAA2E;IAC3E,eAAe,CAAC,EAAE,KAAK,CAAC;QAAE,OAAO,EAAE,MAAM,CAAC;QAAC,OAAO,EAAE,MAAM,CAAA;KAAE,CAAC,CAAA;IAC7D,kDAAkD;IAClD,cAAc,CAAC,EAAE,MAAM,EAAE,CAAA;CAC1B;AAED,2FAA2F;AAC3F,MAAM,WAAW,gBAAiB,SAAQ,iBAAiB,EAAE,oBAAoB;IAC/E,kEAAkE;IAClE,gBAAgB,CAAC,EAAE,MAAM,CAAA;IACzB,gGAAgG;IAChG,gBAAgB,CAAC,EAAE,MAAM,CAAA;IACzB;;;;OAIG;IACH,WAAW,CAAC,EAAE,MAAM,CAAA;IACpB;;;;;OAKG;IACH,YAAY,CAAC,EAAE,CACb,OAAO,EAAE,eAAe,EAAE,EAC1B,OAAO,EAAE,eAAe,KACrB,OAAO,CAAC,eAAe,EAAE,CAAC,GAAG,eAAe,EAAE,CAAA;IACnD;;;;OAIG;IACH,SAAS,CAAC,EAAE,CAAC,GAAG,EAAE,MAAM,EAAE,MAAM,EAAE,WAAW,KAAK,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC,CAAA;CACzE;AAeD;;;;;;GAMG;AACH,8BAAsB,aAAc,SAAQ,kBAAkB,CAAC,eAAe,CAAC;IAC7E,SAAS,CAAC,QAAQ,CAAC,gBAAgB,EAAE,MAAM,CAAA;IAC3C,SAAS,CAAC,QAAQ,CAAC,gBAAgB,EAAE,MAAM,CAAA;IAC3C,SAAS,CAAC,QAAQ,CAAC,YAAY,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAA;IACvD,SAAS,CAAC,QAAQ,CAAC,aAAa,EAAE,KAAK,CAAC;QAAE,OAAO,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAE,CAAC,CAAA;IAC3E,SAAS,CAAC,QAAQ,CAAC,eAAe,EAAE,KAAK,CAAC;QAAE,OAAO,EAAE,MAAM,CAAC;QAAC,OAAO,EAAE,MAAM,CAAA;KAAE,CAAC,CAAA;IAC/E,SAAS,CAAC,QAAQ,CAAC,cAAc,EAAE,MAAM,EAAE,CAAA;IAC3C,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAQ;IACrC,OAAO,CAAC,QAAQ,CAAC,YAAY,CAAC,CAAkC;IAChE,OAAO,CAAC,QAAQ,CAAC,eAAe,CAAC,CAA+B;gBAEpD,OAAO,GAAE,gBAAqB;IAa1C;;;;;;OAMG;IACH,SAAS,CAAC,QAAQ,CAAC,aAAa,CAAC,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,WAAW,GAAG,OAAO,CAAC,eAAe,EAAE,CAAC;IAEhG;;;;;;OAMG;cACa,WAAW,CACzB,OAAO,EAAE,eAAe,EACxB,MAAM,EAAE,WAAW,GAClB,OAAO,CAAC,UAAU,GAAG,IAAI,CAAC;IA4G7B;;;;;;;;OAQG;IACH,OAAO,CAAC,YAAY;IA2CpB;;;OAGG;IACH,SAAS,CAAC,eAAe,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO;IA0C/C;;;;;;;;OAQG;IACH,OAAO,CAAC,qBAAqB;IAK7B,sFAAsF;IACtF,OAAO,CAAC,YAAY;CAKrB"}