llm-search-tools 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +244 -0
  3. package/dist/index.d.ts +18 -0
  4. package/dist/index.js +40 -0
  5. package/dist/index.js.map +1 -0
  6. package/dist/integration.test.d.ts +1 -0
  7. package/dist/integration.test.js +237 -0
  8. package/dist/modules/answerbox.test.d.ts +1 -0
  9. package/dist/modules/answerbox.test.js +105 -0
  10. package/dist/modules/autocomplete.d.ts +11 -0
  11. package/dist/modules/autocomplete.js +159 -0
  12. package/dist/modules/autocomplete.test.d.ts +1 -0
  13. package/dist/modules/autocomplete.test.js +188 -0
  14. package/dist/modules/common.d.ts +26 -0
  15. package/dist/modules/common.js +263 -0
  16. package/dist/modules/common.test.d.ts +1 -0
  17. package/dist/modules/common.test.js +87 -0
  18. package/dist/modules/crawl.d.ts +9 -0
  19. package/dist/modules/crawl.js +117 -0
  20. package/dist/modules/crawl.test.d.ts +1 -0
  21. package/dist/modules/crawl.test.js +48 -0
  22. package/dist/modules/events.d.ts +8 -0
  23. package/dist/modules/events.js +129 -0
  24. package/dist/modules/events.test.d.ts +1 -0
  25. package/dist/modules/events.test.js +104 -0
  26. package/dist/modules/finance.d.ts +10 -0
  27. package/dist/modules/finance.js +20 -0
  28. package/dist/modules/finance.test.d.ts +1 -0
  29. package/dist/modules/finance.test.js +77 -0
  30. package/dist/modules/flights.d.ts +8 -0
  31. package/dist/modules/flights.js +135 -0
  32. package/dist/modules/flights.test.d.ts +1 -0
  33. package/dist/modules/flights.test.js +128 -0
  34. package/dist/modules/hackernews.d.ts +8 -0
  35. package/dist/modules/hackernews.js +87 -0
  36. package/dist/modules/hackernews.js.map +1 -0
  37. package/dist/modules/images.test.d.ts +1 -0
  38. package/dist/modules/images.test.js +145 -0
  39. package/dist/modules/integrations.test.d.ts +1 -0
  40. package/dist/modules/integrations.test.js +93 -0
  41. package/dist/modules/media.d.ts +11 -0
  42. package/dist/modules/media.js +132 -0
  43. package/dist/modules/media.test.d.ts +1 -0
  44. package/dist/modules/media.test.js +186 -0
  45. package/dist/modules/news.d.ts +3 -0
  46. package/dist/modules/news.js +39 -0
  47. package/dist/modules/news.test.d.ts +1 -0
  48. package/dist/modules/news.test.js +88 -0
  49. package/dist/modules/parser.d.ts +19 -0
  50. package/dist/modules/parser.js +361 -0
  51. package/dist/modules/parser.test.d.ts +1 -0
  52. package/dist/modules/parser.test.js +151 -0
  53. package/dist/modules/reddit.d.ts +21 -0
  54. package/dist/modules/reddit.js +107 -0
  55. package/dist/modules/scrape.d.ts +16 -0
  56. package/dist/modules/scrape.js +272 -0
  57. package/dist/modules/scrape.test.d.ts +1 -0
  58. package/dist/modules/scrape.test.js +232 -0
  59. package/dist/modules/scraper.d.ts +12 -0
  60. package/dist/modules/scraper.js +640 -0
  61. package/dist/modules/scrapers/anidb.d.ts +8 -0
  62. package/dist/modules/scrapers/anidb.js +156 -0
  63. package/dist/modules/scrapers/duckduckgo.d.ts +6 -0
  64. package/dist/modules/scrapers/duckduckgo.js +284 -0
  65. package/dist/modules/scrapers/google-news.d.ts +2 -0
  66. package/dist/modules/scrapers/google-news.js +60 -0
  67. package/dist/modules/scrapers/google.d.ts +6 -0
  68. package/dist/modules/scrapers/google.js +211 -0
  69. package/dist/modules/scrapers/searxng.d.ts +2 -0
  70. package/dist/modules/scrapers/searxng.js +93 -0
  71. package/dist/modules/scrapers/thetvdb.d.ts +3 -0
  72. package/dist/modules/scrapers/thetvdb.js +147 -0
  73. package/dist/modules/scrapers/tmdb.d.ts +3 -0
  74. package/dist/modules/scrapers/tmdb.js +172 -0
  75. package/dist/modules/scrapers/yahoo-finance.d.ts +2 -0
  76. package/dist/modules/scrapers/yahoo-finance.js +33 -0
  77. package/dist/modules/search.d.ts +5 -0
  78. package/dist/modules/search.js +45 -0
  79. package/dist/modules/search.js.map +1 -0
  80. package/dist/modules/search.test.d.ts +1 -0
  81. package/dist/modules/search.test.js +219 -0
  82. package/dist/modules/urbandictionary.d.ts +12 -0
  83. package/dist/modules/urbandictionary.js +26 -0
  84. package/dist/modules/webpage.d.ts +4 -0
  85. package/dist/modules/webpage.js +150 -0
  86. package/dist/modules/webpage.js.map +1 -0
  87. package/dist/modules/wikipedia.d.ts +5 -0
  88. package/dist/modules/wikipedia.js +85 -0
  89. package/dist/modules/wikipedia.js.map +1 -0
  90. package/dist/scripts/interactive-search.d.ts +1 -0
  91. package/dist/scripts/interactive-search.js +98 -0
  92. package/dist/test.d.ts +1 -0
  93. package/dist/test.js +179 -0
  94. package/dist/test.js.map +1 -0
  95. package/dist/testBraveSearch.d.ts +1 -0
  96. package/dist/testBraveSearch.js +34 -0
  97. package/dist/testDuckDuckGo.d.ts +1 -0
  98. package/dist/testDuckDuckGo.js +52 -0
  99. package/dist/testEcosia.d.ts +1 -0
  100. package/dist/testEcosia.js +57 -0
  101. package/dist/testSearchModule.d.ts +1 -0
  102. package/dist/testSearchModule.js +95 -0
  103. package/dist/testwebpage.d.ts +1 -0
  104. package/dist/testwebpage.js +81 -0
  105. package/dist/types.d.ts +174 -0
  106. package/dist/types.js +3 -0
  107. package/dist/types.js.map +1 -0
  108. package/dist/utils/createTestDocx.d.ts +1 -0
  109. package/dist/utils/createTestDocx.js +58 -0
  110. package/dist/utils/htmlcleaner.d.ts +20 -0
  111. package/dist/utils/htmlcleaner.js +172 -0
  112. package/docs/README.md +275 -0
  113. package/docs/autocomplete.md +73 -0
  114. package/docs/crawling.md +88 -0
  115. package/docs/events.md +58 -0
  116. package/docs/examples.md +158 -0
  117. package/docs/finance.md +60 -0
  118. package/docs/flights.md +71 -0
  119. package/docs/hackernews.md +121 -0
  120. package/docs/media.md +87 -0
  121. package/docs/news.md +75 -0
  122. package/docs/parser.md +197 -0
  123. package/docs/scraper.md +347 -0
  124. package/docs/search.md +106 -0
  125. package/docs/wikipedia.md +91 -0
  126. package/package.json +97 -0
@@ -0,0 +1,272 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.normalizeContent = normalizeContent;
7
+ exports.getWebpageContent = getWebpageContent;
8
+ exports.getWebpageText = getWebpageText;
9
+ const readability_1 = require("@mozilla/readability");
10
+ const jsdom_1 = require("jsdom");
11
+ const turndown_1 = __importDefault(require("turndown"));
12
+ const wikipedia_1 = require("./wikipedia");
13
+ const hackernews_1 = require("./hackernews");
14
+ const google_1 = require("./scrapers/google");
15
+ const duckduckgo_1 = require("./scrapers/duckduckgo");
16
+ const common_1 = require("./common");
17
+ // Configure TurndownService once for better performance
18
+ const turndownService = new turndown_1.default({
19
+ headingStyle: "atx",
20
+ codeBlockStyle: "fenced",
21
+ });
22
+ // check url type and get appropriate handler
23
+ function getUrlType(url) {
24
+ try {
25
+ const urlObj = new URL(url);
26
+ const hostname = urlObj.hostname;
27
+ if (hostname.includes("wikipedia.org")) {
28
+ return "wikipedia";
29
+ }
30
+ if (hostname === "news.ycombinator.com" && url.includes("item?id=")) {
31
+ return "hackernews";
32
+ }
33
+ if (hostname.includes("google.") && urlObj.pathname.startsWith("/search")) {
34
+ return "google-search";
35
+ }
36
+ if (hostname.includes("duckduckgo.com") && (urlObj.searchParams.has("q") || url.includes("?q="))) {
37
+ return "duckduckgo-search";
38
+ }
39
+ // list of domains that don't work well with readability
40
+ const unsupported = [
41
+ "youtube.com",
42
+ "youtu.be",
43
+ "vimeo.com",
44
+ "twitter.com",
45
+ "x.com",
46
+ "instagram.com",
47
+ "facebook.com",
48
+ "linkedin.com",
49
+ ];
50
+ if (unsupported.some((domain) => hostname.includes(domain))) {
51
+ return "unsupported";
52
+ }
53
+ return "general";
54
+ }
55
+ catch {
56
+ return "unsupported";
57
+ }
58
+ }
59
+ // Helper to extract images from DOM
60
+ function extractImages(doc) {
61
+ const imageUrls = [];
62
+ doc.querySelectorAll("img").forEach((img) => {
63
+ const src = img.src;
64
+ // JSDOM resolves relative URLs to absolute when initialized with url option
65
+ if (src && src.length > 0) {
66
+ imageUrls.push(src);
67
+ }
68
+ });
69
+ return imageUrls;
70
+ }
71
+ // Helper to extract favicon from DOM
72
+ function extractFavicon(doc, url) {
73
+ const iconLinks = doc.querySelectorAll("link[rel*='icon']");
74
+ if (iconLinks.length > 0) {
75
+ return iconLinks[0].href;
76
+ }
77
+ try {
78
+ return new URL("/favicon.ico", url).href;
79
+ }
80
+ catch {
81
+ return undefined;
82
+ }
83
+ }
84
+ function normalizeContent(params) {
85
+ const { url, html, title, siteName, fallbackFavicon, skipReadability } = params;
86
+ const dom = new jsdom_1.JSDOM(html, { url });
87
+ const doc = dom.window.document;
88
+ // Extract metadata using helpers
89
+ const favicon = extractFavicon(doc, url) || fallbackFavicon;
90
+ const imageUrls = extractImages(doc);
91
+ // Use Readability for better content extraction unless explicitly skipped
92
+ let finalContent = html;
93
+ let finalTitle = title || url;
94
+ let textContent = "";
95
+ let excerpt;
96
+ let finalSiteName = siteName;
97
+ if (!skipReadability) {
98
+ const reader = new readability_1.Readability(doc);
99
+ const article = reader.parse();
100
+ if (article) {
101
+ finalContent = article.content || html;
102
+ finalTitle = article.title || title || url;
103
+ textContent = (0, common_1.cleanText)(article.textContent || "");
104
+ excerpt = article.excerpt || undefined;
105
+ finalSiteName = siteName || article.siteName || undefined;
106
+ }
107
+ else {
108
+ // Readability failed, fall back to raw extraction
109
+ textContent = (0, common_1.cleanText)(doc.body?.textContent || "");
110
+ excerpt = textContent.slice(0, 200) + (textContent.length > 200 ? "..." : "");
111
+ }
112
+ }
113
+ else {
114
+ // Skip Readability - use raw extraction
115
+ textContent = (0, common_1.cleanText)(doc.body?.textContent || "");
116
+ excerpt = textContent.slice(0, 200) + (textContent.length > 200 ? "..." : "");
117
+ }
118
+ const markdown = turndownService.turndown(finalContent);
119
+ return {
120
+ title: finalTitle,
121
+ content: finalContent,
122
+ textContent,
123
+ length: textContent.length,
124
+ excerpt,
125
+ siteName: finalSiteName,
126
+ favicon,
127
+ imageUrls,
128
+ markdown,
129
+ rawHtml: html,
130
+ };
131
+ }
132
+ // get webpage content using readability with stealth puppeteer
133
+ async function getWebpageContent(url, options) {
134
+ // Backward compatibility: if options is boolean, treat as usePuppeteer
135
+ if (typeof options === "boolean") {
136
+ options = { usePuppeteer: options };
137
+ }
138
+ else if (!options) {
139
+ options = {};
140
+ }
141
+ // Cast options to the correct type for internal use
142
+ const opts = options;
143
+ try {
144
+ const urlType = getUrlType(url);
145
+ // handle special cases
146
+ if (urlType === "wikipedia") {
147
+ const title = url.split("/wiki/")[1]?.replace(/_/g, " ") || url;
148
+ const html = await (0, wikipedia_1.wikiGetContent)(title);
149
+ return normalizeContent({
150
+ url,
151
+ html,
152
+ title,
153
+ siteName: "Wikipedia",
154
+ fallbackFavicon: "https://en.wikipedia.org/static/favicon/wikipedia.ico",
155
+ skipReadability: true, // Wikipedia content is already clean from their API
156
+ });
157
+ }
158
+ if (urlType === "hackernews") {
159
+ const idStr = url.split("id=")[1];
160
+ const id = parseInt(idStr);
161
+ const story = await (0, hackernews_1.getStoryById)(id);
162
+ const html = story.snippet || story.title || "No content available";
163
+ return normalizeContent({
164
+ url,
165
+ html,
166
+ title: story.title || url,
167
+ siteName: "Hacker News",
168
+ fallbackFavicon: "https://news.ycombinator.com/favicon.ico",
169
+ skipReadability: true, // HN snippets are already clean
170
+ });
171
+ }
172
+ if (urlType === "unsupported") {
173
+ return normalizeContent({
174
+ url,
175
+ html: "<html><body><p>This URL type is not supported for content extraction.</p></body></html>",
176
+ title: url,
177
+ skipReadability: true,
178
+ });
179
+ }
180
+ // handle general case with readability
181
+ let html;
182
+ if (opts.usePuppeteer) {
183
+ // Use stealth puppeteer for bot-protected sites
184
+ const proxy = (0, common_1.parseProxyConfig)(opts.proxy);
185
+ const browser = await (0, common_1.createStealthBrowser)(proxy || undefined);
186
+ try {
187
+ const pages = await browser.pages();
188
+ const page = pages.length > 0 ? pages[0] : await browser.newPage();
189
+ await page.setViewport({ width: 1920, height: 1080 });
190
+ await page.setExtraHTTPHeaders((0, common_1.createRealisticHeaders)());
191
+ await page.goto(url, { waitUntil: "networkidle2" });
192
+ html = await page.content();
193
+ }
194
+ finally {
195
+ await browser.close();
196
+ }
197
+ }
198
+ else {
199
+ try {
200
+ const headers = (0, common_1.createRealisticHeaders)();
201
+ const fetchOptions = { headers };
202
+ const proxy = (0, common_1.parseProxyConfig)(opts.proxy);
203
+ if (proxy) {
204
+ if (proxy.type === "socks4" || proxy.type === "socks5") {
205
+ const { SocksProxyAgent } = await import("socks-proxy-agent");
206
+ fetchOptions.agent = new SocksProxyAgent(proxy.url);
207
+ }
208
+ else {
209
+ const { HttpsProxyAgent } = await import("https-proxy-agent");
210
+ fetchOptions.agent = new HttpsProxyAgent(proxy.url);
211
+ }
212
+ }
213
+ const response = await fetch(url, fetchOptions);
214
+ html = await response.text();
215
+ if ((0, common_1.detectBotProtection)(response.headers, html)) {
216
+ // If bot protection detected, re-run with puppeteer
217
+ return await getWebpageContent(url, { ...opts, usePuppeteer: true });
218
+ }
219
+ }
220
+ catch {
221
+ // If basic fetch fails, try with puppeteer
222
+ return await getWebpageContent(url, { ...opts, usePuppeteer: true });
223
+ }
224
+ }
225
+ // Extract Answer Box for Search Engines
226
+ if (urlType === "google-search") {
227
+ const dom = new jsdom_1.JSDOM(html);
228
+ const answer = (0, google_1.extractAnswerBox)(dom.window.document);
229
+ if (answer) {
230
+ return normalizeContent({
231
+ url,
232
+ html: `<div class="answer-box"><h1>Google Answer</h1><p>${answer}</p></div>`,
233
+ title: "Google Answer",
234
+ siteName: "Google",
235
+ skipReadability: true,
236
+ fallbackFavicon: "https://www.google.com/favicon.ico",
237
+ });
238
+ }
239
+ }
240
+ if (urlType === "duckduckgo-search") {
241
+ const dom = new jsdom_1.JSDOM(html);
242
+ const answer = (0, duckduckgo_1.extractAnswerBox)(dom.window.document);
243
+ if (answer) {
244
+ return normalizeContent({
245
+ url,
246
+ html: `<div class="answer-box"><h1>DuckDuckGo Answer</h1><p>${answer}</p></div>`,
247
+ title: "DuckDuckGo Answer",
248
+ siteName: "DuckDuckGo",
249
+ skipReadability: true,
250
+ fallbackFavicon: "https://duckduckgo.com/favicon.ico",
251
+ });
252
+ }
253
+ }
254
+ return normalizeContent({
255
+ url,
256
+ html,
257
+ skipReadability: false,
258
+ });
259
+ }
260
+ catch (err) {
261
+ throw {
262
+ message: "failed to get webpage content :/",
263
+ code: "WEBPAGE_ERROR",
264
+ originalError: err,
265
+ };
266
+ }
267
+ }
268
+ // get just the text content
269
+ async function getWebpageText(url, options = {}) {
270
+ const content = await getWebpageContent(url, options);
271
+ return content.textContent;
272
+ }
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,232 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ Object.defineProperty(exports, "__esModule", { value: true });
36
+ const vitest_1 = require("vitest");
37
+ const scrape_1 = require("./scrape");
38
+ const common = __importStar(require("./common"));
39
+ const wikipedia_1 = require("./wikipedia");
40
+ const hackernews_1 = require("./hackernews");
41
+ const readability_1 = require("@mozilla/readability");
42
+ const jsdom_1 = require("jsdom");
43
+ const google_1 = require("./scrapers/google");
44
+ const duckduckgo_1 = require("./scrapers/duckduckgo");
45
+ // Mock dependencies
46
+ vitest_1.vi.mock("./wikipedia");
47
+ vitest_1.vi.mock("./hackernews");
48
+ vitest_1.vi.mock("./scrapers/google", () => ({
49
+ extractAnswerBox: vitest_1.vi.fn(),
50
+ }));
51
+ vitest_1.vi.mock("./scrapers/duckduckgo", () => ({
52
+ extractAnswerBox: vitest_1.vi.fn(),
53
+ }));
54
+ vitest_1.vi.mock("@mozilla/readability");
55
+ vitest_1.vi.mock("jsdom");
56
+ vitest_1.vi.mock("./common", async () => {
57
+ const actual = await vitest_1.vi.importActual("./common");
58
+ return {
59
+ ...actual,
60
+ createStealthBrowser: vitest_1.vi.fn(),
61
+ createRealisticHeaders: vitest_1.vi.fn().mockReturnValue({ "User-Agent": "test-agent" }),
62
+ parseProxyConfig: vitest_1.vi.fn(),
63
+ detectBotProtection: vitest_1.vi.fn().mockReturnValue(false),
64
+ cleanText: vitest_1.vi.fn((text) => text?.trim() || ""),
65
+ };
66
+ });
67
+ // Global fetch mock
68
+ global.fetch = vitest_1.vi.fn();
69
+ (0, vitest_1.describe)("Scrape Module", () => {
70
+ (0, vitest_1.beforeEach)(() => {
71
+ vitest_1.vi.resetAllMocks();
72
+ // Default JSDOM/Readability mocks
73
+ jsdom_1.JSDOM.mockImplementation(() => ({
74
+ window: {
75
+ document: {
76
+ querySelectorAll: vitest_1.vi.fn().mockImplementation((selector) => {
77
+ if (selector === "link[rel*='icon']") {
78
+ return [{ href: "https://example.com/favicon.ico" }];
79
+ }
80
+ if (selector === "img") {
81
+ return [{ src: "https://example.com/image1.jpg" }, { src: "https://example.com/image2.png" }];
82
+ }
83
+ return [];
84
+ }),
85
+ body: {
86
+ textContent: "Test content from body",
87
+ },
88
+ },
89
+ },
90
+ }));
91
+ readability_1.Readability.mockImplementation(() => ({
92
+ parse: vitest_1.vi.fn().mockReturnValue({
93
+ title: "Test Article",
94
+ content: "<div>Test content</div>",
95
+ textContent: "Test content",
96
+ excerpt: "Test excerpt",
97
+ siteName: "Test Site",
98
+ }),
99
+ }));
100
+ });
101
+ (0, vitest_1.describe)("URL Type Routing", () => {
102
+ (0, vitest_1.it)("should route wikipedia URLs to wiki handler", async () => {
103
+ wikipedia_1.wikiGetContent.mockResolvedValue("Wiki content");
104
+ const result = await (0, scrape_1.getWebpageContent)("https://en.wikipedia.org/wiki/Test");
105
+ (0, vitest_1.expect)(result.siteName).toBe("Wikipedia");
106
+ (0, vitest_1.expect)(result.content).toBe("Wiki content");
107
+ (0, vitest_1.expect)(result.favicon).toBeDefined();
108
+ (0, vitest_1.expect)(result.markdown).toBeDefined();
109
+ (0, vitest_1.expect)(wikipedia_1.wikiGetContent).toHaveBeenCalledWith("Test");
110
+ });
111
+ (0, vitest_1.it)("should route hackernews URLs to HN handler", async () => {
112
+ hackernews_1.getStoryById.mockResolvedValue({
113
+ title: "HN Story",
114
+ snippet: "HN Content",
115
+ });
116
+ const result = await (0, scrape_1.getWebpageContent)("https://news.ycombinator.com/item?id=12345");
117
+ (0, vitest_1.expect)(result.siteName).toBe("Hacker News");
118
+ (0, vitest_1.expect)(result.title).toBe("HN Story");
119
+ (0, vitest_1.expect)(result.favicon).toBeDefined();
120
+ (0, vitest_1.expect)(result.markdown).toBeDefined();
121
+ (0, vitest_1.expect)(hackernews_1.getStoryById).toHaveBeenCalledWith(12345);
122
+ });
123
+ (0, vitest_1.it)("should handle unsupported domains", async () => {
124
+ const result = await (0, scrape_1.getWebpageContent)("https://youtube.com/watch?v=123");
125
+ // Unsupported URLs now go through normalizeContent, which uses the JSDOM mock
126
+ (0, vitest_1.expect)(result.textContent).toBe("Test content from body");
127
+ (0, vitest_1.expect)(result.title).toBe("https://youtube.com/watch?v=123");
128
+ });
129
+ (0, vitest_1.it)("should route google search URLs and extract answer box", async () => {
130
+ // Mock fetch response
131
+ global.fetch.mockResolvedValue({
132
+ text: () => Promise.resolve("<html><body>Google Search Result</body></html>"),
133
+ headers: new Map(),
134
+ });
135
+ // Mock extractor to return an answer
136
+ google_1.extractAnswerBox.mockReturnValue("The Answer is 42");
137
+ const result = await (0, scrape_1.getWebpageContent)("https://www.google.com/search?q=answer");
138
+ (0, vitest_1.expect)(result.siteName).toBe("Google");
139
+ (0, vitest_1.expect)(result.title).toBe("Google Answer");
140
+ (0, vitest_1.expect)(result.rawHtml).toContain("The Answer is 42");
141
+ (0, vitest_1.expect)(google_1.extractAnswerBox).toHaveBeenCalled();
142
+ });
143
+ (0, vitest_1.it)("should route duckduckgo search URLs and extract answer box", async () => {
144
+ // Mock fetch response
145
+ global.fetch.mockResolvedValue({
146
+ text: () => Promise.resolve("<html><body>DDG Search Result</body></html>"),
147
+ headers: new Map(),
148
+ });
149
+ // Mock extractor to return an answer
150
+ duckduckgo_1.extractAnswerBox.mockReturnValue("The DDG Answer");
151
+ const result = await (0, scrape_1.getWebpageContent)("https://duckduckgo.com/?q=answer");
152
+ (0, vitest_1.expect)(result.siteName).toBe("DuckDuckGo");
153
+ (0, vitest_1.expect)(result.title).toBe("DuckDuckGo Answer");
154
+ (0, vitest_1.expect)(result.rawHtml).toContain("The DDG Answer");
155
+ (0, vitest_1.expect)(duckduckgo_1.extractAnswerBox).toHaveBeenCalled();
156
+ });
157
+ });
158
+ (0, vitest_1.describe)("General Scraping (Fetch)", () => {
159
+ (0, vitest_1.it)("should scrape content using fetch and readability", async () => {
160
+ global.fetch.mockResolvedValue({
161
+ text: () => Promise.resolve("<html><body>Test</body></html>"),
162
+ headers: new Map(),
163
+ });
164
+ const result = await (0, scrape_1.getWebpageContent)("https://example.com");
165
+ (0, vitest_1.expect)(result.title).toBe("Test Article");
166
+ (0, vitest_1.expect)(result.textContent).toBe("Test content");
167
+ (0, vitest_1.expect)(result.favicon).toBe("https://example.com/favicon.ico");
168
+ (0, vitest_1.expect)(result.imageUrls).toBeDefined();
169
+ (0, vitest_1.expect)(result.imageUrls).toContain("https://example.com/image1.jpg");
170
+ (0, vitest_1.expect)(result.imageUrls).toContain("https://example.com/image2.png");
171
+ (0, vitest_1.expect)(result.markdown).toBeDefined();
172
+ (0, vitest_1.expect)(result.markdown?.length).toBeGreaterThan(0);
173
+ (0, vitest_1.expect)(result.rawHtml).toContain("<html><body>Test</body></html>");
174
+ (0, vitest_1.expect)(global.fetch).toHaveBeenCalled();
175
+ (0, vitest_1.expect)(readability_1.Readability).toHaveBeenCalled();
176
+ });
177
+ (0, vitest_1.it)("should fallback to puppeteer if bot protection detected", async () => {
178
+ // First call detects bot
179
+ global.fetch.mockResolvedValue({
180
+ text: () => Promise.resolve("<html><body>Captcha</body></html>"),
181
+ headers: new Map(),
182
+ });
183
+ common.detectBotProtection.mockReturnValueOnce(true);
184
+ // Mock puppeteer part
185
+ const mockPage = {
186
+ setViewport: vitest_1.vi.fn(),
187
+ setExtraHTTPHeaders: vitest_1.vi.fn(),
188
+ goto: vitest_1.vi.fn(),
189
+ content: vitest_1.vi.fn().mockResolvedValue("<html><body>Real Content</body></html>"),
190
+ };
191
+ const mockBrowser = {
192
+ pages: vitest_1.vi.fn().mockResolvedValue([]),
193
+ newPage: vitest_1.vi.fn().mockResolvedValue(mockPage),
194
+ close: vitest_1.vi.fn(),
195
+ };
196
+ common.createStealthBrowser.mockResolvedValue(mockBrowser);
197
+ const result = await (0, scrape_1.getWebpageContent)("https://protected.com");
198
+ (0, vitest_1.expect)(common.createStealthBrowser).toHaveBeenCalled();
199
+ (0, vitest_1.expect)(result.title).toBe("Test Article");
200
+ });
201
+ });
202
+ (0, vitest_1.describe)("General Scraping (Puppeteer)", () => {
203
+ (0, vitest_1.it)("should use puppeteer when forced", async () => {
204
+ const mockPage = {
205
+ setViewport: vitest_1.vi.fn(),
206
+ setExtraHTTPHeaders: vitest_1.vi.fn(),
207
+ goto: vitest_1.vi.fn(),
208
+ content: vitest_1.vi.fn().mockResolvedValue("<html><body>Puppeteer Content</body></html>"),
209
+ };
210
+ const mockBrowser = {
211
+ pages: vitest_1.vi.fn().mockResolvedValue([]),
212
+ newPage: vitest_1.vi.fn().mockResolvedValue(mockPage),
213
+ close: vitest_1.vi.fn(),
214
+ };
215
+ common.createStealthBrowser.mockResolvedValue(mockBrowser);
216
+ await (0, scrape_1.getWebpageContent)("https://example.com", { usePuppeteer: true });
217
+ (0, vitest_1.expect)(global.fetch).not.toHaveBeenCalled();
218
+ (0, vitest_1.expect)(common.createStealthBrowser).toHaveBeenCalled();
219
+ (0, vitest_1.expect)(mockPage.goto).toHaveBeenCalledWith("https://example.com", vitest_1.expect.any(Object));
220
+ });
221
+ });
222
+ (0, vitest_1.describe)("getWebpageText", () => {
223
+ (0, vitest_1.it)("should return only text content", async () => {
224
+ global.fetch.mockResolvedValue({
225
+ text: () => Promise.resolve("html"),
226
+ headers: new Map(),
227
+ });
228
+ const text = await (0, scrape_1.getWebpageText)("https://example.com");
229
+ (0, vitest_1.expect)(text).toBe("Test content");
230
+ });
231
+ });
232
+ });
@@ -0,0 +1,12 @@
1
+ import { ScraperOptions, SearchResult, WebpageContent } from "../types";
2
+ export declare function searchGoogle(query: string, options?: ScraperOptions): Promise<SearchResult[]>;
3
+ export declare function searchDuckDuckGo(query: string, options?: ScraperOptions): Promise<SearchResult[]>;
4
+ export declare function search(query: string, options?: ScraperOptions): Promise<SearchResult[]>;
5
+ export declare function getWebpageContent(url: string, options?: {
6
+ usePuppeteer?: boolean;
7
+ } & ScraperOptions | boolean): Promise<WebpageContent>;
8
+ export declare function getWebpageText(url: string, options?: {
9
+ usePuppeteer?: boolean;
10
+ } & ScraperOptions): Promise<string>;
11
+ export declare function isUrlAccessible(url: string): Promise<boolean>;
12
+ export { searchGoogle as searchGoogleLegacy, searchDuckDuckGo as searchDuckDuckGoLegacy };