llm-search-tools 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +244 -0
- package/dist/index.d.ts +18 -0
- package/dist/index.js +40 -0
- package/dist/index.js.map +1 -0
- package/dist/integration.test.d.ts +1 -0
- package/dist/integration.test.js +237 -0
- package/dist/modules/answerbox.test.d.ts +1 -0
- package/dist/modules/answerbox.test.js +105 -0
- package/dist/modules/autocomplete.d.ts +11 -0
- package/dist/modules/autocomplete.js +159 -0
- package/dist/modules/autocomplete.test.d.ts +1 -0
- package/dist/modules/autocomplete.test.js +188 -0
- package/dist/modules/common.d.ts +26 -0
- package/dist/modules/common.js +263 -0
- package/dist/modules/common.test.d.ts +1 -0
- package/dist/modules/common.test.js +87 -0
- package/dist/modules/crawl.d.ts +9 -0
- package/dist/modules/crawl.js +117 -0
- package/dist/modules/crawl.test.d.ts +1 -0
- package/dist/modules/crawl.test.js +48 -0
- package/dist/modules/events.d.ts +8 -0
- package/dist/modules/events.js +129 -0
- package/dist/modules/events.test.d.ts +1 -0
- package/dist/modules/events.test.js +104 -0
- package/dist/modules/finance.d.ts +10 -0
- package/dist/modules/finance.js +20 -0
- package/dist/modules/finance.test.d.ts +1 -0
- package/dist/modules/finance.test.js +77 -0
- package/dist/modules/flights.d.ts +8 -0
- package/dist/modules/flights.js +135 -0
- package/dist/modules/flights.test.d.ts +1 -0
- package/dist/modules/flights.test.js +128 -0
- package/dist/modules/hackernews.d.ts +8 -0
- package/dist/modules/hackernews.js +87 -0
- package/dist/modules/hackernews.js.map +1 -0
- package/dist/modules/images.test.d.ts +1 -0
- package/dist/modules/images.test.js +145 -0
- package/dist/modules/integrations.test.d.ts +1 -0
- package/dist/modules/integrations.test.js +93 -0
- package/dist/modules/media.d.ts +11 -0
- package/dist/modules/media.js +132 -0
- package/dist/modules/media.test.d.ts +1 -0
- package/dist/modules/media.test.js +186 -0
- package/dist/modules/news.d.ts +3 -0
- package/dist/modules/news.js +39 -0
- package/dist/modules/news.test.d.ts +1 -0
- package/dist/modules/news.test.js +88 -0
- package/dist/modules/parser.d.ts +19 -0
- package/dist/modules/parser.js +361 -0
- package/dist/modules/parser.test.d.ts +1 -0
- package/dist/modules/parser.test.js +151 -0
- package/dist/modules/reddit.d.ts +21 -0
- package/dist/modules/reddit.js +107 -0
- package/dist/modules/scrape.d.ts +16 -0
- package/dist/modules/scrape.js +272 -0
- package/dist/modules/scrape.test.d.ts +1 -0
- package/dist/modules/scrape.test.js +232 -0
- package/dist/modules/scraper.d.ts +12 -0
- package/dist/modules/scraper.js +640 -0
- package/dist/modules/scrapers/anidb.d.ts +8 -0
- package/dist/modules/scrapers/anidb.js +156 -0
- package/dist/modules/scrapers/duckduckgo.d.ts +6 -0
- package/dist/modules/scrapers/duckduckgo.js +284 -0
- package/dist/modules/scrapers/google-news.d.ts +2 -0
- package/dist/modules/scrapers/google-news.js +60 -0
- package/dist/modules/scrapers/google.d.ts +6 -0
- package/dist/modules/scrapers/google.js +211 -0
- package/dist/modules/scrapers/searxng.d.ts +2 -0
- package/dist/modules/scrapers/searxng.js +93 -0
- package/dist/modules/scrapers/thetvdb.d.ts +3 -0
- package/dist/modules/scrapers/thetvdb.js +147 -0
- package/dist/modules/scrapers/tmdb.d.ts +3 -0
- package/dist/modules/scrapers/tmdb.js +172 -0
- package/dist/modules/scrapers/yahoo-finance.d.ts +2 -0
- package/dist/modules/scrapers/yahoo-finance.js +33 -0
- package/dist/modules/search.d.ts +5 -0
- package/dist/modules/search.js +45 -0
- package/dist/modules/search.js.map +1 -0
- package/dist/modules/search.test.d.ts +1 -0
- package/dist/modules/search.test.js +219 -0
- package/dist/modules/urbandictionary.d.ts +12 -0
- package/dist/modules/urbandictionary.js +26 -0
- package/dist/modules/webpage.d.ts +4 -0
- package/dist/modules/webpage.js +150 -0
- package/dist/modules/webpage.js.map +1 -0
- package/dist/modules/wikipedia.d.ts +5 -0
- package/dist/modules/wikipedia.js +85 -0
- package/dist/modules/wikipedia.js.map +1 -0
- package/dist/scripts/interactive-search.d.ts +1 -0
- package/dist/scripts/interactive-search.js +98 -0
- package/dist/test.d.ts +1 -0
- package/dist/test.js +179 -0
- package/dist/test.js.map +1 -0
- package/dist/testBraveSearch.d.ts +1 -0
- package/dist/testBraveSearch.js +34 -0
- package/dist/testDuckDuckGo.d.ts +1 -0
- package/dist/testDuckDuckGo.js +52 -0
- package/dist/testEcosia.d.ts +1 -0
- package/dist/testEcosia.js +57 -0
- package/dist/testSearchModule.d.ts +1 -0
- package/dist/testSearchModule.js +95 -0
- package/dist/testwebpage.d.ts +1 -0
- package/dist/testwebpage.js +81 -0
- package/dist/types.d.ts +174 -0
- package/dist/types.js +3 -0
- package/dist/types.js.map +1 -0
- package/dist/utils/createTestDocx.d.ts +1 -0
- package/dist/utils/createTestDocx.js +58 -0
- package/dist/utils/htmlcleaner.d.ts +20 -0
- package/dist/utils/htmlcleaner.js +172 -0
- package/docs/README.md +275 -0
- package/docs/autocomplete.md +73 -0
- package/docs/crawling.md +88 -0
- package/docs/events.md +58 -0
- package/docs/examples.md +158 -0
- package/docs/finance.md +60 -0
- package/docs/flights.md +71 -0
- package/docs/hackernews.md +121 -0
- package/docs/media.md +87 -0
- package/docs/news.md +75 -0
- package/docs/parser.md +197 -0
- package/docs/scraper.md +347 -0
- package/docs/search.md +106 -0
- package/docs/wikipedia.md +91 -0
- package/package.json +97 -0
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.normalizeContent = normalizeContent;
|
|
7
|
+
exports.getWebpageContent = getWebpageContent;
|
|
8
|
+
exports.getWebpageText = getWebpageText;
|
|
9
|
+
const readability_1 = require("@mozilla/readability");
|
|
10
|
+
const jsdom_1 = require("jsdom");
|
|
11
|
+
const turndown_1 = __importDefault(require("turndown"));
|
|
12
|
+
const wikipedia_1 = require("./wikipedia");
|
|
13
|
+
const hackernews_1 = require("./hackernews");
|
|
14
|
+
const google_1 = require("./scrapers/google");
|
|
15
|
+
const duckduckgo_1 = require("./scrapers/duckduckgo");
|
|
16
|
+
const common_1 = require("./common");
|
|
17
|
+
// Configure TurndownService once for better performance
|
|
18
|
+
const turndownService = new turndown_1.default({
|
|
19
|
+
headingStyle: "atx",
|
|
20
|
+
codeBlockStyle: "fenced",
|
|
21
|
+
});
|
|
22
|
+
// check url type and get appropriate handler
|
|
23
|
+
function getUrlType(url) {
|
|
24
|
+
try {
|
|
25
|
+
const urlObj = new URL(url);
|
|
26
|
+
const hostname = urlObj.hostname;
|
|
27
|
+
if (hostname.includes("wikipedia.org")) {
|
|
28
|
+
return "wikipedia";
|
|
29
|
+
}
|
|
30
|
+
if (hostname === "news.ycombinator.com" && url.includes("item?id=")) {
|
|
31
|
+
return "hackernews";
|
|
32
|
+
}
|
|
33
|
+
if (hostname.includes("google.") && urlObj.pathname.startsWith("/search")) {
|
|
34
|
+
return "google-search";
|
|
35
|
+
}
|
|
36
|
+
if (hostname.includes("duckduckgo.com") && (urlObj.searchParams.has("q") || url.includes("?q="))) {
|
|
37
|
+
return "duckduckgo-search";
|
|
38
|
+
}
|
|
39
|
+
// list of domains that don't work well with readability
|
|
40
|
+
const unsupported = [
|
|
41
|
+
"youtube.com",
|
|
42
|
+
"youtu.be",
|
|
43
|
+
"vimeo.com",
|
|
44
|
+
"twitter.com",
|
|
45
|
+
"x.com",
|
|
46
|
+
"instagram.com",
|
|
47
|
+
"facebook.com",
|
|
48
|
+
"linkedin.com",
|
|
49
|
+
];
|
|
50
|
+
if (unsupported.some((domain) => hostname.includes(domain))) {
|
|
51
|
+
return "unsupported";
|
|
52
|
+
}
|
|
53
|
+
return "general";
|
|
54
|
+
}
|
|
55
|
+
catch {
|
|
56
|
+
return "unsupported";
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
// Helper to extract images from DOM
|
|
60
|
+
function extractImages(doc) {
|
|
61
|
+
const imageUrls = [];
|
|
62
|
+
doc.querySelectorAll("img").forEach((img) => {
|
|
63
|
+
const src = img.src;
|
|
64
|
+
// JSDOM resolves relative URLs to absolute when initialized with url option
|
|
65
|
+
if (src && src.length > 0) {
|
|
66
|
+
imageUrls.push(src);
|
|
67
|
+
}
|
|
68
|
+
});
|
|
69
|
+
return imageUrls;
|
|
70
|
+
}
|
|
71
|
+
// Helper to extract favicon from DOM
|
|
72
|
+
function extractFavicon(doc, url) {
|
|
73
|
+
const iconLinks = doc.querySelectorAll("link[rel*='icon']");
|
|
74
|
+
if (iconLinks.length > 0) {
|
|
75
|
+
return iconLinks[0].href;
|
|
76
|
+
}
|
|
77
|
+
try {
|
|
78
|
+
return new URL("/favicon.ico", url).href;
|
|
79
|
+
}
|
|
80
|
+
catch {
|
|
81
|
+
return undefined;
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
function normalizeContent(params) {
|
|
85
|
+
const { url, html, title, siteName, fallbackFavicon, skipReadability } = params;
|
|
86
|
+
const dom = new jsdom_1.JSDOM(html, { url });
|
|
87
|
+
const doc = dom.window.document;
|
|
88
|
+
// Extract metadata using helpers
|
|
89
|
+
const favicon = extractFavicon(doc, url) || fallbackFavicon;
|
|
90
|
+
const imageUrls = extractImages(doc);
|
|
91
|
+
// Use Readability for better content extraction unless explicitly skipped
|
|
92
|
+
let finalContent = html;
|
|
93
|
+
let finalTitle = title || url;
|
|
94
|
+
let textContent = "";
|
|
95
|
+
let excerpt;
|
|
96
|
+
let finalSiteName = siteName;
|
|
97
|
+
if (!skipReadability) {
|
|
98
|
+
const reader = new readability_1.Readability(doc);
|
|
99
|
+
const article = reader.parse();
|
|
100
|
+
if (article) {
|
|
101
|
+
finalContent = article.content || html;
|
|
102
|
+
finalTitle = article.title || title || url;
|
|
103
|
+
textContent = (0, common_1.cleanText)(article.textContent || "");
|
|
104
|
+
excerpt = article.excerpt || undefined;
|
|
105
|
+
finalSiteName = siteName || article.siteName || undefined;
|
|
106
|
+
}
|
|
107
|
+
else {
|
|
108
|
+
// Readability failed, fall back to raw extraction
|
|
109
|
+
textContent = (0, common_1.cleanText)(doc.body?.textContent || "");
|
|
110
|
+
excerpt = textContent.slice(0, 200) + (textContent.length > 200 ? "..." : "");
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
else {
|
|
114
|
+
// Skip Readability - use raw extraction
|
|
115
|
+
textContent = (0, common_1.cleanText)(doc.body?.textContent || "");
|
|
116
|
+
excerpt = textContent.slice(0, 200) + (textContent.length > 200 ? "..." : "");
|
|
117
|
+
}
|
|
118
|
+
const markdown = turndownService.turndown(finalContent);
|
|
119
|
+
return {
|
|
120
|
+
title: finalTitle,
|
|
121
|
+
content: finalContent,
|
|
122
|
+
textContent,
|
|
123
|
+
length: textContent.length,
|
|
124
|
+
excerpt,
|
|
125
|
+
siteName: finalSiteName,
|
|
126
|
+
favicon,
|
|
127
|
+
imageUrls,
|
|
128
|
+
markdown,
|
|
129
|
+
rawHtml: html,
|
|
130
|
+
};
|
|
131
|
+
}
|
|
132
|
+
// get webpage content using readability with stealth puppeteer
|
|
133
|
+
async function getWebpageContent(url, options) {
|
|
134
|
+
// Backward compatibility: if options is boolean, treat as usePuppeteer
|
|
135
|
+
if (typeof options === "boolean") {
|
|
136
|
+
options = { usePuppeteer: options };
|
|
137
|
+
}
|
|
138
|
+
else if (!options) {
|
|
139
|
+
options = {};
|
|
140
|
+
}
|
|
141
|
+
// Cast options to the correct type for internal use
|
|
142
|
+
const opts = options;
|
|
143
|
+
try {
|
|
144
|
+
const urlType = getUrlType(url);
|
|
145
|
+
// handle special cases
|
|
146
|
+
if (urlType === "wikipedia") {
|
|
147
|
+
const title = url.split("/wiki/")[1]?.replace(/_/g, " ") || url;
|
|
148
|
+
const html = await (0, wikipedia_1.wikiGetContent)(title);
|
|
149
|
+
return normalizeContent({
|
|
150
|
+
url,
|
|
151
|
+
html,
|
|
152
|
+
title,
|
|
153
|
+
siteName: "Wikipedia",
|
|
154
|
+
fallbackFavicon: "https://en.wikipedia.org/static/favicon/wikipedia.ico",
|
|
155
|
+
skipReadability: true, // Wikipedia content is already clean from their API
|
|
156
|
+
});
|
|
157
|
+
}
|
|
158
|
+
if (urlType === "hackernews") {
|
|
159
|
+
const idStr = url.split("id=")[1];
|
|
160
|
+
const id = parseInt(idStr);
|
|
161
|
+
const story = await (0, hackernews_1.getStoryById)(id);
|
|
162
|
+
const html = story.snippet || story.title || "No content available";
|
|
163
|
+
return normalizeContent({
|
|
164
|
+
url,
|
|
165
|
+
html,
|
|
166
|
+
title: story.title || url,
|
|
167
|
+
siteName: "Hacker News",
|
|
168
|
+
fallbackFavicon: "https://news.ycombinator.com/favicon.ico",
|
|
169
|
+
skipReadability: true, // HN snippets are already clean
|
|
170
|
+
});
|
|
171
|
+
}
|
|
172
|
+
if (urlType === "unsupported") {
|
|
173
|
+
return normalizeContent({
|
|
174
|
+
url,
|
|
175
|
+
html: "<html><body><p>This URL type is not supported for content extraction.</p></body></html>",
|
|
176
|
+
title: url,
|
|
177
|
+
skipReadability: true,
|
|
178
|
+
});
|
|
179
|
+
}
|
|
180
|
+
// handle general case with readability
|
|
181
|
+
let html;
|
|
182
|
+
if (opts.usePuppeteer) {
|
|
183
|
+
// Use stealth puppeteer for bot-protected sites
|
|
184
|
+
const proxy = (0, common_1.parseProxyConfig)(opts.proxy);
|
|
185
|
+
const browser = await (0, common_1.createStealthBrowser)(proxy || undefined);
|
|
186
|
+
try {
|
|
187
|
+
const pages = await browser.pages();
|
|
188
|
+
const page = pages.length > 0 ? pages[0] : await browser.newPage();
|
|
189
|
+
await page.setViewport({ width: 1920, height: 1080 });
|
|
190
|
+
await page.setExtraHTTPHeaders((0, common_1.createRealisticHeaders)());
|
|
191
|
+
await page.goto(url, { waitUntil: "networkidle2" });
|
|
192
|
+
html = await page.content();
|
|
193
|
+
}
|
|
194
|
+
finally {
|
|
195
|
+
await browser.close();
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
else {
|
|
199
|
+
try {
|
|
200
|
+
const headers = (0, common_1.createRealisticHeaders)();
|
|
201
|
+
const fetchOptions = { headers };
|
|
202
|
+
const proxy = (0, common_1.parseProxyConfig)(opts.proxy);
|
|
203
|
+
if (proxy) {
|
|
204
|
+
if (proxy.type === "socks4" || proxy.type === "socks5") {
|
|
205
|
+
const { SocksProxyAgent } = await import("socks-proxy-agent");
|
|
206
|
+
fetchOptions.agent = new SocksProxyAgent(proxy.url);
|
|
207
|
+
}
|
|
208
|
+
else {
|
|
209
|
+
const { HttpsProxyAgent } = await import("https-proxy-agent");
|
|
210
|
+
fetchOptions.agent = new HttpsProxyAgent(proxy.url);
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
const response = await fetch(url, fetchOptions);
|
|
214
|
+
html = await response.text();
|
|
215
|
+
if ((0, common_1.detectBotProtection)(response.headers, html)) {
|
|
216
|
+
// If bot protection detected, re-run with puppeteer
|
|
217
|
+
return await getWebpageContent(url, { ...opts, usePuppeteer: true });
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
catch {
|
|
221
|
+
// If basic fetch fails, try with puppeteer
|
|
222
|
+
return await getWebpageContent(url, { ...opts, usePuppeteer: true });
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
// Extract Answer Box for Search Engines
|
|
226
|
+
if (urlType === "google-search") {
|
|
227
|
+
const dom = new jsdom_1.JSDOM(html);
|
|
228
|
+
const answer = (0, google_1.extractAnswerBox)(dom.window.document);
|
|
229
|
+
if (answer) {
|
|
230
|
+
return normalizeContent({
|
|
231
|
+
url,
|
|
232
|
+
html: `<div class="answer-box"><h1>Google Answer</h1><p>${answer}</p></div>`,
|
|
233
|
+
title: "Google Answer",
|
|
234
|
+
siteName: "Google",
|
|
235
|
+
skipReadability: true,
|
|
236
|
+
fallbackFavicon: "https://www.google.com/favicon.ico",
|
|
237
|
+
});
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
if (urlType === "duckduckgo-search") {
|
|
241
|
+
const dom = new jsdom_1.JSDOM(html);
|
|
242
|
+
const answer = (0, duckduckgo_1.extractAnswerBox)(dom.window.document);
|
|
243
|
+
if (answer) {
|
|
244
|
+
return normalizeContent({
|
|
245
|
+
url,
|
|
246
|
+
html: `<div class="answer-box"><h1>DuckDuckGo Answer</h1><p>${answer}</p></div>`,
|
|
247
|
+
title: "DuckDuckGo Answer",
|
|
248
|
+
siteName: "DuckDuckGo",
|
|
249
|
+
skipReadability: true,
|
|
250
|
+
fallbackFavicon: "https://duckduckgo.com/favicon.ico",
|
|
251
|
+
});
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
return normalizeContent({
|
|
255
|
+
url,
|
|
256
|
+
html,
|
|
257
|
+
skipReadability: false,
|
|
258
|
+
});
|
|
259
|
+
}
|
|
260
|
+
catch (err) {
|
|
261
|
+
throw {
|
|
262
|
+
message: "failed to get webpage content :/",
|
|
263
|
+
code: "WEBPAGE_ERROR",
|
|
264
|
+
originalError: err,
|
|
265
|
+
};
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
// get just the text content
|
|
269
|
+
async function getWebpageText(url, options = {}) {
|
|
270
|
+
const content = await getWebpageContent(url, options);
|
|
271
|
+
return content.textContent;
|
|
272
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
+
const vitest_1 = require("vitest");
|
|
37
|
+
const scrape_1 = require("./scrape");
|
|
38
|
+
const common = __importStar(require("./common"));
|
|
39
|
+
const wikipedia_1 = require("./wikipedia");
|
|
40
|
+
const hackernews_1 = require("./hackernews");
|
|
41
|
+
const readability_1 = require("@mozilla/readability");
|
|
42
|
+
const jsdom_1 = require("jsdom");
|
|
43
|
+
const google_1 = require("./scrapers/google");
|
|
44
|
+
const duckduckgo_1 = require("./scrapers/duckduckgo");
|
|
45
|
+
// Mock dependencies
|
|
46
|
+
vitest_1.vi.mock("./wikipedia");
|
|
47
|
+
vitest_1.vi.mock("./hackernews");
|
|
48
|
+
vitest_1.vi.mock("./scrapers/google", () => ({
|
|
49
|
+
extractAnswerBox: vitest_1.vi.fn(),
|
|
50
|
+
}));
|
|
51
|
+
vitest_1.vi.mock("./scrapers/duckduckgo", () => ({
|
|
52
|
+
extractAnswerBox: vitest_1.vi.fn(),
|
|
53
|
+
}));
|
|
54
|
+
vitest_1.vi.mock("@mozilla/readability");
|
|
55
|
+
vitest_1.vi.mock("jsdom");
|
|
56
|
+
vitest_1.vi.mock("./common", async () => {
|
|
57
|
+
const actual = await vitest_1.vi.importActual("./common");
|
|
58
|
+
return {
|
|
59
|
+
...actual,
|
|
60
|
+
createStealthBrowser: vitest_1.vi.fn(),
|
|
61
|
+
createRealisticHeaders: vitest_1.vi.fn().mockReturnValue({ "User-Agent": "test-agent" }),
|
|
62
|
+
parseProxyConfig: vitest_1.vi.fn(),
|
|
63
|
+
detectBotProtection: vitest_1.vi.fn().mockReturnValue(false),
|
|
64
|
+
cleanText: vitest_1.vi.fn((text) => text?.trim() || ""),
|
|
65
|
+
};
|
|
66
|
+
});
|
|
67
|
+
// Global fetch mock
|
|
68
|
+
global.fetch = vitest_1.vi.fn();
|
|
69
|
+
(0, vitest_1.describe)("Scrape Module", () => {
|
|
70
|
+
(0, vitest_1.beforeEach)(() => {
|
|
71
|
+
vitest_1.vi.resetAllMocks();
|
|
72
|
+
// Default JSDOM/Readability mocks
|
|
73
|
+
jsdom_1.JSDOM.mockImplementation(() => ({
|
|
74
|
+
window: {
|
|
75
|
+
document: {
|
|
76
|
+
querySelectorAll: vitest_1.vi.fn().mockImplementation((selector) => {
|
|
77
|
+
if (selector === "link[rel*='icon']") {
|
|
78
|
+
return [{ href: "https://example.com/favicon.ico" }];
|
|
79
|
+
}
|
|
80
|
+
if (selector === "img") {
|
|
81
|
+
return [{ src: "https://example.com/image1.jpg" }, { src: "https://example.com/image2.png" }];
|
|
82
|
+
}
|
|
83
|
+
return [];
|
|
84
|
+
}),
|
|
85
|
+
body: {
|
|
86
|
+
textContent: "Test content from body",
|
|
87
|
+
},
|
|
88
|
+
},
|
|
89
|
+
},
|
|
90
|
+
}));
|
|
91
|
+
readability_1.Readability.mockImplementation(() => ({
|
|
92
|
+
parse: vitest_1.vi.fn().mockReturnValue({
|
|
93
|
+
title: "Test Article",
|
|
94
|
+
content: "<div>Test content</div>",
|
|
95
|
+
textContent: "Test content",
|
|
96
|
+
excerpt: "Test excerpt",
|
|
97
|
+
siteName: "Test Site",
|
|
98
|
+
}),
|
|
99
|
+
}));
|
|
100
|
+
});
|
|
101
|
+
(0, vitest_1.describe)("URL Type Routing", () => {
|
|
102
|
+
(0, vitest_1.it)("should route wikipedia URLs to wiki handler", async () => {
|
|
103
|
+
wikipedia_1.wikiGetContent.mockResolvedValue("Wiki content");
|
|
104
|
+
const result = await (0, scrape_1.getWebpageContent)("https://en.wikipedia.org/wiki/Test");
|
|
105
|
+
(0, vitest_1.expect)(result.siteName).toBe("Wikipedia");
|
|
106
|
+
(0, vitest_1.expect)(result.content).toBe("Wiki content");
|
|
107
|
+
(0, vitest_1.expect)(result.favicon).toBeDefined();
|
|
108
|
+
(0, vitest_1.expect)(result.markdown).toBeDefined();
|
|
109
|
+
(0, vitest_1.expect)(wikipedia_1.wikiGetContent).toHaveBeenCalledWith("Test");
|
|
110
|
+
});
|
|
111
|
+
(0, vitest_1.it)("should route hackernews URLs to HN handler", async () => {
|
|
112
|
+
hackernews_1.getStoryById.mockResolvedValue({
|
|
113
|
+
title: "HN Story",
|
|
114
|
+
snippet: "HN Content",
|
|
115
|
+
});
|
|
116
|
+
const result = await (0, scrape_1.getWebpageContent)("https://news.ycombinator.com/item?id=12345");
|
|
117
|
+
(0, vitest_1.expect)(result.siteName).toBe("Hacker News");
|
|
118
|
+
(0, vitest_1.expect)(result.title).toBe("HN Story");
|
|
119
|
+
(0, vitest_1.expect)(result.favicon).toBeDefined();
|
|
120
|
+
(0, vitest_1.expect)(result.markdown).toBeDefined();
|
|
121
|
+
(0, vitest_1.expect)(hackernews_1.getStoryById).toHaveBeenCalledWith(12345);
|
|
122
|
+
});
|
|
123
|
+
(0, vitest_1.it)("should handle unsupported domains", async () => {
|
|
124
|
+
const result = await (0, scrape_1.getWebpageContent)("https://youtube.com/watch?v=123");
|
|
125
|
+
// Unsupported URLs now go through normalizeContent, which uses the JSDOM mock
|
|
126
|
+
(0, vitest_1.expect)(result.textContent).toBe("Test content from body");
|
|
127
|
+
(0, vitest_1.expect)(result.title).toBe("https://youtube.com/watch?v=123");
|
|
128
|
+
});
|
|
129
|
+
(0, vitest_1.it)("should route google search URLs and extract answer box", async () => {
|
|
130
|
+
// Mock fetch response
|
|
131
|
+
global.fetch.mockResolvedValue({
|
|
132
|
+
text: () => Promise.resolve("<html><body>Google Search Result</body></html>"),
|
|
133
|
+
headers: new Map(),
|
|
134
|
+
});
|
|
135
|
+
// Mock extractor to return an answer
|
|
136
|
+
google_1.extractAnswerBox.mockReturnValue("The Answer is 42");
|
|
137
|
+
const result = await (0, scrape_1.getWebpageContent)("https://www.google.com/search?q=answer");
|
|
138
|
+
(0, vitest_1.expect)(result.siteName).toBe("Google");
|
|
139
|
+
(0, vitest_1.expect)(result.title).toBe("Google Answer");
|
|
140
|
+
(0, vitest_1.expect)(result.rawHtml).toContain("The Answer is 42");
|
|
141
|
+
(0, vitest_1.expect)(google_1.extractAnswerBox).toHaveBeenCalled();
|
|
142
|
+
});
|
|
143
|
+
(0, vitest_1.it)("should route duckduckgo search URLs and extract answer box", async () => {
|
|
144
|
+
// Mock fetch response
|
|
145
|
+
global.fetch.mockResolvedValue({
|
|
146
|
+
text: () => Promise.resolve("<html><body>DDG Search Result</body></html>"),
|
|
147
|
+
headers: new Map(),
|
|
148
|
+
});
|
|
149
|
+
// Mock extractor to return an answer
|
|
150
|
+
duckduckgo_1.extractAnswerBox.mockReturnValue("The DDG Answer");
|
|
151
|
+
const result = await (0, scrape_1.getWebpageContent)("https://duckduckgo.com/?q=answer");
|
|
152
|
+
(0, vitest_1.expect)(result.siteName).toBe("DuckDuckGo");
|
|
153
|
+
(0, vitest_1.expect)(result.title).toBe("DuckDuckGo Answer");
|
|
154
|
+
(0, vitest_1.expect)(result.rawHtml).toContain("The DDG Answer");
|
|
155
|
+
(0, vitest_1.expect)(duckduckgo_1.extractAnswerBox).toHaveBeenCalled();
|
|
156
|
+
});
|
|
157
|
+
});
|
|
158
|
+
(0, vitest_1.describe)("General Scraping (Fetch)", () => {
|
|
159
|
+
(0, vitest_1.it)("should scrape content using fetch and readability", async () => {
|
|
160
|
+
global.fetch.mockResolvedValue({
|
|
161
|
+
text: () => Promise.resolve("<html><body>Test</body></html>"),
|
|
162
|
+
headers: new Map(),
|
|
163
|
+
});
|
|
164
|
+
const result = await (0, scrape_1.getWebpageContent)("https://example.com");
|
|
165
|
+
(0, vitest_1.expect)(result.title).toBe("Test Article");
|
|
166
|
+
(0, vitest_1.expect)(result.textContent).toBe("Test content");
|
|
167
|
+
(0, vitest_1.expect)(result.favicon).toBe("https://example.com/favicon.ico");
|
|
168
|
+
(0, vitest_1.expect)(result.imageUrls).toBeDefined();
|
|
169
|
+
(0, vitest_1.expect)(result.imageUrls).toContain("https://example.com/image1.jpg");
|
|
170
|
+
(0, vitest_1.expect)(result.imageUrls).toContain("https://example.com/image2.png");
|
|
171
|
+
(0, vitest_1.expect)(result.markdown).toBeDefined();
|
|
172
|
+
(0, vitest_1.expect)(result.markdown?.length).toBeGreaterThan(0);
|
|
173
|
+
(0, vitest_1.expect)(result.rawHtml).toContain("<html><body>Test</body></html>");
|
|
174
|
+
(0, vitest_1.expect)(global.fetch).toHaveBeenCalled();
|
|
175
|
+
(0, vitest_1.expect)(readability_1.Readability).toHaveBeenCalled();
|
|
176
|
+
});
|
|
177
|
+
(0, vitest_1.it)("should fallback to puppeteer if bot protection detected", async () => {
|
|
178
|
+
// First call detects bot
|
|
179
|
+
global.fetch.mockResolvedValue({
|
|
180
|
+
text: () => Promise.resolve("<html><body>Captcha</body></html>"),
|
|
181
|
+
headers: new Map(),
|
|
182
|
+
});
|
|
183
|
+
common.detectBotProtection.mockReturnValueOnce(true);
|
|
184
|
+
// Mock puppeteer part
|
|
185
|
+
const mockPage = {
|
|
186
|
+
setViewport: vitest_1.vi.fn(),
|
|
187
|
+
setExtraHTTPHeaders: vitest_1.vi.fn(),
|
|
188
|
+
goto: vitest_1.vi.fn(),
|
|
189
|
+
content: vitest_1.vi.fn().mockResolvedValue("<html><body>Real Content</body></html>"),
|
|
190
|
+
};
|
|
191
|
+
const mockBrowser = {
|
|
192
|
+
pages: vitest_1.vi.fn().mockResolvedValue([]),
|
|
193
|
+
newPage: vitest_1.vi.fn().mockResolvedValue(mockPage),
|
|
194
|
+
close: vitest_1.vi.fn(),
|
|
195
|
+
};
|
|
196
|
+
common.createStealthBrowser.mockResolvedValue(mockBrowser);
|
|
197
|
+
const result = await (0, scrape_1.getWebpageContent)("https://protected.com");
|
|
198
|
+
(0, vitest_1.expect)(common.createStealthBrowser).toHaveBeenCalled();
|
|
199
|
+
(0, vitest_1.expect)(result.title).toBe("Test Article");
|
|
200
|
+
});
|
|
201
|
+
});
|
|
202
|
+
(0, vitest_1.describe)("General Scraping (Puppeteer)", () => {
|
|
203
|
+
(0, vitest_1.it)("should use puppeteer when forced", async () => {
|
|
204
|
+
const mockPage = {
|
|
205
|
+
setViewport: vitest_1.vi.fn(),
|
|
206
|
+
setExtraHTTPHeaders: vitest_1.vi.fn(),
|
|
207
|
+
goto: vitest_1.vi.fn(),
|
|
208
|
+
content: vitest_1.vi.fn().mockResolvedValue("<html><body>Puppeteer Content</body></html>"),
|
|
209
|
+
};
|
|
210
|
+
const mockBrowser = {
|
|
211
|
+
pages: vitest_1.vi.fn().mockResolvedValue([]),
|
|
212
|
+
newPage: vitest_1.vi.fn().mockResolvedValue(mockPage),
|
|
213
|
+
close: vitest_1.vi.fn(),
|
|
214
|
+
};
|
|
215
|
+
common.createStealthBrowser.mockResolvedValue(mockBrowser);
|
|
216
|
+
await (0, scrape_1.getWebpageContent)("https://example.com", { usePuppeteer: true });
|
|
217
|
+
(0, vitest_1.expect)(global.fetch).not.toHaveBeenCalled();
|
|
218
|
+
(0, vitest_1.expect)(common.createStealthBrowser).toHaveBeenCalled();
|
|
219
|
+
(0, vitest_1.expect)(mockPage.goto).toHaveBeenCalledWith("https://example.com", vitest_1.expect.any(Object));
|
|
220
|
+
});
|
|
221
|
+
});
|
|
222
|
+
(0, vitest_1.describe)("getWebpageText", () => {
|
|
223
|
+
(0, vitest_1.it)("should return only text content", async () => {
|
|
224
|
+
global.fetch.mockResolvedValue({
|
|
225
|
+
text: () => Promise.resolve("html"),
|
|
226
|
+
headers: new Map(),
|
|
227
|
+
});
|
|
228
|
+
const text = await (0, scrape_1.getWebpageText)("https://example.com");
|
|
229
|
+
(0, vitest_1.expect)(text).toBe("Test content");
|
|
230
|
+
});
|
|
231
|
+
});
|
|
232
|
+
});
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import { ScraperOptions, SearchResult, WebpageContent } from "../types";
|
|
2
|
+
export declare function searchGoogle(query: string, options?: ScraperOptions): Promise<SearchResult[]>;
|
|
3
|
+
export declare function searchDuckDuckGo(query: string, options?: ScraperOptions): Promise<SearchResult[]>;
|
|
4
|
+
export declare function search(query: string, options?: ScraperOptions): Promise<SearchResult[]>;
|
|
5
|
+
export declare function getWebpageContent(url: string, options?: {
|
|
6
|
+
usePuppeteer?: boolean;
|
|
7
|
+
} & ScraperOptions | boolean): Promise<WebpageContent>;
|
|
8
|
+
export declare function getWebpageText(url: string, options?: {
|
|
9
|
+
usePuppeteer?: boolean;
|
|
10
|
+
} & ScraperOptions): Promise<string>;
|
|
11
|
+
export declare function isUrlAccessible(url: string): Promise<boolean>;
|
|
12
|
+
export { searchGoogle as searchGoogleLegacy, searchDuckDuckGo as searchDuckDuckGoLegacy };
|