llm-search-tools 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +244 -0
- package/dist/index.d.ts +18 -0
- package/dist/index.js +40 -0
- package/dist/index.js.map +1 -0
- package/dist/integration.test.d.ts +1 -0
- package/dist/integration.test.js +237 -0
- package/dist/modules/answerbox.test.d.ts +1 -0
- package/dist/modules/answerbox.test.js +105 -0
- package/dist/modules/autocomplete.d.ts +11 -0
- package/dist/modules/autocomplete.js +159 -0
- package/dist/modules/autocomplete.test.d.ts +1 -0
- package/dist/modules/autocomplete.test.js +188 -0
- package/dist/modules/common.d.ts +26 -0
- package/dist/modules/common.js +263 -0
- package/dist/modules/common.test.d.ts +1 -0
- package/dist/modules/common.test.js +87 -0
- package/dist/modules/crawl.d.ts +9 -0
- package/dist/modules/crawl.js +117 -0
- package/dist/modules/crawl.test.d.ts +1 -0
- package/dist/modules/crawl.test.js +48 -0
- package/dist/modules/events.d.ts +8 -0
- package/dist/modules/events.js +129 -0
- package/dist/modules/events.test.d.ts +1 -0
- package/dist/modules/events.test.js +104 -0
- package/dist/modules/finance.d.ts +10 -0
- package/dist/modules/finance.js +20 -0
- package/dist/modules/finance.test.d.ts +1 -0
- package/dist/modules/finance.test.js +77 -0
- package/dist/modules/flights.d.ts +8 -0
- package/dist/modules/flights.js +135 -0
- package/dist/modules/flights.test.d.ts +1 -0
- package/dist/modules/flights.test.js +128 -0
- package/dist/modules/hackernews.d.ts +8 -0
- package/dist/modules/hackernews.js +87 -0
- package/dist/modules/hackernews.js.map +1 -0
- package/dist/modules/images.test.d.ts +1 -0
- package/dist/modules/images.test.js +145 -0
- package/dist/modules/integrations.test.d.ts +1 -0
- package/dist/modules/integrations.test.js +93 -0
- package/dist/modules/media.d.ts +11 -0
- package/dist/modules/media.js +132 -0
- package/dist/modules/media.test.d.ts +1 -0
- package/dist/modules/media.test.js +186 -0
- package/dist/modules/news.d.ts +3 -0
- package/dist/modules/news.js +39 -0
- package/dist/modules/news.test.d.ts +1 -0
- package/dist/modules/news.test.js +88 -0
- package/dist/modules/parser.d.ts +19 -0
- package/dist/modules/parser.js +361 -0
- package/dist/modules/parser.test.d.ts +1 -0
- package/dist/modules/parser.test.js +151 -0
- package/dist/modules/reddit.d.ts +21 -0
- package/dist/modules/reddit.js +107 -0
- package/dist/modules/scrape.d.ts +16 -0
- package/dist/modules/scrape.js +272 -0
- package/dist/modules/scrape.test.d.ts +1 -0
- package/dist/modules/scrape.test.js +232 -0
- package/dist/modules/scraper.d.ts +12 -0
- package/dist/modules/scraper.js +640 -0
- package/dist/modules/scrapers/anidb.d.ts +8 -0
- package/dist/modules/scrapers/anidb.js +156 -0
- package/dist/modules/scrapers/duckduckgo.d.ts +6 -0
- package/dist/modules/scrapers/duckduckgo.js +284 -0
- package/dist/modules/scrapers/google-news.d.ts +2 -0
- package/dist/modules/scrapers/google-news.js +60 -0
- package/dist/modules/scrapers/google.d.ts +6 -0
- package/dist/modules/scrapers/google.js +211 -0
- package/dist/modules/scrapers/searxng.d.ts +2 -0
- package/dist/modules/scrapers/searxng.js +93 -0
- package/dist/modules/scrapers/thetvdb.d.ts +3 -0
- package/dist/modules/scrapers/thetvdb.js +147 -0
- package/dist/modules/scrapers/tmdb.d.ts +3 -0
- package/dist/modules/scrapers/tmdb.js +172 -0
- package/dist/modules/scrapers/yahoo-finance.d.ts +2 -0
- package/dist/modules/scrapers/yahoo-finance.js +33 -0
- package/dist/modules/search.d.ts +5 -0
- package/dist/modules/search.js +45 -0
- package/dist/modules/search.js.map +1 -0
- package/dist/modules/search.test.d.ts +1 -0
- package/dist/modules/search.test.js +219 -0
- package/dist/modules/urbandictionary.d.ts +12 -0
- package/dist/modules/urbandictionary.js +26 -0
- package/dist/modules/webpage.d.ts +4 -0
- package/dist/modules/webpage.js +150 -0
- package/dist/modules/webpage.js.map +1 -0
- package/dist/modules/wikipedia.d.ts +5 -0
- package/dist/modules/wikipedia.js +85 -0
- package/dist/modules/wikipedia.js.map +1 -0
- package/dist/scripts/interactive-search.d.ts +1 -0
- package/dist/scripts/interactive-search.js +98 -0
- package/dist/test.d.ts +1 -0
- package/dist/test.js +179 -0
- package/dist/test.js.map +1 -0
- package/dist/testBraveSearch.d.ts +1 -0
- package/dist/testBraveSearch.js +34 -0
- package/dist/testDuckDuckGo.d.ts +1 -0
- package/dist/testDuckDuckGo.js +52 -0
- package/dist/testEcosia.d.ts +1 -0
- package/dist/testEcosia.js +57 -0
- package/dist/testSearchModule.d.ts +1 -0
- package/dist/testSearchModule.js +95 -0
- package/dist/testwebpage.d.ts +1 -0
- package/dist/testwebpage.js +81 -0
- package/dist/types.d.ts +174 -0
- package/dist/types.js +3 -0
- package/dist/types.js.map +1 -0
- package/dist/utils/createTestDocx.d.ts +1 -0
- package/dist/utils/createTestDocx.js +58 -0
- package/dist/utils/htmlcleaner.d.ts +20 -0
- package/dist/utils/htmlcleaner.js +172 -0
- package/docs/README.md +275 -0
- package/docs/autocomplete.md +73 -0
- package/docs/crawling.md +88 -0
- package/docs/events.md +58 -0
- package/docs/examples.md +158 -0
- package/docs/finance.md +60 -0
- package/docs/flights.md +71 -0
- package/docs/hackernews.md +121 -0
- package/docs/media.md +87 -0
- package/docs/news.md +75 -0
- package/docs/parser.md +197 -0
- package/docs/scraper.md +347 -0
- package/docs/search.md +106 -0
- package/docs/wikipedia.md +91 -0
- package/package.json +97 -0
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.searchAniDB = searchAniDB;
|
|
4
|
+
exports.getAniDBDetails = getAniDBDetails;
|
|
5
|
+
const common_1 = require("../common");
|
|
6
|
+
const jsdom_1 = require("jsdom");
|
|
7
|
+
// AniDB requires stricter rate limiting to avoid bans
|
|
8
|
+
// They recommend 2s delay between requests
|
|
9
|
+
const MIN_DELAY = 2500;
|
|
10
|
+
let lastSearchTime = 0;
|
|
11
|
+
async function enforceRateLimit() {
|
|
12
|
+
const now = Date.now();
|
|
13
|
+
const timeSinceLast = now - lastSearchTime;
|
|
14
|
+
if (timeSinceLast < MIN_DELAY) {
|
|
15
|
+
await new Promise((resolve) => setTimeout(resolve, MIN_DELAY - timeSinceLast));
|
|
16
|
+
}
|
|
17
|
+
lastSearchTime = Date.now();
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* AniDB Scraper
|
|
21
|
+
* AniDB has strict anti-bot protection ("AntiLeech").
|
|
22
|
+
* We must use Puppeteer with Stealth plugin and respect rate limits.
|
|
23
|
+
*/
|
|
24
|
+
async function searchAniDB(query, options = {}) {
|
|
25
|
+
try {
|
|
26
|
+
await enforceRateLimit();
|
|
27
|
+
const mergedOptions = {
|
|
28
|
+
...options,
|
|
29
|
+
// Always force puppeteer for AniDB due to protection
|
|
30
|
+
forcePuppeteer: true,
|
|
31
|
+
};
|
|
32
|
+
// AniDB search URL
|
|
33
|
+
// do=animelist performs a search
|
|
34
|
+
const searchUrl = `https://anidb.net/anime/?adb.search=${encodeURIComponent(query)}&do.update=Search&noalias=1`;
|
|
35
|
+
return await scrapeAniDBWithPuppeteer(searchUrl, mergedOptions);
|
|
36
|
+
}
|
|
37
|
+
catch (error) {
|
|
38
|
+
throw {
|
|
39
|
+
message: "AniDB search failed",
|
|
40
|
+
code: "ANIDB_SEARCH_ERROR",
|
|
41
|
+
originalError: error,
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
async function scrapeAniDBWithPuppeteer(url, options) {
|
|
46
|
+
const proxy = (0, common_1.parseProxyConfig)(options.proxy);
|
|
47
|
+
const browser = await (0, common_1.createStealthBrowser)(proxy || undefined);
|
|
48
|
+
try {
|
|
49
|
+
const page = await browser.newPage();
|
|
50
|
+
await page.setViewport({ width: 1920, height: 1080 });
|
|
51
|
+
await page.setExtraHTTPHeaders((0, common_1.createRealisticHeaders)());
|
|
52
|
+
// Navigate with a slightly longer timeout for AniDB
|
|
53
|
+
await page.goto(url, { waitUntil: "networkidle2", timeout: 30000 });
|
|
54
|
+
// Wait for the anime list table
|
|
55
|
+
try {
|
|
56
|
+
await page.waitForSelector("table.animelist", { timeout: 10000 });
|
|
57
|
+
}
|
|
58
|
+
catch (e) {
|
|
59
|
+
// Check if we got a single result redirect (AniDB sometimes redirects directly to the anime page)
|
|
60
|
+
const currentUrl = page.url();
|
|
61
|
+
if (currentUrl.includes("/anime/") && !currentUrl.includes("adb.search")) {
|
|
62
|
+
// Single result found
|
|
63
|
+
const html = await page.content();
|
|
64
|
+
const dom = new jsdom_1.JSDOM(html);
|
|
65
|
+
const singleResult = parseAniDBSinglePage(dom.window.document, currentUrl);
|
|
66
|
+
return singleResult ? [singleResult] : [];
|
|
67
|
+
}
|
|
68
|
+
return [];
|
|
69
|
+
}
|
|
70
|
+
const html = await page.content();
|
|
71
|
+
const dom = new jsdom_1.JSDOM(html);
|
|
72
|
+
return parseAniDBList(dom.window.document);
|
|
73
|
+
}
|
|
74
|
+
finally {
|
|
75
|
+
await browser.close();
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
async function getAniDBDetails(url, options = {}) {
|
|
79
|
+
try {
|
|
80
|
+
await enforceRateLimit();
|
|
81
|
+
const proxy = (0, common_1.parseProxyConfig)(options.proxy);
|
|
82
|
+
const browser = await (0, common_1.createStealthBrowser)(proxy || undefined);
|
|
83
|
+
try {
|
|
84
|
+
const page = await browser.newPage();
|
|
85
|
+
await page.setViewport({ width: 1920, height: 1080 });
|
|
86
|
+
await page.setExtraHTTPHeaders((0, common_1.createRealisticHeaders)());
|
|
87
|
+
await page.goto(url, { waitUntil: "networkidle2", timeout: 30000 });
|
|
88
|
+
const html = await page.content();
|
|
89
|
+
const dom = new jsdom_1.JSDOM(html);
|
|
90
|
+
const result = parseAniDBSinglePage(dom.window.document, url);
|
|
91
|
+
return result || {};
|
|
92
|
+
}
|
|
93
|
+
finally {
|
|
94
|
+
await browser.close();
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
catch (e) {
|
|
98
|
+
return {};
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
function parseAniDBList(doc) {
|
|
102
|
+
const results = [];
|
|
103
|
+
const rows = doc.querySelectorAll("table.animelist tbody tr");
|
|
104
|
+
rows.forEach((row) => {
|
|
105
|
+
if (results.length >= 10)
|
|
106
|
+
return;
|
|
107
|
+
// AniDB list columns: ID, Icon, Title, Type, Episodes, Rating, etc.
|
|
108
|
+
// The title is usually in the "name" column (class .name or depending on layout)
|
|
109
|
+
const titleLink = row.querySelector("td[data-label='Title'] a, td.name a");
|
|
110
|
+
const imgEl = row.querySelector("img"); // Often thumbnails are small or hidden on list view
|
|
111
|
+
if (titleLink) {
|
|
112
|
+
const title = titleLink.textContent?.trim() || "";
|
|
113
|
+
let href = titleLink.getAttribute("href") || "";
|
|
114
|
+
if (href && !href.startsWith("http")) {
|
|
115
|
+
href = `https://anidb.net${href}`;
|
|
116
|
+
}
|
|
117
|
+
// Try to get other metadata from columns if available
|
|
118
|
+
const typeEl = row.querySelector("td[data-label='Type']");
|
|
119
|
+
const ratingEl = row.querySelector("td[data-label='Rating']");
|
|
120
|
+
const rating = ratingEl?.textContent?.trim();
|
|
121
|
+
const type = typeEl?.textContent?.trim(); // e.g. TV, Movie, OVA
|
|
122
|
+
results.push({
|
|
123
|
+
title,
|
|
124
|
+
url: href,
|
|
125
|
+
rating,
|
|
126
|
+
description: type ? `Type: ${type}` : undefined,
|
|
127
|
+
source: "anidb",
|
|
128
|
+
mediaType: "anime",
|
|
129
|
+
});
|
|
130
|
+
}
|
|
131
|
+
});
|
|
132
|
+
return results;
|
|
133
|
+
}
|
|
134
|
+
function parseAniDBSinglePage(doc, url) {
|
|
135
|
+
// Parse a specific anime page if we get redirected there
|
|
136
|
+
const titleEl = doc.querySelector("h1.anime");
|
|
137
|
+
if (!titleEl)
|
|
138
|
+
return null;
|
|
139
|
+
const title = titleEl.textContent?.replace("Anime:", "").trim() || "";
|
|
140
|
+
const descriptionEl = doc.querySelector("div.desc");
|
|
141
|
+
const ratingEl = doc.querySelector("tr.rating td.value");
|
|
142
|
+
const imgEl = doc.querySelector("div.image img");
|
|
143
|
+
let posterUrl = imgEl?.getAttribute("src") || undefined;
|
|
144
|
+
if (posterUrl && !posterUrl.startsWith("http")) {
|
|
145
|
+
posterUrl = `https://anidb.net${posterUrl}`;
|
|
146
|
+
}
|
|
147
|
+
return {
|
|
148
|
+
title,
|
|
149
|
+
url,
|
|
150
|
+
description: descriptionEl?.textContent?.trim(),
|
|
151
|
+
rating: ratingEl?.textContent?.trim(),
|
|
152
|
+
posterUrl,
|
|
153
|
+
source: "anidb",
|
|
154
|
+
mediaType: "anime",
|
|
155
|
+
};
|
|
156
|
+
}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import { ScraperOptions, SearchResult } from "../../types";
|
|
2
|
+
/**
|
|
3
|
+
* Extracts the "Answer Box" or "Instant Answer" from the DuckDuckGo Search DOM
|
|
4
|
+
*/
|
|
5
|
+
export declare function extractAnswerBox(doc: Document): string | undefined;
|
|
6
|
+
export declare function searchDuckDuckGo(query: string, options?: ScraperOptions): Promise<SearchResult[]>;
|
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.extractAnswerBox = extractAnswerBox;
|
|
4
|
+
exports.searchDuckDuckGo = searchDuckDuckGo;
|
|
5
|
+
const common_1 = require("../common");
|
|
6
|
+
const jsdom_1 = require("jsdom");
|
|
7
|
+
/**
|
|
8
|
+
* Extracts the "Answer Box" or "Instant Answer" from the DuckDuckGo Search DOM
|
|
9
|
+
*/
|
|
10
|
+
function extractAnswerBox(doc) {
|
|
11
|
+
// 1. Abstract / Wikipedia Snippet - .module__text
|
|
12
|
+
const abstract = doc.querySelector(".module__text");
|
|
13
|
+
if (abstract && abstract.textContent) {
|
|
14
|
+
return abstract.textContent.trim();
|
|
15
|
+
}
|
|
16
|
+
// 2. Definition / Answer - .zci__def__text
|
|
17
|
+
const definition = doc.querySelector(".zci__def__text");
|
|
18
|
+
if (definition && definition.textContent) {
|
|
19
|
+
return definition.textContent.trim();
|
|
20
|
+
}
|
|
21
|
+
// 3. Calculator / Unit Converter - .c-base__title
|
|
22
|
+
const calculator = doc.querySelector(".c-base__title");
|
|
23
|
+
if (calculator && calculator.textContent) {
|
|
24
|
+
return calculator.textContent.trim();
|
|
25
|
+
}
|
|
26
|
+
// 4. Generic Fact - .zci__body
|
|
27
|
+
const fact = doc.querySelector(".zci__body");
|
|
28
|
+
if (fact && fact.textContent) {
|
|
29
|
+
return fact.textContent.trim();
|
|
30
|
+
}
|
|
31
|
+
return undefined;
|
|
32
|
+
}
|
|
33
|
+
// Rate limiting parameters
|
|
34
|
+
const MIN_DELAY_BETWEEN_SEARCHES = 2000;
|
|
35
|
+
let lastDDGSearchTime = 0;
|
|
36
|
+
// Cache for search results
|
|
37
|
+
const searchCache = new Map();
|
|
38
|
+
const CACHE_TTL = 60 * 60 * 1000; // 1 hour
|
|
39
|
+
// Helper function to enforce rate limiting
|
|
40
|
+
async function enforceRateLimit() {
|
|
41
|
+
const now = Date.now();
|
|
42
|
+
const timeSinceLastSearch = now - lastDDGSearchTime;
|
|
43
|
+
if (timeSinceLastSearch < MIN_DELAY_BETWEEN_SEARCHES) {
|
|
44
|
+
await new Promise((resolve) => setTimeout(resolve, MIN_DELAY_BETWEEN_SEARCHES - timeSinceLastSearch));
|
|
45
|
+
}
|
|
46
|
+
lastDDGSearchTime = Date.now();
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* Extract the direct URL from a DuckDuckGo redirect URL
|
|
50
|
+
*/
|
|
51
|
+
function extractDirectUrl(duckduckgoUrl) {
|
|
52
|
+
try {
|
|
53
|
+
let urlStr = duckduckgoUrl;
|
|
54
|
+
// Handle relative URLs from DuckDuckGo
|
|
55
|
+
if (urlStr.startsWith("//")) {
|
|
56
|
+
urlStr = "https:" + urlStr;
|
|
57
|
+
}
|
|
58
|
+
else if (urlStr.startsWith("/")) {
|
|
59
|
+
urlStr = "https://duckduckgo.com" + urlStr;
|
|
60
|
+
}
|
|
61
|
+
const url = new URL(urlStr);
|
|
62
|
+
// Extract direct URL from DuckDuckGo redirect
|
|
63
|
+
if (url.hostname === "duckduckgo.com" && url.pathname === "/l/") {
|
|
64
|
+
const uddg = url.searchParams.get("uddg");
|
|
65
|
+
if (uddg) {
|
|
66
|
+
return decodeURIComponent(uddg);
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
// Handle ad redirects
|
|
70
|
+
if (url.hostname === "duckduckgo.com" && url.pathname === "/y.js") {
|
|
71
|
+
const u3 = url.searchParams.get("u3");
|
|
72
|
+
if (u3) {
|
|
73
|
+
try {
|
|
74
|
+
const decodedU3 = decodeURIComponent(u3);
|
|
75
|
+
const u3Url = new URL(decodedU3);
|
|
76
|
+
const clickUrl = u3Url.searchParams.get("ld");
|
|
77
|
+
if (clickUrl) {
|
|
78
|
+
return decodeURIComponent(clickUrl);
|
|
79
|
+
}
|
|
80
|
+
return decodedU3;
|
|
81
|
+
}
|
|
82
|
+
catch {
|
|
83
|
+
return urlStr;
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
return urlStr;
|
|
88
|
+
}
|
|
89
|
+
catch {
|
|
90
|
+
// If URL parsing fails, try to extract URL from a basic string match
|
|
91
|
+
const urlMatch = duckduckgoUrl.match(/https?:\/\/[^\s<>"]+/);
|
|
92
|
+
if (urlMatch) {
|
|
93
|
+
return urlMatch[0];
|
|
94
|
+
}
|
|
95
|
+
return duckduckgoUrl;
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
/**
|
|
99
|
+
* Generate a Jina AI URL for a given website URL
|
|
100
|
+
*/
|
|
101
|
+
function getJinaAiUrl(url) {
|
|
102
|
+
try {
|
|
103
|
+
const urlObj = new URL(url);
|
|
104
|
+
return `https://r.jina.ai/${urlObj.href}`;
|
|
105
|
+
}
|
|
106
|
+
catch {
|
|
107
|
+
return "";
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
// Search using Puppeteer
|
|
111
|
+
async function searchWithPuppeteer(query, options) {
|
|
112
|
+
const proxy = (0, common_1.parseProxyConfig)(options.proxy);
|
|
113
|
+
const browser = await (0, common_1.createStealthBrowser)(proxy || undefined);
|
|
114
|
+
const page = await browser.newPage();
|
|
115
|
+
try {
|
|
116
|
+
// Set realistic viewport
|
|
117
|
+
await page.setViewport({ width: 1920, height: 1080 });
|
|
118
|
+
await page.setExtraHTTPHeaders((0, common_1.createRealisticHeaders)());
|
|
119
|
+
if (options.category === "images") {
|
|
120
|
+
const searchUrl = `https://duckduckgo.com/?q=${encodeURIComponent(query)}&iax=images&ia=images`;
|
|
121
|
+
await page.goto(searchUrl, { waitUntil: "networkidle2" });
|
|
122
|
+
// Wait for image results - DDG images usually loaded in tiles
|
|
123
|
+
try {
|
|
124
|
+
await page.waitForSelector(".tile--img", { timeout: 10000 });
|
|
125
|
+
}
|
|
126
|
+
catch (e) {
|
|
127
|
+
// continue
|
|
128
|
+
}
|
|
129
|
+
const results = await page.evaluate((limit) => {
|
|
130
|
+
const items = [];
|
|
131
|
+
const elements = document.querySelectorAll(".tile--img");
|
|
132
|
+
for (let i = 0; i < Math.min(elements.length, limit || 20); i++) {
|
|
133
|
+
const el = elements[i];
|
|
134
|
+
// Title
|
|
135
|
+
const titleEl = el.querySelector(".tile__title");
|
|
136
|
+
const title = titleEl?.textContent || "Image";
|
|
137
|
+
// Source link
|
|
138
|
+
const linkEl = el.querySelector("a.tile--img__sub");
|
|
139
|
+
const url = linkEl?.href || "";
|
|
140
|
+
// Thumbnail/Image
|
|
141
|
+
const imgEl = el.querySelector("img.tile--img__img");
|
|
142
|
+
const imageUrl = imgEl?.src || imgEl?.getAttribute("data-src") || "";
|
|
143
|
+
if (url && imageUrl) {
|
|
144
|
+
items.push({
|
|
145
|
+
title,
|
|
146
|
+
url,
|
|
147
|
+
snippet: title,
|
|
148
|
+
imageUrl: imageUrl,
|
|
149
|
+
thumbnailUrl: imageUrl,
|
|
150
|
+
source: "duckduckgo-images",
|
|
151
|
+
});
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
return items;
|
|
155
|
+
}, options.limit);
|
|
156
|
+
return results;
|
|
157
|
+
}
|
|
158
|
+
const searchUrl = `https://duckduckgo.com/?q=${encodeURIComponent(query)}`;
|
|
159
|
+
await page.goto(searchUrl, { waitUntil: "networkidle2" });
|
|
160
|
+
await page.waitForSelector("#links .result", { timeout: 10000 });
|
|
161
|
+
const results = await page.evaluate((limit) => {
|
|
162
|
+
const items = [];
|
|
163
|
+
const elements = document.querySelectorAll("#links .result");
|
|
164
|
+
for (let i = 0; i < Math.min(elements.length, limit || 10); i++) {
|
|
165
|
+
const el = elements[i];
|
|
166
|
+
const titleEl = el.querySelector("h2");
|
|
167
|
+
const linkEl = el.querySelector("a");
|
|
168
|
+
const snippetEl = el.querySelector(".result__snippet");
|
|
169
|
+
if (titleEl && linkEl) {
|
|
170
|
+
items.push({
|
|
171
|
+
title: titleEl.textContent || "",
|
|
172
|
+
url: linkEl.href || "",
|
|
173
|
+
snippet: snippetEl?.textContent || "",
|
|
174
|
+
source: "duckduckgo",
|
|
175
|
+
});
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
return items;
|
|
179
|
+
}, options.limit);
|
|
180
|
+
return results;
|
|
181
|
+
}
|
|
182
|
+
finally {
|
|
183
|
+
await browser.close();
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
async function searchDuckDuckGo(query, options = {}) {
|
|
187
|
+
try {
|
|
188
|
+
const mergedOptions = {
|
|
189
|
+
limit: 10,
|
|
190
|
+
safeSearch: true,
|
|
191
|
+
timeout: 10000,
|
|
192
|
+
forcePuppeteer: false,
|
|
193
|
+
antiBot: {
|
|
194
|
+
enabled: true,
|
|
195
|
+
maxRetries: 3,
|
|
196
|
+
retryDelay: 2000,
|
|
197
|
+
},
|
|
198
|
+
...options,
|
|
199
|
+
};
|
|
200
|
+
const cacheKey = (0, common_1.getCacheKey)(query, mergedOptions);
|
|
201
|
+
const cached = searchCache.get(cacheKey);
|
|
202
|
+
if (cached && Date.now() - cached.timestamp < CACHE_TTL) {
|
|
203
|
+
return cached.results;
|
|
204
|
+
}
|
|
205
|
+
await enforceRateLimit();
|
|
206
|
+
// Try HTML scraping first unless Puppeteer is forced or we are searching for images (images require JS/Puppeteer)
|
|
207
|
+
if (!mergedOptions.forcePuppeteer && mergedOptions.category !== "images") {
|
|
208
|
+
try {
|
|
209
|
+
let searchUrl = `https://duckduckgo.com/html/?q=${encodeURIComponent(query)}`;
|
|
210
|
+
if (mergedOptions.category === "news") {
|
|
211
|
+
searchUrl += "&iar=news&ia=news";
|
|
212
|
+
}
|
|
213
|
+
const { body } = await (0, common_1.fetchWithDetection)(searchUrl, mergedOptions);
|
|
214
|
+
const dom = new jsdom_1.JSDOM(body);
|
|
215
|
+
const doc = dom.window.document;
|
|
216
|
+
const elements = doc.querySelectorAll(".result");
|
|
217
|
+
const results = [];
|
|
218
|
+
// Use a simple loop with index to respect limit
|
|
219
|
+
for (let i = 0; i < elements.length && results.length < (mergedOptions.limit || 10); i++) {
|
|
220
|
+
const el = elements[i];
|
|
221
|
+
const titleEl = el.querySelector(".result__title a");
|
|
222
|
+
const linkEl = el.querySelector(".result__url"); // Usually just display URL
|
|
223
|
+
const snippetEl = el.querySelector(".result__snippet");
|
|
224
|
+
// Try to extract date/source for news if available (structure might vary in HTML version)
|
|
225
|
+
// For now, basic extraction works for both
|
|
226
|
+
if (titleEl) {
|
|
227
|
+
const rawLink = titleEl.getAttribute("href");
|
|
228
|
+
const title = titleEl.textContent?.trim() || "";
|
|
229
|
+
const snippet = snippetEl?.textContent?.trim() || "";
|
|
230
|
+
if (rawLink && title) {
|
|
231
|
+
const url = extractDirectUrl(rawLink);
|
|
232
|
+
if (url && url.startsWith("http")) {
|
|
233
|
+
results.push({
|
|
234
|
+
title,
|
|
235
|
+
url,
|
|
236
|
+
snippet,
|
|
237
|
+
source: mergedOptions.category === "news" ? "duckduckgo-news" : "duckduckgo",
|
|
238
|
+
});
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
if (results.length > 0) {
|
|
244
|
+
searchCache.set(cacheKey, {
|
|
245
|
+
results,
|
|
246
|
+
timestamp: Date.now(),
|
|
247
|
+
source: "duckduckgo",
|
|
248
|
+
});
|
|
249
|
+
return results;
|
|
250
|
+
}
|
|
251
|
+
// If no results found via HTML, might be blocked or empty, try puppeteer?
|
|
252
|
+
// Or if it was a genuine empty result.
|
|
253
|
+
// Let's assume if 0 results in HTML but page loaded, we might try puppeteer as backup
|
|
254
|
+
// if we suspect bot detection, but if fetchWithDetection didn't throw, maybe it's just no results.
|
|
255
|
+
// However, DDG HTML version sometimes gives 0 results for complex queries where JS version works.
|
|
256
|
+
// So fallback is good.
|
|
257
|
+
}
|
|
258
|
+
catch (error) {
|
|
259
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
260
|
+
if (errorMessage === "Bot protection detected" && mergedOptions.antiBot?.enabled) {
|
|
261
|
+
// Silent fallback
|
|
262
|
+
}
|
|
263
|
+
else {
|
|
264
|
+
// Silent fallback
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
// Use Puppeteer as fallback
|
|
269
|
+
const results = await searchWithPuppeteer(query, mergedOptions);
|
|
270
|
+
searchCache.set(cacheKey, {
|
|
271
|
+
results,
|
|
272
|
+
timestamp: Date.now(),
|
|
273
|
+
source: "duckduckgo",
|
|
274
|
+
});
|
|
275
|
+
return results;
|
|
276
|
+
}
|
|
277
|
+
catch (error) {
|
|
278
|
+
throw {
|
|
279
|
+
message: "duckduckgo search failed :/",
|
|
280
|
+
code: "DDG_SEARCH_ERROR",
|
|
281
|
+
originalError: error,
|
|
282
|
+
};
|
|
283
|
+
}
|
|
284
|
+
}
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.searchGoogleNews = searchGoogleNews;
|
|
7
|
+
const google_news_scraper_1 = __importDefault(require("google-news-scraper"));
|
|
8
|
+
const common_1 = require("../common");
|
|
9
|
+
// Cache for news results
|
|
10
|
+
const newsCache = new Map();
|
|
11
|
+
const CACHE_TTL = 30 * 60 * 1000; // 30 minutes for news
|
|
12
|
+
async function searchGoogleNews(query, options = {}) {
|
|
13
|
+
try {
|
|
14
|
+
const mergedOptions = {
|
|
15
|
+
limit: 10,
|
|
16
|
+
safeSearch: true,
|
|
17
|
+
timeout: 10000,
|
|
18
|
+
...options,
|
|
19
|
+
};
|
|
20
|
+
const cacheKey = (0, common_1.getCacheKey)(query, mergedOptions);
|
|
21
|
+
const cached = newsCache.get(cacheKey);
|
|
22
|
+
if (cached && Date.now() - cached.timestamp < CACHE_TTL) {
|
|
23
|
+
return cached.results;
|
|
24
|
+
}
|
|
25
|
+
// google-news-scraper uses Puppeteer internally
|
|
26
|
+
const articles = await (0, google_news_scraper_1.default)({
|
|
27
|
+
searchTerm: query,
|
|
28
|
+
prettyURLs: true,
|
|
29
|
+
queryVars: {
|
|
30
|
+
hl: "en-US",
|
|
31
|
+
gl: "US",
|
|
32
|
+
ceid: "US:en",
|
|
33
|
+
},
|
|
34
|
+
});
|
|
35
|
+
const results = articles.slice(0, mergedOptions.limit).map((article) => ({
|
|
36
|
+
title: article.title,
|
|
37
|
+
url: article.link,
|
|
38
|
+
snippet: article.subtitle || "",
|
|
39
|
+
source: "google-news",
|
|
40
|
+
sourceName: article.source,
|
|
41
|
+
imageUrl: article.image,
|
|
42
|
+
publishedAt: article.time,
|
|
43
|
+
}));
|
|
44
|
+
if (results.length > 0) {
|
|
45
|
+
newsCache.set(cacheKey, {
|
|
46
|
+
results,
|
|
47
|
+
timestamp: Date.now(),
|
|
48
|
+
source: "google-news",
|
|
49
|
+
});
|
|
50
|
+
}
|
|
51
|
+
return results;
|
|
52
|
+
}
|
|
53
|
+
catch (error) {
|
|
54
|
+
throw {
|
|
55
|
+
message: "google news search failed",
|
|
56
|
+
code: "GOOGLE_NEWS_SEARCH_ERROR",
|
|
57
|
+
originalError: error,
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import { ScraperOptions, SearchResult } from "../../types";
|
|
2
|
+
/**
|
|
3
|
+
* Extracts the "Answer Box" or "Featured Snippet" from the Google Search DOM
|
|
4
|
+
*/
|
|
5
|
+
export declare function extractAnswerBox(doc: Document): string | undefined;
|
|
6
|
+
export declare function searchGoogle(query: string, options?: ScraperOptions): Promise<SearchResult[]>;
|