llm-search-tools 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +244 -0
- package/dist/index.d.ts +18 -0
- package/dist/index.js +40 -0
- package/dist/index.js.map +1 -0
- package/dist/integration.test.d.ts +1 -0
- package/dist/integration.test.js +237 -0
- package/dist/modules/answerbox.test.d.ts +1 -0
- package/dist/modules/answerbox.test.js +105 -0
- package/dist/modules/autocomplete.d.ts +11 -0
- package/dist/modules/autocomplete.js +159 -0
- package/dist/modules/autocomplete.test.d.ts +1 -0
- package/dist/modules/autocomplete.test.js +188 -0
- package/dist/modules/common.d.ts +26 -0
- package/dist/modules/common.js +263 -0
- package/dist/modules/common.test.d.ts +1 -0
- package/dist/modules/common.test.js +87 -0
- package/dist/modules/crawl.d.ts +9 -0
- package/dist/modules/crawl.js +117 -0
- package/dist/modules/crawl.test.d.ts +1 -0
- package/dist/modules/crawl.test.js +48 -0
- package/dist/modules/events.d.ts +8 -0
- package/dist/modules/events.js +129 -0
- package/dist/modules/events.test.d.ts +1 -0
- package/dist/modules/events.test.js +104 -0
- package/dist/modules/finance.d.ts +10 -0
- package/dist/modules/finance.js +20 -0
- package/dist/modules/finance.test.d.ts +1 -0
- package/dist/modules/finance.test.js +77 -0
- package/dist/modules/flights.d.ts +8 -0
- package/dist/modules/flights.js +135 -0
- package/dist/modules/flights.test.d.ts +1 -0
- package/dist/modules/flights.test.js +128 -0
- package/dist/modules/hackernews.d.ts +8 -0
- package/dist/modules/hackernews.js +87 -0
- package/dist/modules/hackernews.js.map +1 -0
- package/dist/modules/images.test.d.ts +1 -0
- package/dist/modules/images.test.js +145 -0
- package/dist/modules/integrations.test.d.ts +1 -0
- package/dist/modules/integrations.test.js +93 -0
- package/dist/modules/media.d.ts +11 -0
- package/dist/modules/media.js +132 -0
- package/dist/modules/media.test.d.ts +1 -0
- package/dist/modules/media.test.js +186 -0
- package/dist/modules/news.d.ts +3 -0
- package/dist/modules/news.js +39 -0
- package/dist/modules/news.test.d.ts +1 -0
- package/dist/modules/news.test.js +88 -0
- package/dist/modules/parser.d.ts +19 -0
- package/dist/modules/parser.js +361 -0
- package/dist/modules/parser.test.d.ts +1 -0
- package/dist/modules/parser.test.js +151 -0
- package/dist/modules/reddit.d.ts +21 -0
- package/dist/modules/reddit.js +107 -0
- package/dist/modules/scrape.d.ts +16 -0
- package/dist/modules/scrape.js +272 -0
- package/dist/modules/scrape.test.d.ts +1 -0
- package/dist/modules/scrape.test.js +232 -0
- package/dist/modules/scraper.d.ts +12 -0
- package/dist/modules/scraper.js +640 -0
- package/dist/modules/scrapers/anidb.d.ts +8 -0
- package/dist/modules/scrapers/anidb.js +156 -0
- package/dist/modules/scrapers/duckduckgo.d.ts +6 -0
- package/dist/modules/scrapers/duckduckgo.js +284 -0
- package/dist/modules/scrapers/google-news.d.ts +2 -0
- package/dist/modules/scrapers/google-news.js +60 -0
- package/dist/modules/scrapers/google.d.ts +6 -0
- package/dist/modules/scrapers/google.js +211 -0
- package/dist/modules/scrapers/searxng.d.ts +2 -0
- package/dist/modules/scrapers/searxng.js +93 -0
- package/dist/modules/scrapers/thetvdb.d.ts +3 -0
- package/dist/modules/scrapers/thetvdb.js +147 -0
- package/dist/modules/scrapers/tmdb.d.ts +3 -0
- package/dist/modules/scrapers/tmdb.js +172 -0
- package/dist/modules/scrapers/yahoo-finance.d.ts +2 -0
- package/dist/modules/scrapers/yahoo-finance.js +33 -0
- package/dist/modules/search.d.ts +5 -0
- package/dist/modules/search.js +45 -0
- package/dist/modules/search.js.map +1 -0
- package/dist/modules/search.test.d.ts +1 -0
- package/dist/modules/search.test.js +219 -0
- package/dist/modules/urbandictionary.d.ts +12 -0
- package/dist/modules/urbandictionary.js +26 -0
- package/dist/modules/webpage.d.ts +4 -0
- package/dist/modules/webpage.js +150 -0
- package/dist/modules/webpage.js.map +1 -0
- package/dist/modules/wikipedia.d.ts +5 -0
- package/dist/modules/wikipedia.js +85 -0
- package/dist/modules/wikipedia.js.map +1 -0
- package/dist/scripts/interactive-search.d.ts +1 -0
- package/dist/scripts/interactive-search.js +98 -0
- package/dist/test.d.ts +1 -0
- package/dist/test.js +179 -0
- package/dist/test.js.map +1 -0
- package/dist/testBraveSearch.d.ts +1 -0
- package/dist/testBraveSearch.js +34 -0
- package/dist/testDuckDuckGo.d.ts +1 -0
- package/dist/testDuckDuckGo.js +52 -0
- package/dist/testEcosia.d.ts +1 -0
- package/dist/testEcosia.js +57 -0
- package/dist/testSearchModule.d.ts +1 -0
- package/dist/testSearchModule.js +95 -0
- package/dist/testwebpage.d.ts +1 -0
- package/dist/testwebpage.js +81 -0
- package/dist/types.d.ts +174 -0
- package/dist/types.js +3 -0
- package/dist/types.js.map +1 -0
- package/dist/utils/createTestDocx.d.ts +1 -0
- package/dist/utils/createTestDocx.js +58 -0
- package/dist/utils/htmlcleaner.d.ts +20 -0
- package/dist/utils/htmlcleaner.js +172 -0
- package/docs/README.md +275 -0
- package/docs/autocomplete.md +73 -0
- package/docs/crawling.md +88 -0
- package/docs/events.md +58 -0
- package/docs/examples.md +158 -0
- package/docs/finance.md +60 -0
- package/docs/flights.md +71 -0
- package/docs/hackernews.md +121 -0
- package/docs/media.md +87 -0
- package/docs/news.md +75 -0
- package/docs/parser.md +197 -0
- package/docs/scraper.md +347 -0
- package/docs/search.md +106 -0
- package/docs/wikipedia.md +91 -0
- package/package.json +97 -0
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.extractAnswerBox = extractAnswerBox;
|
|
4
|
+
exports.searchGoogle = searchGoogle;
|
|
5
|
+
const google_sr_1 = require("google-sr");
|
|
6
|
+
const common_1 = require("../common");
|
|
7
|
+
/**
|
|
8
|
+
* Extracts the "Answer Box" or "Featured Snippet" from the Google Search DOM
|
|
9
|
+
*/
|
|
10
|
+
function extractAnswerBox(doc) {
|
|
11
|
+
// 1. Featured Snippet (Text) - .hgKElc
|
|
12
|
+
const featuredSnippet = doc.querySelector(".hgKElc");
|
|
13
|
+
if (featuredSnippet && featuredSnippet.textContent) {
|
|
14
|
+
return featuredSnippet.textContent.trim();
|
|
15
|
+
}
|
|
16
|
+
// 2. Featured Snippet (List) - .LGOjhe
|
|
17
|
+
const listSnippet = doc.querySelector(".LGOjhe");
|
|
18
|
+
if (listSnippet && listSnippet.textContent) {
|
|
19
|
+
return listSnippet.textContent.trim();
|
|
20
|
+
}
|
|
21
|
+
// 3. Direct Answer (e.g., calculations, dates) - .Z0LcW
|
|
22
|
+
const directAnswer = doc.querySelector(".Z0LcW");
|
|
23
|
+
if (directAnswer && directAnswer.textContent) {
|
|
24
|
+
return directAnswer.textContent.trim();
|
|
25
|
+
}
|
|
26
|
+
// 4. Knowledge Panel Description - .kno-rdesc span
|
|
27
|
+
const knowledgePanel = doc.querySelector(".kno-rdesc span");
|
|
28
|
+
if (knowledgePanel && knowledgePanel.textContent) {
|
|
29
|
+
return knowledgePanel.textContent.trim();
|
|
30
|
+
}
|
|
31
|
+
// 5. Dictionary Definition - div[data-attrid="description"]
|
|
32
|
+
const definition = doc.querySelector("div[data-attrid='description']");
|
|
33
|
+
if (definition && definition.textContent) {
|
|
34
|
+
return definition.textContent.trim();
|
|
35
|
+
}
|
|
36
|
+
return undefined;
|
|
37
|
+
}
|
|
38
|
+
// Rate limiting parameters
|
|
39
|
+
const GOOGLE_DELAY = 2000; // 2 seconds for Google
|
|
40
|
+
let lastGoogleSearchTime = 0;
|
|
41
|
+
// Cache for search results
|
|
42
|
+
const searchCache = new Map();
|
|
43
|
+
const CACHE_TTL = 60 * 60 * 1000; // 1 hour
|
|
44
|
+
// Helper function to enforce rate limiting
|
|
45
|
+
async function enforceRateLimit() {
|
|
46
|
+
const now = Date.now();
|
|
47
|
+
const timeSinceLastSearch = now - lastGoogleSearchTime;
|
|
48
|
+
if (timeSinceLastSearch < GOOGLE_DELAY) {
|
|
49
|
+
await new Promise((resolve) => setTimeout(resolve, GOOGLE_DELAY - timeSinceLastSearch));
|
|
50
|
+
}
|
|
51
|
+
lastGoogleSearchTime = Date.now();
|
|
52
|
+
}
|
|
53
|
+
// Search using Puppeteer
|
|
54
|
+
async function searchWithPuppeteer(query, options) {
|
|
55
|
+
const proxy = (0, common_1.parseProxyConfig)(options.proxy);
|
|
56
|
+
const browser = await (0, common_1.createStealthBrowser)(proxy || undefined);
|
|
57
|
+
const page = await browser.newPage();
|
|
58
|
+
try {
|
|
59
|
+
// Set realistic viewport
|
|
60
|
+
await page.setViewport({ width: 1920, height: 1080 });
|
|
61
|
+
// Set extra headers
|
|
62
|
+
await page.setExtraHTTPHeaders((0, common_1.createRealisticHeaders)());
|
|
63
|
+
if (options.category === "images") {
|
|
64
|
+
const searchUrl = `https://www.google.com/search?q=${encodeURIComponent(query)}&tbm=isch`;
|
|
65
|
+
await page.goto(searchUrl, { waitUntil: "networkidle2" });
|
|
66
|
+
// Wait for image results
|
|
67
|
+
// Google images usually have div.isv-r or specific containers
|
|
68
|
+
try {
|
|
69
|
+
await page.waitForSelector("div[data-id], div.isv-r", { timeout: 10000 });
|
|
70
|
+
}
|
|
71
|
+
catch (e) {
|
|
72
|
+
// continue, might be empty
|
|
73
|
+
}
|
|
74
|
+
const results = await page.evaluate((limit) => {
|
|
75
|
+
const items = [];
|
|
76
|
+
// Select image containers
|
|
77
|
+
const elements = document.querySelectorAll("div[data-id], div.isv-r");
|
|
78
|
+
for (let i = 0; i < Math.min(elements.length, limit || 20); i++) {
|
|
79
|
+
const el = elements[i];
|
|
80
|
+
// Title often in h3 or aria-label
|
|
81
|
+
const titleEl = el.querySelector("h3") || el.querySelector("[title]");
|
|
82
|
+
const title = titleEl?.textContent || titleEl?.getAttribute("title") || "Image";
|
|
83
|
+
// Source link (page containing image)
|
|
84
|
+
const linkEl = el.querySelector("a");
|
|
85
|
+
const url = linkEl?.href || "";
|
|
86
|
+
// Thumbnail
|
|
87
|
+
const imgEl = el.querySelector("img");
|
|
88
|
+
const thumbnailUrl = imgEl?.src || imgEl?.getAttribute("data-src") || "";
|
|
89
|
+
// Full image URL is hard to get without clicking.
|
|
90
|
+
// Sometimes it's in a JSON blob in the page, but that's brittle.
|
|
91
|
+
// We will use the thumbnail as a fallback for imageUrl if we can't find better.
|
|
92
|
+
// For now, let's just use the thumbnail.
|
|
93
|
+
if (url && thumbnailUrl) {
|
|
94
|
+
items.push({
|
|
95
|
+
title,
|
|
96
|
+
url, // This is the source page URL
|
|
97
|
+
snippet: title,
|
|
98
|
+
imageUrl: thumbnailUrl, // Using thumbnail as image URL for now due to complexity
|
|
99
|
+
thumbnailUrl,
|
|
100
|
+
source: "google-images",
|
|
101
|
+
});
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
return items;
|
|
105
|
+
}, options.limit);
|
|
106
|
+
return results;
|
|
107
|
+
}
|
|
108
|
+
// Default Web Search
|
|
109
|
+
const searchUrl = `https://www.google.com/search?q=${encodeURIComponent(query)}`;
|
|
110
|
+
await page.goto(searchUrl, { waitUntil: "networkidle2" });
|
|
111
|
+
// Wait for results
|
|
112
|
+
await page.waitForSelector("div.g", { timeout: 10000 });
|
|
113
|
+
// Extract results
|
|
114
|
+
const results = await page.evaluate((limit) => {
|
|
115
|
+
const items = [];
|
|
116
|
+
const elements = document.querySelectorAll("div.g");
|
|
117
|
+
for (let i = 0; i < Math.min(elements.length, limit || 10); i++) {
|
|
118
|
+
const el = elements[i];
|
|
119
|
+
const titleEl = el.querySelector("h3");
|
|
120
|
+
const linkEl = el.querySelector("a");
|
|
121
|
+
const snippetEl = el.querySelector(".VwiC3b");
|
|
122
|
+
if (titleEl && linkEl) {
|
|
123
|
+
items.push({
|
|
124
|
+
title: titleEl.textContent || "",
|
|
125
|
+
url: linkEl.href || "",
|
|
126
|
+
snippet: snippetEl?.textContent || "",
|
|
127
|
+
source: "google",
|
|
128
|
+
});
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
return items;
|
|
132
|
+
}, options.limit);
|
|
133
|
+
return results;
|
|
134
|
+
}
|
|
135
|
+
finally {
|
|
136
|
+
await browser.close();
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
async function searchGoogle(query, options = {}) {
|
|
140
|
+
try {
|
|
141
|
+
// Clone and merge options
|
|
142
|
+
const mergedOptions = {
|
|
143
|
+
limit: 10,
|
|
144
|
+
safeSearch: true,
|
|
145
|
+
timeout: 10000,
|
|
146
|
+
forcePuppeteer: false,
|
|
147
|
+
antiBot: {
|
|
148
|
+
enabled: true,
|
|
149
|
+
maxRetries: 3,
|
|
150
|
+
retryDelay: 2000,
|
|
151
|
+
},
|
|
152
|
+
...options,
|
|
153
|
+
};
|
|
154
|
+
const cacheKey = (0, common_1.getCacheKey)(query, mergedOptions);
|
|
155
|
+
const cached = searchCache.get(cacheKey);
|
|
156
|
+
if (cached && Date.now() - cached.timestamp < CACHE_TTL) {
|
|
157
|
+
return cached.results;
|
|
158
|
+
}
|
|
159
|
+
await enforceRateLimit();
|
|
160
|
+
// Try basic fetch first unless Puppeteer is forced or we are searching for images
|
|
161
|
+
if (!mergedOptions.forcePuppeteer && mergedOptions.category !== "images") {
|
|
162
|
+
try {
|
|
163
|
+
const searchUrl = `https://www.google.com/search?q=${encodeURIComponent(query)}`;
|
|
164
|
+
await (0, common_1.fetchWithDetection)(searchUrl, mergedOptions);
|
|
165
|
+
// If no bot detection, use library
|
|
166
|
+
const results = await (0, google_sr_1.search)({
|
|
167
|
+
query,
|
|
168
|
+
parsers: [google_sr_1.OrganicResult],
|
|
169
|
+
noPartialResults: true,
|
|
170
|
+
requestConfig: { queryParams: { safe: "active" } },
|
|
171
|
+
});
|
|
172
|
+
const formattedResults = results.map((r) => ({
|
|
173
|
+
title: r.title || "",
|
|
174
|
+
url: r.link || "",
|
|
175
|
+
snippet: r.description || "",
|
|
176
|
+
source: "google",
|
|
177
|
+
}));
|
|
178
|
+
searchCache.set(cacheKey, {
|
|
179
|
+
results: formattedResults,
|
|
180
|
+
timestamp: Date.now(),
|
|
181
|
+
source: "google",
|
|
182
|
+
});
|
|
183
|
+
return formattedResults;
|
|
184
|
+
}
|
|
185
|
+
catch (error) {
|
|
186
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
187
|
+
if (errorMessage === "Bot protection detected" && mergedOptions.antiBot?.enabled) {
|
|
188
|
+
// Silent fallback
|
|
189
|
+
}
|
|
190
|
+
else {
|
|
191
|
+
throw error;
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
// Use Puppeteer as fallback
|
|
196
|
+
const results = await searchWithPuppeteer(query, mergedOptions);
|
|
197
|
+
searchCache.set(cacheKey, {
|
|
198
|
+
results,
|
|
199
|
+
timestamp: Date.now(),
|
|
200
|
+
source: "google",
|
|
201
|
+
});
|
|
202
|
+
return results;
|
|
203
|
+
}
|
|
204
|
+
catch (error) {
|
|
205
|
+
throw {
|
|
206
|
+
message: "google search failed :(",
|
|
207
|
+
code: "GOOGLE_SEARCH_ERROR",
|
|
208
|
+
originalError: error,
|
|
209
|
+
};
|
|
210
|
+
}
|
|
211
|
+
}
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.searchSearxNG = searchSearxNG;
|
|
4
|
+
const common_1 = require("../common");
|
|
5
|
+
// Default public SearxNG instances
|
|
6
|
+
const DEFAULT_INSTANCES = [
|
|
7
|
+
"https://searx.be",
|
|
8
|
+
"https://searx.space",
|
|
9
|
+
"https://search.mdosch.de",
|
|
10
|
+
"https://searx.work",
|
|
11
|
+
"https://searx.fmac.xyz",
|
|
12
|
+
"https://northboot.xyz",
|
|
13
|
+
];
|
|
14
|
+
// Cache for search results
|
|
15
|
+
const searchCache = new Map();
|
|
16
|
+
const CACHE_TTL = 60 * 60 * 1000; // 1 hour
|
|
17
|
+
async function searchSearxNG(query, options = {}) {
|
|
18
|
+
try {
|
|
19
|
+
const mergedOptions = {
|
|
20
|
+
limit: 10,
|
|
21
|
+
safeSearch: true,
|
|
22
|
+
timeout: 10000,
|
|
23
|
+
...options,
|
|
24
|
+
};
|
|
25
|
+
const cacheKey = (0, common_1.getCacheKey)(query, mergedOptions);
|
|
26
|
+
const cached = searchCache.get(cacheKey);
|
|
27
|
+
if (cached && Date.now() - cached.timestamp < CACHE_TTL) {
|
|
28
|
+
return cached.results;
|
|
29
|
+
}
|
|
30
|
+
// Use provided instance or pick a random default
|
|
31
|
+
const instance = mergedOptions.searxngInstance || DEFAULT_INSTANCES[Math.floor(Math.random() * DEFAULT_INSTANCES.length)];
|
|
32
|
+
// Construct URL with JSON format
|
|
33
|
+
const searchUrl = new URL(`${instance}/search`);
|
|
34
|
+
searchUrl.searchParams.append("q", query);
|
|
35
|
+
searchUrl.searchParams.append("format", "json");
|
|
36
|
+
searchUrl.searchParams.append("safesearch", mergedOptions.safeSearch ? "1" : "0");
|
|
37
|
+
if (mergedOptions.category === "images") {
|
|
38
|
+
searchUrl.searchParams.append("categories", "images");
|
|
39
|
+
}
|
|
40
|
+
try {
|
|
41
|
+
const { body } = await (0, common_1.fetchWithDetection)(searchUrl.toString(), mergedOptions);
|
|
42
|
+
const data = JSON.parse(body);
|
|
43
|
+
if (!data.results || !Array.isArray(data.results)) {
|
|
44
|
+
throw new Error("Invalid response format from SearxNG");
|
|
45
|
+
}
|
|
46
|
+
if (mergedOptions.category === "images") {
|
|
47
|
+
const results = data.results
|
|
48
|
+
.slice(0, mergedOptions.limit)
|
|
49
|
+
.filter((r) => r.img_src || r.thumbnail_src || r.url.match(/\.(jpeg|jpg|gif|png)$/i))
|
|
50
|
+
.map((r) => ({
|
|
51
|
+
title: r.title,
|
|
52
|
+
url: r.url,
|
|
53
|
+
snippet: r.content || r.title,
|
|
54
|
+
imageUrl: r.img_src || r.url,
|
|
55
|
+
thumbnailUrl: r.thumbnail_src || r.thumbnail || r.img_src || r.url,
|
|
56
|
+
source: "searxng-images",
|
|
57
|
+
}));
|
|
58
|
+
searchCache.set(cacheKey, {
|
|
59
|
+
results,
|
|
60
|
+
timestamp: Date.now(),
|
|
61
|
+
source: "searxng",
|
|
62
|
+
});
|
|
63
|
+
return results;
|
|
64
|
+
}
|
|
65
|
+
const results = data.results.slice(0, mergedOptions.limit).map((r) => ({
|
|
66
|
+
title: r.title,
|
|
67
|
+
url: r.url,
|
|
68
|
+
snippet: r.content || "",
|
|
69
|
+
source: "searxng",
|
|
70
|
+
}));
|
|
71
|
+
searchCache.set(cacheKey, {
|
|
72
|
+
results,
|
|
73
|
+
timestamp: Date.now(),
|
|
74
|
+
source: "searxng",
|
|
75
|
+
});
|
|
76
|
+
return results;
|
|
77
|
+
}
|
|
78
|
+
catch (error) {
|
|
79
|
+
// If specific error, rethrow
|
|
80
|
+
if (error instanceof Error && error.message.includes("Bot protection")) {
|
|
81
|
+
throw error;
|
|
82
|
+
}
|
|
83
|
+
throw new Error(`Failed to fetch from SearxNG instance ${instance}: ${error instanceof Error ? error.message : String(error)}`);
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
catch (error) {
|
|
87
|
+
throw {
|
|
88
|
+
message: "searxng search failed",
|
|
89
|
+
code: "SEARXNG_SEARCH_ERROR",
|
|
90
|
+
originalError: error,
|
|
91
|
+
};
|
|
92
|
+
}
|
|
93
|
+
}
|
|
@@ -0,0 +1,3 @@
|
|
|
1
|
+
import { MediaResult, MediaSearchOptions } from "../../types";
|
|
2
|
+
export declare function searchTheTVDB(query: string, options?: MediaSearchOptions): Promise<MediaResult[]>;
|
|
3
|
+
export declare function getTheTVDBDetails(url: string, options?: MediaSearchOptions): Promise<Partial<MediaResult>>;
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.searchTheTVDB = searchTheTVDB;
|
|
4
|
+
exports.getTheTVDBDetails = getTheTVDBDetails;
|
|
5
|
+
const common_1 = require("../common");
|
|
6
|
+
const jsdom_1 = require("jsdom");
|
|
7
|
+
// Rate limiting
|
|
8
|
+
const MIN_DELAY = 1500;
|
|
9
|
+
let lastSearchTime = 0;
|
|
10
|
+
async function enforceRateLimit() {
|
|
11
|
+
const now = Date.now();
|
|
12
|
+
const timeSinceLast = now - lastSearchTime;
|
|
13
|
+
if (timeSinceLast < MIN_DELAY) {
|
|
14
|
+
await new Promise((resolve) => setTimeout(resolve, MIN_DELAY - timeSinceLast));
|
|
15
|
+
}
|
|
16
|
+
lastSearchTime = Date.now();
|
|
17
|
+
}
|
|
18
|
+
async function searchTheTVDB(query, options = {}) {
|
|
19
|
+
try {
|
|
20
|
+
await enforceRateLimit();
|
|
21
|
+
const mergedOptions = { ...options };
|
|
22
|
+
const searchUrl = `https://thetvdb.com/search?query=${encodeURIComponent(query)}`;
|
|
23
|
+
// Try basic fetch first
|
|
24
|
+
try {
|
|
25
|
+
if (!mergedOptions.forcePuppeteer) {
|
|
26
|
+
return await scrapeTheTVDBWithFetch(searchUrl, mergedOptions);
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
catch (e) {
|
|
30
|
+
if (mergedOptions.forcePuppeteer)
|
|
31
|
+
throw e;
|
|
32
|
+
}
|
|
33
|
+
// Fallback to Puppeteer
|
|
34
|
+
return await scrapeTheTVDBWithPuppeteer(searchUrl, mergedOptions);
|
|
35
|
+
}
|
|
36
|
+
catch (error) {
|
|
37
|
+
throw {
|
|
38
|
+
message: "TheTVDB search failed",
|
|
39
|
+
code: "THETVDB_SEARCH_ERROR",
|
|
40
|
+
originalError: error,
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
async function scrapeTheTVDBWithFetch(url, options) {
|
|
45
|
+
const { body } = await (0, common_1.fetchWithDetection)(url, options);
|
|
46
|
+
const dom = new jsdom_1.JSDOM(body);
|
|
47
|
+
return parseTheTVDBResults(dom.window.document);
|
|
48
|
+
}
|
|
49
|
+
async function scrapeTheTVDBWithPuppeteer(url, options) {
|
|
50
|
+
const proxy = (0, common_1.parseProxyConfig)(options.proxy);
|
|
51
|
+
const browser = await (0, common_1.createStealthBrowser)(proxy || undefined);
|
|
52
|
+
try {
|
|
53
|
+
const page = await browser.newPage();
|
|
54
|
+
await page.setViewport({ width: 1920, height: 1080 });
|
|
55
|
+
await page.setExtraHTTPHeaders((0, common_1.createRealisticHeaders)());
|
|
56
|
+
await page.goto(url, { waitUntil: "networkidle2" });
|
|
57
|
+
// Wait for results list
|
|
58
|
+
try {
|
|
59
|
+
await page.waitForSelector(".list-group, .media-list", { timeout: 5000 });
|
|
60
|
+
}
|
|
61
|
+
catch (e) {
|
|
62
|
+
return [];
|
|
63
|
+
}
|
|
64
|
+
const html = await page.content();
|
|
65
|
+
const dom = new jsdom_1.JSDOM(html);
|
|
66
|
+
return parseTheTVDBResults(dom.window.document);
|
|
67
|
+
}
|
|
68
|
+
finally {
|
|
69
|
+
await browser.close();
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
function parseTheTVDBResults(doc) {
|
|
73
|
+
const results = [];
|
|
74
|
+
// TheTVDB search results usually look like list items
|
|
75
|
+
const items = doc.querySelectorAll(".list-group-item, li.media");
|
|
76
|
+
items.forEach((item) => {
|
|
77
|
+
if (results.length >= 10)
|
|
78
|
+
return;
|
|
79
|
+
const titleEl = item.querySelector("h4, .media-heading");
|
|
80
|
+
const linkEl = item.querySelector("a");
|
|
81
|
+
const imgEl = item.querySelector("img");
|
|
82
|
+
const overviewEl = item.querySelector("p, .overview");
|
|
83
|
+
const smallText = item.querySelector("small"); // Often contains date or network
|
|
84
|
+
if (titleEl && linkEl) {
|
|
85
|
+
const title = titleEl.textContent?.trim() || "";
|
|
86
|
+
let href = linkEl.getAttribute("href") || "";
|
|
87
|
+
if (href && !href.startsWith("http")) {
|
|
88
|
+
href = `https://thetvdb.com${href}`;
|
|
89
|
+
}
|
|
90
|
+
const description = overviewEl?.textContent?.trim();
|
|
91
|
+
let posterUrl = imgEl?.getAttribute("src") || undefined;
|
|
92
|
+
// Filter out placeholder images if possible
|
|
93
|
+
if (posterUrl?.includes("missing"))
|
|
94
|
+
posterUrl = undefined;
|
|
95
|
+
// Detect type
|
|
96
|
+
let mediaType = "tv"; // Default for TheTVDB
|
|
97
|
+
if (href.includes("/movies/")) {
|
|
98
|
+
mediaType = "movie";
|
|
99
|
+
}
|
|
100
|
+
// Often TheTVDB lists translations for "Series" or "Movie" in badges
|
|
101
|
+
const badges = item.querySelectorAll(".badge");
|
|
102
|
+
badges.forEach((badge) => {
|
|
103
|
+
const text = badge.textContent?.toLowerCase();
|
|
104
|
+
if (text === "movie")
|
|
105
|
+
mediaType = "movie";
|
|
106
|
+
if (text === "series")
|
|
107
|
+
mediaType = "tv";
|
|
108
|
+
});
|
|
109
|
+
results.push({
|
|
110
|
+
title,
|
|
111
|
+
url: href,
|
|
112
|
+
description,
|
|
113
|
+
releaseDate: smallText?.textContent?.trim(),
|
|
114
|
+
posterUrl,
|
|
115
|
+
source: "thetvdb",
|
|
116
|
+
mediaType,
|
|
117
|
+
});
|
|
118
|
+
}
|
|
119
|
+
});
|
|
120
|
+
return results;
|
|
121
|
+
}
|
|
122
|
+
async function getTheTVDBDetails(url, options = {}) {
|
|
123
|
+
try {
|
|
124
|
+
const { body } = await (0, common_1.fetchWithDetection)(url, options);
|
|
125
|
+
const dom = new jsdom_1.JSDOM(body);
|
|
126
|
+
const doc = dom.window.document;
|
|
127
|
+
// Extract genres
|
|
128
|
+
const genres = [];
|
|
129
|
+
// Look for genre links or definition lists
|
|
130
|
+
// Typically in a sidebar or info block
|
|
131
|
+
const genreLinks = doc.querySelectorAll("a[href*='/genres/']");
|
|
132
|
+
genreLinks.forEach((link) => {
|
|
133
|
+
if (link.textContent)
|
|
134
|
+
genres.push(link.textContent.trim());
|
|
135
|
+
});
|
|
136
|
+
// Rating
|
|
137
|
+
// TheTVDB structure for rating might be in a header or info block
|
|
138
|
+
// Assuming a generic approach or looking for specific class if known
|
|
139
|
+
// (This is best effort without live DOM inspection)
|
|
140
|
+
return {
|
|
141
|
+
genres: genres.length > 0 ? genres : undefined,
|
|
142
|
+
};
|
|
143
|
+
}
|
|
144
|
+
catch (e) {
|
|
145
|
+
return {};
|
|
146
|
+
}
|
|
147
|
+
}
|
|
@@ -0,0 +1,3 @@
|
|
|
1
|
+
import { MediaResult, MediaSearchOptions } from "../../types";
|
|
2
|
+
export declare function searchTMDB(query: string, options?: MediaSearchOptions): Promise<MediaResult[]>;
|
|
3
|
+
export declare function getTMDBDetails(url: string, options?: MediaSearchOptions): Promise<Partial<MediaResult>>;
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.searchTMDB = searchTMDB;
|
|
4
|
+
exports.getTMDBDetails = getTMDBDetails;
|
|
5
|
+
const common_1 = require("../common");
|
|
6
|
+
const jsdom_1 = require("jsdom");
|
|
7
|
+
// Rate limiting
|
|
8
|
+
const MIN_DELAY = 1000;
|
|
9
|
+
let lastSearchTime = 0;
|
|
10
|
+
async function enforceRateLimit() {
|
|
11
|
+
const now = Date.now();
|
|
12
|
+
const timeSinceLast = now - lastSearchTime;
|
|
13
|
+
if (timeSinceLast < MIN_DELAY) {
|
|
14
|
+
await new Promise((resolve) => setTimeout(resolve, MIN_DELAY - timeSinceLast));
|
|
15
|
+
}
|
|
16
|
+
lastSearchTime = Date.now();
|
|
17
|
+
}
|
|
18
|
+
async function searchTMDB(query, options = {}) {
|
|
19
|
+
try {
|
|
20
|
+
await enforceRateLimit();
|
|
21
|
+
const mergedOptions = { ...options };
|
|
22
|
+
// Determine search type
|
|
23
|
+
let searchUrl = `https://www.themoviedb.org/search?query=${encodeURIComponent(query)}`;
|
|
24
|
+
if (mergedOptions.type === "movie") {
|
|
25
|
+
searchUrl = `https://www.themoviedb.org/search/movie?query=${encodeURIComponent(query)}`;
|
|
26
|
+
}
|
|
27
|
+
else if (mergedOptions.type === "tv") {
|
|
28
|
+
searchUrl = `https://www.themoviedb.org/search/tv?query=${encodeURIComponent(query)}`;
|
|
29
|
+
}
|
|
30
|
+
// First try basic fetch
|
|
31
|
+
try {
|
|
32
|
+
if (!mergedOptions.forcePuppeteer) {
|
|
33
|
+
return await scrapeTMDBWithFetch(searchUrl, mergedOptions);
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
catch (e) {
|
|
37
|
+
// Fallback to puppeteer below
|
|
38
|
+
if (mergedOptions.forcePuppeteer)
|
|
39
|
+
throw e;
|
|
40
|
+
}
|
|
41
|
+
// Fallback to Puppeteer
|
|
42
|
+
return await scrapeTMDBWithPuppeteer(searchUrl, mergedOptions);
|
|
43
|
+
}
|
|
44
|
+
catch (error) {
|
|
45
|
+
throw {
|
|
46
|
+
message: "TMDB search failed",
|
|
47
|
+
code: "TMDB_SEARCH_ERROR",
|
|
48
|
+
originalError: error,
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
async function scrapeTMDBWithFetch(url, options) {
|
|
53
|
+
const { body } = await (0, common_1.fetchWithDetection)(url, options);
|
|
54
|
+
const dom = new jsdom_1.JSDOM(body);
|
|
55
|
+
const doc = dom.window.document;
|
|
56
|
+
return parseTMDBResults(doc);
|
|
57
|
+
}
|
|
58
|
+
async function scrapeTMDBWithPuppeteer(url, options) {
|
|
59
|
+
const proxy = (0, common_1.parseProxyConfig)(options.proxy);
|
|
60
|
+
const browser = await (0, common_1.createStealthBrowser)(proxy || undefined);
|
|
61
|
+
try {
|
|
62
|
+
const page = await browser.newPage();
|
|
63
|
+
await page.setViewport({ width: 1920, height: 1080 });
|
|
64
|
+
await page.setExtraHTTPHeaders((0, common_1.createRealisticHeaders)());
|
|
65
|
+
await page.goto(url, { waitUntil: "networkidle2" });
|
|
66
|
+
// Wait for results
|
|
67
|
+
try {
|
|
68
|
+
await page.waitForSelector(".card", { timeout: 5000 });
|
|
69
|
+
}
|
|
70
|
+
catch (e) {
|
|
71
|
+
// No results found or timeout
|
|
72
|
+
return [];
|
|
73
|
+
}
|
|
74
|
+
const html = await page.content();
|
|
75
|
+
const dom = new jsdom_1.JSDOM(html);
|
|
76
|
+
return parseTMDBResults(dom.window.document);
|
|
77
|
+
}
|
|
78
|
+
finally {
|
|
79
|
+
await browser.close();
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
function parseTMDBResults(doc) {
|
|
83
|
+
const results = [];
|
|
84
|
+
const cards = doc.querySelectorAll(".card");
|
|
85
|
+
cards.forEach((card) => {
|
|
86
|
+
if (results.length >= 10)
|
|
87
|
+
return;
|
|
88
|
+
const titleEl = card.querySelector("h2");
|
|
89
|
+
const linkEl = card.querySelector("a.result"); // Usually the image link or title link
|
|
90
|
+
const dateEl = card.querySelector(".date");
|
|
91
|
+
const overviewEl = card.querySelector(".overview");
|
|
92
|
+
const imgEl = card.querySelector("img");
|
|
93
|
+
if (titleEl && linkEl) {
|
|
94
|
+
const title = titleEl.textContent?.trim() || "";
|
|
95
|
+
const href = linkEl.href;
|
|
96
|
+
const url = href.startsWith("http") ? href : `https://www.themoviedb.org${href}`;
|
|
97
|
+
const releaseDate = dateEl?.textContent?.trim();
|
|
98
|
+
const description = overviewEl?.textContent?.trim();
|
|
99
|
+
let posterUrl = imgEl?.src || imgEl?.getAttribute("data-src") || undefined;
|
|
100
|
+
if (posterUrl && !posterUrl.startsWith("http")) {
|
|
101
|
+
posterUrl = `https://www.themoviedb.org${posterUrl}`;
|
|
102
|
+
}
|
|
103
|
+
// Determine type from URL if possible
|
|
104
|
+
let mediaType = "movie"; // Default
|
|
105
|
+
if (url.includes("/tv/")) {
|
|
106
|
+
mediaType = "tv";
|
|
107
|
+
}
|
|
108
|
+
// Basic info first
|
|
109
|
+
results.push({
|
|
110
|
+
title,
|
|
111
|
+
url,
|
|
112
|
+
description,
|
|
113
|
+
releaseDate,
|
|
114
|
+
posterUrl,
|
|
115
|
+
source: "tmdb",
|
|
116
|
+
mediaType,
|
|
117
|
+
});
|
|
118
|
+
}
|
|
119
|
+
});
|
|
120
|
+
return results;
|
|
121
|
+
}
|
|
122
|
+
// Separate function to get details including cast and providers
|
|
123
|
+
// This would be called if the user asks for specific details on a result
|
|
124
|
+
async function getTMDBDetails(url, options = {}) {
|
|
125
|
+
// This function visits the detail page to get cast, genres, rating, providers
|
|
126
|
+
try {
|
|
127
|
+
const { body } = await (0, common_1.fetchWithDetection)(url, options);
|
|
128
|
+
const dom = new jsdom_1.JSDOM(body);
|
|
129
|
+
const doc = dom.window.document;
|
|
130
|
+
// Rating
|
|
131
|
+
const ratingEl = doc.querySelector(".user_score_chart");
|
|
132
|
+
const rating = ratingEl?.getAttribute("data-percent") ? `${ratingEl.getAttribute("data-percent")}%` : undefined;
|
|
133
|
+
// Genres
|
|
134
|
+
const genres = [];
|
|
135
|
+
doc.querySelectorAll(".genres a").forEach((el) => {
|
|
136
|
+
if (el.textContent)
|
|
137
|
+
genres.push(el.textContent.trim());
|
|
138
|
+
});
|
|
139
|
+
// Cast
|
|
140
|
+
const cast = [];
|
|
141
|
+
doc.querySelectorAll(".people.scroller li.card p a").forEach((el) => {
|
|
142
|
+
if (el.textContent)
|
|
143
|
+
cast.push(el.textContent.trim());
|
|
144
|
+
});
|
|
145
|
+
// Watch Providers (This is tricky as it's often loaded dynamically or in a separate section)
|
|
146
|
+
// TMDB often lists them in a section called "Where to Watch" or similar,
|
|
147
|
+
// but the actual data might be fetched via API or hidden.
|
|
148
|
+
// For basic scraping, we check for provider logos/links if visible.
|
|
149
|
+
const watchProviders = [];
|
|
150
|
+
// Check for provider list containers (provider structure varies)
|
|
151
|
+
const providerSections = doc.querySelectorAll(".provider");
|
|
152
|
+
providerSections.forEach((section) => {
|
|
153
|
+
const img = section.querySelector("img");
|
|
154
|
+
if (img) {
|
|
155
|
+
const name = img.getAttribute("alt") || "";
|
|
156
|
+
if (name) {
|
|
157
|
+
// Heuristic to guess type usually requires more context, defaulting to stream
|
|
158
|
+
watchProviders.push({ name, type: "stream" });
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
});
|
|
162
|
+
return {
|
|
163
|
+
rating,
|
|
164
|
+
genres,
|
|
165
|
+
cast,
|
|
166
|
+
watchProviders: watchProviders.length > 0 ? watchProviders : undefined,
|
|
167
|
+
};
|
|
168
|
+
}
|
|
169
|
+
catch (e) {
|
|
170
|
+
return {};
|
|
171
|
+
}
|
|
172
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.getStockQuote = getStockQuote;
|
|
7
|
+
const yahoo_finance2_1 = __importDefault(require("yahoo-finance2"));
|
|
8
|
+
async function getStockQuote(symbol) {
|
|
9
|
+
try {
|
|
10
|
+
// Explicitly cast to Quote to avoid type inference issues
|
|
11
|
+
const quote = await yahoo_finance2_1.default.quote(symbol);
|
|
12
|
+
return {
|
|
13
|
+
symbol: quote.symbol,
|
|
14
|
+
shortName: quote.shortName,
|
|
15
|
+
longName: quote.longName,
|
|
16
|
+
regularMarketPrice: quote.regularMarketPrice,
|
|
17
|
+
regularMarketChange: quote.regularMarketChange,
|
|
18
|
+
regularMarketChangePercent: quote.regularMarketChangePercent,
|
|
19
|
+
regularMarketTime: quote.regularMarketTime,
|
|
20
|
+
currency: quote.currency,
|
|
21
|
+
exchange: quote.exchange,
|
|
22
|
+
marketState: quote.marketState,
|
|
23
|
+
source: "yahoo-finance",
|
|
24
|
+
};
|
|
25
|
+
}
|
|
26
|
+
catch (error) {
|
|
27
|
+
throw {
|
|
28
|
+
message: `Failed to fetch quote for symbol: ${symbol}`,
|
|
29
|
+
code: "FINANCE_QUOTE_ERROR",
|
|
30
|
+
originalError: error,
|
|
31
|
+
};
|
|
32
|
+
}
|
|
33
|
+
}
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
import { ScraperOptions, SearchResult } from "../types";
|
|
2
|
+
export { searchGoogle } from "./scrapers/google";
|
|
3
|
+
export { searchDuckDuckGo } from "./scrapers/duckduckgo";
|
|
4
|
+
export { searchSearxNG } from "./scrapers/searxng";
|
|
5
|
+
export declare function search(query: string, options?: ScraperOptions): Promise<SearchResult[]>;
|