@fettstorch/clai 0.1.6 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/scraper.ts CHANGED
@@ -1,4 +1,31 @@
1
- import * as Cheerio from 'cheerio';
1
+ import * as Cheerio from "cheerio";
2
+
3
+ /**
4
+ * ANTI-SCRAPING DETECTION STRATEGY
5
+ *
6
+ * This scraper uses several techniques to avoid being detected as a bot:
7
+ *
8
+ * 1. BROWSER MIMICRY:
9
+ * - Complete HTTP headers that match real browsers
10
+ * - Randomized but realistic User-Agent strings
11
+ * - Proper Sec-Fetch metadata and client hints
12
+ *
13
+ * 2. SEARCH ENGINE DIVERSITY:
14
+ * - Try SearX instances first (scraper-friendly)
15
+ * - Fallback to Google with careful HTML parsing
16
+ * - DuckDuckGo API as secondary fallback
17
+ * - Emergency constructed URLs as last resort
18
+ *
19
+ * 3. RESPECTFUL BEHAVIOR:
20
+ * - Single request per user interaction (no rapid-fire requests)
21
+ * - Proper error handling without retries that could trigger rate limits
22
+ * - Clean fallback chain that doesn't hammer failed services
23
+ *
24
+ * MAINTENANCE NOTES:
25
+ * - Update User-Agent strings every few months
26
+ * - Monitor SearX instance availability
27
+ * - Watch for changes in Google's HTML structure
28
+ */
2
29
 
3
30
  export interface ScrapedData {
4
31
  title: string;
@@ -9,11 +36,11 @@ export interface ScrapedData {
9
36
  export async function scrape(input: string): Promise<ScrapedData[]> {
10
37
  try {
11
38
  let urls: string[];
12
-
39
+
13
40
  if (isValidUrl(input)) {
14
41
  urls = [normalizeUrl(input)];
15
42
  } else {
16
- urls = await getGoogleResults(input);
43
+ urls = await getSearchResults(input);
17
44
  }
18
45
 
19
46
  // Fetch all URLs in parallel
@@ -33,8 +60,8 @@ export async function scrape(input: string): Promise<ScrapedData[]> {
33
60
  // Filter out failed scrapes
34
61
  return results.filter((result): result is ScrapedData => result !== null);
35
62
  } catch (error) {
36
- console.error('Error during scraping:', error);
37
- throw error;
63
+ // If search engines fail, return empty array to trigger OpenAI fallback
64
+ return [];
38
65
  }
39
66
  }
40
67
 
@@ -42,66 +69,347 @@ export async function scrape(input: string): Promise<ScrapedData[]> {
42
69
 
43
70
  function isValidUrl(input: string): boolean {
44
71
  // Check for whitespace
45
- if (input.includes(' ')) return false;
46
-
72
+ if (input.includes(" ")) return false;
73
+
47
74
  // Check for common TLDs using regex
48
75
  const tldPattern = /^[^\s]+\.[a-z]{2,}$/i;
49
76
  return tldPattern.test(input);
50
77
  }
51
78
 
52
79
  function normalizeUrl(url: string): string {
53
- if (!url.startsWith('http://') && !url.startsWith('https://')) {
80
+ if (!url.startsWith("http://") && !url.startsWith("https://")) {
54
81
  return `https://${url}`;
55
82
  }
56
83
  return url;
57
84
  }
58
85
 
86
+ async function getSearchResults(query: string): Promise<string[]> {
87
+ const searchEngines = [
88
+ { name: "SearX", fn: getSearXResults },
89
+ { name: "Google", fn: getGoogleResults },
90
+ { name: "DuckDuckGo", fn: getDuckDuckGoResults },
91
+ { name: "Wikipedia", fn: getWikipediaResults },
92
+ ];
93
+
94
+ for (const engine of searchEngines) {
95
+ try {
96
+ const result = await engine.fn(query);
97
+ console.log(`[${engine.name}]::✅`);
98
+ return result;
99
+ } catch (_) {
100
+ console.log(`[${engine.name}]::❌`);
101
+ }
102
+ }
103
+
104
+ console.log("All search engines failed - no URLs to scrape");
105
+ throw new Error("No search results available");
106
+ }
107
+
108
+ async function getSearXResults(query: string): Promise<string[]> {
109
+ // Keep a minimal list since most SearX instances block automation
110
+ const searxInstances = ["https://searx.be", "https://search.sapti.me"];
111
+
112
+ // Try instances until one works
113
+ for (const instance of searxInstances) {
114
+ try {
115
+ const searchUrl = `${instance}/search?q=${encodeURIComponent(
116
+ query
117
+ )}&format=json&categories=general`;
118
+
119
+ // Use enhancedFetch with JSON Accept header for API requests
120
+ // This makes the request look like a legitimate AJAX call
121
+ const response = await enhancedFetch(searchUrl, {
122
+ headers: {
123
+ Accept: "application/json",
124
+ },
125
+ });
126
+
127
+ if (!response.ok) {
128
+ // Likely blocked - continue silently to next instance
129
+ continue;
130
+ }
131
+
132
+ const data = await response.json();
133
+ const urls: string[] = [];
134
+
135
+ if (data.results && Array.isArray(data.results)) {
136
+ for (const result of data.results.slice(0, 5)) {
137
+ if (
138
+ result.url &&
139
+ (result.url.startsWith("http://") ||
140
+ result.url.startsWith("https://")) &&
141
+ !result.url.includes("wikipedia.org") && // Skip Wikipedia for diversity
142
+ !urls.includes(result.url)
143
+ ) {
144
+ urls.push(result.url);
145
+ }
146
+ }
147
+ }
148
+
149
+ if (urls.length > 0) {
150
+ return urls.slice(0, 3); // Limit to 3 results
151
+ }
152
+ } catch (error) {
153
+ // Continue to next instance silently
154
+ }
155
+ }
156
+
157
+ throw new Error("All SearX instances failed");
158
+ }
159
+
160
+ async function getWikipediaResults(query: string): Promise<string[]> {
161
+ // Wikipedia's OpenSearch API - designed for automation and doesn't block
162
+ const searchUrl = `https://en.wikipedia.org/w/api.php?action=opensearch&search=${encodeURIComponent(
163
+ query
164
+ )}&limit=3&format=json&origin=*`;
165
+
166
+ const response = await enhancedFetch(searchUrl, {
167
+ headers: {
168
+ Accept: "application/json",
169
+ },
170
+ });
171
+
172
+ if (!response.ok) {
173
+ throw new Error(`Wikipedia API error: ${response.status}`);
174
+ }
175
+
176
+ const data = await response.json();
177
+
178
+ // Wikipedia OpenSearch returns [query, titles, descriptions, urls]
179
+ if (Array.isArray(data) && data.length >= 4 && Array.isArray(data[3])) {
180
+ const urls = data[3]?.filter((url: string) => url?.startsWith("https://"));
181
+
182
+ if (urls?.length > 0) {
183
+ return urls;
184
+ }
185
+ }
186
+
187
+ throw new Error("No Wikipedia results found");
188
+ }
189
+
59
190
  async function getGoogleResults(query: string): Promise<string[]> {
60
- const searchUrl = `https://www.google.com/search?q=${encodeURIComponent(query)}`;
191
+ const searchUrl = `https://www.google.com/search?q=${encodeURIComponent(
192
+ query
193
+ )}&num=10`;
194
+
195
+ // Fetch Google search page using enhanced headers to avoid bot detection
61
196
  const html = await fetchHtml(searchUrl);
62
-
63
- const urlPattern = /https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)/gi;
64
- const urls = html.match(urlPattern) || [];
65
-
66
- const queryWords = query
67
- .toLowerCase()
68
- .split(/\s+/)
69
- .filter(word => word.length > 2);
70
-
71
- const filteredUrls = new Set(
72
- urls.filter(url => {
73
- const urlLower = url.toLowerCase();
74
- return !urlLower.includes('www.google') &&
75
- !urlLower.includes('gstatic.com') &&
76
- !urlLower.includes('googleapis.com') &&
77
- !urlLower.includes('googleadservices') &&
78
- queryWords.some(word => urlLower.includes(word));
79
- })
80
- );
81
-
82
- const results = [...filteredUrls].slice(0, 3);
83
-
84
- if (results.length === 0) {
85
- throw new Error('No search results found');
197
+
198
+ // Check if Google is blocking us
199
+ if (
200
+ html.includes("If you're having trouble accessing Google Search") ||
201
+ html.includes("unusual traffic from your computer network")
202
+ ) {
203
+ throw new Error("Google blocked request - detected as bot");
86
204
  }
87
-
88
- return results;
205
+
206
+ const cheerioDoc = Cheerio.load(html);
207
+ const urls: string[] = [];
208
+
209
+ // Google search result links are in <a> tags with href starting with /url?q=
210
+ cheerioDoc('a[href^="/url?q="]').each((_, element) => {
211
+ const href = cheerioDoc(element).attr("href");
212
+ if (href) {
213
+ // Extract the actual URL from Google's redirect format: /url?q=ACTUAL_URL&sa=...
214
+ const urlMatch = href.match(/\/url\?q=([^&]+)/);
215
+ if (urlMatch) {
216
+ try {
217
+ const decodedUrl = decodeURIComponent(urlMatch[1]);
218
+ // Filter out Google's own URLs and other unwanted domains
219
+ if (
220
+ !decodedUrl.includes("google.com") &&
221
+ !decodedUrl.includes("youtube.com") &&
222
+ (decodedUrl.startsWith("http://") ||
223
+ decodedUrl.startsWith("https://"))
224
+ ) {
225
+ urls.push(decodedUrl);
226
+ }
227
+ } catch (error) {
228
+ // Skip malformed URLs
229
+ }
230
+ }
231
+ }
232
+ });
233
+
234
+ // Also try direct links (sometimes Google shows direct links)
235
+ cheerioDoc('a[href^="http"]').each((_, element) => {
236
+ const href = cheerioDoc(element).attr("href");
237
+ if (
238
+ href &&
239
+ !href.includes("google.com") &&
240
+ !href.includes("youtube.com") &&
241
+ !urls.includes(href)
242
+ ) {
243
+ urls.push(href);
244
+ }
245
+ });
246
+
247
+ // Remove duplicates and limit to first 3 results
248
+ const uniqueUrls = [...new Set(urls)].slice(0, 3);
249
+
250
+ if (uniqueUrls.length === 0) {
251
+ throw new Error("No search results found in Google response");
252
+ }
253
+
254
+ return uniqueUrls;
89
255
  }
90
256
 
91
- async function fetchHtml(url: string): Promise<string> {
92
- const response = await fetch(url, {
93
- headers: {
94
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
257
+ async function getDuckDuckGoResults(query: string): Promise<string[]> {
258
+ const searchUrl = `https://api.duckduckgo.com/?q=${encodeURIComponent(
259
+ query
260
+ )}&format=json&no_html=1&skip_disambig=1`;
261
+
262
+ // DuckDuckGo API is more lenient but still benefits from browser-like headers
263
+ const response = await enhancedFetch(searchUrl);
264
+
265
+ if (!response.ok) {
266
+ throw new Error(`DuckDuckGo API error: ${response.status}`);
267
+ }
268
+
269
+ const data = await response.json();
270
+
271
+ const urls: string[] = [];
272
+
273
+ if (data.AbstractURL) {
274
+ urls.push(data.AbstractURL);
275
+ }
276
+
277
+ if (data.RelatedTopics) {
278
+ for (const topic of data.RelatedTopics.slice(0, 2)) {
279
+ if (topic.FirstURL) {
280
+ urls.push(topic.FirstURL);
281
+ }
95
282
  }
283
+ }
284
+
285
+ // If no direct URLs, try definition URL
286
+ if (urls.length === 0 && data.DefinitionURL) {
287
+ urls.push(data.DefinitionURL);
288
+ }
289
+
290
+ if (urls.length === 0) {
291
+ throw new Error("No search results found in DuckDuckGo response");
292
+ }
293
+
294
+ return urls;
295
+ }
296
+
297
+ /**
298
+ * Enhanced fetch function that mimics real browser behavior to avoid scraping detection
299
+ *
300
+ * Anti-detection techniques used:
301
+ * 1. Complete browser fingerprint with all expected headers
302
+ * 2. Client hints that modern browsers send automatically
303
+ * 3. Proper Sec-Fetch metadata for legitimate navigation
304
+ * 4. Cache control headers to prevent suspicious caching patterns
305
+ */
306
+ async function enhancedFetch(
307
+ url: string,
308
+ options: RequestInit = {}
309
+ ): Promise<Response> {
310
+ const headers = {
311
+ // Randomized but realistic User-Agent from our pool
312
+ "User-Agent": getRandomUserAgent(),
313
+
314
+ // Standard browser Accept header - tells server what content types we can handle
315
+ Accept:
316
+ "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
317
+
318
+ // Language preferences - indicates we prefer English
319
+ "Accept-Language": "en-US,en;q=0.9",
320
+
321
+ // Compression support - modern browsers support these
322
+ "Accept-Encoding": "gzip, deflate, br",
323
+
324
+ // CLIENT HINTS - Modern browsers send these automatically
325
+ // Tells server we're Chrome 121 (matches our User-Agent)
326
+ "sec-ch-ua":
327
+ '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
328
+
329
+ // Indicates we're on desktop (not mobile)
330
+ "sec-ch-ua-mobile": "?0",
331
+
332
+ // Platform information (macOS in this case)
333
+ "sec-ch-ua-platform": '"macOS"',
334
+
335
+ // SEC-FETCH METADATA - Critical for avoiding detection
336
+ // Tells server this is a document request (not an API call)
337
+ "Sec-Fetch-Dest": "document",
338
+
339
+ // Indicates this is a navigation (user clicking a link)
340
+ "Sec-Fetch-Mode": "navigate",
341
+
342
+ // Cross-site navigation (coming from different domain)
343
+ "Sec-Fetch-Site": "cross-site",
344
+
345
+ // User-initiated request (not automatic/script)
346
+ "Sec-Fetch-User": "?1",
347
+
348
+ // Indicates we want HTTPS when possible
349
+ "Upgrade-Insecure-Requests": "1",
350
+
351
+ // CACHE CONTROL - Prevents suspicious caching patterns
352
+ // Don't use cached responses
353
+ "Cache-Control": "no-cache",
354
+
355
+ // Legacy cache control for older servers
356
+ Pragma: "no-cache",
357
+
358
+ // Allow caller to override any headers if needed
359
+ ...options.headers,
360
+ };
361
+
362
+ return fetch(url, {
363
+ ...options,
364
+ headers,
96
365
  });
366
+ }
367
+
368
+ async function fetchHtml(url: string): Promise<string> {
369
+ const response = await enhancedFetch(url);
97
370
  return response.text();
98
371
  }
99
372
 
373
+ /**
374
+ * Returns a random User-Agent string from a pool of current, realistic browser strings
375
+ *
376
+ * Why this helps avoid detection:
377
+ * 1. Rotating User-Agents prevents fingerprinting based on consistent UA
378
+ * 2. All UAs are current versions (as of late 2024) - old versions are suspicious
379
+ * 3. Mix of browsers/platforms makes traffic look more natural
380
+ * 4. These exact strings are used by millions of real users
381
+ *
382
+ * Maintenance note: Update these every few months to stay current
383
+ */
384
+ function getRandomUserAgent(): string {
385
+ const userAgents = [
386
+ // Latest Chrome on macOS (most common desktop browser)
387
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
388
+
389
+ // Latest Chrome on Windows (largest user base)
390
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
391
+
392
+ // Latest Firefox on macOS (privacy-conscious users)
393
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:132.0) Gecko/20100101 Firefox/132.0",
394
+
395
+ // Latest Firefox on Windows
396
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0",
397
+
398
+ // Latest Safari on macOS (default Mac browser)
399
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1 Safari/605.1.15",
400
+
401
+ // Latest Edge on Windows (default Windows browser)
402
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0",
403
+ ];
404
+
405
+ return userAgents[Math.floor(Math.random() * userAgents.length)];
406
+ }
407
+
100
408
  function extractDataFromHtml(html: string): ScrapedData {
101
409
  const cheerioDoc = Cheerio.load(html);
102
410
  return {
103
- title: cheerioDoc('title').text(),
104
- content: cheerioDoc('body').text(),
105
- url: cheerioDoc('link[rel="canonical"]').attr('href') || ''
411
+ title: cheerioDoc("title").text(),
412
+ content: cheerioDoc("body").text(),
413
+ url: cheerioDoc('link[rel="canonical"]').attr("href") || "",
106
414
  };
107
- }
415
+ }
package/src/summarizer.ts CHANGED
@@ -1,4 +1,4 @@
1
- import { openaiClient } from './openai';
1
+ import { openaiClient } from "./openai";
2
2
 
3
3
  export type SummaryResult = Readonly<{
4
4
  textual: string;
@@ -6,7 +6,7 @@ export type SummaryResult = Readonly<{
6
6
  name: string;
7
7
  url: string;
8
8
  }>;
9
- }>
9
+ }>;
10
10
 
11
11
  /**
12
12
  * Summarizes content and extracts relevant links using OpenAI
@@ -14,7 +14,7 @@ export type SummaryResult = Readonly<{
14
14
  * @param maxLength - Maximum length of the summary in words
15
15
  * @returns Promise containing the summary text and extracted links
16
16
  * @throws Will throw an error if OpenAI API call fails
17
- *
17
+ *
18
18
  * @example
19
19
  * ```ts
20
20
  * const result = await summarizeContent(longText, 100)
@@ -23,9 +23,12 @@ export type SummaryResult = Readonly<{
23
23
  * ```
24
24
  */
25
25
 
26
- export async function summarizeWebPage(content: string, openAIApiKey: string): Promise<SummaryResult> {
26
+ export async function summarizeWebPage(
27
+ content: string,
28
+ openAIApiKey: string
29
+ ): Promise<SummaryResult> {
27
30
  const openai = openaiClient(openAIApiKey);
28
-
31
+
29
32
  const prompt = `Your are an expert educator. Analyze the following text and create a
30
33
  concise summary with the following guidelines:
31
34
  1. Always use bullet points, lists and tables over paragraphs.
@@ -49,32 +52,93 @@ export async function summarizeWebPage(content: string, openAIApiKey: string): P
49
52
 
50
53
  const schema = {
51
54
  textual: {
52
- type: 'string',
53
- description: 'Concise summary of the text'
55
+ type: "string",
56
+ description: "Concise summary of the text",
54
57
  },
55
58
  links: {
56
- type: 'array',
59
+ type: "array",
57
60
  items: {
58
- type: 'object',
61
+ type: "object",
59
62
  properties: {
60
63
  name: {
61
- type: 'string',
62
- description: 'Descriptive name or title of the link'
64
+ type: "string",
65
+ description: "Descriptive name or title of the link",
63
66
  },
64
67
  url: {
65
- type: 'string',
66
- description: 'The URL of the link'
67
- }
68
+ type: "string",
69
+ description: "The URL of the link",
70
+ },
68
71
  },
69
- required: ['name', 'url']
70
- }
71
- }
72
+ required: ["name", "url"],
73
+ },
74
+ },
72
75
  };
73
76
 
74
77
  const result = await openai.completeStructured<SummaryResult>(prompt, {
75
78
  temperature: 0.3,
76
- responseSchema: schema
79
+ responseSchema: schema,
80
+ });
81
+
82
+ return result;
83
+ }
84
+
85
+ /**
86
+ * Uses OpenAI to directly answer a query when no scraped content is available
87
+ * @param query - The user's question or search query
88
+ * @param openAIApiKey - OpenAI API key
89
+ * @returns Promise containing the AI-generated response and any relevant links
90
+ */
91
+ export async function summarizeQuery(
92
+ query: string,
93
+ openAIApiKey: string
94
+ ): Promise<SummaryResult> {
95
+ const openai = openaiClient(openAIApiKey);
96
+
97
+ const prompt = `You are an expert educator and researcher. Answer the following query with accurate, helpful information:
98
+
99
+ "${query}"
100
+
101
+ Guidelines:
102
+ 1. Provide a comprehensive but concise answer
103
+ 2. Use bullet points, lists, and tables when appropriate
104
+ 3. Include relevant examples or step-by-step instructions if applicable
105
+ 4. Format your response in valid markdown
106
+ 5. Be factual and cite general knowledge sources when relevant
107
+ 6. If you suggest external resources, format them as links in the response
108
+ 7. Mark proper nouns as bold e.g. **OpenAI**
109
+ 8. Use appropriate headings (##, ###) to structure your response
110
+ 9. If the query is about current events beyond your knowledge cutoff, mention that limitation
111
+
112
+ Provide a thorough, educational response that directly addresses the user's query.`;
113
+
114
+ const schema = {
115
+ textual: {
116
+ type: "string",
117
+ description: "Comprehensive answer to the user query",
118
+ },
119
+ links: {
120
+ type: "array",
121
+ items: {
122
+ type: "object",
123
+ properties: {
124
+ name: {
125
+ type: "string",
126
+ description: "Descriptive name of the recommended resource",
127
+ },
128
+ url: {
129
+ type: "string",
130
+ description: "URL to the recommended resource",
131
+ },
132
+ },
133
+ required: ["name", "url"],
134
+ },
135
+ },
136
+ };
137
+
138
+ const result = await openai.completeStructured<SummaryResult>(prompt, {
139
+ temperature: 0.7,
140
+ responseSchema: schema,
77
141
  });
78
142
 
79
143
  return result;
80
- }
144
+ }
package/tsconfig.json CHANGED
@@ -1,15 +1,15 @@
1
1
  {
2
- "compilerOptions": {
3
- "target": "esnext",
4
- "module": "esnext",
5
- "moduleResolution": "bundler",
6
- "types": ["bun-types"],
7
- "outDir": "./dist",
8
- "rootDir": "./src",
9
- "strict": true,
10
- "skipLibCheck": true,
11
- "forceConsistentCasingInFileNames": true,
12
- "resolveJsonModule": true
13
- },
14
- "include": ["src/**/*"]
15
- }
2
+ "compilerOptions": {
3
+ "target": "esnext",
4
+ "module": "esnext",
5
+ "moduleResolution": "bundler",
6
+ "types": ["bun-types"],
7
+ "outDir": "./dist",
8
+ "rootDir": "./src",
9
+ "strict": true,
10
+ "skipLibCheck": true,
11
+ "forceConsistentCasingInFileNames": true,
12
+ "resolveJsonModule": true
13
+ },
14
+ "include": ["src/**/*"]
15
+ }