@fettstorch/clai 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/scraper.ts CHANGED
@@ -1,5 +1,32 @@
1
1
  import * as Cheerio from "cheerio";
2
2
 
3
+ /**
4
+ * ANTI-SCRAPING DETECTION STRATEGY
5
+ *
6
+ * This scraper uses several techniques to avoid being detected as a bot:
7
+ *
8
+ * 1. BROWSER MIMICRY:
9
+ * - Complete HTTP headers that match real browsers
10
+ * - Randomized but realistic User-Agent strings
11
+ * - Proper Sec-Fetch metadata and client hints
12
+ *
13
+ * 2. SEARCH ENGINE DIVERSITY:
14
+ * - Try SearX instances first (scraper-friendly)
15
+ * - Fallback to Google with careful HTML parsing
16
+ * - DuckDuckGo API as secondary fallback
17
+ * - Emergency constructed URLs as last resort
18
+ *
19
+ * 3. RESPECTFUL BEHAVIOR:
20
+ * - Single request per user interaction (no rapid-fire requests)
21
+ * - Proper error handling without retries that could trigger rate limits
22
+ * - Clean fallback chain that doesn't hammer failed services
23
+ *
24
+ * MAINTENANCE NOTES:
25
+ * - Update User-Agent strings every few months
26
+ * - Monitor SearX instance availability
27
+ * - Watch for changes in Google's HTML structure
28
+ */
29
+
3
30
  export interface ScrapedData {
4
31
  title: string;
5
32
  content: string;
@@ -33,8 +60,8 @@ export async function scrape(input: string): Promise<ScrapedData[]> {
33
60
  // Filter out failed scrapes
34
61
  return results.filter((result): result is ScrapedData => result !== null);
35
62
  } catch (error) {
36
- console.error("Error during scraping:", error);
37
- throw error;
63
+ // If search engines fail, return empty array to trigger OpenAI fallback
64
+ return [];
38
65
  }
39
66
  }
40
67
 
@@ -57,74 +84,30 @@ function normalizeUrl(url: string): string {
57
84
  }
58
85
 
59
86
  async function getSearchResults(query: string): Promise<string[]> {
60
- try {
61
- return await getSearXResults(query);
62
- } catch (_) {
63
- console.log("Trying Google search...");
87
+ const searchEngines = [
88
+ { name: "SearX", fn: getSearXResults },
89
+ { name: "Google", fn: getGoogleResults },
90
+ { name: "DuckDuckGo", fn: getDuckDuckGoResults },
91
+ { name: "Wikipedia", fn: getWikipediaResults },
92
+ ];
93
+
94
+ for (const engine of searchEngines) {
64
95
  try {
65
- return await getGoogleResults(query);
96
+ const result = await engine.fn(query);
97
+ console.log(`[${engine.name}]::✅`);
98
+ return result;
66
99
  } catch (_) {
67
- console.log("Trying DuckDuckGo search...");
68
- try {
69
- return await getDuckDuckGoResults(query);
70
- } catch (_) {
71
- console.log("Using emergency fallback...");
72
- return getEmergencyResults(query);
73
- }
74
- }
75
- }
76
- }
77
-
78
- function getEmergencyResults(query: string): string[] {
79
- // Emergency fallback - construct likely URLs based on the query
80
- const results: string[] = [];
81
-
82
- // Try to construct some reasonable URLs based on common patterns
83
- const cleanQuery = query
84
- .toLowerCase()
85
- .replace(/[^a-z0-9\s]/g, "")
86
- .trim();
87
- const words = cleanQuery.split(/\s+/).filter((word) => word.length > 2);
88
-
89
- if (words.length > 0) {
90
- const mainWord = words[0];
91
-
92
- // Add some likely candidates
93
- results.push(
94
- `https://en.wikipedia.org/wiki/${encodeURIComponent(
95
- query.replace(/\s+/g, "_")
96
- )}`
97
- );
98
-
99
- if (mainWord.length > 3) {
100
- results.push(`https://${mainWord}.com`);
101
- results.push(`https://www.${mainWord}.org`);
100
+ console.log(`[${engine.name}]::❌`);
102
101
  }
103
-
104
- // Add a Reddit search as last resort
105
- results.push(
106
- `https://www.reddit.com/search/?q=${encodeURIComponent(query)}`
107
- );
108
102
  }
109
103
 
110
- console.log("Emergency fallback returning:", results.join(", "));
111
- return results.length > 0
112
- ? results.slice(0, 3)
113
- : [
114
- `https://en.wikipedia.org/wiki/${encodeURIComponent(
115
- query.replace(/\s+/g, "_")
116
- )}`,
117
- ];
104
+ console.log("All search engines failed - no URLs to scrape");
105
+ throw new Error("No search results available");
118
106
  }
119
107
 
120
108
  async function getSearXResults(query: string): Promise<string[]> {
121
- // Public SearXNG instances that are scraper-friendly
122
- const searxInstances = [
123
- "https://searx.be",
124
- "https://search.sapti.me",
125
- "https://searx.tiekoetter.com",
126
- "https://searx.prvcy.eu",
127
- ];
109
+ // Keep a minimal list since most SearX instances block automation
110
+ const searxInstances = ["https://searx.be", "https://search.sapti.me"];
128
111
 
129
112
  // Try instances until one works
130
113
  for (const instance of searxInstances) {
@@ -133,16 +116,16 @@ async function getSearXResults(query: string): Promise<string[]> {
133
116
  query
134
117
  )}&format=json&categories=general`;
135
118
 
136
- console.log("Trying SearX search...");
137
-
138
- const response = await fetch(searchUrl, {
119
+ // Use enhancedFetch with JSON Accept header for API requests
120
+ // This makes the request look like a legitimate AJAX call
121
+ const response = await enhancedFetch(searchUrl, {
139
122
  headers: {
140
- "User-Agent": getRandomUserAgent(),
141
123
  Accept: "application/json",
142
124
  },
143
125
  });
144
126
 
145
127
  if (!response.ok) {
128
+ // Likely blocked - continue silently to next instance
146
129
  continue;
147
130
  }
148
131
 
@@ -164,23 +147,62 @@ async function getSearXResults(query: string): Promise<string[]> {
164
147
  }
165
148
 
166
149
  if (urls.length > 0) {
167
- console.log(`✓ SearX found ${urls.length} results`);
168
150
  return urls.slice(0, 3); // Limit to 3 results
169
151
  }
170
152
  } catch (error) {
171
- // Continue to next instance
153
+ // Continue to next instance silently
172
154
  }
173
155
  }
174
156
 
175
157
  throw new Error("All SearX instances failed");
176
158
  }
177
159
 
160
+ async function getWikipediaResults(query: string): Promise<string[]> {
161
+ // Wikipedia's OpenSearch API - designed for automation and doesn't block
162
+ const searchUrl = `https://en.wikipedia.org/w/api.php?action=opensearch&search=${encodeURIComponent(
163
+ query
164
+ )}&limit=3&format=json&origin=*`;
165
+
166
+ const response = await enhancedFetch(searchUrl, {
167
+ headers: {
168
+ Accept: "application/json",
169
+ },
170
+ });
171
+
172
+ if (!response.ok) {
173
+ throw new Error(`Wikipedia API error: ${response.status}`);
174
+ }
175
+
176
+ const data = await response.json();
177
+
178
+ // Wikipedia OpenSearch returns [query, titles, descriptions, urls]
179
+ if (Array.isArray(data) && data.length >= 4 && Array.isArray(data[3])) {
180
+ const urls = data[3]?.filter((url: string) => url?.startsWith("https://"));
181
+
182
+ if (urls?.length > 0) {
183
+ return urls;
184
+ }
185
+ }
186
+
187
+ throw new Error("No Wikipedia results found");
188
+ }
189
+
178
190
  async function getGoogleResults(query: string): Promise<string[]> {
179
191
  const searchUrl = `https://www.google.com/search?q=${encodeURIComponent(
180
192
  query
181
193
  )}&num=10`;
182
194
 
195
+ // Fetch Google search page using enhanced headers to avoid bot detection
183
196
  const html = await fetchHtml(searchUrl);
197
+
198
+ // Check if Google is blocking us
199
+ if (
200
+ html.includes("If you're having trouble accessing Google Search") ||
201
+ html.includes("unusual traffic from your computer network")
202
+ ) {
203
+ throw new Error("Google blocked request - detected as bot");
204
+ }
205
+
184
206
  const cheerioDoc = Cheerio.load(html);
185
207
  const urls: string[] = [];
186
208
 
@@ -229,7 +251,6 @@ async function getGoogleResults(query: string): Promise<string[]> {
229
251
  throw new Error("No search results found in Google response");
230
252
  }
231
253
 
232
- console.log(`✓ Google found ${uniqueUrls.length} results`);
233
254
  return uniqueUrls;
234
255
  }
235
256
 
@@ -238,9 +259,27 @@ async function getDuckDuckGoResults(query: string): Promise<string[]> {
238
259
  query
239
260
  )}&format=json&no_html=1&skip_disambig=1`;
240
261
 
241
- const response = await fetch(searchUrl);
262
+ // DuckDuckGo API is more lenient but still benefits from browser-like headers
263
+ const response = await enhancedFetch(searchUrl);
264
+
265
+ if (!response.ok) {
266
+ throw new Error(`DuckDuckGo API error: ${response.status}`);
267
+ }
268
+
242
269
  const data = await response.json();
243
270
 
271
+ // Check for DuckDuckGo blocking patterns
272
+ if (
273
+ data.Abstract?.includes("redirects users to a non-JavaScript site") ||
274
+ data.Abstract?.includes("DuckDuckGo redirects users") ||
275
+ data.AbstractText?.includes("redirects users to a non-JavaScript site") ||
276
+ data.AbstractText?.includes("DuckDuckGo redirects users")
277
+ ) {
278
+ throw new Error(
279
+ "DuckDuckGo blocked request - JavaScript disabled redirect"
280
+ );
281
+ }
282
+
244
283
  const urls: string[] = [];
245
284
 
246
285
  if (data.AbstractURL) {
@@ -255,48 +294,123 @@ async function getDuckDuckGoResults(query: string): Promise<string[]> {
255
294
  }
256
295
  }
257
296
 
297
+ // If no direct URLs, try definition URL
298
+ if (urls.length === 0 && data.DefinitionURL) {
299
+ urls.push(data.DefinitionURL);
300
+ }
301
+
258
302
  if (urls.length === 0) {
259
303
  throw new Error("No search results found in DuckDuckGo response");
260
304
  }
261
305
 
262
- console.log(`✓ DuckDuckGo found ${urls.length} results`);
263
306
  return urls;
264
307
  }
265
308
 
266
- async function fetchHtml(url: string): Promise<string> {
267
- const response = await fetch(url, {
268
- headers: {
269
- "User-Agent": getRandomUserAgent(),
270
- Accept:
271
- "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
272
- "Accept-Language": "en-US,en;q=0.9",
273
- "Accept-Encoding": "gzip, deflate, br",
274
- DNT: "1",
275
- Connection: "keep-alive",
276
- "Upgrade-Insecure-Requests": "1",
277
- "Sec-Fetch-Dest": "document",
278
- "Sec-Fetch-Mode": "navigate",
279
- "Sec-Fetch-Site": "none",
280
- "Sec-Fetch-User": "?1",
281
- "Cache-Control": "max-age=0",
282
- },
309
+ /**
310
+ * Enhanced fetch function that mimics real browser behavior to avoid scraping detection
311
+ *
312
+ * Anti-detection techniques used:
313
+ * 1. Complete browser fingerprint with all expected headers
314
+ * 2. Client hints that modern browsers send automatically
315
+ * 3. Proper Sec-Fetch metadata for legitimate navigation
316
+ * 4. Cache control headers to prevent suspicious caching patterns
317
+ */
318
+ async function enhancedFetch(
319
+ url: string,
320
+ options: RequestInit = {}
321
+ ): Promise<Response> {
322
+ const headers = {
323
+ // Randomized but realistic User-Agent from our pool
324
+ "User-Agent": getRandomUserAgent(),
325
+
326
+ // Standard browser Accept header - tells server what content types we can handle
327
+ Accept:
328
+ "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
329
+
330
+ // Language preferences - indicates we prefer English
331
+ "Accept-Language": "en-US,en;q=0.9",
332
+
333
+ // Compression support - modern browsers support these
334
+ "Accept-Encoding": "gzip, deflate, br",
335
+
336
+ // CLIENT HINTS - Modern browsers send these automatically
337
+ // Tells server we're Chrome 121 (matches our User-Agent)
338
+ "sec-ch-ua":
339
+ '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
340
+
341
+ // Indicates we're on desktop (not mobile)
342
+ "sec-ch-ua-mobile": "?0",
343
+
344
+ // Platform information (macOS in this case)
345
+ "sec-ch-ua-platform": '"macOS"',
346
+
347
+ // SEC-FETCH METADATA - Critical for avoiding detection
348
+ // Tells server this is a document request (not an API call)
349
+ "Sec-Fetch-Dest": "document",
350
+
351
+ // Indicates this is a navigation (user clicking a link)
352
+ "Sec-Fetch-Mode": "navigate",
353
+
354
+ // Cross-site navigation (coming from different domain)
355
+ "Sec-Fetch-Site": "cross-site",
356
+
357
+ // User-initiated request (not automatic/script)
358
+ "Sec-Fetch-User": "?1",
359
+
360
+ // Indicates we want HTTPS when possible
361
+ "Upgrade-Insecure-Requests": "1",
362
+
363
+ // CACHE CONTROL - Prevents suspicious caching patterns
364
+ // Don't use cached responses
365
+ "Cache-Control": "no-cache",
366
+
367
+ // Legacy cache control for older servers
368
+ Pragma: "no-cache",
369
+
370
+ // Allow caller to override any headers if needed
371
+ ...options.headers,
372
+ };
373
+
374
+ return fetch(url, {
375
+ ...options,
376
+ headers,
283
377
  });
378
+ }
379
+
380
+ async function fetchHtml(url: string): Promise<string> {
381
+ const response = await enhancedFetch(url);
284
382
  return response.text();
285
383
  }
286
384
 
385
+ /**
386
+ * Returns a random User-Agent string from a pool of current, realistic browser strings
387
+ *
388
+ * Why this helps avoid detection:
389
+ * 1. Rotating User-Agents prevents fingerprinting based on consistent UA
390
+ * 2. All UAs are current versions (as of late 2024) - old versions are suspicious
391
+ * 3. Mix of browsers/platforms makes traffic look more natural
392
+ * 4. These exact strings are used by millions of real users
393
+ *
394
+ * Maintenance note: Update these every few months to stay current
395
+ */
287
396
  function getRandomUserAgent(): string {
288
397
  const userAgents = [
289
- // Latest Chrome on macOS
398
+ // Latest Chrome on macOS (most common desktop browser)
290
399
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
291
- // Latest Chrome on Windows
400
+
401
+ // Latest Chrome on Windows (largest user base)
292
402
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
293
- // Latest Firefox on macOS
403
+
404
+ // Latest Firefox on macOS (privacy-conscious users)
294
405
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:132.0) Gecko/20100101 Firefox/132.0",
406
+
295
407
  // Latest Firefox on Windows
296
408
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0",
297
- // Latest Safari on macOS
409
+
410
+ // Latest Safari on macOS (default Mac browser)
298
411
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1 Safari/605.1.15",
299
- // Latest Edge on Windows
412
+
413
+ // Latest Edge on Windows (default Windows browser)
300
414
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0",
301
415
  ];
302
416
 
package/src/summarizer.ts CHANGED
@@ -1,12 +1,12 @@
1
- import { openaiClient } from './openai'
1
+ import { openaiClient } from "./openai";
2
2
 
3
3
  export type SummaryResult = Readonly<{
4
- textual: string
5
- links: ReadonlyArray<{
6
- name: string
7
- url: string
8
- }>
9
- }>
4
+ textual: string;
5
+ links: ReadonlyArray<{
6
+ name: string;
7
+ url: string;
8
+ }>;
9
+ }>;
10
10
 
11
11
  /**
12
12
  * Summarizes content and extracts relevant links using OpenAI
@@ -24,12 +24,12 @@ export type SummaryResult = Readonly<{
24
24
  */
25
25
 
26
26
  export async function summarizeWebPage(
27
- content: string,
28
- openAIApiKey: string,
27
+ content: string,
28
+ openAIApiKey: string
29
29
  ): Promise<SummaryResult> {
30
- const openai = openaiClient(openAIApiKey)
30
+ const openai = openaiClient(openAIApiKey);
31
31
 
32
- const prompt = `Your are an expert educator. Analyze the following text and create a
32
+ const prompt = `Your are an expert educator. Analyze the following text and create a
33
33
  concise summary with the following guidelines:
34
34
  1. Always use bullet points, lists and tables over paragraphs.
35
35
  2. Produce valid markdown output
@@ -48,36 +48,97 @@ export async function summarizeWebPage(
48
48
 
49
49
  Don't just summarize, cite the key information.
50
50
 
51
- Text to analyze:\n"${content}\n"`
51
+ Text to analyze:\n"${content}\n"`;
52
52
 
53
- const schema = {
54
- textual: {
55
- type: 'string',
56
- description: 'Concise summary of the text',
57
- },
58
- links: {
59
- type: 'array',
60
- items: {
61
- type: 'object',
62
- properties: {
63
- name: {
64
- type: 'string',
65
- description: 'Descriptive name or title of the link',
66
- },
67
- url: {
68
- type: 'string',
69
- description: 'The URL of the link',
70
- },
71
- },
72
- required: ['name', 'url'],
73
- },
74
- },
75
- }
53
+ const schema = {
54
+ textual: {
55
+ type: "string",
56
+ description: "Concise summary of the text",
57
+ },
58
+ links: {
59
+ type: "array",
60
+ items: {
61
+ type: "object",
62
+ properties: {
63
+ name: {
64
+ type: "string",
65
+ description: "Descriptive name or title of the link",
66
+ },
67
+ url: {
68
+ type: "string",
69
+ description: "The URL of the link",
70
+ },
71
+ },
72
+ required: ["name", "url"],
73
+ },
74
+ },
75
+ };
76
76
 
77
- const result = await openai.completeStructured<SummaryResult>(prompt, {
78
- temperature: 0.3,
79
- responseSchema: schema,
80
- })
77
+ const result = await openai.completeStructured<SummaryResult>(prompt, {
78
+ temperature: 0.3,
79
+ responseSchema: schema,
80
+ });
81
81
 
82
- return result
82
+ return result;
83
+ }
84
+
85
+ /**
86
+ * Uses OpenAI to directly answer a query when no scraped content is available
87
+ * @param query - The user's question or search query
88
+ * @param openAIApiKey - OpenAI API key
89
+ * @returns Promise containing the AI-generated response and any relevant links
90
+ */
91
+ export async function summarizeQuery(
92
+ query: string,
93
+ openAIApiKey: string
94
+ ): Promise<SummaryResult> {
95
+ const openai = openaiClient(openAIApiKey);
96
+
97
+ const prompt = `You are an expert educator and researcher. Answer the following query with accurate, helpful information:
98
+
99
+ "${query}"
100
+
101
+ Guidelines:
102
+ 1. Provide a comprehensive but concise answer
103
+ 2. Use bullet points, lists, and tables when appropriate
104
+ 3. Include relevant examples or step-by-step instructions if applicable
105
+ 4. Format your response in valid markdown
106
+ 5. Be factual and cite general knowledge sources when relevant
107
+ 6. If you suggest external resources, format them as links in the response
108
+ 7. Mark proper nouns as bold e.g. **OpenAI**
109
+ 8. Use appropriate headings (##, ###) to structure your response
110
+ 9. If the query is about current events beyond your knowledge cutoff, mention that limitation
111
+
112
+ Provide a thorough, educational response that directly addresses the user's query.`;
113
+
114
+ const schema = {
115
+ textual: {
116
+ type: "string",
117
+ description: "Comprehensive answer to the user query",
118
+ },
119
+ links: {
120
+ type: "array",
121
+ items: {
122
+ type: "object",
123
+ properties: {
124
+ name: {
125
+ type: "string",
126
+ description: "Descriptive name of the recommended resource",
127
+ },
128
+ url: {
129
+ type: "string",
130
+ description: "URL to the recommended resource",
131
+ },
132
+ },
133
+ required: ["name", "url"],
134
+ },
135
+ },
136
+ };
137
+
138
+ const result = await openai.completeStructured<SummaryResult>(prompt, {
139
+ temperature: 0.7,
140
+ responseSchema: schema,
141
+ });
142
+
143
+ return result;
83
144
  }