enterprise-ai-recursive-web-scraper 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,1303 @@
1
+ import { Browser } from 'puppeteer';
2
+ import { GoogleGenerativeAI, HarmCategory, HarmBlockThreshold } from '@google/generative-ai';
3
+
4
+ /**
5
+ * @fileoverview Enhanced web scraping and content filtering functions for detecting and filtering inappropriate content
6
+ * @file scraper.ts
7
+ * @module scraper
8
+ * @description This module provides functionality for web scraping with content filtering capabilities.
9
+ * It includes classes for managing browser operations, text processing, and content filtering using
10
+ * Trie data structures. The module is designed to detect and filter NSFW content, slurs, and other
11
+ * inappropriate content from web pages.
12
+ *
13
+ * Key features:
14
+ * - Web scraping using Puppeteer with stealth and ad-blocking capabilities
15
+ * - Content filtering using Trie data structures for efficient pattern matching
16
+ * - Text processing with duplicate detection and removal
17
+ * - NSFW domain detection and filtering
18
+ * - Configurable content replacement
19
+ *
20
+ * Classes:
21
+ * - ContentTrie: Trie data structure for efficient string matching
22
+ * - ContentFilterManager: Singleton manager for content filtering operations
23
+ * - TextProcessor: Text cleaning and duplicate detection utility
24
+ * - BrowserManager: Browser and page management for scraping
25
+ *
26
+ * @example
27
+ * ```typescript
28
+ * // Initialize the filtering system
29
+ * initializeFilterWords();
30
+ *
31
+ * // Scrape and filter content from a URL
32
+ * const result = await scrape('https://example.com');
33
+ * if ('error' in result) {
34
+ * console.error(result.error);
35
+ * } else {
36
+ * console.log(result.filteredTexts);
37
+ * }
38
+ *
39
+ * // Filter individual text
40
+ * const filtered = filterText('text to filter');
41
+ * ```
42
+ *
43
+ * @requires puppeteer-extra - Enhanced version of Puppeteer with plugin support
44
+ * @requires puppeteer-extra-plugin-adblocker - Plugin for blocking ads and trackers
45
+ * @requires puppeteer-extra-plugin-stealth - Plugin for avoiding bot detection
46
+ *
47
+ * @license MIT
48
+ * @author Original author and contributors
49
+ * @version 1.0.0
50
+ * @since 1.0.0
51
+ *
52
+ * @see {@link https://github.com/berstend/puppeteer-extra|puppeteer-extra} - Enhanced version of Puppeteer
53
+ * @see {@link https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra-plugin-stealth|puppeteer-extra-plugin-stealth} - Stealth plugin for avoiding detection
54
+ * @see {@link https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra-plugin-adblocker|puppeteer-extra-plugin-adblocker} - Ad blocking plugin
55
+ *
56
+ * @todo Add support for custom filtering rules
57
+ * @todo Improve error handling and recovery
58
+ * @todo Add rate limiting and request throttling
59
+ * @todo Implement caching for frequently accessed content
60
+ * @todo Add support for proxy rotation
61
+ * @todo Improve duplicate detection algorithms
62
+ * @todo Add support for custom content processors
63
+ * @todo Implement better logging and monitoring
64
+ * @todo Add support for distributed scraping
65
+ * @todo Improve memory management for large-scale scraping
66
+ *
67
+ * @throws {Error} When filter initialization fails
68
+ * @throws {Error} When browser operations fail
69
+ * @throws {Error} When content processing fails
70
+ * @throws {Error} When network operations fail
71
+ *
72
+ * @property {ContentFilterManager} filterManager - Singleton instance for content filtering
73
+ * @property {BrowserManager} browserManager - Static class for browser operations
74
+ * @property {TextProcessor} textProcessor - Static class for text processing
75
+ *
76
+ * @borrows ContentFilterManager.filterText as filterText
77
+ * @borrows ContentFilterManager.getInstance as getFilterManager
78
+ * @borrows BrowserManager.launch as launchBrowser
79
+ * @borrows TextProcessor.processText as processText
80
+ *
81
+ * @exports scrape - Main scraping function
82
+ * @exports initializeFilterWords - Filter initialization function
83
+ * @exports filterText - Text filtering function
84
+ * @exports ContentFilterManager - Content filtering manager class
85
+ *
86
+ * @typedef {Object} ScrapingResult
87
+ * @property {boolean} [flaggedDomain] - Whether the domain is flagged as NSFW
88
+ * @property {boolean} [containsCensored] - Whether censored content was found
89
+ * @property {string[]} [filteredTexts] - Array of filtered text content
90
+ * @property {string} [error] - Error message if scraping failed
91
+ *
92
+ * @typedef {Object} CodeBlock
93
+ * @property {string} language - Programming language of the code block
94
+ * @property {string} code - The actual code content
95
+ * @property {boolean} lineNumbers - Whether line numbers should be displayed
96
+ *
97
+ * @typedef {Object} TrieNode
98
+ * @property {Object.<string, TrieNode>} children - Child nodes in the Trie
99
+ * @property {boolean} isEndOfWord - Whether this node represents end of word
100
+ *
101
+ * @typedef {Object} ContentExtractionResult
102
+ * @property {string[]} texts - Array of extracted text content
103
+ * @property {CodeBlock[]} codeBlocks - Array of extracted code blocks
104
+ *
105
+ * @typedef {Object} FilterOptions
106
+ * @property {string} [replacement="***"] - Replacement string for filtered content
107
+ * @property {boolean} [caseSensitive=false] - Whether filtering is case sensitive
108
+ * @property {number} [minLength=1] - Minimum length for content to be filtered
109
+ *
110
+ * @typedef {Object} BrowserOptions
111
+ * @property {boolean} [headless=true] - Whether to run browser in headless mode
112
+ * @property {string[]} [args] - Additional browser launch arguments
113
+ * @property {number} [timeout=30000] - Navigation timeout in milliseconds
114
+ *
115
+ * @typedef {Object} ProcessingOptions
116
+ * @property {number} [similarityThreshold=0.85] - Threshold for duplicate detection
117
+ * @property {number} [maxLength=50000] - Maximum content length for processing
118
+ * @property {boolean} [preserveFormatting=false] - Whether to preserve text formatting
119
+ */
120
+
121
+ /**
122
+ * Singleton class managing content filtering operations.
123
+ * @class
124
+ * @description Provides centralized content filtering functionality using multiple
125
+ * filtering mechanisms including Tries and Sets. Implements the Singleton pattern
126
+ * to ensure consistent filtering across the application.
127
+ *
128
+ * Key features:
129
+ * - Singleton pattern ensures consistent filtering state
130
+ * - Multiple filtering mechanisms (Tries, Sets)
131
+ * - Configurable content replacement
132
+ * - Efficient text chunk processing
133
+ * - NSFW domain detection
134
+ *
135
+ * @example
136
+ * ```typescript
137
+ * const filterManager = ContentFilterManager.getInstance();
138
+ *
139
+ * // Check domain
140
+ * const isNSFW = filterManager.isNSFWDomain('example.com');
141
+ *
142
+ * // Filter text
143
+ * const filtered = filterManager.filterText('text to filter');
144
+ * ```
145
+ */
146
+ declare class ContentFilterManager {
147
+ /**
148
+ * Singleton instance
149
+ * @private
150
+ * @static
151
+ * @type {ContentFilterManager}
152
+ */
153
+ private static instance;
154
+ /**
155
+ * Trie for storing and matching filtered words
156
+ * @private
157
+ * @type {ContentTrie}
158
+ */
159
+ private filterTrie;
160
+ /**
161
+ * Set of NSFW domains
162
+ * @private
163
+ * @type {Set<string>}
164
+ */
165
+ private nsfwDomains;
166
+ /**
167
+ * Trie for storing and matching NSFW terms
168
+ * @private
169
+ * @type {ContentTrie}
170
+ */
171
+ private nsfwNamesTrie;
172
+ /**
173
+ * Set of filtered dictionary words
174
+ * @private
175
+ * @type {Set<string>}
176
+ */
177
+ private filterDict;
178
+ /**
179
+ * Maximum content length for processing
180
+ * @private
181
+ * @readonly
182
+ * @type {number}
183
+ */
184
+ private readonly MAX_CONTENT_LENGTH;
185
+ /**
186
+ * Private constructor to prevent direct instantiation.
187
+ * @private
188
+ * @description Initializes all filtering data structures and loads initial data.
189
+ * This constructor is private to enforce the singleton pattern.
190
+ *
191
+ * @throws {Error} If filter initialization fails
192
+ */
193
+ private constructor();
194
+ /**
195
+ * Gets or creates the singleton instance of ContentFilterManager.
196
+ * @returns {ContentFilterManager} The singleton instance
197
+ * @description Ensures only one instance of ContentFilterManager exists.
198
+ * Creates the instance if it doesn't exist, otherwise returns the existing instance.
199
+ *
200
+ * @example
201
+ * ```typescript
202
+ * const filterManager = ContentFilterManager.getInstance();
203
+ * ```
204
+ */
205
+ static getInstance(): ContentFilterManager;
206
+ /**
207
+ * Loads filter data from configuration files.
208
+ * @private
209
+ * @returns {Promise<void>}
210
+ * @description Asynchronously loads filtering data from configuration files,
211
+ * including NSFW domains, NSFW terms, and slurs. Initializes all filtering
212
+ * data structures with the loaded data.
213
+ *
214
+ * @throws {Error} If filter initialization fails or data files cannot be loaded
215
+ */
216
+ private loadFilters;
217
+ /**
218
+ * Checks if a URL contains or belongs to an NSFW domain.
219
+ * @param {string} url - The URL to check
220
+ * @returns {boolean} True if the URL matches any NSFW domain patterns
221
+ * @description Performs case-sensitive matching against known NSFW domains.
222
+ * Checks if the URL contains any known NSFW domain patterns.
223
+ *
224
+ * @example
225
+ * ```typescript
226
+ * const filterManager = ContentFilterManager.getInstance();
227
+ * const isNSFW = filterManager.isNSFWDomain('example.com');
228
+ * ```
229
+ *
230
+ * @throws {TypeError} If url is not a string
231
+ */
232
+ isNSFWDomain(url: string): boolean;
233
+ /**
234
+ * Splits text into manageable chunks while preserving context.
235
+ * @private
236
+ * @param {string} text - Text to split
237
+ * @returns {string[]} Array of text chunks
238
+ * @description Splits long text into smaller chunks while trying to maintain
239
+ * sentence boundaries and context. This ensures efficient processing of large
240
+ * text content.
241
+ *
242
+ * @throws {TypeError} If text is not a string
243
+ */
244
+ private splitIntoChunks;
245
+ /**
246
+ * Filters text content using content filtering rules.
247
+ * @param {string} text - Text to filter
248
+ * @param {string} [replacement="***"] - Replacement string for filtered content
249
+ * @returns {string} Filtered text with inappropriate content replaced
250
+ * @description Processes text content in chunks, applying filtering rules
251
+ * to detect and replace inappropriate content. Handles large text efficiently
252
+ * by breaking it into manageable chunks.
253
+ *
254
+ * @example
255
+ * ```typescript
256
+ * const filterManager = ContentFilterManager.getInstance();
257
+ * const filtered = filterManager.filterText('text to filter', '***');
258
+ * ```
259
+ *
260
+ * @throws {TypeError} If text is not a string
261
+ */
262
+ filterText(text: string, replacement?: string): string;
263
+ /**
264
+ * Applies the actual filtering logic to a single chunk.
265
+ * @private
266
+ * @param {string} chunk - Text chunk to filter
267
+ * @param {string} replacement - Replacement string for filtered content
268
+ * @returns {string} Filtered text chunk
269
+ * @description Applies filtering rules to a single chunk of text,
270
+ * replacing inappropriate content with the specified replacement string.
271
+ *
272
+ * @throws {TypeError} If chunk is not a string
273
+ */
274
+ private applyFilters;
275
+ }
276
+ /**
277
+ * Main scraping function that processes and filters web content.
278
+ * @param {string} url - The URL to scrape
279
+ * @returns {Promise<{flaggedDomain: boolean, containsCensored: boolean, filteredTexts: string[]} | {error: string}>}
280
+ * Object containing scraping results or error information
281
+ * @description Coordinates the entire scraping process including:
282
+ * - URL validation
283
+ * - Browser management
284
+ * - Content extraction
285
+ * - Text processing
286
+ * - Content filtering
287
+ * @throws {Error} Various errors related to browser operations or content processing
288
+ */
289
+ declare function scrape(url: string, browser?: Browser | null): Promise<{
290
+ flaggedDomain?: boolean;
291
+ containsCensored?: boolean;
292
+ filteredTexts?: string[];
293
+ error?: string;
294
+ }>;
295
+ /**
296
+ * Initializes the content filtering system.
297
+ * @description Creates the singleton instance of ContentFilterManager and
298
+ * loads all filtering data structures
299
+ */
300
+ declare const initializeFilterWords: () => void;
301
+ /**
302
+ * Filters text content using the ContentFilterManager.
303
+ * @param {string} text - The text to filter
304
+ * @param {string} [replace="***"] - The replacement string for filtered content
305
+ * @returns {string} The filtered text with inappropriate content replaced
306
+ * @description Provides a convenient wrapper around ContentFilterManager's filterText method
307
+ */
308
+ declare const filterText: (text: string, replace?: string) => string;
309
+
310
+ /**
311
+ * @fileoverview Advanced web scraping and content processing system that provides comprehensive functionality
312
+ * for recursive web crawling, content extraction, screenshot capture, and AI-powered content analysis.
313
+ *
314
+ * @module web
315
+ * @requires playwright - For browser automation and screenshot capture
316
+ * @requires node:path - For file path handling
317
+ * @requires node:fs/promises - For async file operations
318
+ * @requires ./scraper.js - Content extraction and filtering logic
319
+ * @requires ./content-analyzer.js - Content analysis and prompt generation
320
+ * @requires ../constants/gemini-settings.js - Configuration for Gemini LLM
321
+ * @requires ./content-filter.js - Content filtering and moderation
322
+ *
323
+ * @description
324
+ * This module implements a sophisticated web scraping and content processing system with the following key capabilities:
325
+ *
326
+ * - Multi-threaded web scraping using a thread pool for concurrent processing
327
+ * - Recursive crawling of websites while respecting domain boundaries
328
+ * - Automated screenshot capture at different scroll positions
329
+ * - Content extraction and filtering using custom scraping logic
330
+ * - AI-powered content analysis and structuring using Google's Gemini LLM
331
+ * - File-based storage of raw and processed content with organized directory structure
332
+ * - Error handling and recovery mechanisms
333
+ * - Content moderation and NSFW filtering
334
+ * - Dynamic prompt generation based on content analysis
335
+ *
336
+ * The system is designed to be highly scalable and configurable while maintaining clean separation of concerns
337
+ * between different processing stages. It uses a modular architecture with specialized components for:
338
+ *
339
+ * - Browser automation (Playwright)
340
+ * - Content extraction (Scraper)
341
+ * - Content filtering (ContentFilterManager)
342
+ * - Content analysis (ContentAnalyzer)
343
+ * - Prompt generation (PromptGenerator)
344
+ * - File system operations
345
+ *
346
+ * Key Features:
347
+ * - Configurable output directory structure
348
+ * - Automatic handling of relative/absolute URLs
349
+ * - Intelligent URL deduplication
350
+ * - Robust error handling and recovery
351
+ * - Modular design for easy extension
352
+ * - Comprehensive logging and debugging
353
+ * - Memory efficient processing
354
+ * - Rate limiting and throttling support
355
+ * - Configurable content filtering
356
+ * - AI-powered content analysis
357
+ *
358
+ * Processing Flow:
359
+ * 1. URL validation and normalization
360
+ * 2. Browser initialization with optimized settings
361
+ * 3. Page load and screenshot capture
362
+ * 4. Content extraction and initial filtering
363
+ * 5. NSFW/content moderation checks
364
+ * 6. AI-powered content analysis
365
+ * 7. File storage with organized structure
366
+ * 8. Link discovery and recursive processing
367
+ * 9. Error handling and recovery
368
+ * 10. Resource cleanup
369
+ *
370
+ * Configuration Options:
371
+ * - Output directory structure
372
+ * - Browser launch parameters
373
+ * - Content filtering rules
374
+ * - AI model settings
375
+ * - Rate limiting parameters
376
+ * - Domain boundaries
377
+ * - File naming conventions
378
+ * - Screenshot settings
379
+ *
380
+ * Error Handling:
381
+ * - Network failures
382
+ * - Invalid URLs
383
+ * - Content extraction errors
384
+ * - AI processing failures
385
+ * - File system errors
386
+ * - Memory constraints
387
+ * - Timeout conditions
388
+ *
389
+ * Performance Considerations:
390
+ * - Memory usage optimization
391
+ * - Concurrent processing limits
392
+ * - Resource cleanup
393
+ * - Caching strategies
394
+ * - Network efficiency
395
+ * - Storage optimization
396
+ *
397
+ * Security Features:
398
+ * - NSFW content filtering
399
+ * - Domain validation
400
+ * - Content sanitization
401
+ * - Resource limits
402
+ * - Safe file handling
403
+ *
404
+ * @example
405
+ * ```typescript
406
+ * // Initialize scraper with custom output directory
407
+ * const scraper = new WebScraper("custom_output");
408
+ *
409
+ * // Configure content filter
410
+ * scraper.contentFilter.setRules({
411
+ * maxLength: 10000,
412
+ * allowedDomains: ['example.com'],
413
+ * blockedKeywords: ['spam', 'adult']
414
+ * });
415
+ *
416
+ * try {
417
+ * // Start recursive scraping
418
+ * const results = await scraper.scrapeWebsite("https://example.com");
419
+ *
420
+ * // Process results
421
+ * for (const [url, result] of results) {
422
+ * if (result.error) {
423
+ * console.error(`Error processing ${url}:`, result.error);
424
+ * continue;
425
+ * }
426
+ *
427
+ * // Access processed content
428
+ * const content = await fs.readFile(result.processedContentPath, 'utf-8');
429
+ * console.log(`Processed ${url}:`, {
430
+ * rawContent: result.contentPath,
431
+ * processedContent: result.processedContentPath,
432
+ * screenshot: result.screenshot,
433
+ * timestamp: new Date(result.timestamp)
434
+ * });
435
+ * }
436
+ * } catch (error) {
437
+ * console.error("Scraping failed:", error);
438
+ * }
439
+ * ```
440
+ *
441
+ * @see {@link PageResult} for details on processing results
442
+ * @see {@link ContentFilterManager} for content filtering capabilities
443
+ * @see {@link ContentAnalyzer} for AI analysis features
444
+ * @see {@link PromptGenerator} for dynamic prompt generation
445
+ *
446
+ * @license MIT
447
+ * @author Original author and contributors
448
+ * @version 1.0.0
449
+ * @since 1.0.0
450
+ * @copyright 2024
451
+ */
452
+
453
+ /**
454
+ * Represents the complete result of processing a single web page, including all generated artifacts
455
+ * and metadata.
456
+ *
457
+ * @interface PageResult
458
+ * @property {string} url - The fully qualified URL of the processed web page
459
+ * @property {string} contentPath - Filesystem path to the raw scraped content file
460
+ * @property {string} processedContentPath - Filesystem path to the AI-processed and structured content file
461
+ * @property {string} screenshot - Filesystem path to the captured page screenshot
462
+ * @property {string} [error] - Optional error message if any stage of processing failed
463
+ * @property {number} timestamp - Unix timestamp (in milliseconds) when processing completed
464
+ *
465
+ * The PageResult interface provides a comprehensive record of all artifacts and metadata
466
+ * generated during the processing of a single web page. This includes:
467
+ *
468
+ * - Original URL for reference and deduplication
469
+ * - Paths to both raw and processed content files
470
+ * - Screenshot location for visual reference
471
+ * - Error information if processing failed
472
+ * - Timestamp for tracking and ordering
473
+ *
474
+ * Use Cases:
475
+ * - Tracking processing status and results
476
+ * - Error handling and recovery
477
+ * - Content access and retrieval
478
+ * - Processing verification
479
+ * - Audit trail
480
+ *
481
+ * @example
482
+ * ```typescript
483
+ * // Successful processing result
484
+ * const successResult: PageResult = {
485
+ * url: 'https://example.com/page',
486
+ * contentPath: 'output/content/example_com_page_1234567890.txt',
487
+ * processedContentPath: 'output/processed/example_com_page_1234567890.txt',
488
+ * screenshot: 'output/screenshots/example_com_page_0.png',
489
+ * timestamp: Date.now()
490
+ * };
491
+ *
492
+ * // Failed processing result
493
+ * const errorResult: PageResult = {
494
+ * url: 'https://example.com/invalid',
495
+ * contentPath: '',
496
+ * processedContentPath: '',
497
+ * screenshot: '',
498
+ * error: 'Failed to load page: 404 Not Found',
499
+ * timestamp: Date.now()
500
+ * };
501
+ * ```
502
+ */
503
+ interface PageResult {
504
+ url: string;
505
+ contentPath: string;
506
+ processedContentPath: string;
507
+ screenshot: string;
508
+ error?: string;
509
+ timestamp: number;
510
+ }
511
+ /**
512
+ * Core class implementing the web scraping and content processing system. Handles all aspects
513
+ * of the scraping process from URL discovery to content storage.
514
+ *
515
+ * @class WebScraper
516
+ *
517
+ * @property {Browser | null} browser - Playwright browser instance used for automation
518
+ * @property {Map<string, PageResult>} results - Map storing processing results for each URL
519
+ * @property {Set<string>} processedUrls - Set of URLs that have been processed to prevent duplicates
520
+ * @property {string} outputDir - Root directory for storing all generated files and artifacts
521
+ * @property {ContentFilterManager} contentFilter - Instance of content filtering manager
522
+ * @property {string} baseUrl - Base URL/domain for the current scraping session
523
+ *
524
+ * Key Responsibilities:
525
+ * 1. Browser Management
526
+ * - Initialization with optimized settings
527
+ * - Resource cleanup
528
+ * - Error handling
529
+ *
530
+ * 2. Content Processing
531
+ * - URL validation and normalization
532
+ * - Content extraction
533
+ * - Screenshot capture
534
+ * - AI analysis
535
+ * - Content filtering
536
+ *
537
+ * 3. File Management
538
+ * - Directory structure creation
539
+ * - File naming and organization
540
+ * - Content storage
541
+ * - Resource cleanup
542
+ *
543
+ * 4. URL Management
544
+ * - Deduplication
545
+ * - Domain boundary enforcement
546
+ * - Link discovery
547
+ * - Queue management
548
+ *
549
+ * 5. Error Handling
550
+ * - Network failures
551
+ * - Content processing errors
552
+ * - Resource constraints
553
+ * - Recovery mechanisms
554
+ *
555
+ * Processing Stages:
556
+ * 1. Initialization
557
+ * - Directory setup
558
+ * - Browser launch
559
+ * - Filter configuration
560
+ *
561
+ * 2. URL Processing
562
+ * - Validation
563
+ * - Deduplication
564
+ * - Domain checking
565
+ *
566
+ * 3. Content Extraction
567
+ * - Page loading
568
+ * - Screenshot capture
569
+ * - Content scraping
570
+ *
571
+ * 4. Content Processing
572
+ * - Filtering
573
+ * - AI analysis
574
+ * - Structure generation
575
+ *
576
+ * 5. Storage
577
+ * - File organization
578
+ * - Content saving
579
+ * - Metadata tracking
580
+ *
581
+ * 6. Link Discovery
582
+ * - URL extraction
583
+ * - Validation
584
+ * - Queue management
585
+ *
586
+ * 7. Cleanup
587
+ * - Resource release
588
+ * - Error handling
589
+ * - Status reporting
590
+ *
591
+ * @example
592
+ * ```typescript
593
+ * // Initialize scraper with custom settings
594
+ * const scraper = new WebScraper("output_dir");
595
+ *
596
+ * try {
597
+ * // Configure content filter
598
+ * scraper.contentFilter.setRules({
599
+ * maxLength: 50000,
600
+ * allowedDomains: ['example.com']
601
+ * });
602
+ *
603
+ * // Start recursive scraping
604
+ * const results = await scraper.scrapeWebsite("https://example.com");
605
+ *
606
+ * // Process results
607
+ * for (const [url, result] of results) {
608
+ * if (result.error) {
609
+ * console.error(`Error processing ${url}:`, result.error);
610
+ * continue;
611
+ * }
612
+ *
613
+ * // Access processed content
614
+ * const content = await fs.readFile(result.processedContentPath, 'utf-8');
615
+ * console.log(`Successfully processed ${url}`);
616
+ * }
617
+ * } catch (error) {
618
+ * console.error("Scraping failed:", error);
619
+ * }
620
+ * ```
621
+ *
622
+ * @throws {Error} Invalid URL provided
623
+ * @throws {Error} Browser initialization failed
624
+ * @throws {Error} Content processing failed
625
+ * @throws {Error} File system operation failed
626
+ */
627
+ declare class WebScraper {
628
+ private browser;
629
+ private results;
630
+ private processedUrls;
631
+ private outputDir;
632
+ readonly contentFilter: ContentFilterManager;
633
+ private baseUrl;
634
+ private sentimentAnalyzer;
635
+ /**
636
+ * Creates a new WebScraper instance.
637
+ *
638
+ * @param {string} outputDir - Directory where scraped content and artifacts will be stored
639
+ * @default "scraping_output"
640
+ *
641
+ * The constructor initializes a new WebScraper instance with the following setup:
642
+ *
643
+ * 1. Output Directory
644
+ * - Creates base directory for all artifacts
645
+ * - Organizes subdirectories for different content types
646
+ * - Handles path normalization
647
+ *
648
+ * 2. Content Filter
649
+ * - Initializes content filtering system
650
+ * - Sets up default filtering rules
651
+ * - Prepares moderation capabilities
652
+ *
653
+ * Directory Structure:
654
+ * ```
655
+ * outputDir/
656
+ * ├── content/ # Raw scraped content
657
+ * │ └── [domain]/ # Organized by domain
658
+ * ├── processed/ # AI-processed content
659
+ * │ └── [domain]/ # Organized by domain
660
+ * └── screenshots/ # Page screenshots
661
+ * └── [domain]/ # Organized by domain
662
+ * ```
663
+ *
664
+ * @example
665
+ * ```typescript
666
+ * // Basic initialization
667
+ * const scraper = new WebScraper();
668
+ *
669
+ * // Custom output directory
670
+ * const customScraper = new WebScraper("custom/output/path");
671
+ * ```
672
+ *
673
+ * @throws {Error} If directory creation fails
674
+ * @throws {Error} If content filter initialization fails
675
+ */
676
+ constructor(outputDir?: string);
677
+ /**
678
+ * Main entry point for scraping a website. Initializes the browser, processes the starting URL,
679
+ * and recursively crawls linked pages within the same domain.
680
+ *
681
+ * Processing Flow:
682
+ * 1. URL Validation
683
+ * - Format checking
684
+ * - Domain extraction
685
+ * - Protocol verification
686
+ *
687
+ * 2. Environment Setup
688
+ * - Directory initialization
689
+ * - Browser launch
690
+ * - Resource allocation
691
+ *
692
+ * 3. Content Processing
693
+ * - Page loading
694
+ * - Content extraction
695
+ * - Screenshot capture
696
+ * - AI analysis
697
+ *
698
+ * 4. Link Discovery
699
+ * - URL extraction
700
+ * - Domain filtering
701
+ * - Queue management
702
+ *
703
+ * 5. Resource Management
704
+ * - Memory monitoring
705
+ * - Connection handling
706
+ * - Cleanup operations
707
+ *
708
+ * Error Handling:
709
+ * - Invalid URLs
710
+ * - Network failures
711
+ * - Browser crashes
712
+ * - Memory constraints
713
+ * - Timeout conditions
714
+ *
715
+ * @param {string} url - Starting URL to begin scraping from
716
+ * @returns {Promise<Map<string, PageResult>>} Map of results for all processed URLs
717
+ * @throws {Error} If URL is invalid or scraping fails
718
+ *
719
+ * @example
720
+ * ```typescript
721
+ * const scraper = new WebScraper("output");
722
+ *
723
+ * try {
724
+ * // Start scraping
725
+ * const results = await scraper.scrapeWebsite("https://example.com");
726
+ *
727
+ * // Process successful results
728
+ * for (const [url, result] of results) {
729
+ * if (!result.error) {
730
+ * console.log(`Successfully processed ${url}`);
731
+ * console.log(`Content saved to: ${result.processedContentPath}`);
732
+ * console.log(`Screenshot saved to: ${result.screenshot}`);
733
+ * }
734
+ * }
735
+ *
736
+ * // Handle errors
737
+ * const errors = Array.from(results.entries())
738
+ * .filter(([_, result]) => result.error)
739
+ * .map(([url, result]) => ({url, error: result.error}));
740
+ *
741
+ * if (errors.length > 0) {
742
+ * console.error("Encountered errors:", errors);
743
+ * }
744
+ * } catch (error) {
745
+ * console.error("Fatal error during scraping:", error);
746
+ * }
747
+ * ```
748
+ */
749
+ scrapeWebsite(url: string): Promise<Map<string, PageResult>>;
750
+ /**
751
+ * Creates required output directories if they don't exist.
752
+ *
753
+ * Directory Structure:
754
+ * ```
755
+ * outputDir/
756
+ * ├── content/ # Raw scraped content
757
+ * │ └── [domain]/ # Organized by domain
758
+ * ├── processed/ # AI-processed content
759
+ * │ └── [domain]/ # Organized by domain
760
+ * └── screenshots/ # Page screenshots
761
+ * └── [domain]/ # Organized by domain
762
+ * ```
763
+ *
764
+ * @private
765
+ * @returns {Promise<void>}
766
+ *
767
+ * @throws {Error} If directory creation fails
768
+ * @throws {Error} If permissions are insufficient
769
+ * @throws {Error} If disk space is insufficient
770
+ */
771
+ private initializeDirectories;
772
+ /**
773
+ * Processes a single web page, extracting content, capturing a screenshot, and analyzing content.
774
+ * Also discovers and processes linked pages within the same domain.
775
+ *
776
+ * Processing Stages:
777
+ * 1. URL Validation
778
+ * - Format checking
779
+ * - Deduplication
780
+ * - Content type verification
781
+ *
782
+ * 2. Content Safety
783
+ * - Domain checking
784
+ * - NSFW detection
785
+ * - Content moderation
786
+ *
787
+ * 3. Page Processing
788
+ * - Loading and rendering
789
+ * - Screenshot capture
790
+ * - Content extraction
791
+ *
792
+ * 4. Content Analysis
793
+ * - Text filtering
794
+ * - AI processing
795
+ * - Structure generation
796
+ *
797
+ * 5. Link Discovery
798
+ * - URL extraction
799
+ * - Domain filtering
800
+ * - Queue management
801
+ *
802
+ * Error Handling:
803
+ * - Network failures
804
+ * - Timeout conditions
805
+ * - Content extraction errors
806
+ * - Processing failures
807
+ * - Resource constraints
808
+ *
809
+ * @private
810
+ * @param {string} url - URL of the page to process
811
+ * @returns {Promise<PageResult>} Processing result for the page
812
+ *
813
+ * @example
814
+ * ```typescript
815
+ * try {
816
+ * const result = await scraper.processSinglePage("https://example.com/page");
817
+ *
818
+ * if (result.error) {
819
+ * console.error(`Processing failed: ${result.error}`);
820
+ * return;
821
+ * }
822
+ *
823
+ * // Access results
824
+ * console.log("Raw content:", result.contentPath);
825
+ * console.log("Processed content:", result.processedContentPath);
826
+ * console.log("Screenshot:", result.screenshot);
827
+ * console.log("Processed at:", new Date(result.timestamp));
828
+ * } catch (error) {
829
+ * console.error("Fatal error:", error);
830
+ * }
831
+ * ```
832
+ *
833
+ * @throws {Error} If page loading fails
834
+ * @throws {Error} If content extraction fails
835
+ * @throws {Error} If processing fails
836
+ */
837
+ private processSinglePage;
838
+ /**
839
+ * Processes extracted content using Google's Gemini LLM for analysis and structuring.
840
+ *
841
+ * Processing Steps:
842
+ * 1. Content Preparation
843
+ * - Text filtering
844
+ * - Format validation
845
+ * - Length checking
846
+ *
847
+ * 2. Context Analysis
848
+ * - URL analysis
849
+ * - Content type detection
850
+ * - Structure identification
851
+ *
852
+ * 3. Prompt Generation
853
+ * - Dynamic template selection
854
+ * - Context integration
855
+ * - Parameter optimization
856
+ *
857
+ * 4. AI Processing
858
+ * - Model selection
859
+ * - Safety settings
860
+ * - Response handling
861
+ *
862
+ * Error Handling:
863
+ * - Content validation
864
+ * - Model errors
865
+ * - Timeout conditions
866
+ * - Response validation
867
+ *
868
+ * @private
869
+ * @param {string} content - Raw content to process
870
+ * @param {string} url - URL of the content source
871
+ * @returns {Promise<string>} Processed and structured content
872
+ *
873
+ * @throws {Error} If content is invalid
874
+ * @throws {Error} If LLM processing fails
875
+ * @throws {Error} If response is invalid
876
+ *
877
+ * @example
878
+ * ```typescript
879
+ * try {
880
+ * const rawContent = "Example raw content...";
881
+ * const url = "https://example.com";
882
+ *
883
+ * const processed = await scraper.processWithLLM(rawContent, url);
884
+ * console.log("Processed content:", processed);
885
+ * } catch (error) {
886
+ * console.error("Processing failed:", error);
887
+ * }
888
+ * ```
889
+ */
890
+ private processWithLLM;
891
+ /**
892
+ * Takes a full page screenshot of the current page
893
+ *
894
+ * Screenshot Process:
895
+ * 1. Page Preparation
896
+ * - Viewport setup
897
+ * - Content loading
898
+ * - Animation completion
899
+ *
900
+ * 2. Capture Settings
901
+ * - Full page mode
902
+ * - Resolution configuration
903
+ * - Format selection
904
+ *
905
+ * 3. File Management
906
+ * - Path generation
907
+ * - Directory creation
908
+ * - File saving
909
+ *
910
+ * Error Handling:
911
+ * - Page loading issues
912
+ * - Screenshot failures
913
+ * - Storage errors
914
+ *
915
+ * @private
916
+ * @param {Page} page - Playwright page instance
917
+ * @param {string} url - URL being captured
918
+ * @returns {Promise<string>} Path to saved screenshot
919
+ *
920
+ * @throws {Error} If screenshot capture fails
921
+ * @throws {Error} If file saving fails
922
+ *
923
+ * @example
924
+ * ```typescript
925
+ * const page = await browser.newPage();
926
+ * await page.goto(url);
927
+ *
928
+ * try {
929
+ * const screenshotPath = await scraper.takeScreenshot(page, url);
930
+ * console.log("Screenshot saved to:", screenshotPath);
931
+ * } catch (error) {
932
+ * console.error("Screenshot capture failed:", error);
933
+ * }
934
+ * ```
935
+ */
936
+ private takeScreenshot;
937
+ /**
938
+ * Saves content to a file with organized directory structure based on URL path.
939
+ *
940
+ * File Organization:
941
+ * 1. Path Generation
942
+ * - URL parsing
943
+ * - Path cleaning
944
+ * - Directory structure
945
+ *
946
+ * 2. Content Validation
947
+ * - File type checking
948
+ * - Content verification
949
+ * - Size limits
950
+ *
951
+ * 3. Directory Management
952
+ * - Path creation
953
+ * - Permissions
954
+ * - Existing files
955
+ *
956
+ * 4. File Operations
957
+ * - Content writing
958
+ * - Atomic saves
959
+ * - Cleanup
960
+ *
961
+ * Directory Structure:
962
+ * ```
963
+ * outputDir/
964
+ * └── [domain]/
965
+ * ├── content/
966
+ * │ └── [path]/
967
+ * │ └── content-[timestamp].txt
968
+ * ├── processed/
969
+ * │ └── [path]/
970
+ * │ └── processed-[timestamp].txt
971
+ * └── screenshots/
972
+ * └── [path]/
973
+ * └── screenshot-[timestamp].png
974
+ * ```
975
+ *
976
+ * @private
977
+ * @param {string} content - Content to save
978
+ * @param {'content' | 'processed' | 'screenshots'} type - Type of content being saved
979
+ * @param {string} url - Source URL
980
+ * @param {string} [fileExtension='.txt'] - File extension to use
981
+ * @returns {Promise<string>} Path to saved file
982
+ *
983
+ * @throws {Error} If file is non-textual
984
+ * @throws {Error} If saving fails
985
+ * @throws {Error} If directory creation fails
986
+ *
987
+ * @example
988
+ * ```typescript
989
+ * try {
990
+ * // Save raw content
991
+ * const contentPath = await scraper.saveToFile(
992
+ * "Raw content...",
993
+ * "content",
994
+ * "https://example.com/page"
995
+ * );
996
+ *
997
+ * // Save processed content
998
+ * const processedPath = await scraper.saveToFile(
999
+ * "Processed content...",
1000
+ * "processed",
1001
+ * "https://example.com/page"
1002
+ * );
1003
+ *
1004
+ * console.log("Content saved to:", contentPath);
1005
+ * console.log("Processed content saved to:", processedPath);
1006
+ * } catch (error) {
1007
+ * @throws {Error} If file is non-textual or saving fails
1008
+ */
1009
+ private saveToFile;
1010
+ /**
1011
+ * Validates AI generated content for safety and sentiment
1012
+ * @private
1013
+ * @param {string} content - AI generated content to validate
1014
+ * @returns {Promise<{isValid: boolean, reason?: string}>}
1015
+ */
1016
+ private validateAIResponse;
1017
+ /**
1018
+ * Process AI response with safety checks
1019
+ * @private
1020
+ * @param {string} aiResponse - Response from AI model
1021
+ * @returns {Promise<string>} Validated and processed response
1022
+ * @throws {Error} If content validation fails
1023
+ */
1024
+ private processAIResponse;
1025
+ }
1026
+
1027
+ /**
1028
+ * @fileoverview Content analysis and prompt generation system for web content processing
1029
+ * @module content-analyzer
1030
+ * @description Provides comprehensive functionality for analyzing web content structure and generating
1031
+ * context-aware prompts for LLM processing. The module includes two main classes:
1032
+ * - ContentAnalyzer: Analyzes web content to determine its context and characteristics
1033
+ * - PromptGenerator: Generates tailored prompts based on the analyzed content context
1034
+ *
1035
+ * Key features:
1036
+ * - URL pattern matching and content signal detection
1037
+ * - Content type classification (article, product, profile, etc.)
1038
+ * - Structure analysis (narrative, analytical, technical, etc.)
1039
+ * - Context-aware prompt generation
1040
+ * - Flexible template system for different content types
1041
+ *
1042
+ * @example
1043
+ * ```typescript
1044
+ * // Analyze content
1045
+ * const context = ContentAnalyzer.analyzeContent(url, htmlContent);
1046
+ *
1047
+ * // Generate appropriate prompt
1048
+ * const prompt = PromptGenerator.generatePrompt(context, content);
1049
+ * ```
1050
+ */
1051
+ /**
1052
+ * Represents the context and characteristics of analyzed content
1053
+ * @interface ContentContext
1054
+ * @property {('article'|'product'|'category'|'profile'|'general')} pageType - The type of page being analyzed:
1055
+ * - article: Blog posts, news articles, editorial content
1056
+ * - product: Product pages, item listings, shop entries
1057
+ * - category: Category pages, department listings, sections
1058
+ * - profile: User profiles, about pages, portfolios
1059
+ * - general: Default type for unclassified content
1060
+ * @property {('brief'|'standard'|'detailed')} contentLength - The relative length/depth of the content:
1061
+ * - brief: Short-form content, summaries
1062
+ * - standard: Medium-length content
1063
+ * - detailed: Long-form, in-depth content
1064
+ * @property {('narrative'|'analytical'|'technical'|'descriptive')} structureType - The structural style of the content:
1065
+ * - narrative: Story-based, chronological flow
1066
+ * - analytical: Data-driven, research-oriented
1067
+ * - technical: Specification-focused, procedural
1068
+ * - descriptive: Feature-focused, explanatory
1069
+ * @property {('general'|'technical'|'business'|'academic')} targetAudience - The intended audience for the content:
1070
+ * - general: General public, non-specialized readers
1071
+ * - technical: Technical professionals, developers
1072
+ * - business: Business professionals, stakeholders
1073
+ * - academic: Researchers, students, educators
1074
+ */
1075
+ interface ContentContext$1 {
1076
+ pageType: 'article' | 'product' | 'category' | 'profile' | 'general';
1077
+ contentLength: 'brief' | 'standard' | 'detailed';
1078
+ structureType: 'narrative' | 'analytical' | 'technical' | 'descriptive';
1079
+ targetAudience: 'general' | 'technical' | 'business' | 'academic';
1080
+ }
1081
+ /**
1082
+ * Analyzes web content to determine its context and characteristics
1083
+ * @class ContentAnalyzer
1084
+ * @static
1085
+ * @description Provides static methods for analyzing web content and determining its context.
1086
+ * Uses pattern matching and content signals to classify content and determine appropriate
1087
+ * processing strategies. The analysis considers:
1088
+ * - URL patterns and structure
1089
+ * - Content keywords and signals
1090
+ * - Document structure and elements
1091
+ * - Content indicators and metadata
1092
+ */
1093
+ declare class ContentAnalyzer {
1094
+ /**
1095
+ * Predefined patterns for analyzing different types of content routes
1096
+ * @private
1097
+ * @static
1098
+ * @readonly
1099
+ * @type {RouteAnalysis[]}
1100
+ * @description Array of route analysis configurations that define:
1101
+ * - URL patterns to match different content types
1102
+ * - Content signals associated with each type
1103
+ * - Default context settings for matched content
1104
+ * Each configuration targets a specific content category (article, product, etc.)
1105
+ * and provides the basis for content classification.
1106
+ */
1107
+ private static readonly routePatterns;
1108
+ /**
1109
+ * Extracts content signals from HTML content by analyzing structure and keywords
1110
+ * @private
1111
+ * @static
1112
+ * @param {string} content - The HTML content to analyze
1113
+ * @returns {Set<string>} Set of identified content signals
1114
+ * @description Analyzes HTML content to identify structural and keyword signals:
1115
+ * - Checks for presence of headers, lists, and tables
1116
+ * - Identifies content-specific keywords and patterns
1117
+ * - Detects pricing information and author attribution
1118
+ * - Recognizes profile and biographical content
1119
+ * The signals are used to help classify and contextualize the content.
1120
+ */
1121
+ private static getContentSignals;
1122
+ /**
1123
+ * Analyzes content and URL to determine the appropriate content context
1124
+ * @public
1125
+ * @static
1126
+ * @param {string} url - The URL of the content being analyzed
1127
+ * @param {string} content - The HTML content to analyze
1128
+ * @returns {ContentContext} The determined content context
1129
+ * @description Performs comprehensive content analysis by:
1130
+ * 1. Extracting and analyzing the URL path
1131
+ * 2. Identifying content signals from the HTML
1132
+ * 3. Matching against predefined route patterns
1133
+ * 4. Determining the most appropriate content context
1134
+ * If no specific matches are found, returns a default general context.
1135
+ *
1136
+ * @example
1137
+ * ```typescript
1138
+ * const context = ContentAnalyzer.analyzeContent(
1139
+ * 'https://example.com/blog/article-1',
1140
+ * '<html>...</html>'
1141
+ * );
1142
+ * ```
1143
+ */
1144
+ static analyzeContent(url: string, content: string): ContentContext$1;
1145
+ }
1146
+ /**
1147
+ * Generates context-aware prompts for LLM content processing
1148
+ * @class PromptGenerator
1149
+ * @static
1150
+ * @description Provides functionality for generating tailored prompts based on content context.
1151
+ * Features:
1152
+ * - Template-based prompt generation
1153
+ * - Context-aware prompt customization
1154
+ * - Support for multiple content types and structures
1155
+ * - Fallback to default prompts when needed
1156
+ */
1157
+ declare class PromptGenerator {
1158
+ /**
1159
+ * Template definitions for different content types and structures
1160
+ * @private
1161
+ * @static
1162
+ * @readonly
1163
+ * @type {Object}
1164
+ * @description Comprehensive template system that provides:
1165
+ * - Content type-specific templates (article, product, profile)
1166
+ * - Structure-specific variations (narrative, analytical, technical)
1167
+ * - Detailed processing instructions
1168
+ * - Placeholder support for content insertion
1169
+ * Templates are organized hierarchically by content type and structure.
1170
+ */
1171
+ private static readonly promptTemplates;
1172
+ /**
1173
+ * Generates an appropriate prompt based on content context
1174
+ * @public
1175
+ * @static
1176
+ * @param {ContentContext} context - The analyzed content context
1177
+ * @param {string} content - The content to be processed
1178
+ * @returns {string} Generated prompt for LLM processing
1179
+ * @description Generates a context-appropriate prompt by:
1180
+ * 1. Selecting appropriate template based on content type
1181
+ * 2. Choosing structure-specific variation
1182
+ * 3. Inserting content into template
1183
+ * 4. Falling back to default prompt if no specific template exists
1184
+ *
1185
+ * @example
1186
+ * ```typescript
1187
+ * const prompt = PromptGenerator.generatePrompt(
1188
+ * { pageType: 'article', structureType: 'narrative', ... },
1189
+ * 'Article content...'
1190
+ * );
1191
+ * ```
1192
+ */
1193
+ static generatePrompt(context: ContentContext$1, content: string): string;
1194
+ /**
1195
+ * Provides a default prompt when no specific template matches
1196
+ * @private
1197
+ * @static
1198
+ * @param {string} content - The content to be processed
1199
+ * @returns {string} Default analysis prompt
1200
+ * @description Generates a generic but comprehensive prompt for content analysis
1201
+ * when no specific template matches the content context. The default prompt
1202
+ * focuses on:
1203
+ * - Topic extraction
1204
+ * - Content organization
1205
+ * - Redundancy removal
1206
+ * - Clarity and readability
1207
+ */
1208
+ private static getDefaultPrompt;
1209
+ }
1210
+
1211
+ interface WorkerMessage<T = any> {
1212
+ id: string;
1213
+ type: "TASK" | "RESULT" | "ERROR" | "STATUS" | "READY" | "INIT";
1214
+ payload: T;
1215
+ timestamp: number;
1216
+ }
1217
+ interface WorkerTask {
1218
+ id: string;
1219
+ type: "TASK" | "RESULT" | "ERROR" | "STATUS" | "READY" | "INIT";
1220
+ url?: string;
1221
+ data?: any;
1222
+ }
1223
+ interface TrieNode {
1224
+ children: {
1225
+ [key: string]: TrieNode;
1226
+ };
1227
+ isEndOfWord: boolean;
1228
+ }
1229
+ interface ContentContext {
1230
+ pageType: 'article' | 'product' | 'category' | 'profile' | 'general';
1231
+ contentLength: 'brief' | 'standard' | 'detailed';
1232
+ structureType: 'narrative' | 'analytical' | 'technical' | 'descriptive';
1233
+ targetAudience: 'general' | 'technical' | 'business' | 'academic';
1234
+ }
1235
+ interface RouteAnalysis {
1236
+ patterns: RegExp[];
1237
+ signals: string[];
1238
+ context: ContentContext;
1239
+ }
1240
+ interface CodeBlock {
1241
+ language: string;
1242
+ code: string;
1243
+ lineNumbers?: boolean;
1244
+ }
1245
+
1246
+ declare const nsfwNames: Readonly<Record<string, string>>;
1247
+ type NsfwName = keyof typeof nsfwNames;
1248
+
1249
+ declare const nsfw: Readonly<Record<string, string>>;
1250
+
1251
+ declare const robots: Readonly<Record<string, string>>;
1252
+ type Robot = keyof typeof robots;
1253
+
1254
+ declare const slurs: Readonly<Record<string, string>>;
1255
+ type Slur = keyof typeof slurs;
1256
+
1257
+ /**
1258
+ * @fileoverview Configuration settings for Google's Gemini AI model integration
1259
+ * @module gemini-settings
1260
+ * @description Provides configuration constants and settings for interacting with the Gemini AI API,
1261
+ * including model selection, API authentication, and content safety thresholds
1262
+ */
1263
+
1264
+ /**
1265
+ * The specific Gemini model version to use
1266
+ * @constant {string}
1267
+ * @description Specifies the Gemini 1.5 Flash model, optimized for fast inference
1268
+ */
1269
+ declare const model = "gemini-1.5-flash";
1270
+ /**
1271
+ * Google AI API key loaded from environment variables
1272
+ * @constant {string}
1273
+ * @description API key for authenticating with Google's AI services. Falls back to empty string if not configured
1274
+ */
1275
+ declare const API_KEY: string;
1276
+ /**
1277
+ * Initialized Google Generative AI client
1278
+ * @constant {GoogleGenerativeAI}
1279
+ * @description Main client instance for interacting with Gemini AI services
1280
+ */
1281
+ declare const genAI: GoogleGenerativeAI;
1282
+ /**
1283
+ * Generation configuration settings
1284
+ * @constant {undefined}
1285
+ * @description Currently undefined, can be used to specify generation parameters like temperature, top-k, etc.
1286
+ */
1287
+ declare const generationConfig: undefined;
1288
+ /**
1289
+ * Content safety threshold settings
1290
+ * @constant {Array<{category: HarmCategory, threshold: HarmBlockThreshold}>}
1291
+ * @description Configures content filtering thresholds for different harm categories:
1292
+ * - Harassment
1293
+ * - Hate Speech
1294
+ * - Sexually Explicit Content
1295
+ * - Dangerous Content
1296
+ * All thresholds are currently set to BLOCK_NONE for maximum permissiveness
1297
+ */
1298
+ declare const safetySettings: {
1299
+ category: HarmCategory;
1300
+ threshold: HarmBlockThreshold;
1301
+ }[];
1302
+
1303
+ export { API_KEY, type CodeBlock, ContentAnalyzer, type ContentContext, ContentFilterManager, type NsfwName, PromptGenerator, type Robot, type RouteAnalysis, type Slur, type TrieNode, WebScraper, type WorkerMessage, type WorkerTask, filterText, genAI, generationConfig, initializeFilterWords, model, nsfw, nsfwNames, robots, safetySettings, scrape, slurs };