enterprise-ai-recursive-web-scraper 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- package/LICENSE.md +20 -0
- package/README.md +122 -0
- package/lib/cli.cjs +45382 -0
- package/lib/cli.cjs.map +1 -0
- package/lib/cli.d.cts +1 -0
- package/lib/cli.d.ts +1 -0
- package/lib/cli.js +45364 -0
- package/lib/cli.js.map +1 -0
- package/lib/index.cjs +45402 -0
- package/lib/index.cjs.map +1 -0
- package/lib/index.d.cts +1303 -0
- package/lib/index.d.ts +1303 -0
- package/lib/index.js +45373 -0
- package/lib/index.js.map +1 -0
- package/package.json +66 -0
package/lib/index.d.cts
ADDED
@@ -0,0 +1,1303 @@
|
|
1
|
+
import { Browser } from 'puppeteer';
|
2
|
+
import { GoogleGenerativeAI, HarmCategory, HarmBlockThreshold } from '@google/generative-ai';
|
3
|
+
|
4
|
+
/**
|
5
|
+
* @fileoverview Enhanced web scraping and content filtering functions for detecting and filtering inappropriate content
|
6
|
+
* @file scraper.ts
|
7
|
+
* @module scraper
|
8
|
+
* @description This module provides functionality for web scraping with content filtering capabilities.
|
9
|
+
* It includes classes for managing browser operations, text processing, and content filtering using
|
10
|
+
* Trie data structures. The module is designed to detect and filter NSFW content, slurs, and other
|
11
|
+
* inappropriate content from web pages.
|
12
|
+
*
|
13
|
+
* Key features:
|
14
|
+
* - Web scraping using Puppeteer with stealth and ad-blocking capabilities
|
15
|
+
* - Content filtering using Trie data structures for efficient pattern matching
|
16
|
+
* - Text processing with duplicate detection and removal
|
17
|
+
* - NSFW domain detection and filtering
|
18
|
+
* - Configurable content replacement
|
19
|
+
*
|
20
|
+
* Classes:
|
21
|
+
* - ContentTrie: Trie data structure for efficient string matching
|
22
|
+
* - ContentFilterManager: Singleton manager for content filtering operations
|
23
|
+
* - TextProcessor: Text cleaning and duplicate detection utility
|
24
|
+
* - BrowserManager: Browser and page management for scraping
|
25
|
+
*
|
26
|
+
* @example
|
27
|
+
* ```typescript
|
28
|
+
* // Initialize the filtering system
|
29
|
+
* initializeFilterWords();
|
30
|
+
*
|
31
|
+
* // Scrape and filter content from a URL
|
32
|
+
* const result = await scrape('https://example.com');
|
33
|
+
* if ('error' in result) {
|
34
|
+
* console.error(result.error);
|
35
|
+
* } else {
|
36
|
+
* console.log(result.filteredTexts);
|
37
|
+
* }
|
38
|
+
*
|
39
|
+
* // Filter individual text
|
40
|
+
* const filtered = filterText('text to filter');
|
41
|
+
* ```
|
42
|
+
*
|
43
|
+
* @requires puppeteer-extra - Enhanced version of Puppeteer with plugin support
|
44
|
+
* @requires puppeteer-extra-plugin-adblocker - Plugin for blocking ads and trackers
|
45
|
+
* @requires puppeteer-extra-plugin-stealth - Plugin for avoiding bot detection
|
46
|
+
*
|
47
|
+
* @license MIT
|
48
|
+
* @author Original author and contributors
|
49
|
+
* @version 1.0.0
|
50
|
+
* @since 1.0.0
|
51
|
+
*
|
52
|
+
* @see {@link https://github.com/berstend/puppeteer-extra|puppeteer-extra} - Enhanced version of Puppeteer
|
53
|
+
* @see {@link https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra-plugin-stealth|puppeteer-extra-plugin-stealth} - Stealth plugin for avoiding detection
|
54
|
+
* @see {@link https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra-plugin-adblocker|puppeteer-extra-plugin-adblocker} - Ad blocking plugin
|
55
|
+
*
|
56
|
+
* @todo Add support for custom filtering rules
|
57
|
+
* @todo Improve error handling and recovery
|
58
|
+
* @todo Add rate limiting and request throttling
|
59
|
+
* @todo Implement caching for frequently accessed content
|
60
|
+
* @todo Add support for proxy rotation
|
61
|
+
* @todo Improve duplicate detection algorithms
|
62
|
+
* @todo Add support for custom content processors
|
63
|
+
* @todo Implement better logging and monitoring
|
64
|
+
* @todo Add support for distributed scraping
|
65
|
+
* @todo Improve memory management for large-scale scraping
|
66
|
+
*
|
67
|
+
* @throws {Error} When filter initialization fails
|
68
|
+
* @throws {Error} When browser operations fail
|
69
|
+
* @throws {Error} When content processing fails
|
70
|
+
* @throws {Error} When network operations fail
|
71
|
+
*
|
72
|
+
* @property {ContentFilterManager} filterManager - Singleton instance for content filtering
|
73
|
+
* @property {BrowserManager} browserManager - Static class for browser operations
|
74
|
+
* @property {TextProcessor} textProcessor - Static class for text processing
|
75
|
+
*
|
76
|
+
* @borrows ContentFilterManager.filterText as filterText
|
77
|
+
* @borrows ContentFilterManager.getInstance as getFilterManager
|
78
|
+
* @borrows BrowserManager.launch as launchBrowser
|
79
|
+
* @borrows TextProcessor.processText as processText
|
80
|
+
*
|
81
|
+
* @exports scrape - Main scraping function
|
82
|
+
* @exports initializeFilterWords - Filter initialization function
|
83
|
+
* @exports filterText - Text filtering function
|
84
|
+
* @exports ContentFilterManager - Content filtering manager class
|
85
|
+
*
|
86
|
+
* @typedef {Object} ScrapingResult
|
87
|
+
* @property {boolean} [flaggedDomain] - Whether the domain is flagged as NSFW
|
88
|
+
* @property {boolean} [containsCensored] - Whether censored content was found
|
89
|
+
* @property {string[]} [filteredTexts] - Array of filtered text content
|
90
|
+
* @property {string} [error] - Error message if scraping failed
|
91
|
+
*
|
92
|
+
* @typedef {Object} CodeBlock
|
93
|
+
* @property {string} language - Programming language of the code block
|
94
|
+
* @property {string} code - The actual code content
|
95
|
+
* @property {boolean} lineNumbers - Whether line numbers should be displayed
|
96
|
+
*
|
97
|
+
* @typedef {Object} TrieNode
|
98
|
+
* @property {Object.<string, TrieNode>} children - Child nodes in the Trie
|
99
|
+
* @property {boolean} isEndOfWord - Whether this node represents end of word
|
100
|
+
*
|
101
|
+
* @typedef {Object} ContentExtractionResult
|
102
|
+
* @property {string[]} texts - Array of extracted text content
|
103
|
+
* @property {CodeBlock[]} codeBlocks - Array of extracted code blocks
|
104
|
+
*
|
105
|
+
* @typedef {Object} FilterOptions
|
106
|
+
* @property {string} [replacement="***"] - Replacement string for filtered content
|
107
|
+
* @property {boolean} [caseSensitive=false] - Whether filtering is case sensitive
|
108
|
+
* @property {number} [minLength=1] - Minimum length for content to be filtered
|
109
|
+
*
|
110
|
+
* @typedef {Object} BrowserOptions
|
111
|
+
* @property {boolean} [headless=true] - Whether to run browser in headless mode
|
112
|
+
* @property {string[]} [args] - Additional browser launch arguments
|
113
|
+
* @property {number} [timeout=30000] - Navigation timeout in milliseconds
|
114
|
+
*
|
115
|
+
* @typedef {Object} ProcessingOptions
|
116
|
+
* @property {number} [similarityThreshold=0.85] - Threshold for duplicate detection
|
117
|
+
* @property {number} [maxLength=50000] - Maximum content length for processing
|
118
|
+
* @property {boolean} [preserveFormatting=false] - Whether to preserve text formatting
|
119
|
+
*/
|
120
|
+
|
121
|
+
/**
|
122
|
+
* Singleton class managing content filtering operations.
|
123
|
+
* @class
|
124
|
+
* @description Provides centralized content filtering functionality using multiple
|
125
|
+
* filtering mechanisms including Tries and Sets. Implements the Singleton pattern
|
126
|
+
* to ensure consistent filtering across the application.
|
127
|
+
*
|
128
|
+
* Key features:
|
129
|
+
* - Singleton pattern ensures consistent filtering state
|
130
|
+
* - Multiple filtering mechanisms (Tries, Sets)
|
131
|
+
* - Configurable content replacement
|
132
|
+
* - Efficient text chunk processing
|
133
|
+
* - NSFW domain detection
|
134
|
+
*
|
135
|
+
* @example
|
136
|
+
* ```typescript
|
137
|
+
* const filterManager = ContentFilterManager.getInstance();
|
138
|
+
*
|
139
|
+
* // Check domain
|
140
|
+
* const isNSFW = filterManager.isNSFWDomain('example.com');
|
141
|
+
*
|
142
|
+
* // Filter text
|
143
|
+
* const filtered = filterManager.filterText('text to filter');
|
144
|
+
* ```
|
145
|
+
*/
|
146
|
+
declare class ContentFilterManager {
|
147
|
+
/**
|
148
|
+
* Singleton instance
|
149
|
+
* @private
|
150
|
+
* @static
|
151
|
+
* @type {ContentFilterManager}
|
152
|
+
*/
|
153
|
+
private static instance;
|
154
|
+
/**
|
155
|
+
* Trie for storing and matching filtered words
|
156
|
+
* @private
|
157
|
+
* @type {ContentTrie}
|
158
|
+
*/
|
159
|
+
private filterTrie;
|
160
|
+
/**
|
161
|
+
* Set of NSFW domains
|
162
|
+
* @private
|
163
|
+
* @type {Set<string>}
|
164
|
+
*/
|
165
|
+
private nsfwDomains;
|
166
|
+
/**
|
167
|
+
* Trie for storing and matching NSFW terms
|
168
|
+
* @private
|
169
|
+
* @type {ContentTrie}
|
170
|
+
*/
|
171
|
+
private nsfwNamesTrie;
|
172
|
+
/**
|
173
|
+
* Set of filtered dictionary words
|
174
|
+
* @private
|
175
|
+
* @type {Set<string>}
|
176
|
+
*/
|
177
|
+
private filterDict;
|
178
|
+
/**
|
179
|
+
* Maximum content length for processing
|
180
|
+
* @private
|
181
|
+
* @readonly
|
182
|
+
* @type {number}
|
183
|
+
*/
|
184
|
+
private readonly MAX_CONTENT_LENGTH;
|
185
|
+
/**
|
186
|
+
* Private constructor to prevent direct instantiation.
|
187
|
+
* @private
|
188
|
+
* @description Initializes all filtering data structures and loads initial data.
|
189
|
+
* This constructor is private to enforce the singleton pattern.
|
190
|
+
*
|
191
|
+
* @throws {Error} If filter initialization fails
|
192
|
+
*/
|
193
|
+
private constructor();
|
194
|
+
/**
|
195
|
+
* Gets or creates the singleton instance of ContentFilterManager.
|
196
|
+
* @returns {ContentFilterManager} The singleton instance
|
197
|
+
* @description Ensures only one instance of ContentFilterManager exists.
|
198
|
+
* Creates the instance if it doesn't exist, otherwise returns the existing instance.
|
199
|
+
*
|
200
|
+
* @example
|
201
|
+
* ```typescript
|
202
|
+
* const filterManager = ContentFilterManager.getInstance();
|
203
|
+
* ```
|
204
|
+
*/
|
205
|
+
static getInstance(): ContentFilterManager;
|
206
|
+
/**
|
207
|
+
* Loads filter data from configuration files.
|
208
|
+
* @private
|
209
|
+
* @returns {Promise<void>}
|
210
|
+
* @description Asynchronously loads filtering data from configuration files,
|
211
|
+
* including NSFW domains, NSFW terms, and slurs. Initializes all filtering
|
212
|
+
* data structures with the loaded data.
|
213
|
+
*
|
214
|
+
* @throws {Error} If filter initialization fails or data files cannot be loaded
|
215
|
+
*/
|
216
|
+
private loadFilters;
|
217
|
+
/**
|
218
|
+
* Checks if a URL contains or belongs to an NSFW domain.
|
219
|
+
* @param {string} url - The URL to check
|
220
|
+
* @returns {boolean} True if the URL matches any NSFW domain patterns
|
221
|
+
* @description Performs case-sensitive matching against known NSFW domains.
|
222
|
+
* Checks if the URL contains any known NSFW domain patterns.
|
223
|
+
*
|
224
|
+
* @example
|
225
|
+
* ```typescript
|
226
|
+
* const filterManager = ContentFilterManager.getInstance();
|
227
|
+
* const isNSFW = filterManager.isNSFWDomain('example.com');
|
228
|
+
* ```
|
229
|
+
*
|
230
|
+
* @throws {TypeError} If url is not a string
|
231
|
+
*/
|
232
|
+
isNSFWDomain(url: string): boolean;
|
233
|
+
/**
|
234
|
+
* Splits text into manageable chunks while preserving context.
|
235
|
+
* @private
|
236
|
+
* @param {string} text - Text to split
|
237
|
+
* @returns {string[]} Array of text chunks
|
238
|
+
* @description Splits long text into smaller chunks while trying to maintain
|
239
|
+
* sentence boundaries and context. This ensures efficient processing of large
|
240
|
+
* text content.
|
241
|
+
*
|
242
|
+
* @throws {TypeError} If text is not a string
|
243
|
+
*/
|
244
|
+
private splitIntoChunks;
|
245
|
+
/**
|
246
|
+
* Filters text content using content filtering rules.
|
247
|
+
* @param {string} text - Text to filter
|
248
|
+
* @param {string} [replacement="***"] - Replacement string for filtered content
|
249
|
+
* @returns {string} Filtered text with inappropriate content replaced
|
250
|
+
* @description Processes text content in chunks, applying filtering rules
|
251
|
+
* to detect and replace inappropriate content. Handles large text efficiently
|
252
|
+
* by breaking it into manageable chunks.
|
253
|
+
*
|
254
|
+
* @example
|
255
|
+
* ```typescript
|
256
|
+
* const filterManager = ContentFilterManager.getInstance();
|
257
|
+
* const filtered = filterManager.filterText('text to filter', '***');
|
258
|
+
* ```
|
259
|
+
*
|
260
|
+
* @throws {TypeError} If text is not a string
|
261
|
+
*/
|
262
|
+
filterText(text: string, replacement?: string): string;
|
263
|
+
/**
|
264
|
+
* Applies the actual filtering logic to a single chunk.
|
265
|
+
* @private
|
266
|
+
* @param {string} chunk - Text chunk to filter
|
267
|
+
* @param {string} replacement - Replacement string for filtered content
|
268
|
+
* @returns {string} Filtered text chunk
|
269
|
+
* @description Applies filtering rules to a single chunk of text,
|
270
|
+
* replacing inappropriate content with the specified replacement string.
|
271
|
+
*
|
272
|
+
* @throws {TypeError} If chunk is not a string
|
273
|
+
*/
|
274
|
+
private applyFilters;
|
275
|
+
}
|
276
|
+
/**
|
277
|
+
* Main scraping function that processes and filters web content.
|
278
|
+
* @param {string} url - The URL to scrape
|
279
|
+
* @returns {Promise<{flaggedDomain: boolean, containsCensored: boolean, filteredTexts: string[]} | {error: string}>}
|
280
|
+
* Object containing scraping results or error information
|
281
|
+
* @description Coordinates the entire scraping process including:
|
282
|
+
* - URL validation
|
283
|
+
* - Browser management
|
284
|
+
* - Content extraction
|
285
|
+
* - Text processing
|
286
|
+
* - Content filtering
|
287
|
+
* @throws {Error} Various errors related to browser operations or content processing
|
288
|
+
*/
|
289
|
+
declare function scrape(url: string, browser?: Browser | null): Promise<{
|
290
|
+
flaggedDomain?: boolean;
|
291
|
+
containsCensored?: boolean;
|
292
|
+
filteredTexts?: string[];
|
293
|
+
error?: string;
|
294
|
+
}>;
|
295
|
+
/**
|
296
|
+
* Initializes the content filtering system.
|
297
|
+
* @description Creates the singleton instance of ContentFilterManager and
|
298
|
+
* loads all filtering data structures
|
299
|
+
*/
|
300
|
+
declare const initializeFilterWords: () => void;
|
301
|
+
/**
|
302
|
+
* Filters text content using the ContentFilterManager.
|
303
|
+
* @param {string} text - The text to filter
|
304
|
+
* @param {string} [replace="***"] - The replacement string for filtered content
|
305
|
+
* @returns {string} The filtered text with inappropriate content replaced
|
306
|
+
* @description Provides a convenient wrapper around ContentFilterManager's filterText method
|
307
|
+
*/
|
308
|
+
declare const filterText: (text: string, replace?: string) => string;
|
309
|
+
|
310
|
+
/**
|
311
|
+
* @fileoverview Advanced web scraping and content processing system that provides comprehensive functionality
|
312
|
+
* for recursive web crawling, content extraction, screenshot capture, and AI-powered content analysis.
|
313
|
+
*
|
314
|
+
* @module web
|
315
|
+
* @requires playwright - For browser automation and screenshot capture
|
316
|
+
* @requires node:path - For file path handling
|
317
|
+
* @requires node:fs/promises - For async file operations
|
318
|
+
* @requires ./scraper.js - Content extraction and filtering logic
|
319
|
+
* @requires ./content-analyzer.js - Content analysis and prompt generation
|
320
|
+
* @requires ../constants/gemini-settings.js - Configuration for Gemini LLM
|
321
|
+
* @requires ./content-filter.js - Content filtering and moderation
|
322
|
+
*
|
323
|
+
* @description
|
324
|
+
* This module implements a sophisticated web scraping and content processing system with the following key capabilities:
|
325
|
+
*
|
326
|
+
* - Multi-threaded web scraping using a thread pool for concurrent processing
|
327
|
+
* - Recursive crawling of websites while respecting domain boundaries
|
328
|
+
* - Automated screenshot capture at different scroll positions
|
329
|
+
* - Content extraction and filtering using custom scraping logic
|
330
|
+
* - AI-powered content analysis and structuring using Google's Gemini LLM
|
331
|
+
* - File-based storage of raw and processed content with organized directory structure
|
332
|
+
* - Error handling and recovery mechanisms
|
333
|
+
* - Content moderation and NSFW filtering
|
334
|
+
* - Dynamic prompt generation based on content analysis
|
335
|
+
*
|
336
|
+
* The system is designed to be highly scalable and configurable while maintaining clean separation of concerns
|
337
|
+
* between different processing stages. It uses a modular architecture with specialized components for:
|
338
|
+
*
|
339
|
+
* - Browser automation (Playwright)
|
340
|
+
* - Content extraction (Scraper)
|
341
|
+
* - Content filtering (ContentFilterManager)
|
342
|
+
* - Content analysis (ContentAnalyzer)
|
343
|
+
* - Prompt generation (PromptGenerator)
|
344
|
+
* - File system operations
|
345
|
+
*
|
346
|
+
* Key Features:
|
347
|
+
* - Configurable output directory structure
|
348
|
+
* - Automatic handling of relative/absolute URLs
|
349
|
+
* - Intelligent URL deduplication
|
350
|
+
* - Robust error handling and recovery
|
351
|
+
* - Modular design for easy extension
|
352
|
+
* - Comprehensive logging and debugging
|
353
|
+
* - Memory efficient processing
|
354
|
+
* - Rate limiting and throttling support
|
355
|
+
* - Configurable content filtering
|
356
|
+
* - AI-powered content analysis
|
357
|
+
*
|
358
|
+
* Processing Flow:
|
359
|
+
* 1. URL validation and normalization
|
360
|
+
* 2. Browser initialization with optimized settings
|
361
|
+
* 3. Page load and screenshot capture
|
362
|
+
* 4. Content extraction and initial filtering
|
363
|
+
* 5. NSFW/content moderation checks
|
364
|
+
* 6. AI-powered content analysis
|
365
|
+
* 7. File storage with organized structure
|
366
|
+
* 8. Link discovery and recursive processing
|
367
|
+
* 9. Error handling and recovery
|
368
|
+
* 10. Resource cleanup
|
369
|
+
*
|
370
|
+
* Configuration Options:
|
371
|
+
* - Output directory structure
|
372
|
+
* - Browser launch parameters
|
373
|
+
* - Content filtering rules
|
374
|
+
* - AI model settings
|
375
|
+
* - Rate limiting parameters
|
376
|
+
* - Domain boundaries
|
377
|
+
* - File naming conventions
|
378
|
+
* - Screenshot settings
|
379
|
+
*
|
380
|
+
* Error Handling:
|
381
|
+
* - Network failures
|
382
|
+
* - Invalid URLs
|
383
|
+
* - Content extraction errors
|
384
|
+
* - AI processing failures
|
385
|
+
* - File system errors
|
386
|
+
* - Memory constraints
|
387
|
+
* - Timeout conditions
|
388
|
+
*
|
389
|
+
* Performance Considerations:
|
390
|
+
* - Memory usage optimization
|
391
|
+
* - Concurrent processing limits
|
392
|
+
* - Resource cleanup
|
393
|
+
* - Caching strategies
|
394
|
+
* - Network efficiency
|
395
|
+
* - Storage optimization
|
396
|
+
*
|
397
|
+
* Security Features:
|
398
|
+
* - NSFW content filtering
|
399
|
+
* - Domain validation
|
400
|
+
* - Content sanitization
|
401
|
+
* - Resource limits
|
402
|
+
* - Safe file handling
|
403
|
+
*
|
404
|
+
* @example
|
405
|
+
* ```typescript
|
406
|
+
* // Initialize scraper with custom output directory
|
407
|
+
* const scraper = new WebScraper("custom_output");
|
408
|
+
*
|
409
|
+
* // Configure content filter
|
410
|
+
* scraper.contentFilter.setRules({
|
411
|
+
* maxLength: 10000,
|
412
|
+
* allowedDomains: ['example.com'],
|
413
|
+
* blockedKeywords: ['spam', 'adult']
|
414
|
+
* });
|
415
|
+
*
|
416
|
+
* try {
|
417
|
+
* // Start recursive scraping
|
418
|
+
* const results = await scraper.scrapeWebsite("https://example.com");
|
419
|
+
*
|
420
|
+
* // Process results
|
421
|
+
* for (const [url, result] of results) {
|
422
|
+
* if (result.error) {
|
423
|
+
* console.error(`Error processing ${url}:`, result.error);
|
424
|
+
* continue;
|
425
|
+
* }
|
426
|
+
*
|
427
|
+
* // Access processed content
|
428
|
+
* const content = await fs.readFile(result.processedContentPath, 'utf-8');
|
429
|
+
* console.log(`Processed ${url}:`, {
|
430
|
+
* rawContent: result.contentPath,
|
431
|
+
* processedContent: result.processedContentPath,
|
432
|
+
* screenshot: result.screenshot,
|
433
|
+
* timestamp: new Date(result.timestamp)
|
434
|
+
* });
|
435
|
+
* }
|
436
|
+
* } catch (error) {
|
437
|
+
* console.error("Scraping failed:", error);
|
438
|
+
* }
|
439
|
+
* ```
|
440
|
+
*
|
441
|
+
* @see {@link PageResult} for details on processing results
|
442
|
+
* @see {@link ContentFilterManager} for content filtering capabilities
|
443
|
+
* @see {@link ContentAnalyzer} for AI analysis features
|
444
|
+
* @see {@link PromptGenerator} for dynamic prompt generation
|
445
|
+
*
|
446
|
+
* @license MIT
|
447
|
+
* @author Original author and contributors
|
448
|
+
* @version 1.0.0
|
449
|
+
* @since 1.0.0
|
450
|
+
* @copyright 2024
|
451
|
+
*/
|
452
|
+
|
453
|
+
/**
|
454
|
+
* Represents the complete result of processing a single web page, including all generated artifacts
|
455
|
+
* and metadata.
|
456
|
+
*
|
457
|
+
* @interface PageResult
|
458
|
+
* @property {string} url - The fully qualified URL of the processed web page
|
459
|
+
* @property {string} contentPath - Filesystem path to the raw scraped content file
|
460
|
+
* @property {string} processedContentPath - Filesystem path to the AI-processed and structured content file
|
461
|
+
* @property {string} screenshot - Filesystem path to the captured page screenshot
|
462
|
+
* @property {string} [error] - Optional error message if any stage of processing failed
|
463
|
+
* @property {number} timestamp - Unix timestamp (in milliseconds) when processing completed
|
464
|
+
*
|
465
|
+
* The PageResult interface provides a comprehensive record of all artifacts and metadata
|
466
|
+
* generated during the processing of a single web page. This includes:
|
467
|
+
*
|
468
|
+
* - Original URL for reference and deduplication
|
469
|
+
* - Paths to both raw and processed content files
|
470
|
+
* - Screenshot location for visual reference
|
471
|
+
* - Error information if processing failed
|
472
|
+
* - Timestamp for tracking and ordering
|
473
|
+
*
|
474
|
+
* Use Cases:
|
475
|
+
* - Tracking processing status and results
|
476
|
+
* - Error handling and recovery
|
477
|
+
* - Content access and retrieval
|
478
|
+
* - Processing verification
|
479
|
+
* - Audit trail
|
480
|
+
*
|
481
|
+
* @example
|
482
|
+
* ```typescript
|
483
|
+
* // Successful processing result
|
484
|
+
* const successResult: PageResult = {
|
485
|
+
* url: 'https://example.com/page',
|
486
|
+
* contentPath: 'output/content/example_com_page_1234567890.txt',
|
487
|
+
* processedContentPath: 'output/processed/example_com_page_1234567890.txt',
|
488
|
+
* screenshot: 'output/screenshots/example_com_page_0.png',
|
489
|
+
* timestamp: Date.now()
|
490
|
+
* };
|
491
|
+
*
|
492
|
+
* // Failed processing result
|
493
|
+
* const errorResult: PageResult = {
|
494
|
+
* url: 'https://example.com/invalid',
|
495
|
+
* contentPath: '',
|
496
|
+
* processedContentPath: '',
|
497
|
+
* screenshot: '',
|
498
|
+
* error: 'Failed to load page: 404 Not Found',
|
499
|
+
* timestamp: Date.now()
|
500
|
+
* };
|
501
|
+
* ```
|
502
|
+
*/
|
503
|
+
interface PageResult {
|
504
|
+
url: string;
|
505
|
+
contentPath: string;
|
506
|
+
processedContentPath: string;
|
507
|
+
screenshot: string;
|
508
|
+
error?: string;
|
509
|
+
timestamp: number;
|
510
|
+
}
|
511
|
+
/**
|
512
|
+
* Core class implementing the web scraping and content processing system. Handles all aspects
|
513
|
+
* of the scraping process from URL discovery to content storage.
|
514
|
+
*
|
515
|
+
* @class WebScraper
|
516
|
+
*
|
517
|
+
* @property {Browser | null} browser - Playwright browser instance used for automation
|
518
|
+
* @property {Map<string, PageResult>} results - Map storing processing results for each URL
|
519
|
+
* @property {Set<string>} processedUrls - Set of URLs that have been processed to prevent duplicates
|
520
|
+
* @property {string} outputDir - Root directory for storing all generated files and artifacts
|
521
|
+
* @property {ContentFilterManager} contentFilter - Instance of content filtering manager
|
522
|
+
* @property {string} baseUrl - Base URL/domain for the current scraping session
|
523
|
+
*
|
524
|
+
* Key Responsibilities:
|
525
|
+
* 1. Browser Management
|
526
|
+
* - Initialization with optimized settings
|
527
|
+
* - Resource cleanup
|
528
|
+
* - Error handling
|
529
|
+
*
|
530
|
+
* 2. Content Processing
|
531
|
+
* - URL validation and normalization
|
532
|
+
* - Content extraction
|
533
|
+
* - Screenshot capture
|
534
|
+
* - AI analysis
|
535
|
+
* - Content filtering
|
536
|
+
*
|
537
|
+
* 3. File Management
|
538
|
+
* - Directory structure creation
|
539
|
+
* - File naming and organization
|
540
|
+
* - Content storage
|
541
|
+
* - Resource cleanup
|
542
|
+
*
|
543
|
+
* 4. URL Management
|
544
|
+
* - Deduplication
|
545
|
+
* - Domain boundary enforcement
|
546
|
+
* - Link discovery
|
547
|
+
* - Queue management
|
548
|
+
*
|
549
|
+
* 5. Error Handling
|
550
|
+
* - Network failures
|
551
|
+
* - Content processing errors
|
552
|
+
* - Resource constraints
|
553
|
+
* - Recovery mechanisms
|
554
|
+
*
|
555
|
+
* Processing Stages:
|
556
|
+
* 1. Initialization
|
557
|
+
* - Directory setup
|
558
|
+
* - Browser launch
|
559
|
+
* - Filter configuration
|
560
|
+
*
|
561
|
+
* 2. URL Processing
|
562
|
+
* - Validation
|
563
|
+
* - Deduplication
|
564
|
+
* - Domain checking
|
565
|
+
*
|
566
|
+
* 3. Content Extraction
|
567
|
+
* - Page loading
|
568
|
+
* - Screenshot capture
|
569
|
+
* - Content scraping
|
570
|
+
*
|
571
|
+
* 4. Content Processing
|
572
|
+
* - Filtering
|
573
|
+
* - AI analysis
|
574
|
+
* - Structure generation
|
575
|
+
*
|
576
|
+
* 5. Storage
|
577
|
+
* - File organization
|
578
|
+
* - Content saving
|
579
|
+
* - Metadata tracking
|
580
|
+
*
|
581
|
+
* 6. Link Discovery
|
582
|
+
* - URL extraction
|
583
|
+
* - Validation
|
584
|
+
* - Queue management
|
585
|
+
*
|
586
|
+
* 7. Cleanup
|
587
|
+
* - Resource release
|
588
|
+
* - Error handling
|
589
|
+
* - Status reporting
|
590
|
+
*
|
591
|
+
* @example
|
592
|
+
* ```typescript
|
593
|
+
* // Initialize scraper with custom settings
|
594
|
+
* const scraper = new WebScraper("output_dir");
|
595
|
+
*
|
596
|
+
* try {
|
597
|
+
* // Configure content filter
|
598
|
+
* scraper.contentFilter.setRules({
|
599
|
+
* maxLength: 50000,
|
600
|
+
* allowedDomains: ['example.com']
|
601
|
+
* });
|
602
|
+
*
|
603
|
+
* // Start recursive scraping
|
604
|
+
* const results = await scraper.scrapeWebsite("https://example.com");
|
605
|
+
*
|
606
|
+
* // Process results
|
607
|
+
* for (const [url, result] of results) {
|
608
|
+
* if (result.error) {
|
609
|
+
* console.error(`Error processing ${url}:`, result.error);
|
610
|
+
* continue;
|
611
|
+
* }
|
612
|
+
*
|
613
|
+
* // Access processed content
|
614
|
+
* const content = await fs.readFile(result.processedContentPath, 'utf-8');
|
615
|
+
* console.log(`Successfully processed ${url}`);
|
616
|
+
* }
|
617
|
+
* } catch (error) {
|
618
|
+
* console.error("Scraping failed:", error);
|
619
|
+
* }
|
620
|
+
* ```
|
621
|
+
*
|
622
|
+
* @throws {Error} Invalid URL provided
|
623
|
+
* @throws {Error} Browser initialization failed
|
624
|
+
* @throws {Error} Content processing failed
|
625
|
+
* @throws {Error} File system operation failed
|
626
|
+
*/
|
627
|
+
declare class WebScraper {
|
628
|
+
private browser;
|
629
|
+
private results;
|
630
|
+
private processedUrls;
|
631
|
+
private outputDir;
|
632
|
+
readonly contentFilter: ContentFilterManager;
|
633
|
+
private baseUrl;
|
634
|
+
private sentimentAnalyzer;
|
635
|
+
/**
|
636
|
+
* Creates a new WebScraper instance.
|
637
|
+
*
|
638
|
+
* @param {string} outputDir - Directory where scraped content and artifacts will be stored
|
639
|
+
* @default "scraping_output"
|
640
|
+
*
|
641
|
+
* The constructor initializes a new WebScraper instance with the following setup:
|
642
|
+
*
|
643
|
+
* 1. Output Directory
|
644
|
+
* - Creates base directory for all artifacts
|
645
|
+
* - Organizes subdirectories for different content types
|
646
|
+
* - Handles path normalization
|
647
|
+
*
|
648
|
+
* 2. Content Filter
|
649
|
+
* - Initializes content filtering system
|
650
|
+
* - Sets up default filtering rules
|
651
|
+
* - Prepares moderation capabilities
|
652
|
+
*
|
653
|
+
* Directory Structure:
|
654
|
+
* ```
|
655
|
+
* outputDir/
|
656
|
+
* ├── content/ # Raw scraped content
|
657
|
+
* │ └── [domain]/ # Organized by domain
|
658
|
+
* ├── processed/ # AI-processed content
|
659
|
+
* │ └── [domain]/ # Organized by domain
|
660
|
+
* └── screenshots/ # Page screenshots
|
661
|
+
* └── [domain]/ # Organized by domain
|
662
|
+
* ```
|
663
|
+
*
|
664
|
+
* @example
|
665
|
+
* ```typescript
|
666
|
+
* // Basic initialization
|
667
|
+
* const scraper = new WebScraper();
|
668
|
+
*
|
669
|
+
* // Custom output directory
|
670
|
+
* const customScraper = new WebScraper("custom/output/path");
|
671
|
+
* ```
|
672
|
+
*
|
673
|
+
* @throws {Error} If directory creation fails
|
674
|
+
* @throws {Error} If content filter initialization fails
|
675
|
+
*/
|
676
|
+
constructor(outputDir?: string);
|
677
|
+
/**
|
678
|
+
* Main entry point for scraping a website. Initializes the browser, processes the starting URL,
|
679
|
+
* and recursively crawls linked pages within the same domain.
|
680
|
+
*
|
681
|
+
* Processing Flow:
|
682
|
+
* 1. URL Validation
|
683
|
+
* - Format checking
|
684
|
+
* - Domain extraction
|
685
|
+
* - Protocol verification
|
686
|
+
*
|
687
|
+
* 2. Environment Setup
|
688
|
+
* - Directory initialization
|
689
|
+
* - Browser launch
|
690
|
+
* - Resource allocation
|
691
|
+
*
|
692
|
+
* 3. Content Processing
|
693
|
+
* - Page loading
|
694
|
+
* - Content extraction
|
695
|
+
* - Screenshot capture
|
696
|
+
* - AI analysis
|
697
|
+
*
|
698
|
+
* 4. Link Discovery
|
699
|
+
* - URL extraction
|
700
|
+
* - Domain filtering
|
701
|
+
* - Queue management
|
702
|
+
*
|
703
|
+
* 5. Resource Management
|
704
|
+
* - Memory monitoring
|
705
|
+
* - Connection handling
|
706
|
+
* - Cleanup operations
|
707
|
+
*
|
708
|
+
* Error Handling:
|
709
|
+
* - Invalid URLs
|
710
|
+
* - Network failures
|
711
|
+
* - Browser crashes
|
712
|
+
* - Memory constraints
|
713
|
+
* - Timeout conditions
|
714
|
+
*
|
715
|
+
* @param {string} url - Starting URL to begin scraping from
|
716
|
+
* @returns {Promise<Map<string, PageResult>>} Map of results for all processed URLs
|
717
|
+
* @throws {Error} If URL is invalid or scraping fails
|
718
|
+
*
|
719
|
+
* @example
|
720
|
+
* ```typescript
|
721
|
+
* const scraper = new WebScraper("output");
|
722
|
+
*
|
723
|
+
* try {
|
724
|
+
* // Start scraping
|
725
|
+
* const results = await scraper.scrapeWebsite("https://example.com");
|
726
|
+
*
|
727
|
+
* // Process successful results
|
728
|
+
* for (const [url, result] of results) {
|
729
|
+
* if (!result.error) {
|
730
|
+
* console.log(`Successfully processed ${url}`);
|
731
|
+
* console.log(`Content saved to: ${result.processedContentPath}`);
|
732
|
+
* console.log(`Screenshot saved to: ${result.screenshot}`);
|
733
|
+
* }
|
734
|
+
* }
|
735
|
+
*
|
736
|
+
* // Handle errors
|
737
|
+
* const errors = Array.from(results.entries())
|
738
|
+
* .filter(([_, result]) => result.error)
|
739
|
+
* .map(([url, result]) => ({url, error: result.error}));
|
740
|
+
*
|
741
|
+
* if (errors.length > 0) {
|
742
|
+
* console.error("Encountered errors:", errors);
|
743
|
+
* }
|
744
|
+
* } catch (error) {
|
745
|
+
* console.error("Fatal error during scraping:", error);
|
746
|
+
* }
|
747
|
+
* ```
|
748
|
+
*/
|
749
|
+
scrapeWebsite(url: string): Promise<Map<string, PageResult>>;
|
750
|
+
/**
|
751
|
+
* Creates required output directories if they don't exist.
|
752
|
+
*
|
753
|
+
* Directory Structure:
|
754
|
+
* ```
|
755
|
+
* outputDir/
|
756
|
+
* ├── content/ # Raw scraped content
|
757
|
+
* │ └── [domain]/ # Organized by domain
|
758
|
+
* ├── processed/ # AI-processed content
|
759
|
+
* │ └── [domain]/ # Organized by domain
|
760
|
+
* └── screenshots/ # Page screenshots
|
761
|
+
* └── [domain]/ # Organized by domain
|
762
|
+
* ```
|
763
|
+
*
|
764
|
+
* @private
|
765
|
+
* @returns {Promise<void>}
|
766
|
+
*
|
767
|
+
* @throws {Error} If directory creation fails
|
768
|
+
* @throws {Error} If permissions are insufficient
|
769
|
+
* @throws {Error} If disk space is insufficient
|
770
|
+
*/
|
771
|
+
private initializeDirectories;
|
772
|
+
/**
|
773
|
+
* Processes a single web page, extracting content, capturing a screenshot, and analyzing content.
|
774
|
+
* Also discovers and processes linked pages within the same domain.
|
775
|
+
*
|
776
|
+
* Processing Stages:
|
777
|
+
* 1. URL Validation
|
778
|
+
* - Format checking
|
779
|
+
* - Deduplication
|
780
|
+
* - Content type verification
|
781
|
+
*
|
782
|
+
* 2. Content Safety
|
783
|
+
* - Domain checking
|
784
|
+
* - NSFW detection
|
785
|
+
* - Content moderation
|
786
|
+
*
|
787
|
+
* 3. Page Processing
|
788
|
+
* - Loading and rendering
|
789
|
+
* - Screenshot capture
|
790
|
+
* - Content extraction
|
791
|
+
*
|
792
|
+
* 4. Content Analysis
|
793
|
+
* - Text filtering
|
794
|
+
* - AI processing
|
795
|
+
* - Structure generation
|
796
|
+
*
|
797
|
+
* 5. Link Discovery
|
798
|
+
* - URL extraction
|
799
|
+
* - Domain filtering
|
800
|
+
* - Queue management
|
801
|
+
*
|
802
|
+
* Error Handling:
|
803
|
+
* - Network failures
|
804
|
+
* - Timeout conditions
|
805
|
+
* - Content extraction errors
|
806
|
+
* - Processing failures
|
807
|
+
* - Resource constraints
|
808
|
+
*
|
809
|
+
* @private
|
810
|
+
* @param {string} url - URL of the page to process
|
811
|
+
* @returns {Promise<PageResult>} Processing result for the page
|
812
|
+
*
|
813
|
+
* @example
|
814
|
+
* ```typescript
|
815
|
+
* try {
|
816
|
+
* const result = await scraper.processSinglePage("https://example.com/page");
|
817
|
+
*
|
818
|
+
* if (result.error) {
|
819
|
+
* console.error(`Processing failed: ${result.error}`);
|
820
|
+
* return;
|
821
|
+
* }
|
822
|
+
*
|
823
|
+
* // Access results
|
824
|
+
* console.log("Raw content:", result.contentPath);
|
825
|
+
* console.log("Processed content:", result.processedContentPath);
|
826
|
+
* console.log("Screenshot:", result.screenshot);
|
827
|
+
* console.log("Processed at:", new Date(result.timestamp));
|
828
|
+
* } catch (error) {
|
829
|
+
* console.error("Fatal error:", error);
|
830
|
+
* }
|
831
|
+
* ```
|
832
|
+
*
|
833
|
+
* @throws {Error} If page loading fails
|
834
|
+
* @throws {Error} If content extraction fails
|
835
|
+
* @throws {Error} If processing fails
|
836
|
+
*/
|
837
|
+
private processSinglePage;
|
838
|
+
/**
|
839
|
+
* Processes extracted content using Google's Gemini LLM for analysis and structuring.
|
840
|
+
*
|
841
|
+
* Processing Steps:
|
842
|
+
* 1. Content Preparation
|
843
|
+
* - Text filtering
|
844
|
+
* - Format validation
|
845
|
+
* - Length checking
|
846
|
+
*
|
847
|
+
* 2. Context Analysis
|
848
|
+
* - URL analysis
|
849
|
+
* - Content type detection
|
850
|
+
* - Structure identification
|
851
|
+
*
|
852
|
+
* 3. Prompt Generation
|
853
|
+
* - Dynamic template selection
|
854
|
+
* - Context integration
|
855
|
+
* - Parameter optimization
|
856
|
+
*
|
857
|
+
* 4. AI Processing
|
858
|
+
* - Model selection
|
859
|
+
* - Safety settings
|
860
|
+
* - Response handling
|
861
|
+
*
|
862
|
+
* Error Handling:
|
863
|
+
* - Content validation
|
864
|
+
* - Model errors
|
865
|
+
* - Timeout conditions
|
866
|
+
* - Response validation
|
867
|
+
*
|
868
|
+
* @private
|
869
|
+
* @param {string} content - Raw content to process
|
870
|
+
* @param {string} url - URL of the content source
|
871
|
+
* @returns {Promise<string>} Processed and structured content
|
872
|
+
*
|
873
|
+
* @throws {Error} If content is invalid
|
874
|
+
* @throws {Error} If LLM processing fails
|
875
|
+
* @throws {Error} If response is invalid
|
876
|
+
*
|
877
|
+
* @example
|
878
|
+
* ```typescript
|
879
|
+
* try {
|
880
|
+
* const rawContent = "Example raw content...";
|
881
|
+
* const url = "https://example.com";
|
882
|
+
*
|
883
|
+
* const processed = await scraper.processWithLLM(rawContent, url);
|
884
|
+
* console.log("Processed content:", processed);
|
885
|
+
* } catch (error) {
|
886
|
+
* console.error("Processing failed:", error);
|
887
|
+
* }
|
888
|
+
* ```
|
889
|
+
*/
|
890
|
+
private processWithLLM;
|
891
|
+
/**
|
892
|
+
* Takes a full page screenshot of the current page
|
893
|
+
*
|
894
|
+
* Screenshot Process:
|
895
|
+
* 1. Page Preparation
|
896
|
+
* - Viewport setup
|
897
|
+
* - Content loading
|
898
|
+
* - Animation completion
|
899
|
+
*
|
900
|
+
* 2. Capture Settings
|
901
|
+
* - Full page mode
|
902
|
+
* - Resolution configuration
|
903
|
+
* - Format selection
|
904
|
+
*
|
905
|
+
* 3. File Management
|
906
|
+
* - Path generation
|
907
|
+
* - Directory creation
|
908
|
+
* - File saving
|
909
|
+
*
|
910
|
+
* Error Handling:
|
911
|
+
* - Page loading issues
|
912
|
+
* - Screenshot failures
|
913
|
+
* - Storage errors
|
914
|
+
*
|
915
|
+
* @private
|
916
|
+
* @param {Page} page - Playwright page instance
|
917
|
+
* @param {string} url - URL being captured
|
918
|
+
* @returns {Promise<string>} Path to saved screenshot
|
919
|
+
*
|
920
|
+
* @throws {Error} If screenshot capture fails
|
921
|
+
* @throws {Error} If file saving fails
|
922
|
+
*
|
923
|
+
* @example
|
924
|
+
* ```typescript
|
925
|
+
* const page = await browser.newPage();
|
926
|
+
* await page.goto(url);
|
927
|
+
*
|
928
|
+
* try {
|
929
|
+
* const screenshotPath = await scraper.takeScreenshot(page, url);
|
930
|
+
* console.log("Screenshot saved to:", screenshotPath);
|
931
|
+
* } catch (error) {
|
932
|
+
* console.error("Screenshot capture failed:", error);
|
933
|
+
* }
|
934
|
+
* ```
|
935
|
+
*/
|
936
|
+
private takeScreenshot;
|
937
|
+
/**
|
938
|
+
* Saves content to a file with organized directory structure based on URL path.
|
939
|
+
*
|
940
|
+
* File Organization:
|
941
|
+
* 1. Path Generation
|
942
|
+
* - URL parsing
|
943
|
+
* - Path cleaning
|
944
|
+
* - Directory structure
|
945
|
+
*
|
946
|
+
* 2. Content Validation
|
947
|
+
* - File type checking
|
948
|
+
* - Content verification
|
949
|
+
* - Size limits
|
950
|
+
*
|
951
|
+
* 3. Directory Management
|
952
|
+
* - Path creation
|
953
|
+
* - Permissions
|
954
|
+
* - Existing files
|
955
|
+
*
|
956
|
+
* 4. File Operations
|
957
|
+
* - Content writing
|
958
|
+
* - Atomic saves
|
959
|
+
* - Cleanup
|
960
|
+
*
|
961
|
+
* Directory Structure:
|
962
|
+
* ```
|
963
|
+
* outputDir/
|
964
|
+
* └── [domain]/
|
965
|
+
* ├── content/
|
966
|
+
* │ └── [path]/
|
967
|
+
* │ └── content-[timestamp].txt
|
968
|
+
* ├── processed/
|
969
|
+
* │ └── [path]/
|
970
|
+
* │ └── processed-[timestamp].txt
|
971
|
+
* └── screenshots/
|
972
|
+
* └── [path]/
|
973
|
+
* └── screenshot-[timestamp].png
|
974
|
+
* ```
|
975
|
+
*
|
976
|
+
* @private
|
977
|
+
* @param {string} content - Content to save
|
978
|
+
* @param {'content' | 'processed' | 'screenshots'} type - Type of content being saved
|
979
|
+
* @param {string} url - Source URL
|
980
|
+
* @param {string} [fileExtension='.txt'] - File extension to use
|
981
|
+
* @returns {Promise<string>} Path to saved file
|
982
|
+
*
|
983
|
+
* @throws {Error} If file is non-textual
|
984
|
+
* @throws {Error} If saving fails
|
985
|
+
* @throws {Error} If directory creation fails
|
986
|
+
*
|
987
|
+
* @example
|
988
|
+
* ```typescript
|
989
|
+
* try {
|
990
|
+
* // Save raw content
|
991
|
+
* const contentPath = await scraper.saveToFile(
|
992
|
+
* "Raw content...",
|
993
|
+
* "content",
|
994
|
+
* "https://example.com/page"
|
995
|
+
* );
|
996
|
+
*
|
997
|
+
* // Save processed content
|
998
|
+
* const processedPath = await scraper.saveToFile(
|
999
|
+
* "Processed content...",
|
1000
|
+
* "processed",
|
1001
|
+
* "https://example.com/page"
|
1002
|
+
* );
|
1003
|
+
*
|
1004
|
+
* console.log("Content saved to:", contentPath);
|
1005
|
+
* console.log("Processed content saved to:", processedPath);
|
1006
|
+
* } catch (error) {
|
1007
|
+
* @throws {Error} If file is non-textual or saving fails
|
1008
|
+
*/
|
1009
|
+
private saveToFile;
|
1010
|
+
/**
|
1011
|
+
* Validates AI generated content for safety and sentiment
|
1012
|
+
* @private
|
1013
|
+
* @param {string} content - AI generated content to validate
|
1014
|
+
* @returns {Promise<{isValid: boolean, reason?: string}>}
|
1015
|
+
*/
|
1016
|
+
private validateAIResponse;
|
1017
|
+
/**
|
1018
|
+
* Process AI response with safety checks
|
1019
|
+
* @private
|
1020
|
+
* @param {string} aiResponse - Response from AI model
|
1021
|
+
* @returns {Promise<string>} Validated and processed response
|
1022
|
+
* @throws {Error} If content validation fails
|
1023
|
+
*/
|
1024
|
+
private processAIResponse;
|
1025
|
+
}
|
1026
|
+
|
1027
|
+
/**
|
1028
|
+
* @fileoverview Content analysis and prompt generation system for web content processing
|
1029
|
+
* @module content-analyzer
|
1030
|
+
* @description Provides comprehensive functionality for analyzing web content structure and generating
|
1031
|
+
* context-aware prompts for LLM processing. The module includes two main classes:
|
1032
|
+
* - ContentAnalyzer: Analyzes web content to determine its context and characteristics
|
1033
|
+
* - PromptGenerator: Generates tailored prompts based on the analyzed content context
|
1034
|
+
*
|
1035
|
+
* Key features:
|
1036
|
+
* - URL pattern matching and content signal detection
|
1037
|
+
* - Content type classification (article, product, profile, etc.)
|
1038
|
+
* - Structure analysis (narrative, analytical, technical, etc.)
|
1039
|
+
* - Context-aware prompt generation
|
1040
|
+
* - Flexible template system for different content types
|
1041
|
+
*
|
1042
|
+
* @example
|
1043
|
+
* ```typescript
|
1044
|
+
* // Analyze content
|
1045
|
+
* const context = ContentAnalyzer.analyzeContent(url, htmlContent);
|
1046
|
+
*
|
1047
|
+
* // Generate appropriate prompt
|
1048
|
+
* const prompt = PromptGenerator.generatePrompt(context, content);
|
1049
|
+
* ```
|
1050
|
+
*/
|
1051
|
+
/**
|
1052
|
+
* Represents the context and characteristics of analyzed content
|
1053
|
+
* @interface ContentContext
|
1054
|
+
* @property {('article'|'product'|'category'|'profile'|'general')} pageType - The type of page being analyzed:
|
1055
|
+
* - article: Blog posts, news articles, editorial content
|
1056
|
+
* - product: Product pages, item listings, shop entries
|
1057
|
+
* - category: Category pages, department listings, sections
|
1058
|
+
* - profile: User profiles, about pages, portfolios
|
1059
|
+
* - general: Default type for unclassified content
|
1060
|
+
* @property {('brief'|'standard'|'detailed')} contentLength - The relative length/depth of the content:
|
1061
|
+
* - brief: Short-form content, summaries
|
1062
|
+
* - standard: Medium-length content
|
1063
|
+
* - detailed: Long-form, in-depth content
|
1064
|
+
* @property {('narrative'|'analytical'|'technical'|'descriptive')} structureType - The structural style of the content:
|
1065
|
+
* - narrative: Story-based, chronological flow
|
1066
|
+
* - analytical: Data-driven, research-oriented
|
1067
|
+
* - technical: Specification-focused, procedural
|
1068
|
+
* - descriptive: Feature-focused, explanatory
|
1069
|
+
* @property {('general'|'technical'|'business'|'academic')} targetAudience - The intended audience for the content:
|
1070
|
+
* - general: General public, non-specialized readers
|
1071
|
+
* - technical: Technical professionals, developers
|
1072
|
+
* - business: Business professionals, stakeholders
|
1073
|
+
* - academic: Researchers, students, educators
|
1074
|
+
*/
|
1075
|
+
interface ContentContext$1 {
|
1076
|
+
pageType: 'article' | 'product' | 'category' | 'profile' | 'general';
|
1077
|
+
contentLength: 'brief' | 'standard' | 'detailed';
|
1078
|
+
structureType: 'narrative' | 'analytical' | 'technical' | 'descriptive';
|
1079
|
+
targetAudience: 'general' | 'technical' | 'business' | 'academic';
|
1080
|
+
}
|
1081
|
+
/**
|
1082
|
+
* Analyzes web content to determine its context and characteristics
|
1083
|
+
* @class ContentAnalyzer
|
1084
|
+
* @static
|
1085
|
+
* @description Provides static methods for analyzing web content and determining its context.
|
1086
|
+
* Uses pattern matching and content signals to classify content and determine appropriate
|
1087
|
+
* processing strategies. The analysis considers:
|
1088
|
+
* - URL patterns and structure
|
1089
|
+
* - Content keywords and signals
|
1090
|
+
* - Document structure and elements
|
1091
|
+
* - Content indicators and metadata
|
1092
|
+
*/
|
1093
|
+
declare class ContentAnalyzer {
|
1094
|
+
/**
|
1095
|
+
* Predefined patterns for analyzing different types of content routes
|
1096
|
+
* @private
|
1097
|
+
* @static
|
1098
|
+
* @readonly
|
1099
|
+
* @type {RouteAnalysis[]}
|
1100
|
+
* @description Array of route analysis configurations that define:
|
1101
|
+
* - URL patterns to match different content types
|
1102
|
+
* - Content signals associated with each type
|
1103
|
+
* - Default context settings for matched content
|
1104
|
+
* Each configuration targets a specific content category (article, product, etc.)
|
1105
|
+
* and provides the basis for content classification.
|
1106
|
+
*/
|
1107
|
+
private static readonly routePatterns;
|
1108
|
+
/**
|
1109
|
+
* Extracts content signals from HTML content by analyzing structure and keywords
|
1110
|
+
* @private
|
1111
|
+
* @static
|
1112
|
+
* @param {string} content - The HTML content to analyze
|
1113
|
+
* @returns {Set<string>} Set of identified content signals
|
1114
|
+
* @description Analyzes HTML content to identify structural and keyword signals:
|
1115
|
+
* - Checks for presence of headers, lists, and tables
|
1116
|
+
* - Identifies content-specific keywords and patterns
|
1117
|
+
* - Detects pricing information and author attribution
|
1118
|
+
* - Recognizes profile and biographical content
|
1119
|
+
* The signals are used to help classify and contextualize the content.
|
1120
|
+
*/
|
1121
|
+
private static getContentSignals;
|
1122
|
+
/**
|
1123
|
+
* Analyzes content and URL to determine the appropriate content context
|
1124
|
+
* @public
|
1125
|
+
* @static
|
1126
|
+
* @param {string} url - The URL of the content being analyzed
|
1127
|
+
* @param {string} content - The HTML content to analyze
|
1128
|
+
* @returns {ContentContext} The determined content context
|
1129
|
+
* @description Performs comprehensive content analysis by:
|
1130
|
+
* 1. Extracting and analyzing the URL path
|
1131
|
+
* 2. Identifying content signals from the HTML
|
1132
|
+
* 3. Matching against predefined route patterns
|
1133
|
+
* 4. Determining the most appropriate content context
|
1134
|
+
* If no specific matches are found, returns a default general context.
|
1135
|
+
*
|
1136
|
+
* @example
|
1137
|
+
* ```typescript
|
1138
|
+
* const context = ContentAnalyzer.analyzeContent(
|
1139
|
+
* 'https://example.com/blog/article-1',
|
1140
|
+
* '<html>...</html>'
|
1141
|
+
* );
|
1142
|
+
* ```
|
1143
|
+
*/
|
1144
|
+
static analyzeContent(url: string, content: string): ContentContext$1;
|
1145
|
+
}
|
1146
|
+
/**
|
1147
|
+
* Generates context-aware prompts for LLM content processing
|
1148
|
+
* @class PromptGenerator
|
1149
|
+
* @static
|
1150
|
+
* @description Provides functionality for generating tailored prompts based on content context.
|
1151
|
+
* Features:
|
1152
|
+
* - Template-based prompt generation
|
1153
|
+
* - Context-aware prompt customization
|
1154
|
+
* - Support for multiple content types and structures
|
1155
|
+
* - Fallback to default prompts when needed
|
1156
|
+
*/
|
1157
|
+
declare class PromptGenerator {
|
1158
|
+
/**
|
1159
|
+
* Template definitions for different content types and structures
|
1160
|
+
* @private
|
1161
|
+
* @static
|
1162
|
+
* @readonly
|
1163
|
+
* @type {Object}
|
1164
|
+
* @description Comprehensive template system that provides:
|
1165
|
+
* - Content type-specific templates (article, product, profile)
|
1166
|
+
* - Structure-specific variations (narrative, analytical, technical)
|
1167
|
+
* - Detailed processing instructions
|
1168
|
+
* - Placeholder support for content insertion
|
1169
|
+
* Templates are organized hierarchically by content type and structure.
|
1170
|
+
*/
|
1171
|
+
private static readonly promptTemplates;
|
1172
|
+
/**
|
1173
|
+
* Generates an appropriate prompt based on content context
|
1174
|
+
* @public
|
1175
|
+
* @static
|
1176
|
+
* @param {ContentContext} context - The analyzed content context
|
1177
|
+
* @param {string} content - The content to be processed
|
1178
|
+
* @returns {string} Generated prompt for LLM processing
|
1179
|
+
* @description Generates a context-appropriate prompt by:
|
1180
|
+
* 1. Selecting appropriate template based on content type
|
1181
|
+
* 2. Choosing structure-specific variation
|
1182
|
+
* 3. Inserting content into template
|
1183
|
+
* 4. Falling back to default prompt if no specific template exists
|
1184
|
+
*
|
1185
|
+
* @example
|
1186
|
+
* ```typescript
|
1187
|
+
* const prompt = PromptGenerator.generatePrompt(
|
1188
|
+
* { pageType: 'article', structureType: 'narrative', ... },
|
1189
|
+
* 'Article content...'
|
1190
|
+
* );
|
1191
|
+
* ```
|
1192
|
+
*/
|
1193
|
+
static generatePrompt(context: ContentContext$1, content: string): string;
|
1194
|
+
/**
|
1195
|
+
* Provides a default prompt when no specific template matches
|
1196
|
+
* @private
|
1197
|
+
* @static
|
1198
|
+
* @param {string} content - The content to be processed
|
1199
|
+
* @returns {string} Default analysis prompt
|
1200
|
+
* @description Generates a generic but comprehensive prompt for content analysis
|
1201
|
+
* when no specific template matches the content context. The default prompt
|
1202
|
+
* focuses on:
|
1203
|
+
* - Topic extraction
|
1204
|
+
* - Content organization
|
1205
|
+
* - Redundancy removal
|
1206
|
+
* - Clarity and readability
|
1207
|
+
*/
|
1208
|
+
private static getDefaultPrompt;
|
1209
|
+
}
|
1210
|
+
|
1211
|
+
interface WorkerMessage<T = any> {
|
1212
|
+
id: string;
|
1213
|
+
type: "TASK" | "RESULT" | "ERROR" | "STATUS" | "READY" | "INIT";
|
1214
|
+
payload: T;
|
1215
|
+
timestamp: number;
|
1216
|
+
}
|
1217
|
+
interface WorkerTask {
|
1218
|
+
id: string;
|
1219
|
+
type: "TASK" | "RESULT" | "ERROR" | "STATUS" | "READY" | "INIT";
|
1220
|
+
url?: string;
|
1221
|
+
data?: any;
|
1222
|
+
}
|
1223
|
+
interface TrieNode {
|
1224
|
+
children: {
|
1225
|
+
[key: string]: TrieNode;
|
1226
|
+
};
|
1227
|
+
isEndOfWord: boolean;
|
1228
|
+
}
|
1229
|
+
interface ContentContext {
|
1230
|
+
pageType: 'article' | 'product' | 'category' | 'profile' | 'general';
|
1231
|
+
contentLength: 'brief' | 'standard' | 'detailed';
|
1232
|
+
structureType: 'narrative' | 'analytical' | 'technical' | 'descriptive';
|
1233
|
+
targetAudience: 'general' | 'technical' | 'business' | 'academic';
|
1234
|
+
}
|
1235
|
+
interface RouteAnalysis {
|
1236
|
+
patterns: RegExp[];
|
1237
|
+
signals: string[];
|
1238
|
+
context: ContentContext;
|
1239
|
+
}
|
1240
|
+
interface CodeBlock {
|
1241
|
+
language: string;
|
1242
|
+
code: string;
|
1243
|
+
lineNumbers?: boolean;
|
1244
|
+
}
|
1245
|
+
|
1246
|
+
declare const nsfwNames: Readonly<Record<string, string>>;
|
1247
|
+
type NsfwName = keyof typeof nsfwNames;
|
1248
|
+
|
1249
|
+
declare const nsfw: Readonly<Record<string, string>>;
|
1250
|
+
|
1251
|
+
declare const robots: Readonly<Record<string, string>>;
|
1252
|
+
type Robot = keyof typeof robots;
|
1253
|
+
|
1254
|
+
declare const slurs: Readonly<Record<string, string>>;
|
1255
|
+
type Slur = keyof typeof slurs;
|
1256
|
+
|
1257
|
+
/**
|
1258
|
+
* @fileoverview Configuration settings for Google's Gemini AI model integration
|
1259
|
+
* @module gemini-settings
|
1260
|
+
* @description Provides configuration constants and settings for interacting with the Gemini AI API,
|
1261
|
+
* including model selection, API authentication, and content safety thresholds
|
1262
|
+
*/
|
1263
|
+
|
1264
|
+
/**
|
1265
|
+
* The specific Gemini model version to use
|
1266
|
+
* @constant {string}
|
1267
|
+
* @description Specifies the Gemini 1.5 Flash model, optimized for fast inference
|
1268
|
+
*/
|
1269
|
+
declare const model = "gemini-1.5-flash";
|
1270
|
+
/**
|
1271
|
+
* Google AI API key loaded from environment variables
|
1272
|
+
* @constant {string}
|
1273
|
+
* @description API key for authenticating with Google's AI services. Falls back to empty string if not configured
|
1274
|
+
*/
|
1275
|
+
declare const API_KEY: string;
|
1276
|
+
/**
|
1277
|
+
* Initialized Google Generative AI client
|
1278
|
+
* @constant {GoogleGenerativeAI}
|
1279
|
+
* @description Main client instance for interacting with Gemini AI services
|
1280
|
+
*/
|
1281
|
+
declare const genAI: GoogleGenerativeAI;
|
1282
|
+
/**
|
1283
|
+
* Generation configuration settings
|
1284
|
+
* @constant {undefined}
|
1285
|
+
* @description Currently undefined, can be used to specify generation parameters like temperature, top-k, etc.
|
1286
|
+
*/
|
1287
|
+
declare const generationConfig: undefined;
|
1288
|
+
/**
|
1289
|
+
* Content safety threshold settings
|
1290
|
+
* @constant {Array<{category: HarmCategory, threshold: HarmBlockThreshold}>}
|
1291
|
+
* @description Configures content filtering thresholds for different harm categories:
|
1292
|
+
* - Harassment
|
1293
|
+
* - Hate Speech
|
1294
|
+
* - Sexually Explicit Content
|
1295
|
+
* - Dangerous Content
|
1296
|
+
* All thresholds are currently set to BLOCK_NONE for maximum permissiveness
|
1297
|
+
*/
|
1298
|
+
declare const safetySettings: {
|
1299
|
+
category: HarmCategory;
|
1300
|
+
threshold: HarmBlockThreshold;
|
1301
|
+
}[];
|
1302
|
+
|
1303
|
+
export { API_KEY, type CodeBlock, ContentAnalyzer, type ContentContext, ContentFilterManager, type NsfwName, PromptGenerator, type Robot, type RouteAnalysis, type Slur, type TrieNode, WebScraper, type WorkerMessage, type WorkerTask, filterText, genAI, generationConfig, initializeFilterWords, model, nsfw, nsfwNames, robots, safetySettings, scrape, slurs };
|