@tyroneross/blog-scraper 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +254 -279
- package/dist/lib/circuit-breaker.d.ts +29 -0
- package/dist/lib/circuit-breaker.d.ts.map +1 -0
- package/dist/lib/circuit-breaker.js +89 -0
- package/dist/lib/circuit-breaker.js.map +1 -0
- package/dist/lib/content-extractor.d.ts +13 -0
- package/dist/lib/content-extractor.d.ts.map +1 -0
- package/dist/lib/content-extractor.js +75 -0
- package/dist/lib/content-extractor.js.map +1 -0
- package/dist/lib/formatters/html-to-markdown.d.ts +21 -0
- package/dist/lib/formatters/html-to-markdown.d.ts.map +1 -0
- package/dist/lib/formatters/html-to-markdown.js +146 -0
- package/dist/lib/formatters/html-to-markdown.js.map +1 -0
- package/dist/lib/formatters/text-cleaner.d.ts +44 -0
- package/dist/lib/formatters/text-cleaner.d.ts.map +1 -0
- package/dist/lib/formatters/text-cleaner.js +143 -0
- package/dist/lib/formatters/text-cleaner.js.map +1 -0
- package/dist/lib/index.d.ts +96 -0
- package/dist/lib/index.d.ts.map +1 -0
- package/dist/lib/index.js +184 -0
- package/dist/lib/index.js.map +1 -0
- package/dist/lib/quality-scorer.d.ts +83 -0
- package/dist/lib/quality-scorer.d.ts.map +1 -0
- package/dist/lib/quality-scorer.js +376 -0
- package/dist/lib/quality-scorer.js.map +1 -0
- package/dist/lib/rss-utils.d.ts +31 -0
- package/dist/lib/rss-utils.d.ts.map +1 -0
- package/dist/lib/rss-utils.js +175 -0
- package/dist/lib/rss-utils.js.map +1 -0
- package/dist/lib/scraping-rate-limiter.d.ts +52 -0
- package/dist/lib/scraping-rate-limiter.d.ts.map +1 -0
- package/dist/lib/scraping-rate-limiter.js +238 -0
- package/dist/lib/scraping-rate-limiter.js.map +1 -0
- package/dist/lib/source-orchestrator.d.ts +306 -0
- package/dist/lib/source-orchestrator.d.ts.map +1 -0
- package/dist/lib/source-orchestrator.js +840 -0
- package/dist/lib/source-orchestrator.js.map +1 -0
- package/dist/lib/types.d.ts +143 -0
- package/dist/lib/types.d.ts.map +1 -0
- package/dist/lib/types.js +7 -0
- package/dist/lib/types.js.map +1 -0
- package/dist/lib/web-scrapers/content-extractor.d.ts +62 -0
- package/dist/lib/web-scrapers/content-extractor.d.ts.map +1 -0
- package/dist/lib/web-scrapers/content-extractor.js +531 -0
- package/dist/lib/web-scrapers/content-extractor.js.map +1 -0
- package/dist/lib/web-scrapers/html-scraper.d.ts +74 -0
- package/dist/lib/web-scrapers/html-scraper.d.ts.map +1 -0
- package/dist/lib/web-scrapers/html-scraper.js +598 -0
- package/dist/lib/web-scrapers/html-scraper.js.map +1 -0
- package/dist/lib/web-scrapers/playwright-scraper.d.ts +57 -0
- package/dist/lib/web-scrapers/playwright-scraper.d.ts.map +1 -0
- package/dist/lib/web-scrapers/playwright-scraper.js +355 -0
- package/dist/lib/web-scrapers/playwright-scraper.js.map +1 -0
- package/dist/lib/web-scrapers/robots-checker.d.ts +42 -0
- package/dist/lib/web-scrapers/robots-checker.d.ts.map +1 -0
- package/dist/lib/web-scrapers/robots-checker.js +285 -0
- package/dist/lib/web-scrapers/robots-checker.js.map +1 -0
- package/dist/lib/web-scrapers/rss-discovery.d.ts +62 -0
- package/dist/lib/web-scrapers/rss-discovery.d.ts.map +1 -0
- package/dist/lib/web-scrapers/rss-discovery.js +384 -0
- package/dist/lib/web-scrapers/rss-discovery.js.map +1 -0
- package/dist/lib/web-scrapers/sitemap-parser.d.ts +65 -0
- package/dist/lib/web-scrapers/sitemap-parser.d.ts.map +1 -0
- package/dist/lib/web-scrapers/sitemap-parser.js +430 -0
- package/dist/lib/web-scrapers/sitemap-parser.js.map +1 -0
- package/package.json +54 -33
- package/dist/index.d.mts +0 -949
- package/dist/index.d.ts +0 -949
- package/dist/index.js +0 -3236
- package/dist/index.mjs +0 -3165
|
@@ -0,0 +1,531 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
36
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
37
|
+
};
|
|
38
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
39
|
+
exports.globalContentExtractor = exports.ContentExtractor = void 0;
|
|
40
|
+
const readability_1 = require("@mozilla/readability");
|
|
41
|
+
const jsdom_1 = require("jsdom");
|
|
42
|
+
const cheerio = __importStar(require("cheerio"));
|
|
43
|
+
const p_limit_1 = __importDefault(require("p-limit"));
|
|
44
|
+
const scraping_rate_limiter_1 = require("../scraping-rate-limiter");
|
|
45
|
+
const robots_checker_1 = require("./robots-checker");
|
|
46
|
+
class ContentExtractor {
|
|
47
|
+
constructor() {
|
|
48
|
+
this.userAgent = 'Mozilla/5.0 (compatible; AtomizeNews/1.0; +https://atomize-news.vercel.app)';
|
|
49
|
+
this.timeout = 15000; // 15 seconds
|
|
50
|
+
this.maxContentSize = 10 * 1024 * 1024; // 10MB max
|
|
51
|
+
this.minContentLength = 200; // Minimum 200 characters
|
|
52
|
+
this.wordsPerMinute = 200; // Average reading speed
|
|
53
|
+
this.ssrfProtection = {
|
|
54
|
+
isPrivateIP: (url) => {
|
|
55
|
+
try {
|
|
56
|
+
const urlObj = new URL(url);
|
|
57
|
+
const hostname = urlObj.hostname;
|
|
58
|
+
// Check for private IP ranges
|
|
59
|
+
const privateRanges = [
|
|
60
|
+
/^127\./, // 127.0.0.0/8 (loopback)
|
|
61
|
+
/^10\./, // 10.0.0.0/8 (private)
|
|
62
|
+
/^172\.(1[6-9]|2[0-9]|3[01])\./, // 172.16.0.0/12 (private)
|
|
63
|
+
/^192\.168\./, // 192.168.0.0/16 (private)
|
|
64
|
+
/^169\.254\./, // 169.254.0.0/16 (link-local)
|
|
65
|
+
/^::1$/, // IPv6 loopback
|
|
66
|
+
/^fe80:/, // IPv6 link-local
|
|
67
|
+
/^fc00:/, // IPv6 unique local
|
|
68
|
+
/^fd00:/ // IPv6 unique local
|
|
69
|
+
];
|
|
70
|
+
return privateRanges.some(range => range.test(hostname));
|
|
71
|
+
}
|
|
72
|
+
catch {
|
|
73
|
+
return true; // If we can't parse, block it
|
|
74
|
+
}
|
|
75
|
+
},
|
|
76
|
+
isLocalhost: (url) => {
|
|
77
|
+
try {
|
|
78
|
+
const urlObj = new URL(url);
|
|
79
|
+
const hostname = urlObj.hostname.toLowerCase();
|
|
80
|
+
return hostname === 'localhost' || hostname === '127.0.0.1' || hostname === '::1';
|
|
81
|
+
}
|
|
82
|
+
catch {
|
|
83
|
+
return true;
|
|
84
|
+
}
|
|
85
|
+
},
|
|
86
|
+
isAllowedProtocol: (url) => {
|
|
87
|
+
try {
|
|
88
|
+
const urlObj = new URL(url);
|
|
89
|
+
return urlObj.protocol === 'http:' || urlObj.protocol === 'https:';
|
|
90
|
+
}
|
|
91
|
+
catch {
|
|
92
|
+
return false;
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
};
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* Extract content from a URL
|
|
99
|
+
*/
|
|
100
|
+
async extractContent(url) {
|
|
101
|
+
console.log(`📖 [ContentExtractor] Starting content extraction from ${url}`);
|
|
102
|
+
try {
|
|
103
|
+
// SSRF protection
|
|
104
|
+
if (!this.ssrfProtection.isAllowedProtocol(url)) {
|
|
105
|
+
throw new Error(`Disallowed protocol: ${url}`);
|
|
106
|
+
}
|
|
107
|
+
if (this.ssrfProtection.isPrivateIP(url) || this.ssrfProtection.isLocalhost(url)) {
|
|
108
|
+
throw new Error(`Private/local IP not allowed: ${url}`);
|
|
109
|
+
}
|
|
110
|
+
// Check robots.txt compliance
|
|
111
|
+
const robotsCheck = await robots_checker_1.globalRobotsChecker.isAllowed(url);
|
|
112
|
+
if (!robotsCheck.allowed) {
|
|
113
|
+
console.warn(`🤖 [ContentExtractor] URL blocked by robots.txt: ${url} - ${robotsCheck.reason}`);
|
|
114
|
+
return null;
|
|
115
|
+
}
|
|
116
|
+
const html = await this.fetchContent(url);
|
|
117
|
+
if (!html) {
|
|
118
|
+
return null;
|
|
119
|
+
}
|
|
120
|
+
// Extract content using multiple methods
|
|
121
|
+
const extracted = await this.extractFromHTML(html, url);
|
|
122
|
+
if (!extracted) {
|
|
123
|
+
console.warn(`⚠️ [ContentExtractor] No content extracted from ${url}`);
|
|
124
|
+
return null;
|
|
125
|
+
}
|
|
126
|
+
// Validate extracted content
|
|
127
|
+
if (extracted.textContent.length < this.minContentLength) {
|
|
128
|
+
console.warn(`⚠️ [ContentExtractor] Content too short (${extracted.textContent.length} chars): ${url}`);
|
|
129
|
+
return null;
|
|
130
|
+
}
|
|
131
|
+
console.log(`✅ [ContentExtractor] Successfully extracted ${extracted.wordCount} words from ${url}`);
|
|
132
|
+
return extracted;
|
|
133
|
+
}
|
|
134
|
+
catch (error) {
|
|
135
|
+
console.error(`❌ [ContentExtractor] Error extracting content from ${url}:`, error);
|
|
136
|
+
return null;
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
/**
|
|
140
|
+
* Extract content from multiple URLs with configurable concurrency
|
|
141
|
+
*/
|
|
142
|
+
async extractBatch(urls, options = {}) {
|
|
143
|
+
const concurrency = options.concurrency || 5;
|
|
144
|
+
console.log(`📖 [ContentExtractor] Starting parallel batch extraction of ${urls.length} URLs (concurrency: ${concurrency})`);
|
|
145
|
+
const limit = (0, p_limit_1.default)(concurrency);
|
|
146
|
+
let completed = 0;
|
|
147
|
+
const results = await Promise.all(urls.map(url => limit(async () => {
|
|
148
|
+
try {
|
|
149
|
+
const result = await this.extractContent(url);
|
|
150
|
+
completed++;
|
|
151
|
+
options.onProgress?.(completed, urls.length, url);
|
|
152
|
+
return result;
|
|
153
|
+
}
|
|
154
|
+
catch (error) {
|
|
155
|
+
console.error(`❌ [ContentExtractor] Error in batch extraction for ${url}:`, error);
|
|
156
|
+
completed++;
|
|
157
|
+
options.onProgress?.(completed, urls.length, url);
|
|
158
|
+
return null;
|
|
159
|
+
}
|
|
160
|
+
})));
|
|
161
|
+
const successful = results.filter(Boolean).length;
|
|
162
|
+
console.log(`📖 [ContentExtractor] Batch complete: ${successful}/${urls.length} successful`);
|
|
163
|
+
return results;
|
|
164
|
+
}
|
|
165
|
+
async fetchContent(url) {
|
|
166
|
+
try {
|
|
167
|
+
return await scraping_rate_limiter_1.globalRateLimiter.execute(url, async () => {
|
|
168
|
+
const controller = new AbortController();
|
|
169
|
+
const timeoutId = setTimeout(() => controller.abort(), this.timeout);
|
|
170
|
+
try {
|
|
171
|
+
const response = await fetch(url, {
|
|
172
|
+
headers: {
|
|
173
|
+
'User-Agent': this.userAgent,
|
|
174
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
175
|
+
'Accept-Language': 'en-US,en;q=0.9'
|
|
176
|
+
},
|
|
177
|
+
signal: controller.signal,
|
|
178
|
+
});
|
|
179
|
+
clearTimeout(timeoutId);
|
|
180
|
+
if (!response.ok) {
|
|
181
|
+
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
182
|
+
}
|
|
183
|
+
const contentLength = response.headers.get('content-length');
|
|
184
|
+
if (contentLength && parseInt(contentLength) > this.maxContentSize) {
|
|
185
|
+
throw new Error(`Content too large: ${contentLength} bytes`);
|
|
186
|
+
}
|
|
187
|
+
const html = await response.text();
|
|
188
|
+
if (html.length > this.maxContentSize) {
|
|
189
|
+
throw new Error(`Content too large: ${html.length} bytes`);
|
|
190
|
+
}
|
|
191
|
+
return html;
|
|
192
|
+
}
|
|
193
|
+
catch (error) {
|
|
194
|
+
clearTimeout(timeoutId);
|
|
195
|
+
throw error;
|
|
196
|
+
}
|
|
197
|
+
});
|
|
198
|
+
}
|
|
199
|
+
catch (error) {
|
|
200
|
+
console.error(`❌ [ContentExtractor] Error fetching content from ${url}:`, error);
|
|
201
|
+
return null;
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
async extractFromHTML(html, url) {
|
|
205
|
+
const errors = [];
|
|
206
|
+
try {
|
|
207
|
+
// Try Readability first (most reliable)
|
|
208
|
+
const readabilityResult = this.extractWithReadability(html, url);
|
|
209
|
+
if (readabilityResult && readabilityResult.textContent.length >= this.minContentLength) {
|
|
210
|
+
return {
|
|
211
|
+
...readabilityResult,
|
|
212
|
+
extractionMethod: 'readability',
|
|
213
|
+
confidence: 0.9
|
|
214
|
+
};
|
|
215
|
+
}
|
|
216
|
+
else {
|
|
217
|
+
errors.push('Readability extraction failed or content too short');
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
catch (error) {
|
|
221
|
+
errors.push(`Readability error: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
222
|
+
}
|
|
223
|
+
try {
|
|
224
|
+
// Fallback to manual extraction
|
|
225
|
+
const fallbackResult = this.extractWithFallback(html, url);
|
|
226
|
+
if (fallbackResult && fallbackResult.textContent.length >= this.minContentLength) {
|
|
227
|
+
return {
|
|
228
|
+
...fallbackResult,
|
|
229
|
+
extractionMethod: 'fallback',
|
|
230
|
+
confidence: 0.6,
|
|
231
|
+
errors
|
|
232
|
+
};
|
|
233
|
+
}
|
|
234
|
+
else {
|
|
235
|
+
errors.push('Fallback extraction failed or content too short');
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
catch (error) {
|
|
239
|
+
errors.push(`Fallback error: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
240
|
+
}
|
|
241
|
+
// If both methods fail, return null
|
|
242
|
+
console.error(`❌ [ContentExtractor] All extraction methods failed for ${url}:`, errors);
|
|
243
|
+
return null;
|
|
244
|
+
}
|
|
245
|
+
extractWithReadability(html, url) {
|
|
246
|
+
try {
|
|
247
|
+
const dom = new jsdom_1.JSDOM(html, { url });
|
|
248
|
+
const document = dom.window.document;
|
|
249
|
+
const reader = new readability_1.Readability(document);
|
|
250
|
+
const article = reader.parse();
|
|
251
|
+
if (!article) {
|
|
252
|
+
return null;
|
|
253
|
+
}
|
|
254
|
+
// Extract structured data
|
|
255
|
+
const structured = this.extractStructuredData(html, url);
|
|
256
|
+
// Calculate metrics
|
|
257
|
+
const wordCount = this.countWords(article.textContent ?? '');
|
|
258
|
+
const readingTime = Math.ceil(wordCount / this.wordsPerMinute);
|
|
259
|
+
return {
|
|
260
|
+
url,
|
|
261
|
+
title: article.title || '',
|
|
262
|
+
content: article.content || '',
|
|
263
|
+
textContent: article.textContent || '',
|
|
264
|
+
excerpt: article.excerpt || undefined,
|
|
265
|
+
byline: article.byline || undefined,
|
|
266
|
+
publishedTime: this.extractPublishedTime(html),
|
|
267
|
+
siteName: article.siteName || this.extractSiteName(html),
|
|
268
|
+
lang: this.extractLanguage(html),
|
|
269
|
+
structured,
|
|
270
|
+
wordCount,
|
|
271
|
+
readingTime,
|
|
272
|
+
confidence: 0.9,
|
|
273
|
+
extractionMethod: 'readability',
|
|
274
|
+
extractedAt: new Date()
|
|
275
|
+
};
|
|
276
|
+
}
|
|
277
|
+
catch (error) {
|
|
278
|
+
console.error(`❌ [ContentExtractor] Readability extraction failed:`, error);
|
|
279
|
+
return null;
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
extractWithFallback(html, url) {
|
|
283
|
+
try {
|
|
284
|
+
const $ = cheerio.load(html);
|
|
285
|
+
// Remove unwanted elements
|
|
286
|
+
const unwantedSelectors = [
|
|
287
|
+
'script', 'style', 'nav', 'header', 'footer',
|
|
288
|
+
'.advertisement', '.ads', '.social-share', '.comments',
|
|
289
|
+
'.sidebar', '.navigation', '.menu', '.popup', '.modal'
|
|
290
|
+
];
|
|
291
|
+
unwantedSelectors.forEach(selector => $(selector).remove());
|
|
292
|
+
// Try to find the main content
|
|
293
|
+
let content = '';
|
|
294
|
+
let title = '';
|
|
295
|
+
// Extract title
|
|
296
|
+
title = $('h1').first().text().trim() ||
|
|
297
|
+
$('title').text().trim() ||
|
|
298
|
+
$('meta[property="og:title"]').attr('content') || '';
|
|
299
|
+
// Try different content selectors
|
|
300
|
+
const contentSelectors = [
|
|
301
|
+
'.blog-post__body', // Arista blogs, HubSpot blogs
|
|
302
|
+
'article',
|
|
303
|
+
'.article-content',
|
|
304
|
+
'.post-content',
|
|
305
|
+
'.entry-content',
|
|
306
|
+
'.content',
|
|
307
|
+
'main',
|
|
308
|
+
'#content',
|
|
309
|
+
'.story-body'
|
|
310
|
+
];
|
|
311
|
+
for (const selector of contentSelectors) {
|
|
312
|
+
const element = $(selector).first();
|
|
313
|
+
if (element.length > 0) {
|
|
314
|
+
content = element.html() || '';
|
|
315
|
+
if (content.length > this.minContentLength) {
|
|
316
|
+
break;
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
// If no specific content area found, try to extract from body
|
|
321
|
+
if (content.length < this.minContentLength) {
|
|
322
|
+
content = $('body').html() || '';
|
|
323
|
+
}
|
|
324
|
+
if (!content || content.length < this.minContentLength) {
|
|
325
|
+
return null;
|
|
326
|
+
}
|
|
327
|
+
const textContent = $(content).text().trim();
|
|
328
|
+
const wordCount = this.countWords(textContent);
|
|
329
|
+
const readingTime = Math.ceil(wordCount / this.wordsPerMinute);
|
|
330
|
+
// Extract structured data
|
|
331
|
+
const structured = this.extractStructuredData(html, url);
|
|
332
|
+
return {
|
|
333
|
+
url,
|
|
334
|
+
title,
|
|
335
|
+
content,
|
|
336
|
+
textContent,
|
|
337
|
+
excerpt: textContent.substring(0, 300) + '...',
|
|
338
|
+
publishedTime: this.extractPublishedTime(html),
|
|
339
|
+
siteName: this.extractSiteName(html),
|
|
340
|
+
lang: this.extractLanguage(html),
|
|
341
|
+
structured,
|
|
342
|
+
wordCount,
|
|
343
|
+
readingTime,
|
|
344
|
+
confidence: 0.6,
|
|
345
|
+
extractionMethod: 'fallback',
|
|
346
|
+
extractedAt: new Date()
|
|
347
|
+
};
|
|
348
|
+
}
|
|
349
|
+
catch (error) {
|
|
350
|
+
console.error(`❌ [ContentExtractor] Fallback extraction failed:`, error);
|
|
351
|
+
return null;
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
extractStructuredData(html, url) {
|
|
355
|
+
const structured = {};
|
|
356
|
+
try {
|
|
357
|
+
const $ = cheerio.load(html);
|
|
358
|
+
// Extract JSON-LD
|
|
359
|
+
const jsonLdScripts = [];
|
|
360
|
+
$('script[type="application/ld+json"]').each((_, element) => {
|
|
361
|
+
try {
|
|
362
|
+
const jsonText = $(element).html();
|
|
363
|
+
if (jsonText) {
|
|
364
|
+
const data = JSON.parse(jsonText);
|
|
365
|
+
jsonLdScripts.push(data);
|
|
366
|
+
}
|
|
367
|
+
}
|
|
368
|
+
catch {
|
|
369
|
+
// Skip malformed JSON-LD
|
|
370
|
+
}
|
|
371
|
+
});
|
|
372
|
+
if (jsonLdScripts.length > 0) {
|
|
373
|
+
structured.jsonLd = jsonLdScripts;
|
|
374
|
+
}
|
|
375
|
+
// Extract OpenGraph tags
|
|
376
|
+
const openGraph = {};
|
|
377
|
+
$('meta[property^="og:"]').each((_, element) => {
|
|
378
|
+
const property = $(element).attr('property');
|
|
379
|
+
const content = $(element).attr('content');
|
|
380
|
+
if (property && content) {
|
|
381
|
+
openGraph[property] = content;
|
|
382
|
+
}
|
|
383
|
+
});
|
|
384
|
+
if (Object.keys(openGraph).length > 0) {
|
|
385
|
+
structured.openGraph = openGraph;
|
|
386
|
+
}
|
|
387
|
+
// Extract Twitter Card tags
|
|
388
|
+
const twitterCard = {};
|
|
389
|
+
$('meta[name^="twitter:"]').each((_, element) => {
|
|
390
|
+
const name = $(element).attr('name');
|
|
391
|
+
const content = $(element).attr('content');
|
|
392
|
+
if (name && content) {
|
|
393
|
+
twitterCard[name] = content;
|
|
394
|
+
}
|
|
395
|
+
});
|
|
396
|
+
if (Object.keys(twitterCard).length > 0) {
|
|
397
|
+
structured.twitterCard = twitterCard;
|
|
398
|
+
}
|
|
399
|
+
// Extract microdata (basic support)
|
|
400
|
+
const microdata = [];
|
|
401
|
+
$('[itemscope]').each((_, element) => {
|
|
402
|
+
const $item = $(element);
|
|
403
|
+
const itemType = $item.attr('itemtype');
|
|
404
|
+
if (itemType) {
|
|
405
|
+
const item = { '@type': itemType };
|
|
406
|
+
$item.find('[itemprop]').each((_, propElement) => {
|
|
407
|
+
const $prop = $(propElement);
|
|
408
|
+
const propName = $prop.attr('itemprop');
|
|
409
|
+
const propValue = $prop.attr('content') || $prop.text().trim();
|
|
410
|
+
if (propName && propValue) {
|
|
411
|
+
item[propName] = propValue;
|
|
412
|
+
}
|
|
413
|
+
});
|
|
414
|
+
microdata.push(item);
|
|
415
|
+
}
|
|
416
|
+
});
|
|
417
|
+
if (microdata.length > 0) {
|
|
418
|
+
structured.microdata = microdata;
|
|
419
|
+
}
|
|
420
|
+
}
|
|
421
|
+
catch (error) {
|
|
422
|
+
console.warn(`⚠️ [ContentExtractor] Error extracting structured data:`, error);
|
|
423
|
+
}
|
|
424
|
+
return Object.keys(structured).length > 0 ? structured : undefined;
|
|
425
|
+
}
|
|
426
|
+
extractPublishedTime(html) {
|
|
427
|
+
try {
|
|
428
|
+
const $ = cheerio.load(html);
|
|
429
|
+
// Try different selectors for published time
|
|
430
|
+
const timeSelectors = [
|
|
431
|
+
'meta[property="article:published_time"]',
|
|
432
|
+
'meta[name="datePublished"]',
|
|
433
|
+
'meta[name="publishdate"]',
|
|
434
|
+
'time[datetime]',
|
|
435
|
+
'.published-date',
|
|
436
|
+
'.publish-date',
|
|
437
|
+
'.article-date'
|
|
438
|
+
];
|
|
439
|
+
for (const selector of timeSelectors) {
|
|
440
|
+
const element = $(selector).first();
|
|
441
|
+
if (element.length > 0) {
|
|
442
|
+
const timeStr = element.attr('content') || element.attr('datetime') || element.text().trim();
|
|
443
|
+
if (timeStr) {
|
|
444
|
+
const date = new Date(timeStr);
|
|
445
|
+
if (!isNaN(date.getTime())) {
|
|
446
|
+
return date;
|
|
447
|
+
}
|
|
448
|
+
}
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
return undefined;
|
|
452
|
+
}
|
|
453
|
+
catch {
|
|
454
|
+
return undefined;
|
|
455
|
+
}
|
|
456
|
+
}
|
|
457
|
+
extractSiteName(html) {
|
|
458
|
+
try {
|
|
459
|
+
const $ = cheerio.load(html);
|
|
460
|
+
return $('meta[property="og:site_name"]').attr('content') ||
|
|
461
|
+
$('meta[name="application-name"]').attr('content') ||
|
|
462
|
+
undefined;
|
|
463
|
+
}
|
|
464
|
+
catch {
|
|
465
|
+
return undefined;
|
|
466
|
+
}
|
|
467
|
+
}
|
|
468
|
+
extractLanguage(html) {
|
|
469
|
+
try {
|
|
470
|
+
const $ = cheerio.load(html);
|
|
471
|
+
return $('html').attr('lang') ||
|
|
472
|
+
$('meta[name="language"]').attr('content') ||
|
|
473
|
+
$('meta[http-equiv="content-language"]').attr('content') ||
|
|
474
|
+
undefined;
|
|
475
|
+
}
|
|
476
|
+
catch {
|
|
477
|
+
return undefined;
|
|
478
|
+
}
|
|
479
|
+
}
|
|
480
|
+
countWords(text) {
|
|
481
|
+
if (!text)
|
|
482
|
+
return 0;
|
|
483
|
+
return text.trim().split(/\s+/).filter(word => word.length > 0).length;
|
|
484
|
+
}
|
|
485
|
+
/**
|
|
486
|
+
* Validate extracted content quality
|
|
487
|
+
*/
|
|
488
|
+
validateContent(content) {
|
|
489
|
+
const issues = [];
|
|
490
|
+
let score = 1.0;
|
|
491
|
+
// Check minimum content length
|
|
492
|
+
if (content.textContent.length < this.minContentLength) {
|
|
493
|
+
issues.push(`Content too short: ${content.textContent.length} characters`);
|
|
494
|
+
score -= 0.5;
|
|
495
|
+
}
|
|
496
|
+
// Check title quality
|
|
497
|
+
if (!content.title || content.title.length < 10) {
|
|
498
|
+
issues.push('Missing or too short title');
|
|
499
|
+
score -= 0.2;
|
|
500
|
+
}
|
|
501
|
+
else if (content.title.length > 200) {
|
|
502
|
+
issues.push('Title too long');
|
|
503
|
+
score -= 0.1;
|
|
504
|
+
}
|
|
505
|
+
// Check content-to-HTML ratio (detect pages with too much markup)
|
|
506
|
+
const htmlLength = content.content.length;
|
|
507
|
+
const textLength = content.textContent.length;
|
|
508
|
+
const ratio = textLength / htmlLength;
|
|
509
|
+
if (ratio < 0.1) {
|
|
510
|
+
issues.push('Low text-to-HTML ratio - may be poorly extracted');
|
|
511
|
+
score -= 0.2;
|
|
512
|
+
}
|
|
513
|
+
// Check for duplicate content indicators
|
|
514
|
+
const sentences = content.textContent.split('.').filter(s => s.trim().length > 10);
|
|
515
|
+
const uniqueSentences = new Set(sentences);
|
|
516
|
+
const duplicateRatio = (sentences.length - uniqueSentences.size) / sentences.length;
|
|
517
|
+
if (duplicateRatio > 0.3) {
|
|
518
|
+
issues.push('High duplicate content detected');
|
|
519
|
+
score -= 0.3;
|
|
520
|
+
}
|
|
521
|
+
return {
|
|
522
|
+
isValid: issues.length === 0 && score >= 0.5,
|
|
523
|
+
issues,
|
|
524
|
+
score: Math.max(0, score)
|
|
525
|
+
};
|
|
526
|
+
}
|
|
527
|
+
}
|
|
528
|
+
exports.ContentExtractor = ContentExtractor;
|
|
529
|
+
// Default global instance
|
|
530
|
+
exports.globalContentExtractor = new ContentExtractor();
|
|
531
|
+
//# sourceMappingURL=content-extractor.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"content-extractor.js","sourceRoot":"","sources":["../../../lib/web-scrapers/content-extractor.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,sDAAmD;AACnD,iCAA8B;AAC9B,iDAAmC;AACnC,sDAA6B;AAC7B,oEAA6D;AAC7D,qDAAuD;AAsCvD,MAAa,gBAAgB;IAQ3B;QAPiB,cAAS,GAAG,6EAA6E,CAAC;QAC1F,YAAO,GAAG,KAAK,CAAC,CAAC,aAAa;QAC9B,mBAAc,GAAG,EAAE,GAAG,IAAI,GAAG,IAAI,CAAC,CAAC,WAAW;QAC9C,qBAAgB,GAAG,GAAG,CAAC,CAAC,yBAAyB;QACjD,mBAAc,GAAG,GAAG,CAAC,CAAC,wBAAwB;QAI7D,IAAI,CAAC,cAAc,GAAG;YACpB,WAAW,EAAE,CAAC,GAAW,EAAW,EAAE;gBACpC,IAAI,CAAC;oBACH,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;oBAC5B,MAAM,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC;oBAEjC,8BAA8B;oBAC9B,MAAM,aAAa,GAAG;wBACpB,QAAQ,EAAqB,yBAAyB;wBACtD,OAAO,EAAsB,uBAAuB;wBACpD,+BAA+B,EAAG,0BAA0B;wBAC5D,aAAa,EAAgB,2BAA2B;wBACxD,aAAa,EAAgB,8BAA8B;wBAC3D,OAAO,EAAsB,gBAAgB;wBAC7C,QAAQ,EAAqB,kBAAkB;wBAC/C,QAAQ,EAAqB,oBAAoB;wBACjD,QAAQ,CAAqB,oBAAoB;qBAClD,CAAC;oBAEF,OAAO,aAAa,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,CAAC;gBAC3D,CAAC;gBAAC,MAAM,CAAC;oBACP,OAAO,IAAI,CAAC,CAAC,8BAA8B;gBAC7C,CAAC;YACH,CAAC;YAED,WAAW,EAAE,CAAC,GAAW,EAAW,EAAE;gBACpC,IAAI,CAAC;oBACH,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;oBAC5B,MAAM,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC,WAAW,EAAE,CAAC;oBAC/C,OAAO,QAAQ,KAAK,WAAW,IAAI,QAAQ,KAAK,WAAW,IAAI,QAAQ,KAAK,KAAK,CAAC;gBACpF,CAAC;gBAAC,MAAM,CAAC;oBACP,OAAO,IAAI,CAAC;gBACd,CAAC;YACH,CAAC;YAED,iBAAiB,EAAE,CAAC,GAAW,EAAW,EAAE;gBAC1C,IAAI,CAAC;oBACH,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;oBAC5B,OAAO,MAAM,CAAC,QAAQ,KAAK,OAAO,IAAI,MAAM,CAAC,QAAQ,KAAK,QAAQ,CAAC;gBACrE,CAAC;gBAAC,MAAM,CAAC;oBACP,OAAO,KAAK,CAAC;gBACf,CAAC;YACH,CAAC;SACF,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,cAAc,CAAC,GAAW;QAC9B,OAAO,CAAC,GAAG,CAAC,0DAA0D,GAAG,EAAE,CAAC,CAAC;QAE7E,IAAI,CAAC;YACH,kBAAkB;YAClB,IAAI,CAAC,IAAI,CAAC,cAAc,CAAC,iBAAiB,CAAC,GAAG,CAAC,EAAE,CAAC;gBAChD,MAAM,IAAI,KAAK,CAAC,wBAAwB,GAAG,EAAE,CAAC,CAAC;YACjD,CAAC;YAED,IAAI,IAAI,CAAC,cAAc,CAAC,WAAW,CAAC,GAAG,CAAC,IAAI,IAAI,CAAC,cAAc,CAAC,WAAW,CAAC,GAAG,CAAC,EAAE,CAAC;gBACjF,MAAM,IAAI,KAAK,CAAC,iCAAiC,GAAG,EAAE,CAAC,CAAC;YAC1D,CAAC;YAED,8BAA8B;YAC9B,MAAM,WAAW,GAAG,MAAM,oCAAmB,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC;YAC7D,IAAI,CAAC,WAAW,CAAC,OAAO,EAAE,CAAC;gBACzB,OAAO,CAAC,IAAI,CAAC,oDAAoD,GAAG,MAAM,WAAW,CAAC,MAAM,EAAE,CAAC,CAAC;gBAChG,OAAO,IAAI,CAAC;YACd,CAAC;YAED,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;YAC1C,IAAI,CAAC,IAAI,EAAE,CAAC;gBACV,OAAO,IAAI,CAAC;YACd,CAAC;YAED,yCAAyC;YACzC,MAAM,SAAS,GAAG,MAAM,IAAI,CAAC,eAAe,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;YAExD,IAAI,CAAC,SAAS,EAAE,CAAC;gBACf,OAAO,CAAC,IAAI,CAAC,mDAAmD,GAAG,EAAE,CAAC,CAAC;gBACvE,OAAO,IAAI,CAAC;YACd,CAAC;YAED,6BAA6B;YAC7B,IAAI,SAAS,CAAC,WAAW,CAAC,MAAM,GAAG,IAAI,CAAC,gBAAgB,EAAE,CAAC;gBACzD,OAAO,CAAC,IAAI,CAAC,4CAA4C,SAAS,CAAC,WAAW,CAAC,MAAM,YAAY,GAAG,EAAE,CAAC,CAAC;gBACxG,OAAO,IAAI,CAAC;YACd,CAAC;YAED,OAAO,CAAC,GAAG,CAAC,+CAA+C,SAAS,CAAC,SAAS,eAAe,GAAG,EAAE,CAAC,CAAC;YACpG,OAAO,SAAS,CAAC;QAEnB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,sDAAsD,GAAG,GAAG,EAAE,KAAK,CAAC,CAAC;YACnF,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,YAAY,CAChB,IAAc,EACd,UAGI,EAAE;QAEN,MAAM,WAAW,GAAG,OAAO,CAAC,WAAW,IAAI,CAAC,CAAC;QAC7C,OAAO,CAAC,GAAG,CAAC,+DAA+D,IAAI,CAAC,MAAM,uBAAuB,WAAW,GAAG,CAAC,CAAC;QAE7H,MAAM,KAAK,GAAG,IAAA,iBAAM,EAAC,WAAW,CAAC,CAAC;QAClC,IAAI,SAAS,GAAG,CAAC,CAAC;QAElB,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,GAAG,CAC/B,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CACb,KAAK,CAAC,KAAK,IAAI,EAAE;YACf,IAAI,CAAC;gBACH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC;gBAC9C,SAAS,EAAE,CAAC;gBACZ,OAAO,CAAC,UAAU,EAAE,CAAC,SAAS,EAAE,IAAI,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;gBAClD,OAAO,MAAM,CAAC;YAChB,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,OAAO,CAAC,KAAK,CAAC,sDAAsD,GAAG,GAAG,EAAE,KAAK,CAAC,CAAC;gBACnF,SAAS,EAAE,CAAC;gBACZ,OAAO,CAAC,UAAU,EAAE,CAAC,SAAS,EAAE,IAAI,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;gBAClD,OAAO,IAAI,CAAC;YACd,CAAC;QACH,CAAC,CAAC,CACH,CACF,CAAC;QAEF,MAAM,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC;QAClD,OAAO,CAAC,GAAG,CAAC,yCAAyC,UAAU,IAAI,IAAI,CAAC,MAAM,aAAa,CAAC,CAAC;QAC7F,OAAO,OAAO,CAAC;IACjB,CAAC;IAEO,KAAK,CAAC,YAAY,CAAC,GAAW;QACpC,IAAI,CAAC;YACH,OAAO,MAAM,yCAAiB,CAAC,OAAO,CAAC,GAAG,EAAE,KAAK,IAAI,EAAE;gBACrD,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;gBACzC,MAAM,SAAS,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,IAAI,CAAC,OAAO,CAAC,CAAC;gBAErE,IAAI,CAAC;oBACH,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;wBAChC,OAAO,EAAE;4BACP,YAAY,EAAE,IAAI,CAAC,SAAS;4BAC5B,QAAQ,EAAE,iEAAiE;4BAC3E,iBAAiB,EAAE,gBAAgB;yBACpC;wBACD,MAAM,EAAE,UAAU,CAAC,MAAM;qBAC1B,CAAC,CAAC;oBAEH,YAAY,CAAC,SAAS,CAAC,CAAC;oBAExB,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;wBACjB,MAAM,IAAI,KAAK,CAAC,QAAQ,QAAQ,CAAC,MAAM,KAAK,QAAQ,CAAC,UAAU,EAAE,CAAC,CAAC;oBACrE,CAAC;oBAED,MAAM,aAAa,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,gBAAgB,CAAC,CAAC;oBAC7D,IAAI,aAAa,IAAI,QAAQ,CAAC,aAAa,CAAC,GAAG,IAAI,CAAC,cAAc,EAAE,CAAC;wBACnE,MAAM,IAAI,KAAK,CAAC,sBAAsB,aAAa,QAAQ,CAAC,CAAC;oBAC/D,CAAC;oBAED,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;oBAEnC,IAAI,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC,cAAc,EAAE,CAAC;wBACtC,MAAM,IAAI,KAAK,CAAC,sBAAsB,IAAI,CAAC,MAAM,QAAQ,CAAC,CAAC;oBAC7D,CAAC;oBAED,OAAO,IAAI,CAAC;gBAEd,CAAC;gBAAC,OAAO,KAAK,EAAE,CAAC;oBACf,YAAY,CAAC,SAAS,CAAC,CAAC;oBACxB,MAAM,KAAK,CAAC;gBACd,CAAC;YACH,CAAC,CAAC,CAAC;QACL,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,oDAAoD,GAAG,GAAG,EAAE,KAAK,CAAC,CAAC;YACjF,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IAEO,KAAK,CAAC,eAAe,CAAC,IAAY,EAAE,GAAW;QACrD,MAAM,MAAM,GAAa,EAAE,CAAC;QAE5B,IAAI,CAAC;YACH,wCAAwC;YACxC,MAAM,iBAAiB,GAAG,IAAI,CAAC,sBAAsB,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;YACjE,IAAI,iBAAiB,IAAI,iBAAiB,CAAC,WAAW,CAAC,MAAM,IAAI,IAAI,CAAC,gBAAgB,EAAE,CAAC;gBACvF,OAAO;oBACL,GAAG,iBAAiB;oBACpB,gBAAgB,EAAE,aAAa;oBAC/B,UAAU,EAAE,GAAG;iBAChB,CAAC;YACJ,CAAC;iBAAM,CAAC;gBACN,MAAM,CAAC,IAAI,CAAC,oDAAoD,CAAC,CAAC;YACpE,CAAC;QACH,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,CAAC,IAAI,CAAC,sBAAsB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,EAAE,CAAC,CAAC;QAChG,CAAC;QAED,IAAI,CAAC;YACH,gCAAgC;YAChC,MAAM,cAAc,GAAG,IAAI,CAAC,mBAAmB,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;YAC3D,IAAI,cAAc,IAAI,cAAc,CAAC,WAAW,CAAC,MAAM,IAAI,IAAI,CAAC,gBAAgB,EAAE,CAAC;gBACjF,OAAO;oBACL,GAAG,cAAc;oBACjB,gBAAgB,EAAE,UAAU;oBAC5B,UAAU,EAAE,GAAG;oBACf,MAAM;iBACP,CAAC;YACJ,CAAC;iBAAM,CAAC;gBACN,MAAM,CAAC,IAAI,CAAC,iDAAiD,CAAC,CAAC;YACjE,CAAC;QACH,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,CAAC,IAAI,CAAC,mBAAmB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,EAAE,CAAC,CAAC;QAC7F,CAAC;QAED,oCAAoC;QACpC,OAAO,CAAC,KAAK,CAAC,0DAA0D,GAAG,GAAG,EAAE,MAAM,CAAC,CAAC;QACxF,OAAO,IAAI,CAAC;IACd,CAAC;IAEO,sBAAsB,CAAC,IAAY,EAAE,GAAW;QACtD,IAAI,CAAC;YACH,MAAM,GAAG,GAAG,IAAI,aAAK,CAAC,IAAI,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC;YACrC,MAAM,QAAQ,GAAG,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAC;YAErC,MAAM,MAAM,GAAG,IAAI,yBAAW,CAAC,QAAQ,CAAC,CAAC;YACzC,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,EAAE,CAAC;YAE/B,IAAI,CAAC,OAAO,EAAE,CAAC;gBACb,OAAO,IAAI,CAAC;YACd,CAAC;YAED,0BAA0B;YAC1B,MAAM,UAAU,GAAG,IAAI,CAAC,qBAAqB,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;YAEzD,oBAAoB;YACpB,MAAM,SAAS,GAAG,IAAI,CAAC,UAAU,CAAC,OAAO,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC;YAC7D,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC,cAAc,CAAC,CAAC;YAE/D,OAAO;gBACL,GAAG;gBACH,KAAK,EAAE,OAAO,CAAC,KAAK,IAAI,EAAE;gBAC1B,OAAO,EAAE,OAAO,CAAC,OAAO,IAAI,EAAE;gBAC9B,WAAW,EAAE,OAAO,CAAC,WAAW,IAAI,EAAE;gBACtC,OAAO,EAAE,OAAO,CAAC,OAAO,IAAI,SAAS;gBACrC,MAAM,EAAE,OAAO,CAAC,MAAM,IAAI,SAAS;gBACnC,aAAa,EAAE,IAAI,CAAC,oBAAoB,CAAC,IAAI,CAAC;gBAC9C,QAAQ,EAAE,OAAO,CAAC,QAAQ,IAAI,IAAI,CAAC,eAAe,CAAC,IAAI,CAAC;gBACxD,IAAI,EAAE,IAAI,CAAC,eAAe,CAAC,IAAI,CAAC;gBAChC,UAAU;gBACV,SAAS;gBACT,WAAW;gBACX,UAAU,EAAE,GAAG;gBACf,gBAAgB,EAAE,aAAa;gBAC/B,WAAW,EAAE,IAAI,IAAI,EAAE;aACxB,CAAC;QAEJ,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,qDAAqD,EAAE,KAAK,CAAC,CAAC;YAC5E,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IAEO,mBAAmB,CAAC,IAAY,EAAE,GAAW;QACnD,IAAI,CAAC;YACH,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAE7B,2BAA2B;YAC3B,MAAM,iBAAiB,GAAG;gBACxB,QAAQ,EAAE,OAAO,EAAE,KAAK,EAAE,QAAQ,EAAE,QAAQ;gBAC5C,gBAAgB,EAAE,MAAM,EAAE,eAAe,EAAE,WAAW;gBACtD,UAAU,EAAE,aAAa,EAAE,OAAO,EAAE,QAAQ,EAAE,QAAQ;aACvD,CAAC;YAEF,iBAAiB,CAAC,OAAO,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC;YAE5D,+BAA+B;YAC/B,IAAI,OAAO,GAAG,EAAE,CAAC;YACjB,IAAI,KAAK,GAAG,EAAE,CAAC;YAEf,gBAAgB;YAChB,KAAK,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE;gBACnC,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE;gBACxB,CAAC,CAAC,2BAA2B,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,IAAI,EAAE,CAAC;YAEvD,kCAAkC;YAClC,MAAM,gBAAgB,GAAG;gBACvB,kBAAkB,EAAO,8BAA8B;gBACvD,SAAS;gBACT,kBAAkB;gBAClB,eAAe;gBACf,gBAAgB;gBAChB,UAAU;gBACV,MAAM;gBACN,UAAU;gBACV,aAAa;aACd,CAAC;YAEF,KAAK,MAAM,QAAQ,IAAI,gBAAgB,EAAE,CAAC;gBACxC,MAAM,OAAO,GAAG,CAAC,CAAC,QAAQ,CAAC,CAAC,KAAK,EAAE,CAAC;gBACpC,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBACvB,OAAO,GAAG,OAAO,CAAC,IAAI,EAAE,IAAI,EAAE,CAAC;oBAC/B,IAAI,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC,gBAAgB,EAAE,CAAC;wBAC3C,MAAM;oBACR,CAAC;gBACH,CAAC;YACH,CAAC;YAED,8DAA8D;YAC9D,IAAI,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC,gBAAgB,EAAE,CAAC;gBAC3C,OAAO,GAAG,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,IAAI,EAAE,CAAC;YACnC,CAAC;YAED,IAAI,CAAC,OAAO,IAAI,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC,gBAAgB,EAAE,CAAC;gBACvD,OAAO,IAAI,CAAC;YACd,CAAC;YAED,MAAM,WAAW,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;YAC7C,MAAM,SAAS,GAAG,IAAI,CAAC,UAAU,CAAC,WAAW,CAAC,CAAC;YAC/C,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC,cAAc,CAAC,CAAC;YAE/D,0BAA0B;YAC1B,MAAM,UAAU,GAAG,IAAI,CAAC,qBAAqB,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;YAEzD,OAAO;gBACL,GAAG;gBACH,KAAK;gBACL,OAAO;gBACP,WAAW;gBACX,OAAO,EAAE,WAAW,CAAC,SAAS,CAAC,CAAC,EAAE,GAAG,CAAC,GAAG,KAAK;gBAC9C,aAAa,EAAE,IAAI,CAAC,oBAAoB,CAAC,IAAI,CAAC;gBAC9C,QAAQ,EAAE,IAAI,CAAC,eAAe,CAAC,IAAI,CAAC;gBACpC,IAAI,EAAE,IAAI,CAAC,eAAe,CAAC,IAAI,CAAC;gBAChC,UAAU;gBACV,SAAS;gBACT,WAAW;gBACX,UAAU,EAAE,GAAG;gBACf,gBAAgB,EAAE,UAAU;gBAC5B,WAAW,EAAE,IAAI,IAAI,EAAE;aACxB,CAAC;QAEJ,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,kDAAkD,EAAE,KAAK,CAAC,CAAC;YACzE,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IAEO,qBAAqB,CAAC,IAAY,EAAE,GAAW;QACrD,MAAM,UAAU,GAAmC,EAAE,CAAC;QAEtD,IAAI,CAAC;YACH,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAE7B,kBAAkB;YAClB,MAAM,aAAa,GAAU,EAAE,CAAC;YAChC,CAAC,CAAC,oCAAoC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,EAAE;gBAC1D,IAAI,CAAC;oBACH,MAAM,QAAQ,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC;oBACnC,IAAI,QAAQ,EAAE,CAAC;wBACb,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;wBAClC,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;oBAC3B,CAAC;gBACH,CAAC;gBAAC,MAAM,CAAC;oBACP,yBAAyB;gBAC3B,CAAC;YACH,CAAC,CAAC,CAAC;YAEH,IAAI,aAAa,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC7B,UAAU,CAAC,MAAM,GAAG,aAAa,CAAC;YACpC,CAAC;YAED,yBAAyB;YACzB,MAAM,SAAS,GAA2B,EAAE,CAAC;YAC7C,CAAC,CAAC,uBAAuB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,EAAE;gBAC7C,MAAM,QAAQ,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;gBAC7C,MAAM,OAAO,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;gBAC3C,IAAI,QAAQ,IAAI,OAAO,EAAE,CAAC;oBACxB,SAAS,CAAC,QAAQ,CAAC,GAAG,OAAO,CAAC;gBAChC,CAAC;YACH,CAAC,CAAC,CAAC;YAEH,IAAI,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACtC,UAAU,CAAC,SAAS,GAAG,SAAS,CAAC;YACnC,CAAC;YAED,4BAA4B;YAC5B,MAAM,WAAW,GAA2B,EAAE,CAAC;YAC/C,CAAC,CAAC,wBAAwB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,EAAE;gBAC9C,MAAM,IAAI,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;gBACrC,MAAM,OAAO,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;gBAC3C,IAAI,IAAI,IAAI,OAAO,EAAE,CAAC;oBACpB,WAAW,CAAC,IAAI,CAAC,GAAG,OAAO,CAAC;gBAC9B,CAAC;YACH,CAAC,CAAC,CAAC;YAEH,IAAI,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACxC,UAAU,CAAC,WAAW,GAAG,WAAW,CAAC;YACvC,CAAC;YAED,oCAAoC;YACpC,MAAM,SAAS,GAAU,EAAE,CAAC;YAC5B,CAAC,CAAC,aAAa,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,EAAE;gBACnC,MAAM,KAAK,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC;gBACzB,MAAM,QAAQ,GAAG,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;gBACxC,IAAI,QAAQ,EAAE,CAAC;oBACb,MAAM,IAAI,GAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,CAAC;oBACxC,KAAK,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,WAAW,EAAE,EAAE;wBAC/C,MAAM,KAAK,GAAG,CAAC,CAAC,WAAW,CAAC,CAAC;wBAC7B,MAAM,QAAQ,GAAG,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;wBACxC,MAAM,SAAS,GAAG,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,IAAI,KAAK,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;wBAC/D,IAAI,QAAQ,IAAI,SAAS,EAAE,CAAC;4BAC1B,IAAI,CAAC,QAAQ,CAAC,GAAG,SAAS,CAAC;wBAC7B,CAAC;oBACH,CAAC,CAAC,CAAC;oBACH,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;gBACvB,CAAC;YACH,CAAC,CAAC,CAAC;YAEH,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACzB,UAAU,CAAC,SAAS,GAAG,SAAS,CAAC;YACnC,CAAC;QAEH,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,IAAI,CAAC,yDAAyD,EAAE,KAAK,CAAC,CAAC;QACjF,CAAC;QAED,OAAO,MAAM,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,SAAS,CAAC;IACrE,CAAC;IAEO,oBAAoB,CAAC,IAAY;QACvC,IAAI,CAAC;YACH,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAE7B,6CAA6C;YAC7C,MAAM,aAAa,GAAG;gBACpB,yCAAyC;gBACzC,4BAA4B;gBAC5B,0BAA0B;gBAC1B,gBAAgB;gBAChB,iBAAiB;gBACjB,eAAe;gBACf,eAAe;aAChB,CAAC;YAEF,KAAK,MAAM,QAAQ,IAAI,aAAa,EAAE,CAAC;gBACrC,MAAM,OAAO,GAAG,CAAC,CAAC,QAAQ,CAAC,CAAC,KAAK,EAAE,CAAC;gBACpC,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBACvB,MAAM,OAAO,GAAG,OAAO,CAAC,IAAI,CAAC,SAAS,CAAC,IAAI,OAAO,CAAC,IAAI,CAAC,UAAU,CAAC,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;oBAC7F,IAAI,OAAO,EAAE,CAAC;wBACZ,MAAM,IAAI,GAAG,IAAI,IAAI,CAAC,OAAO,CAAC,CAAC;wBAC/B,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC,EAAE,CAAC;4BAC3B,OAAO,IAAI,CAAC;wBACd,CAAC;oBACH,CAAC;gBACH,CAAC;YACH,CAAC;YAED,OAAO,SAAS,CAAC;QACnB,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,SAAS,CAAC;QACnB,CAAC;IACH,CAAC;IAEO,eAAe,CAAC,IAAY;QAClC,IAAI,CAAC;YACH,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAE7B,OAAO,CAAC,CAAC,+BAA+B,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC;gBACvD,CAAC,CAAC,+BAA+B,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC;gBAClD,SAAS,CAAC;QACd,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,SAAS,CAAC;QACnB,CAAC;IACH,CAAC;IAEO,eAAe,CAAC,IAAY;QAClC,IAAI,CAAC;YACH,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAE7B,OAAO,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC;gBAC3B,CAAC,CAAC,uBAAuB,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC;gBAC1C,CAAC,CAAC,qCAAqC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC;gBACxD,SAAS,CAAC;QACd,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,SAAS,CAAC;QACnB,CAAC;IACH,CAAC;IAEO,UAAU,CAAC,IAAY;QAC7B,IAAI,CAAC,IAAI;YAAE,OAAO,CAAC,CAAC;QACpB,OAAO,IAAI,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC;IACzE,CAAC;IAED;;OAEG;IACH,eAAe,CAAC,OAAyB;QAKvC,MAAM,MAAM,GAAa,EAAE,CAAC;QAC5B,IAAI,KAAK,GAAG,GAAG,CAAC;QAEhB,+BAA+B;QAC/B,IAAI,OAAO,CAAC,WAAW,CAAC,MAAM,GAAG,IAAI,CAAC,gBAAgB,EAAE,CAAC;YACvD,MAAM,CAAC,IAAI,CAAC,sBAAsB,OAAO,CAAC,WAAW,CAAC,MAAM,aAAa,CAAC,CAAC;YAC3E,KAAK,IAAI,GAAG,CAAC;QACf,CAAC;QAED,sBAAsB;QACtB,IAAI,CAAC,OAAO,CAAC,KAAK,IAAI,OAAO,CAAC,KAAK,CAAC,MAAM,GAAG,EAAE,EAAE,CAAC;YAChD,MAAM,CAAC,IAAI,CAAC,4BAA4B,CAAC,CAAC;YAC1C,KAAK,IAAI,GAAG,CAAC;QACf,CAAC;aAAM,IAAI,OAAO,CAAC,KAAK,CAAC,MAAM,GAAG,GAAG,EAAE,CAAC;YACtC,MAAM,CAAC,IAAI,CAAC,gBAAgB,CAAC,CAAC;YAC9B,KAAK,IAAI,GAAG,CAAC;QACf,CAAC;QAED,kEAAkE;QAClE,MAAM,UAAU,GAAG,OAAO,CAAC,OAAO,CAAC,MAAM,CAAC;QAC1C,MAAM,UAAU,GAAG,OAAO,CAAC,WAAW,CAAC,MAAM,CAAC;QAC9C,MAAM,KAAK,GAAG,UAAU,GAAG,UAAU,CAAC;QAEtC,IAAI,KAAK,GAAG,GAAG,EAAE,CAAC;YAChB,MAAM,CAAC,IAAI,CAAC,kDAAkD,CAAC,CAAC;YAChE,KAAK,IAAI,GAAG,CAAC;QACf,CAAC;QAED,yCAAyC;QACzC,MAAM,SAAS,GAAG,OAAO,CAAC,WAAW,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,EAAE,CAAC,CAAC;QACnF,MAAM,eAAe,GAAG,IAAI,GAAG,CAAC,SAAS,CAAC,CAAC;QAC3C,MAAM,cAAc,GAAG,CAAC,SAAS,CAAC,MAAM,GAAG,eAAe,CAAC,IAAI,CAAC,GAAG,SAAS,CAAC,MAAM,CAAC;QAEpF,IAAI,cAAc,GAAG,GAAG,EAAE,CAAC;YACzB,MAAM,CAAC,IAAI,CAAC,iCAAiC,CAAC,CAAC;YAC/C,KAAK,IAAI,GAAG,CAAC;QACf,CAAC;QAED,OAAO;YACL,OAAO,EAAE,MAAM,CAAC,MAAM,KAAK,CAAC,IAAI,KAAK,IAAI,GAAG;YAC5C,MAAM;YACN,KAAK,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,KAAK,CAAC;SAC1B,CAAC;IACJ,CAAC;CACF;AA7iBD,4CA6iBC;AAED,0BAA0B;AACb,QAAA,sBAAsB,GAAG,IAAI,gBAAgB,EAAE,CAAC"}
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
declare const PERPLEXITY_MODELS: {
|
|
2
|
+
readonly SONAR: "llama-3.1-sonar-small-128k-online";
|
|
3
|
+
readonly SONAR_PRO: "llama-3.1-sonar-large-128k-online";
|
|
4
|
+
};
|
|
5
|
+
export interface ScrapingConfig {
|
|
6
|
+
selectors?: {
|
|
7
|
+
articleLinks?: string[];
|
|
8
|
+
titleSelectors?: string[];
|
|
9
|
+
dateSelectors?: string[];
|
|
10
|
+
excludeSelectors?: string[];
|
|
11
|
+
};
|
|
12
|
+
filters?: {
|
|
13
|
+
minTitleLength?: number;
|
|
14
|
+
maxTitleLength?: number;
|
|
15
|
+
includePatterns?: RegExp[];
|
|
16
|
+
excludePatterns?: RegExp[];
|
|
17
|
+
allowedDomains?: string[];
|
|
18
|
+
};
|
|
19
|
+
limits?: {
|
|
20
|
+
maxLinksPerPage?: number;
|
|
21
|
+
maxDepth?: number;
|
|
22
|
+
};
|
|
23
|
+
perplexityFallback?: {
|
|
24
|
+
enabled?: boolean;
|
|
25
|
+
model?: typeof PERPLEXITY_MODELS[keyof typeof PERPLEXITY_MODELS];
|
|
26
|
+
useForRobotsBlocked?: boolean;
|
|
27
|
+
useForParseFailed?: boolean;
|
|
28
|
+
searchRecency?: 'hour' | 'day' | 'week' | 'month';
|
|
29
|
+
};
|
|
30
|
+
}
|
|
31
|
+
export interface ExtractedArticle {
|
|
32
|
+
url: string;
|
|
33
|
+
title?: string;
|
|
34
|
+
publishedDate?: Date;
|
|
35
|
+
description?: string;
|
|
36
|
+
confidence: number;
|
|
37
|
+
source: 'link-text' | 'meta-data' | 'structured-data';
|
|
38
|
+
}
|
|
39
|
+
export declare class HTMLScraper {
|
|
40
|
+
private readonly userAgent;
|
|
41
|
+
private readonly timeout;
|
|
42
|
+
private readonly defaultConfig;
|
|
43
|
+
/**
|
|
44
|
+
* Extract article links from a webpage
|
|
45
|
+
*/
|
|
46
|
+
extractArticleLinks(url: string, config?: ScrapingConfig): Promise<ExtractedArticle[]>;
|
|
47
|
+
/**
|
|
48
|
+
* Extract articles from multiple pages with pagination support
|
|
49
|
+
*/
|
|
50
|
+
extractFromMultiplePages(startUrl: string, config?: ScrapingConfig, options?: {
|
|
51
|
+
maxPages?: number;
|
|
52
|
+
paginationSelector?: string;
|
|
53
|
+
nextPagePatterns?: RegExp[];
|
|
54
|
+
}): Promise<ExtractedArticle[]>;
|
|
55
|
+
private fetchPage;
|
|
56
|
+
private parseArticleLinks;
|
|
57
|
+
private extractArticleInfo;
|
|
58
|
+
private extractStructuredData;
|
|
59
|
+
private findNextPageUrls;
|
|
60
|
+
private deduplicateArticles;
|
|
61
|
+
private passesFilters;
|
|
62
|
+
private isLikelyArticleUrl;
|
|
63
|
+
private parseDate;
|
|
64
|
+
private resolveUrl;
|
|
65
|
+
private mergeConfig;
|
|
66
|
+
/**
|
|
67
|
+
* Use Perplexity API to extract articles when traditional scraping fails
|
|
68
|
+
* Requires PERPLEXITY_API_KEY environment variable to be set
|
|
69
|
+
*/
|
|
70
|
+
private extractWithPerplexity;
|
|
71
|
+
}
|
|
72
|
+
export declare const globalHTMLScraper: HTMLScraper;
|
|
73
|
+
export {};
|
|
74
|
+
//# sourceMappingURL=html-scraper.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"html-scraper.d.ts","sourceRoot":"","sources":["../../../lib/web-scrapers/html-scraper.ts"],"names":[],"mappings":"AAMA,QAAA,MAAM,iBAAiB;;;CAGb,CAAC;AAEX,MAAM,WAAW,cAAc;IAC7B,SAAS,CAAC,EAAE;QACV,YAAY,CAAC,EAAE,MAAM,EAAE,CAAC;QACxB,cAAc,CAAC,EAAE,MAAM,EAAE,CAAC;QAC1B,aAAa,CAAC,EAAE,MAAM,EAAE,CAAC;QACzB,gBAAgB,CAAC,EAAE,MAAM,EAAE,CAAC;KAC7B,CAAC;IACF,OAAO,CAAC,EAAE;QACR,cAAc,CAAC,EAAE,MAAM,CAAC;QACxB,cAAc,CAAC,EAAE,MAAM,CAAC;QACxB,eAAe,CAAC,EAAE,MAAM,EAAE,CAAC;QAC3B,eAAe,CAAC,EAAE,MAAM,EAAE,CAAC;QAC3B,cAAc,CAAC,EAAE,MAAM,EAAE,CAAC;KAC3B,CAAC;IACF,MAAM,CAAC,EAAE;QACP,eAAe,CAAC,EAAE,MAAM,CAAC;QACzB,QAAQ,CAAC,EAAE,MAAM,CAAC;KACnB,CAAC;IACF,kBAAkB,CAAC,EAAE;QACnB,OAAO,CAAC,EAAE,OAAO,CAAC;QAClB,KAAK,CAAC,EAAE,OAAO,iBAAiB,CAAC,MAAM,OAAO,iBAAiB,CAAC,CAAC;QACjE,mBAAmB,CAAC,EAAE,OAAO,CAAC;QAC9B,iBAAiB,CAAC,EAAE,OAAO,CAAC;QAC5B,aAAa,CAAC,EAAE,MAAM,GAAG,KAAK,GAAG,MAAM,GAAG,OAAO,CAAC;KACnD,CAAC;CACH;AAED,MAAM,WAAW,gBAAgB;IAC/B,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,aAAa,CAAC,EAAE,IAAI,CAAC;IACrB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,UAAU,EAAE,MAAM,CAAC;IACnB,MAAM,EAAE,WAAW,GAAG,WAAW,GAAG,iBAAiB,CAAC;CACvD;AAED,qBAAa,WAAW;IACtB,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAiF;IAC3G,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAS;IACjC,OAAO,CAAC,QAAQ,CAAC,aAAa,CAqE5B;IAEF;;OAEG;IACG,mBAAmB,CACvB,GAAG,EAAE,MAAM,EACX,MAAM,GAAE,cAAmB,GAC1B,OAAO,CAAC,gBAAgB,EAAE,CAAC;IAqD9B;;OAEG;IACG,wBAAwB,CAC5B,QAAQ,EAAE,MAAM,EAChB,MAAM,GAAE,cAAmB,EAC3B,OAAO,GAAE;QACP,QAAQ,CAAC,EAAE,MAAM,CAAC;QAClB,kBAAkB,CAAC,EAAE,MAAM,CAAC;QAC5B,gBAAgB,CAAC,EAAE,MAAM,EAAE,CAAC;KACxB,GACL,OAAO,CAAC,gBAAgB,EAAE,CAAC;YAgDhB,SAAS;IAuCvB,OAAO,CAAC,iBAAiB;IA4DzB,OAAO,CAAC,kBAAkB;IAqE1B,OAAO,CAAC,qBAAqB;YAsCf,gBAAgB;IAuC9B,OAAO,CAAC,mBAAmB;IAa3B,OAAO,CAAC,aAAa;IAkCrB,OAAO,CAAC,kBAAkB;IAiB1B,OAAO,CAAC,SAAS;IA8BjB,OAAO,CAAC,UAAU;IAQlB,OAAO,CAAC,WAAW;IAiCnB;;;OAGG;YACW,qBAAqB;CAyEpC;AAGD,eAAO,MAAM,iBAAiB,aAAoB,CAAC"}
|