@tyroneross/blog-scraper 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +254 -279
- package/dist/lib/circuit-breaker.d.ts +29 -0
- package/dist/lib/circuit-breaker.d.ts.map +1 -0
- package/dist/lib/circuit-breaker.js +89 -0
- package/dist/lib/circuit-breaker.js.map +1 -0
- package/dist/lib/content-extractor.d.ts +13 -0
- package/dist/lib/content-extractor.d.ts.map +1 -0
- package/dist/lib/content-extractor.js +75 -0
- package/dist/lib/content-extractor.js.map +1 -0
- package/dist/lib/formatters/html-to-markdown.d.ts +21 -0
- package/dist/lib/formatters/html-to-markdown.d.ts.map +1 -0
- package/dist/lib/formatters/html-to-markdown.js +146 -0
- package/dist/lib/formatters/html-to-markdown.js.map +1 -0
- package/dist/lib/formatters/text-cleaner.d.ts +44 -0
- package/dist/lib/formatters/text-cleaner.d.ts.map +1 -0
- package/dist/lib/formatters/text-cleaner.js +143 -0
- package/dist/lib/formatters/text-cleaner.js.map +1 -0
- package/dist/lib/index.d.ts +96 -0
- package/dist/lib/index.d.ts.map +1 -0
- package/dist/lib/index.js +184 -0
- package/dist/lib/index.js.map +1 -0
- package/dist/lib/quality-scorer.d.ts +83 -0
- package/dist/lib/quality-scorer.d.ts.map +1 -0
- package/dist/lib/quality-scorer.js +376 -0
- package/dist/lib/quality-scorer.js.map +1 -0
- package/dist/lib/rss-utils.d.ts +31 -0
- package/dist/lib/rss-utils.d.ts.map +1 -0
- package/dist/lib/rss-utils.js +175 -0
- package/dist/lib/rss-utils.js.map +1 -0
- package/dist/lib/scraping-rate-limiter.d.ts +52 -0
- package/dist/lib/scraping-rate-limiter.d.ts.map +1 -0
- package/dist/lib/scraping-rate-limiter.js +238 -0
- package/dist/lib/scraping-rate-limiter.js.map +1 -0
- package/dist/lib/source-orchestrator.d.ts +306 -0
- package/dist/lib/source-orchestrator.d.ts.map +1 -0
- package/dist/lib/source-orchestrator.js +840 -0
- package/dist/lib/source-orchestrator.js.map +1 -0
- package/dist/lib/types.d.ts +143 -0
- package/dist/lib/types.d.ts.map +1 -0
- package/dist/lib/types.js +7 -0
- package/dist/lib/types.js.map +1 -0
- package/dist/lib/web-scrapers/content-extractor.d.ts +62 -0
- package/dist/lib/web-scrapers/content-extractor.d.ts.map +1 -0
- package/dist/lib/web-scrapers/content-extractor.js +531 -0
- package/dist/lib/web-scrapers/content-extractor.js.map +1 -0
- package/dist/lib/web-scrapers/html-scraper.d.ts +74 -0
- package/dist/lib/web-scrapers/html-scraper.d.ts.map +1 -0
- package/dist/lib/web-scrapers/html-scraper.js +598 -0
- package/dist/lib/web-scrapers/html-scraper.js.map +1 -0
- package/dist/lib/web-scrapers/playwright-scraper.d.ts +57 -0
- package/dist/lib/web-scrapers/playwright-scraper.d.ts.map +1 -0
- package/dist/lib/web-scrapers/playwright-scraper.js +355 -0
- package/dist/lib/web-scrapers/playwright-scraper.js.map +1 -0
- package/dist/lib/web-scrapers/robots-checker.d.ts +42 -0
- package/dist/lib/web-scrapers/robots-checker.d.ts.map +1 -0
- package/dist/lib/web-scrapers/robots-checker.js +285 -0
- package/dist/lib/web-scrapers/robots-checker.js.map +1 -0
- package/dist/lib/web-scrapers/rss-discovery.d.ts +62 -0
- package/dist/lib/web-scrapers/rss-discovery.d.ts.map +1 -0
- package/dist/lib/web-scrapers/rss-discovery.js +384 -0
- package/dist/lib/web-scrapers/rss-discovery.js.map +1 -0
- package/dist/lib/web-scrapers/sitemap-parser.d.ts +65 -0
- package/dist/lib/web-scrapers/sitemap-parser.d.ts.map +1 -0
- package/dist/lib/web-scrapers/sitemap-parser.js +430 -0
- package/dist/lib/web-scrapers/sitemap-parser.js.map +1 -0
- package/package.json +54 -33
- package/dist/index.d.mts +0 -949
- package/dist/index.d.ts +0 -949
- package/dist/index.js +0 -3236
- package/dist/index.mjs +0 -3165
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Playwright-based scraper for JavaScript-rendered pages
|
|
3
|
+
*
|
|
4
|
+
* Used as a fallback when static HTML scraping fails (e.g., Next.js, React, Vue sites)
|
|
5
|
+
* Returns the same ExtractedArticle format as HTMLScraper for consistency
|
|
6
|
+
*/
|
|
7
|
+
import { ExtractedArticle, ScrapingConfig } from './html-scraper';
|
|
8
|
+
export interface PlaywrightScraperConfig extends ScrapingConfig {
|
|
9
|
+
/** Wait for specific selector before extracting (optional) */
|
|
10
|
+
waitForSelector?: string;
|
|
11
|
+
/** Maximum time to wait for page load in ms (default: 30000) */
|
|
12
|
+
timeout?: number;
|
|
13
|
+
/** Whether to block images/fonts for faster loading (default: true) */
|
|
14
|
+
blockMedia?: boolean;
|
|
15
|
+
/** Custom viewport size */
|
|
16
|
+
viewport?: {
|
|
17
|
+
width: number;
|
|
18
|
+
height: number;
|
|
19
|
+
};
|
|
20
|
+
}
|
|
21
|
+
export declare class PlaywrightScraper {
|
|
22
|
+
private browser;
|
|
23
|
+
private readonly userAgent;
|
|
24
|
+
private readonly defaultConfig;
|
|
25
|
+
/**
|
|
26
|
+
* Initialize browser instance
|
|
27
|
+
*/
|
|
28
|
+
private getBrowser;
|
|
29
|
+
/**
|
|
30
|
+
* Close browser instance
|
|
31
|
+
*/
|
|
32
|
+
close(): Promise<void>;
|
|
33
|
+
/**
|
|
34
|
+
* Extract article links from a JavaScript-rendered page
|
|
35
|
+
*/
|
|
36
|
+
extractArticleLinks(url: string, config?: PlaywrightScraperConfig): Promise<ExtractedArticle[]>;
|
|
37
|
+
/**
|
|
38
|
+
* Fetch fully rendered HTML content from a page
|
|
39
|
+
* Useful for content extraction on JS-rendered article pages
|
|
40
|
+
*/
|
|
41
|
+
fetchRenderedContent(url: string, config?: PlaywrightScraperConfig): Promise<string | null>;
|
|
42
|
+
/**
|
|
43
|
+
* Check if URL passes filters
|
|
44
|
+
*/
|
|
45
|
+
private passesFilters;
|
|
46
|
+
/**
|
|
47
|
+
* Check if URL looks like an article
|
|
48
|
+
*/
|
|
49
|
+
private isLikelyArticleUrl;
|
|
50
|
+
/**
|
|
51
|
+
* Merge configurations
|
|
52
|
+
*/
|
|
53
|
+
private mergeConfig;
|
|
54
|
+
}
|
|
55
|
+
export declare function getPlaywrightScraper(): PlaywrightScraper;
|
|
56
|
+
export declare function closePlaywrightScraper(): Promise<void>;
|
|
57
|
+
//# sourceMappingURL=playwright-scraper.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"playwright-scraper.d.ts","sourceRoot":"","sources":["../../../lib/web-scrapers/playwright-scraper.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAGH,OAAO,EAAE,gBAAgB,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAC;AAElE,MAAM,WAAW,uBAAwB,SAAQ,cAAc;IAC7D,8DAA8D;IAC9D,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,gEAAgE;IAChE,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,uEAAuE;IACvE,UAAU,CAAC,EAAE,OAAO,CAAC;IACrB,2BAA2B;IAC3B,QAAQ,CAAC,EAAE;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,MAAM,CAAA;KAAE,CAAC;CAC9C;AAED,qBAAa,iBAAiB;IAC5B,OAAO,CAAC,OAAO,CAAwB;IACvC,OAAO,CAAC,QAAQ,CAAC,SAAS,CAA2H;IAErJ,OAAO,CAAC,QAAQ,CAAC,aAAa,CAsD5B;IAEF;;OAEG;YACW,UAAU;IAiBxB;;OAEG;IACG,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;IAQ5B;;OAEG;IACG,mBAAmB,CACvB,GAAG,EAAE,MAAM,EACX,MAAM,GAAE,uBAA4B,GACnC,OAAO,CAAC,gBAAgB,EAAE,CAAC;IAwJ9B;;;OAGG;IACG,oBAAoB,CAAC,GAAG,EAAE,MAAM,EAAE,MAAM,GAAE,uBAA4B,GAAG,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC;IA2CrG;;OAEG;IACH,OAAO,CAAC,aAAa;IAiBrB;;OAEG;IACH,OAAO,CAAC,kBAAkB;IAe1B;;OAEG;IACH,OAAO,CAAC,WAAW;CA6BpB;AAKD,wBAAgB,oBAAoB,IAAI,iBAAiB,CAKxD;AAGD,wBAAsB,sBAAsB,IAAI,OAAO,CAAC,IAAI,CAAC,CAK5D"}
|
|
@@ -0,0 +1,355 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Playwright-based scraper for JavaScript-rendered pages
|
|
4
|
+
*
|
|
5
|
+
* Used as a fallback when static HTML scraping fails (e.g., Next.js, React, Vue sites)
|
|
6
|
+
* Returns the same ExtractedArticle format as HTMLScraper for consistency
|
|
7
|
+
*/
|
|
8
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
9
|
+
exports.PlaywrightScraper = void 0;
|
|
10
|
+
exports.getPlaywrightScraper = getPlaywrightScraper;
|
|
11
|
+
exports.closePlaywrightScraper = closePlaywrightScraper;
|
|
12
|
+
const playwright_1 = require("playwright");
|
|
13
|
+
class PlaywrightScraper {
|
|
14
|
+
constructor() {
|
|
15
|
+
this.browser = null;
|
|
16
|
+
this.userAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
|
|
17
|
+
this.defaultConfig = {
|
|
18
|
+
timeout: 30000,
|
|
19
|
+
blockMedia: true,
|
|
20
|
+
viewport: { width: 1280, height: 720 },
|
|
21
|
+
selectors: {
|
|
22
|
+
articleLinks: [
|
|
23
|
+
'article a[href]',
|
|
24
|
+
'.article a[href]',
|
|
25
|
+
'.post a[href]',
|
|
26
|
+
'.story a[href]',
|
|
27
|
+
'.news-item a[href]',
|
|
28
|
+
'.card a[href]',
|
|
29
|
+
'[class*="article"] a[href]',
|
|
30
|
+
'[class*="post"] a[href]',
|
|
31
|
+
'[class*="news"] a[href]',
|
|
32
|
+
'[class*="story"] a[href]',
|
|
33
|
+
'h1 a[href]',
|
|
34
|
+
'h2 a[href]',
|
|
35
|
+
'h3 a[href]',
|
|
36
|
+
'.headline a[href]',
|
|
37
|
+
'.title a[href]',
|
|
38
|
+
// Common list patterns
|
|
39
|
+
'ul li a[href]',
|
|
40
|
+
'.list-item a[href]',
|
|
41
|
+
'[role="listitem"] a[href]',
|
|
42
|
+
],
|
|
43
|
+
excludeSelectors: [
|
|
44
|
+
'nav',
|
|
45
|
+
'header',
|
|
46
|
+
'footer',
|
|
47
|
+
'.navigation',
|
|
48
|
+
'.menu',
|
|
49
|
+
'.sidebar',
|
|
50
|
+
'.advertisement',
|
|
51
|
+
'.ads',
|
|
52
|
+
'.comments',
|
|
53
|
+
'.social-share',
|
|
54
|
+
'[aria-hidden="true"]',
|
|
55
|
+
]
|
|
56
|
+
},
|
|
57
|
+
filters: {
|
|
58
|
+
minTitleLength: 10,
|
|
59
|
+
maxTitleLength: 300,
|
|
60
|
+
excludePatterns: [
|
|
61
|
+
/\/(tag|category|author|search|archive|login|register|contact|about|privacy|terms)\//i,
|
|
62
|
+
/\.(pdf|jpg|jpeg|png|gif|mp4|zip|doc)$/i,
|
|
63
|
+
/#/,
|
|
64
|
+
/javascript:/i,
|
|
65
|
+
/mailto:/i
|
|
66
|
+
]
|
|
67
|
+
},
|
|
68
|
+
limits: {
|
|
69
|
+
maxLinksPerPage: 100
|
|
70
|
+
}
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
/**
|
|
74
|
+
* Initialize browser instance
|
|
75
|
+
*/
|
|
76
|
+
async getBrowser() {
|
|
77
|
+
if (!this.browser) {
|
|
78
|
+
console.log('🎭 [Playwright] Launching browser...');
|
|
79
|
+
this.browser = await playwright_1.chromium.launch({
|
|
80
|
+
headless: true,
|
|
81
|
+
args: [
|
|
82
|
+
'--no-sandbox',
|
|
83
|
+
'--disable-setuid-sandbox',
|
|
84
|
+
'--disable-dev-shm-usage',
|
|
85
|
+
'--disable-accelerated-2d-canvas',
|
|
86
|
+
'--disable-gpu'
|
|
87
|
+
]
|
|
88
|
+
});
|
|
89
|
+
}
|
|
90
|
+
return this.browser;
|
|
91
|
+
}
|
|
92
|
+
/**
|
|
93
|
+
* Close browser instance
|
|
94
|
+
*/
|
|
95
|
+
async close() {
|
|
96
|
+
if (this.browser) {
|
|
97
|
+
await this.browser.close();
|
|
98
|
+
this.browser = null;
|
|
99
|
+
console.log('🎭 [Playwright] Browser closed');
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
/**
|
|
103
|
+
* Extract article links from a JavaScript-rendered page
|
|
104
|
+
*/
|
|
105
|
+
async extractArticleLinks(url, config = {}) {
|
|
106
|
+
console.log(`🎭 [Playwright] Extracting articles from ${url}`);
|
|
107
|
+
const mergedConfig = this.mergeConfig(this.defaultConfig, config);
|
|
108
|
+
const browser = await this.getBrowser();
|
|
109
|
+
const context = await browser.newContext({
|
|
110
|
+
userAgent: this.userAgent,
|
|
111
|
+
viewport: mergedConfig.viewport,
|
|
112
|
+
});
|
|
113
|
+
const page = await context.newPage();
|
|
114
|
+
const articles = [];
|
|
115
|
+
try {
|
|
116
|
+
// Block unnecessary resources for faster loading
|
|
117
|
+
if (mergedConfig.blockMedia) {
|
|
118
|
+
await page.route('**/*', (route) => {
|
|
119
|
+
const resourceType = route.request().resourceType();
|
|
120
|
+
if (['image', 'font', 'media'].includes(resourceType)) {
|
|
121
|
+
route.abort();
|
|
122
|
+
}
|
|
123
|
+
else {
|
|
124
|
+
route.continue();
|
|
125
|
+
}
|
|
126
|
+
});
|
|
127
|
+
}
|
|
128
|
+
// Navigate to page
|
|
129
|
+
console.log(`🎭 [Playwright] Loading ${url}...`);
|
|
130
|
+
await page.goto(url, {
|
|
131
|
+
waitUntil: 'networkidle',
|
|
132
|
+
timeout: mergedConfig.timeout
|
|
133
|
+
});
|
|
134
|
+
// Wait for custom selector if specified
|
|
135
|
+
if (mergedConfig.waitForSelector) {
|
|
136
|
+
await page.waitForSelector(mergedConfig.waitForSelector, {
|
|
137
|
+
timeout: mergedConfig.timeout
|
|
138
|
+
});
|
|
139
|
+
}
|
|
140
|
+
// Give JS a moment to finish rendering
|
|
141
|
+
await page.waitForTimeout(1000);
|
|
142
|
+
// Remove excluded elements
|
|
143
|
+
for (const selector of mergedConfig.selectors?.excludeSelectors || []) {
|
|
144
|
+
await page.evaluate((sel) => {
|
|
145
|
+
document.querySelectorAll(sel).forEach(el => el.remove());
|
|
146
|
+
}, selector);
|
|
147
|
+
}
|
|
148
|
+
// Extract articles using configured selectors
|
|
149
|
+
const extractedData = await page.evaluate((selectors) => {
|
|
150
|
+
const results = [];
|
|
151
|
+
const seenUrls = new Set();
|
|
152
|
+
for (const selector of selectors) {
|
|
153
|
+
const links = document.querySelectorAll(selector);
|
|
154
|
+
links.forEach((link) => {
|
|
155
|
+
const anchor = link;
|
|
156
|
+
const href = anchor.href;
|
|
157
|
+
if (!href || seenUrls.has(href))
|
|
158
|
+
return;
|
|
159
|
+
seenUrls.add(href);
|
|
160
|
+
// Get title from link text or nearby heading
|
|
161
|
+
let title = anchor.textContent?.trim() || '';
|
|
162
|
+
// Try to find better title from parent article/card
|
|
163
|
+
const parent = anchor.closest('article, [class*="card"], [class*="post"], [class*="item"], li');
|
|
164
|
+
if (parent) {
|
|
165
|
+
const heading = parent.querySelector('h1, h2, h3, h4, .title, .headline');
|
|
166
|
+
if (heading) {
|
|
167
|
+
const headingText = heading.textContent?.trim();
|
|
168
|
+
if (headingText && headingText.length > title.length) {
|
|
169
|
+
title = headingText;
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
// Get description
|
|
173
|
+
const desc = parent.querySelector('p, .excerpt, .summary, .description');
|
|
174
|
+
const description = desc?.textContent?.trim();
|
|
175
|
+
// Get date
|
|
176
|
+
const dateEl = parent.querySelector('time, [datetime], .date, .published');
|
|
177
|
+
const date = dateEl?.getAttribute('datetime') || dateEl?.textContent?.trim();
|
|
178
|
+
if (title && title.length >= 10) {
|
|
179
|
+
results.push({
|
|
180
|
+
url: href,
|
|
181
|
+
title,
|
|
182
|
+
date,
|
|
183
|
+
description: description?.substring(0, 300)
|
|
184
|
+
});
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
else if (title && title.length >= 10) {
|
|
188
|
+
results.push({ url: href, title });
|
|
189
|
+
}
|
|
190
|
+
});
|
|
191
|
+
}
|
|
192
|
+
return results;
|
|
193
|
+
}, mergedConfig.selectors?.articleLinks || []);
|
|
194
|
+
// Process and filter results
|
|
195
|
+
for (const item of extractedData) {
|
|
196
|
+
if (articles.length >= (mergedConfig.limits?.maxLinksPerPage || 100))
|
|
197
|
+
break;
|
|
198
|
+
// Apply URL filters
|
|
199
|
+
if (!this.passesFilters(item.url, mergedConfig.filters))
|
|
200
|
+
continue;
|
|
201
|
+
// Parse date if available
|
|
202
|
+
let publishedDate;
|
|
203
|
+
if (item.date) {
|
|
204
|
+
const parsed = new Date(item.date);
|
|
205
|
+
if (!isNaN(parsed.getTime())) {
|
|
206
|
+
publishedDate = parsed;
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
// Calculate confidence based on data quality
|
|
210
|
+
let confidence = 0.6; // Base confidence for Playwright extraction
|
|
211
|
+
if (publishedDate)
|
|
212
|
+
confidence += 0.1;
|
|
213
|
+
if (item.description)
|
|
214
|
+
confidence += 0.1;
|
|
215
|
+
if (this.isLikelyArticleUrl(item.url))
|
|
216
|
+
confidence += 0.1;
|
|
217
|
+
articles.push({
|
|
218
|
+
url: item.url,
|
|
219
|
+
title: item.title,
|
|
220
|
+
publishedDate,
|
|
221
|
+
description: item.description,
|
|
222
|
+
confidence: Math.min(confidence, 1.0),
|
|
223
|
+
source: 'link-text'
|
|
224
|
+
});
|
|
225
|
+
}
|
|
226
|
+
console.log(`🎭 [Playwright] Extracted ${articles.length} articles from ${url}`);
|
|
227
|
+
}
|
|
228
|
+
catch (error) {
|
|
229
|
+
console.error(`❌ [Playwright] Error extracting from ${url}:`, error);
|
|
230
|
+
}
|
|
231
|
+
finally {
|
|
232
|
+
await context.close();
|
|
233
|
+
}
|
|
234
|
+
return articles;
|
|
235
|
+
}
|
|
236
|
+
/**
|
|
237
|
+
* Fetch fully rendered HTML content from a page
|
|
238
|
+
* Useful for content extraction on JS-rendered article pages
|
|
239
|
+
*/
|
|
240
|
+
async fetchRenderedContent(url, config = {}) {
|
|
241
|
+
console.log(`🎭 [Playwright] Fetching rendered content from ${url}`);
|
|
242
|
+
const mergedConfig = this.mergeConfig(this.defaultConfig, config);
|
|
243
|
+
const browser = await this.getBrowser();
|
|
244
|
+
const context = await browser.newContext({
|
|
245
|
+
userAgent: this.userAgent,
|
|
246
|
+
viewport: mergedConfig.viewport,
|
|
247
|
+
});
|
|
248
|
+
const page = await context.newPage();
|
|
249
|
+
try {
|
|
250
|
+
// Don't block images for content extraction - we might need them
|
|
251
|
+
await page.goto(url, {
|
|
252
|
+
waitUntil: 'networkidle',
|
|
253
|
+
timeout: mergedConfig.timeout
|
|
254
|
+
});
|
|
255
|
+
// Wait for custom selector if specified
|
|
256
|
+
if (mergedConfig.waitForSelector) {
|
|
257
|
+
await page.waitForSelector(mergedConfig.waitForSelector, {
|
|
258
|
+
timeout: mergedConfig.timeout
|
|
259
|
+
});
|
|
260
|
+
}
|
|
261
|
+
// Give JS a moment to finish rendering
|
|
262
|
+
await page.waitForTimeout(1000);
|
|
263
|
+
// Get the full HTML
|
|
264
|
+
const html = await page.content();
|
|
265
|
+
console.log(`🎭 [Playwright] Fetched ${html.length} bytes of rendered HTML`);
|
|
266
|
+
return html;
|
|
267
|
+
}
|
|
268
|
+
catch (error) {
|
|
269
|
+
console.error(`❌ [Playwright] Error fetching content from ${url}:`, error);
|
|
270
|
+
return null;
|
|
271
|
+
}
|
|
272
|
+
finally {
|
|
273
|
+
await context.close();
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
/**
|
|
277
|
+
* Check if URL passes filters
|
|
278
|
+
*/
|
|
279
|
+
passesFilters(url, filters) {
|
|
280
|
+
if (!filters)
|
|
281
|
+
return true;
|
|
282
|
+
// Check exclude patterns
|
|
283
|
+
if (filters.excludePatterns?.some(pattern => pattern.test(url))) {
|
|
284
|
+
return false;
|
|
285
|
+
}
|
|
286
|
+
// Check include patterns if specified
|
|
287
|
+
if (filters.includePatterns?.length &&
|
|
288
|
+
!filters.includePatterns.some(pattern => pattern.test(url))) {
|
|
289
|
+
return false;
|
|
290
|
+
}
|
|
291
|
+
return true;
|
|
292
|
+
}
|
|
293
|
+
/**
|
|
294
|
+
* Check if URL looks like an article
|
|
295
|
+
*/
|
|
296
|
+
isLikelyArticleUrl(url) {
|
|
297
|
+
const articlePatterns = [
|
|
298
|
+
/\/article[s]?\//i,
|
|
299
|
+
/\/post[s]?\//i,
|
|
300
|
+
/\/story\//i,
|
|
301
|
+
/\/stories\//i,
|
|
302
|
+
/\/news\//i,
|
|
303
|
+
/\/blog\//i,
|
|
304
|
+
/\/\d{4}\/\d{2}\/\d{2}\//,
|
|
305
|
+
/\/\d{4}\/\d{2}\//
|
|
306
|
+
];
|
|
307
|
+
return articlePatterns.some(pattern => pattern.test(url));
|
|
308
|
+
}
|
|
309
|
+
/**
|
|
310
|
+
* Merge configurations
|
|
311
|
+
*/
|
|
312
|
+
mergeConfig(defaultConfig, userConfig) {
|
|
313
|
+
return {
|
|
314
|
+
...defaultConfig,
|
|
315
|
+
...userConfig,
|
|
316
|
+
selectors: {
|
|
317
|
+
...defaultConfig.selectors,
|
|
318
|
+
...userConfig.selectors,
|
|
319
|
+
articleLinks: [
|
|
320
|
+
...(defaultConfig.selectors?.articleLinks || []),
|
|
321
|
+
...(userConfig.selectors?.articleLinks || [])
|
|
322
|
+
],
|
|
323
|
+
excludeSelectors: [
|
|
324
|
+
...(defaultConfig.selectors?.excludeSelectors || []),
|
|
325
|
+
...(userConfig.selectors?.excludeSelectors || [])
|
|
326
|
+
]
|
|
327
|
+
},
|
|
328
|
+
filters: {
|
|
329
|
+
...defaultConfig.filters,
|
|
330
|
+
...userConfig.filters
|
|
331
|
+
},
|
|
332
|
+
limits: {
|
|
333
|
+
...defaultConfig.limits,
|
|
334
|
+
...userConfig.limits
|
|
335
|
+
}
|
|
336
|
+
};
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
exports.PlaywrightScraper = PlaywrightScraper;
|
|
340
|
+
// Global instance with lazy initialization
|
|
341
|
+
let globalPlaywrightScraper = null;
|
|
342
|
+
function getPlaywrightScraper() {
|
|
343
|
+
if (!globalPlaywrightScraper) {
|
|
344
|
+
globalPlaywrightScraper = new PlaywrightScraper();
|
|
345
|
+
}
|
|
346
|
+
return globalPlaywrightScraper;
|
|
347
|
+
}
|
|
348
|
+
// Cleanup function for graceful shutdown
|
|
349
|
+
async function closePlaywrightScraper() {
|
|
350
|
+
if (globalPlaywrightScraper) {
|
|
351
|
+
await globalPlaywrightScraper.close();
|
|
352
|
+
globalPlaywrightScraper = null;
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
//# sourceMappingURL=playwright-scraper.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"playwright-scraper.js","sourceRoot":"","sources":["../../../lib/web-scrapers/playwright-scraper.ts"],"names":[],"mappings":";AAAA;;;;;GAKG;;;AAmYH,oDAKC;AAGD,wDAKC;AA9YD,2CAAqD;AAcrD,MAAa,iBAAiB;IAA9B;QACU,YAAO,GAAmB,IAAI,CAAC;QACtB,cAAS,GAAG,uHAAuH,CAAC;QAEpI,kBAAa,GAA4B;YACxD,OAAO,EAAE,KAAK;YACd,UAAU,EAAE,IAAI;YAChB,QAAQ,EAAE,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE;YACtC,SAAS,EAAE;gBACT,YAAY,EAAE;oBACZ,iBAAiB;oBACjB,kBAAkB;oBAClB,eAAe;oBACf,gBAAgB;oBAChB,oBAAoB;oBACpB,eAAe;oBACf,4BAA4B;oBAC5B,yBAAyB;oBACzB,yBAAyB;oBACzB,0BAA0B;oBAC1B,YAAY;oBACZ,YAAY;oBACZ,YAAY;oBACZ,mBAAmB;oBACnB,gBAAgB;oBAChB,uBAAuB;oBACvB,eAAe;oBACf,oBAAoB;oBACpB,2BAA2B;iBAC5B;gBACD,gBAAgB,EAAE;oBAChB,KAAK;oBACL,QAAQ;oBACR,QAAQ;oBACR,aAAa;oBACb,OAAO;oBACP,UAAU;oBACV,gBAAgB;oBAChB,MAAM;oBACN,WAAW;oBACX,eAAe;oBACf,sBAAsB;iBACvB;aACF;YACD,OAAO,EAAE;gBACP,cAAc,EAAE,EAAE;gBAClB,cAAc,EAAE,GAAG;gBACnB,eAAe,EAAE;oBACf,sFAAsF;oBACtF,wCAAwC;oBACxC,GAAG;oBACH,cAAc;oBACd,UAAU;iBACX;aACF;YACD,MAAM,EAAE;gBACN,eAAe,EAAE,GAAG;aACrB;SACF,CAAC;IAoTJ,CAAC;IAlTC;;OAEG;IACK,KAAK,CAAC,UAAU;QACtB,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC;YAClB,OAAO,CAAC,GAAG,CAAC,sCAAsC,CAAC,CAAC;YACpD,IAAI,CAAC,OAAO,GAAG,MAAM,qBAAQ,CAAC,MAAM,CAAC;gBACnC,QAAQ,EAAE,IAAI;gBACd,IAAI,EAAE;oBACJ,cAAc;oBACd,0BAA0B;oBAC1B,yBAAyB;oBACzB,iCAAiC;oBACjC,eAAe;iBAChB;aACF,CAAC,CAAC;QACL,CAAC;QACD,OAAO,IAAI,CAAC,OAAO,CAAC;IACtB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,KAAK;QACT,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YACjB,MAAM,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;YAC3B,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC;YACpB,OAAO,CAAC,GAAG,CAAC,gCAAgC,CAAC,CAAC;QAChD,CAAC;IACH,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,mBAAmB,CACvB,GAAW,EACX,SAAkC,EAAE;QAEpC,OAAO,CAAC,GAAG,CAAC,4CAA4C,GAAG,EAAE,CAAC,CAAC;QAE/D,MAAM,YAAY,GAAG,IAAI,CAAC,WAAW,CAAC,IAAI,CAAC,aAAa,EAAE,MAAM,CAAC,CAAC;QAClE,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,UAAU,EAAE,CAAC;QACxC,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,UAAU,CAAC;YACvC,SAAS,EAAE,IAAI,CAAC,SAAS;YACzB,QAAQ,EAAE,YAAY,CAAC,QAAQ;SAChC,CAAC,CAAC;QAEH,MAAM,IAAI,GAAG,MAAM,OAAO,CAAC,OAAO,EAAE,CAAC;QACrC,MAAM,QAAQ,GAAuB,EAAE,CAAC;QAExC,IAAI,CAAC;YACH,iDAAiD;YACjD,IAAI,YAAY,CAAC,UAAU,EAAE,CAAC;gBAC5B,MAAM,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,KAAK,EAAE,EAAE;oBACjC,MAAM,YAAY,GAAG,KAAK,CAAC,OAAO,EAAE,CAAC,YAAY,EAAE,CAAC;oBACpD,IAAI,CAAC,OAAO,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC,QAAQ,CAAC,YAAY,CAAC,EAAE,CAAC;wBACtD,KAAK,CAAC,KAAK,EAAE,CAAC;oBAChB,CAAC;yBAAM,CAAC;wBACN,KAAK,CAAC,QAAQ,EAAE,CAAC;oBACnB,CAAC;gBACH,CAAC,CAAC,CAAC;YACL,CAAC;YAED,mBAAmB;YACnB,OAAO,CAAC,GAAG,CAAC,2BAA2B,GAAG,KAAK,CAAC,CAAC;YACjD,MAAM,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE;gBACnB,SAAS,EAAE,aAAa;gBACxB,OAAO,EAAE,YAAY,CAAC,OAAO;aAC9B,CAAC,CAAC;YAEH,wCAAwC;YACxC,IAAI,YAAY,CAAC,eAAe,EAAE,CAAC;gBACjC,MAAM,IAAI,CAAC,eAAe,CAAC,YAAY,CAAC,eAAe,EAAE;oBACvD,OAAO,EAAE,YAAY,CAAC,OAAO;iBAC9B,CAAC,CAAC;YACL,CAAC;YAED,uCAAuC;YACvC,MAAM,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;YAEhC,2BAA2B;YAC3B,KAAK,MAAM,QAAQ,IAAI,YAAY,CAAC,SAAS,EAAE,gBAAgB,IAAI,EAAE,EAAE,CAAC;gBACtE,MAAM,IAAI,CAAC,QAAQ,CAAC,CAAC,GAAG,EAAE,EAAE;oBAC1B,QAAQ,CAAC,gBAAgB,CAAC,GAAG,CAAC,CAAC,OAAO,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,CAAC;gBAC5D,CAAC,EAAE,QAAQ,CAAC,CAAC;YACf,CAAC;YAED,8CAA8C;YAC9C,MAAM,aAAa,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,CAAC,SAAS,EAAE,EAAE;gBACtD,MAAM,OAAO,GAKR,EAAE,CAAC;gBACR,MAAM,QAAQ,GAAG,IAAI,GAAG,EAAU,CAAC;gBAEnC,KAAK,MAAM,QAAQ,IAAI,SAAS,EAAE,CAAC;oBACjC,MAAM,KAAK,GAAG,QAAQ,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAC;oBAElD,KAAK,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,EAAE;wBACrB,MAAM,MAAM,GAAG,IAAyB,CAAC;wBACzC,MAAM,IAAI,GAAG,MAAM,CAAC,IAAI,CAAC;wBAEzB,IAAI,CAAC,IAAI,IAAI,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC;4BAAE,OAAO;wBACxC,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;wBAEnB,6CAA6C;wBAC7C,IAAI,KAAK,GAAG,MAAM,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;wBAE7C,oDAAoD;wBACpD,MAAM,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,gEAAgE,CAAC,CAAC;wBAChG,IAAI,MAAM,EAAE,CAAC;4BACX,MAAM,OAAO,GAAG,MAAM,CAAC,aAAa,CAAC,mCAAmC,CAAC,CAAC;4BAC1E,IAAI,OAAO,EAAE,CAAC;gCACZ,MAAM,WAAW,GAAG,OAAO,CAAC,WAAW,EAAE,IAAI,EAAE,CAAC;gCAChD,IAAI,WAAW,IAAI,WAAW,CAAC,MAAM,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;oCACrD,KAAK,GAAG,WAAW,CAAC;gCACtB,CAAC;4BACH,CAAC;4BAED,kBAAkB;4BAClB,MAAM,IAAI,GAAG,MAAM,CAAC,aAAa,CAAC,qCAAqC,CAAC,CAAC;4BACzE,MAAM,WAAW,GAAG,IAAI,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC;4BAE9C,WAAW;4BACX,MAAM,MAAM,GAAG,MAAM,CAAC,aAAa,CAAC,qCAAqC,CAAC,CAAC;4BAC3E,MAAM,IAAI,GAAG,MAAM,EAAE,YAAY,CAAC,UAAU,CAAC,IAAI,MAAM,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC;4BAE7E,IAAI,KAAK,IAAI,KAAK,CAAC,MAAM,IAAI,EAAE,EAAE,CAAC;gCAChC,OAAO,CAAC,IAAI,CAAC;oCACX,GAAG,EAAE,IAAI;oCACT,KAAK;oCACL,IAAI;oCACJ,WAAW,EAAE,WAAW,EAAE,SAAS,CAAC,CAAC,EAAE,GAAG,CAAC;iCAC5C,CAAC,CAAC;4BACL,CAAC;wBACH,CAAC;6BAAM,IAAI,KAAK,IAAI,KAAK,CAAC,MAAM,IAAI,EAAE,EAAE,CAAC;4BACvC,OAAO,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,IAAI,EAAE,KAAK,EAAE,CAAC,CAAC;wBACrC,CAAC;oBACH,CAAC,CAAC,CAAC;gBACL,CAAC;gBAED,OAAO,OAAO,CAAC;YACjB,CAAC,EAAE,YAAY,CAAC,SAAS,EAAE,YAAY,IAAI,EAAE,CAAC,CAAC;YAE/C,6BAA6B;YAC7B,KAAK,MAAM,IAAI,IAAI,aAAa,EAAE,CAAC;gBACjC,IAAI,QAAQ,CAAC,MAAM,IAAI,CAAC,YAAY,CAAC,MAAM,EAAE,eAAe,IAAI,GAAG,CAAC;oBAAE,MAAM;gBAE5E,oBAAoB;gBACpB,IAAI,CAAC,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,GAAG,EAAE,YAAY,CAAC,OAAO,CAAC;oBAAE,SAAS;gBAElE,0BAA0B;gBAC1B,IAAI,aAA+B,CAAC;gBACpC,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC;oBACd,MAAM,MAAM,GAAG,IAAI,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;oBACnC,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC,EAAE,CAAC;wBAC7B,aAAa,GAAG,MAAM,CAAC;oBACzB,CAAC;gBACH,CAAC;gBAED,6CAA6C;gBAC7C,IAAI,UAAU,GAAG,GAAG,CAAC,CAAC,4CAA4C;gBAClE,IAAI,aAAa;oBAAE,UAAU,IAAI,GAAG,CAAC;gBACrC,IAAI,IAAI,CAAC,WAAW;oBAAE,UAAU,IAAI,GAAG,CAAC;gBACxC,IAAI,IAAI,CAAC,kBAAkB,CAAC,IAAI,CAAC,GAAG,CAAC;oBAAE,UAAU,IAAI,GAAG,CAAC;gBAEzD,QAAQ,CAAC,IAAI,CAAC;oBACZ,GAAG,EAAE,IAAI,CAAC,GAAG;oBACb,KAAK,EAAE,IAAI,CAAC,KAAK;oBACjB,aAAa;oBACb,WAAW,EAAE,IAAI,CAAC,WAAW;oBAC7B,UAAU,EAAE,IAAI,CAAC,GAAG,CAAC,UAAU,EAAE,GAAG,CAAC;oBACrC,MAAM,EAAE,WAAW;iBACpB,CAAC,CAAC;YACL,CAAC;YAED,OAAO,CAAC,GAAG,CAAC,6BAA6B,QAAQ,CAAC,MAAM,kBAAkB,GAAG,EAAE,CAAC,CAAC;QAEnF,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,wCAAwC,GAAG,GAAG,EAAE,KAAK,CAAC,CAAC;QACvE,CAAC;gBAAS,CAAC;YACT,MAAM,OAAO,CAAC,KAAK,EAAE,CAAC;QACxB,CAAC;QAED,OAAO,QAAQ,CAAC;IAClB,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,oBAAoB,CAAC,GAAW,EAAE,SAAkC,EAAE;QAC1E,OAAO,CAAC,GAAG,CAAC,kDAAkD,GAAG,EAAE,CAAC,CAAC;QAErE,MAAM,YAAY,GAAG,IAAI,CAAC,WAAW,CAAC,IAAI,CAAC,aAAa,EAAE,MAAM,CAAC,CAAC;QAClE,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,UAAU,EAAE,CAAC;QACxC,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,UAAU,CAAC;YACvC,SAAS,EAAE,IAAI,CAAC,SAAS;YACzB,QAAQ,EAAE,YAAY,CAAC,QAAQ;SAChC,CAAC,CAAC;QAEH,MAAM,IAAI,GAAG,MAAM,OAAO,CAAC,OAAO,EAAE,CAAC;QAErC,IAAI,CAAC;YACH,iEAAiE;YACjE,MAAM,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE;gBACnB,SAAS,EAAE,aAAa;gBACxB,OAAO,EAAE,YAAY,CAAC,OAAO;aAC9B,CAAC,CAAC;YAEH,wCAAwC;YACxC,IAAI,YAAY,CAAC,eAAe,EAAE,CAAC;gBACjC,MAAM,IAAI,CAAC,eAAe,CAAC,YAAY,CAAC,eAAe,EAAE;oBACvD,OAAO,EAAE,YAAY,CAAC,OAAO;iBAC9B,CAAC,CAAC;YACL,CAAC;YAED,uCAAuC;YACvC,MAAM,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;YAEhC,oBAAoB;YACpB,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,OAAO,EAAE,CAAC;YAClC,OAAO,CAAC,GAAG,CAAC,2BAA2B,IAAI,CAAC,MAAM,yBAAyB,CAAC,CAAC;YAE7E,OAAO,IAAI,CAAC;QAEd,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,8CAA8C,GAAG,GAAG,EAAE,KAAK,CAAC,CAAC;YAC3E,OAAO,IAAI,CAAC;QACd,CAAC;gBAAS,CAAC;YACT,MAAM,OAAO,CAAC,KAAK,EAAE,CAAC;QACxB,CAAC;IACH,CAAC;IAED;;OAEG;IACK,aAAa,CAAC,GAAW,EAAE,OAAmC;QACpE,IAAI,CAAC,OAAO;YAAE,OAAO,IAAI,CAAC;QAE1B,yBAAyB;QACzB,IAAI,OAAO,CAAC,eAAe,EAAE,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC;YAChE,OAAO,KAAK,CAAC;QACf,CAAC;QAED,sCAAsC;QACtC,IAAI,OAAO,CAAC,eAAe,EAAE,MAAM;YAC/B,CAAC,OAAO,CAAC,eAAe,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC;YAChE,OAAO,KAAK,CAAC;QACf,CAAC;QAED,OAAO,IAAI,CAAC;IACd,CAAC;IAED;;OAEG;IACK,kBAAkB,CAAC,GAAW;QACpC,MAAM,eAAe,GAAG;YACtB,kBAAkB;YAClB,eAAe;YACf,YAAY;YACZ,cAAc;YACd,WAAW;YACX,WAAW;YACX,yBAAyB;YACzB,kBAAkB;SACnB,CAAC;QAEF,OAAO,eAAe,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;IAC5D,CAAC;IAED;;OAEG;IACK,WAAW,CACjB,aAAsC,EACtC,UAAmC;QAEnC,OAAO;YACL,GAAG,aAAa;YAChB,GAAG,UAAU;YACb,SAAS,EAAE;gBACT,GAAG,aAAa,CAAC,SAAS;gBAC1B,GAAG,UAAU,CAAC,SAAS;gBACvB,YAAY,EAAE;oBACZ,GAAG,CAAC,aAAa,CAAC,SAAS,EAAE,YAAY,IAAI,EAAE,CAAC;oBAChD,GAAG,CAAC,UAAU,CAAC,SAAS,EAAE,YAAY,IAAI,EAAE,CAAC;iBAC9C;gBACD,gBAAgB,EAAE;oBAChB,GAAG,CAAC,aAAa,CAAC,SAAS,EAAE,gBAAgB,IAAI,EAAE,CAAC;oBACpD,GAAG,CAAC,UAAU,CAAC,SAAS,EAAE,gBAAgB,IAAI,EAAE,CAAC;iBAClD;aACF;YACD,OAAO,EAAE;gBACP,GAAG,aAAa,CAAC,OAAO;gBACxB,GAAG,UAAU,CAAC,OAAO;aACtB;YACD,MAAM,EAAE;gBACN,GAAG,aAAa,CAAC,MAAM;gBACvB,GAAG,UAAU,CAAC,MAAM;aACrB;SACF,CAAC;IACJ,CAAC;CACF;AA9WD,8CA8WC;AAED,2CAA2C;AAC3C,IAAI,uBAAuB,GAA6B,IAAI,CAAC;AAE7D,SAAgB,oBAAoB;IAClC,IAAI,CAAC,uBAAuB,EAAE,CAAC;QAC7B,uBAAuB,GAAG,IAAI,iBAAiB,EAAE,CAAC;IACpD,CAAC;IACD,OAAO,uBAAuB,CAAC;AACjC,CAAC;AAED,yCAAyC;AAClC,KAAK,UAAU,sBAAsB;IAC1C,IAAI,uBAAuB,EAAE,CAAC;QAC5B,MAAM,uBAAuB,CAAC,KAAK,EAAE,CAAC;QACtC,uBAAuB,GAAG,IAAI,CAAC;IACjC,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
export declare class RobotsChecker {
|
|
2
|
+
private cache;
|
|
3
|
+
private readonly cacheTimeout;
|
|
4
|
+
private readonly userAgent;
|
|
5
|
+
private readonly requestTimeout;
|
|
6
|
+
/**
|
|
7
|
+
* Check if a URL is allowed to be crawled according to robots.txt
|
|
8
|
+
*/
|
|
9
|
+
isAllowed(url: string): Promise<{
|
|
10
|
+
allowed: boolean;
|
|
11
|
+
crawlDelay?: number;
|
|
12
|
+
sitemaps: string[];
|
|
13
|
+
reason?: string;
|
|
14
|
+
}>;
|
|
15
|
+
/**
|
|
16
|
+
* Get sitemaps listed in robots.txt for a domain
|
|
17
|
+
*/
|
|
18
|
+
getSitemaps(domain: string): Promise<string[]>;
|
|
19
|
+
/**
|
|
20
|
+
* Get the recommended crawl delay for a domain
|
|
21
|
+
*/
|
|
22
|
+
getCrawlDelay(domain: string): Promise<number | undefined>;
|
|
23
|
+
private getRobotsTxt;
|
|
24
|
+
private parseRobotsTxt;
|
|
25
|
+
private completeRule;
|
|
26
|
+
private checkRules;
|
|
27
|
+
private findBestMatchingRule;
|
|
28
|
+
private matchesPattern;
|
|
29
|
+
clearCache(): void;
|
|
30
|
+
getCacheStats(): {
|
|
31
|
+
size: number;
|
|
32
|
+
entries: {
|
|
33
|
+
url: string;
|
|
34
|
+
fetchedAt: string;
|
|
35
|
+
expiresAt: string;
|
|
36
|
+
rulesCount: number;
|
|
37
|
+
sitemapsCount: number;
|
|
38
|
+
}[];
|
|
39
|
+
};
|
|
40
|
+
}
|
|
41
|
+
export declare const globalRobotsChecker: RobotsChecker;
|
|
42
|
+
//# sourceMappingURL=robots-checker.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"robots-checker.d.ts","sourceRoot":"","sources":["../../../lib/web-scrapers/robots-checker.ts"],"names":[],"mappings":"AAeA,qBAAa,aAAa;IACxB,OAAO,CAAC,KAAK,CAAgC;IAC7C,OAAO,CAAC,QAAQ,CAAC,YAAY,CAAuB;IACpD,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAqB;IAC/C,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAQ;IAEvC;;OAEG;IACG,SAAS,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC;QACpC,OAAO,EAAE,OAAO,CAAC;QACjB,UAAU,CAAC,EAAE,MAAM,CAAC;QACpB,QAAQ,EAAE,MAAM,EAAE,CAAC;QACnB,MAAM,CAAC,EAAE,MAAM,CAAC;KACjB,CAAC;IAkCF;;OAEG;IACG,WAAW,CAAC,MAAM,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC;IAWpD;;OAEG;IACG,aAAa,CAAC,MAAM,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,GAAG,SAAS,CAAC;YAiBlD,YAAY;IAiD1B,OAAO,CAAC,cAAc;IA+EtB,OAAO,CAAC,YAAY;IAUpB,OAAO,CAAC,UAAU;IAkDlB,OAAO,CAAC,oBAAoB;IAW5B,OAAO,CAAC,cAAc;IA2BtB,UAAU;IAKV,aAAa;;;;;;;;;;CAYd;AAGD,eAAO,MAAM,mBAAmB,eAAsB,CAAC"}
|