@tyroneross/blog-scraper 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +371 -0
- package/dist/index.d.mts +949 -0
- package/dist/index.d.ts +949 -0
- package/dist/index.js +3236 -0
- package/dist/index.mjs +3165 -0
- package/package.json +69 -0
package/dist/index.d.mts
ADDED
|
@@ -0,0 +1,949 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* @package @tyroneross/scraper-testing
|
|
5
|
+
* Core types for web scraper testing
|
|
6
|
+
*/
|
|
7
|
+
interface ScrapedArticle {
|
|
8
|
+
url: string;
|
|
9
|
+
title: string;
|
|
10
|
+
publishedDate?: Date | string;
|
|
11
|
+
description?: string;
|
|
12
|
+
fullContent?: string;
|
|
13
|
+
fullContentMarkdown?: string;
|
|
14
|
+
fullContentText?: string;
|
|
15
|
+
confidence: number;
|
|
16
|
+
source: 'link-text' | 'meta-data' | 'structured-data';
|
|
17
|
+
qualityScore?: number;
|
|
18
|
+
metadata?: Record<string, any>;
|
|
19
|
+
}
|
|
20
|
+
interface ScraperTestResult {
|
|
21
|
+
url: string;
|
|
22
|
+
detectedType: 'rss' | 'sitemap' | 'html' | 'unknown';
|
|
23
|
+
confidence: 'high' | 'medium' | 'low';
|
|
24
|
+
articles: ScrapedArticle[];
|
|
25
|
+
extractionStats: {
|
|
26
|
+
attempted: number;
|
|
27
|
+
successful: number;
|
|
28
|
+
failed: number;
|
|
29
|
+
filtered: number;
|
|
30
|
+
totalDiscovered?: number;
|
|
31
|
+
afterDenyFilter?: number;
|
|
32
|
+
afterContentValidation?: number;
|
|
33
|
+
afterQualityFilter?: number;
|
|
34
|
+
};
|
|
35
|
+
processingTime: number;
|
|
36
|
+
errors: string[];
|
|
37
|
+
timestamp: string;
|
|
38
|
+
}
|
|
39
|
+
interface ScraperTestRequest {
|
|
40
|
+
url: string;
|
|
41
|
+
sourceType?: 'auto' | 'rss' | 'sitemap' | 'html';
|
|
42
|
+
maxArticles?: number;
|
|
43
|
+
extractFullContent?: boolean;
|
|
44
|
+
denyPaths?: string[];
|
|
45
|
+
qualityThreshold?: number;
|
|
46
|
+
}
|
|
47
|
+
interface ScraperTestProps {
|
|
48
|
+
onTestComplete?: (result: ScraperTestResult) => void;
|
|
49
|
+
onTestStart?: (url: string) => void;
|
|
50
|
+
onError?: (error: Error) => void;
|
|
51
|
+
className?: string;
|
|
52
|
+
defaultUrl?: string;
|
|
53
|
+
plugins?: ScraperPlugin[];
|
|
54
|
+
}
|
|
55
|
+
interface ScraperResultsProps {
|
|
56
|
+
result: ScraperTestResult | null;
|
|
57
|
+
loading?: boolean;
|
|
58
|
+
error?: string | null;
|
|
59
|
+
className?: string;
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* Plugin system for extending scraper functionality
|
|
63
|
+
* Allows users to add their own LLM-based enhancements
|
|
64
|
+
*/
|
|
65
|
+
interface ScraperPlugin {
|
|
66
|
+
name: string;
|
|
67
|
+
version: string;
|
|
68
|
+
/**
|
|
69
|
+
* Called before scraping starts
|
|
70
|
+
* Useful for validation, rate limiting, or pre-processing
|
|
71
|
+
*/
|
|
72
|
+
beforeScrape?: (url: string) => Promise<void>;
|
|
73
|
+
/**
|
|
74
|
+
* Called after all articles are scraped
|
|
75
|
+
* Useful for batch processing or re-ranking
|
|
76
|
+
*/
|
|
77
|
+
afterScrape?: (articles: ScrapedArticle[]) => Promise<ScrapedArticle[]>;
|
|
78
|
+
/**
|
|
79
|
+
* Called for each article individually
|
|
80
|
+
* Useful for adding AI-based quality scores or classifications
|
|
81
|
+
*/
|
|
82
|
+
enhanceArticle?: (article: ScrapedArticle) => Promise<ScrapedArticle>;
|
|
83
|
+
/**
|
|
84
|
+
* Called to determine if an article should be filtered out
|
|
85
|
+
* Return true to keep the article, false to filter it out
|
|
86
|
+
*/
|
|
87
|
+
filterArticle?: (article: ScrapedArticle) => Promise<boolean>;
|
|
88
|
+
}
|
|
89
|
+
/**
|
|
90
|
+
* Quality scoring configuration
|
|
91
|
+
*/
|
|
92
|
+
interface QualityScoreConfig {
|
|
93
|
+
contentWeight?: number;
|
|
94
|
+
dateWeight?: number;
|
|
95
|
+
authorWeight?: number;
|
|
96
|
+
schemaWeight?: number;
|
|
97
|
+
readingTimeWeight?: number;
|
|
98
|
+
threshold?: number;
|
|
99
|
+
}
|
|
100
|
+
/**
|
|
101
|
+
* Content validation result
|
|
102
|
+
*/
|
|
103
|
+
interface ContentValidation {
|
|
104
|
+
isValid: boolean;
|
|
105
|
+
score: number;
|
|
106
|
+
reasons: string[];
|
|
107
|
+
}
|
|
108
|
+
/**
|
|
109
|
+
* Extracted content structure
|
|
110
|
+
*/
|
|
111
|
+
interface ExtractedContent$1 {
|
|
112
|
+
title?: string;
|
|
113
|
+
byline?: string;
|
|
114
|
+
content?: string;
|
|
115
|
+
textContent?: string;
|
|
116
|
+
length?: number;
|
|
117
|
+
excerpt?: string;
|
|
118
|
+
siteName?: string;
|
|
119
|
+
publishedTime?: Date | string;
|
|
120
|
+
lang?: string;
|
|
121
|
+
readingTime?: number;
|
|
122
|
+
structured?: {
|
|
123
|
+
jsonLd?: any;
|
|
124
|
+
openGraph?: Record<string, string>;
|
|
125
|
+
twitter?: Record<string, string>;
|
|
126
|
+
};
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
/**
|
|
130
|
+
* @package @tyroneross/blog-scraper
|
|
131
|
+
* High-level API for easy scraping
|
|
132
|
+
*/
|
|
133
|
+
|
|
134
|
+
/**
|
|
135
|
+
* Options for the scrape function
|
|
136
|
+
*/
|
|
137
|
+
interface ScrapeOptions {
|
|
138
|
+
/** Source type detection mode (default: 'auto') */
|
|
139
|
+
sourceType?: 'auto' | 'rss' | 'sitemap' | 'html';
|
|
140
|
+
/** Maximum number of articles to return (default: 50) */
|
|
141
|
+
maxArticles?: number;
|
|
142
|
+
/** Extract full article content (default: true) */
|
|
143
|
+
extractFullContent?: boolean;
|
|
144
|
+
/** URL patterns to exclude (default: common non-article paths) */
|
|
145
|
+
denyPaths?: string[];
|
|
146
|
+
/** Minimum quality score 0-1 (default: 0.6) */
|
|
147
|
+
qualityThreshold?: number;
|
|
148
|
+
}
|
|
149
|
+
/**
|
|
150
|
+
* Main scraping function - simple interface for extracting articles
|
|
151
|
+
*
|
|
152
|
+
* @example
|
|
153
|
+
* ```typescript
|
|
154
|
+
* import { scrape } from '@tyroneross/blog-scraper';
|
|
155
|
+
*
|
|
156
|
+
* const result = await scrape('https://example.com/blog');
|
|
157
|
+
* console.log(`Found ${result.articles.length} articles`);
|
|
158
|
+
* ```
|
|
159
|
+
*
|
|
160
|
+
* @param url - URL to scrape (RSS feed, sitemap, or HTML page)
|
|
161
|
+
* @param options - Optional scraping configuration
|
|
162
|
+
* @returns Promise with scraping results
|
|
163
|
+
*/
|
|
164
|
+
declare function scrape(url: string, options?: ScrapeOptions): Promise<ScraperTestResult>;
|
|
165
|
+
/**
|
|
166
|
+
* Quick scrape - returns just the article URLs and titles (fast)
|
|
167
|
+
*
|
|
168
|
+
* @example
|
|
169
|
+
* ```typescript
|
|
170
|
+
* const urls = await quickScrape('https://example.com/blog');
|
|
171
|
+
* console.log(urls); // ['url1', 'url2', ...]
|
|
172
|
+
* ```
|
|
173
|
+
*/
|
|
174
|
+
declare function quickScrape(url: string): Promise<string[]>;
|
|
175
|
+
|
|
176
|
+
interface DiscoveredFeed {
|
|
177
|
+
url: string;
|
|
178
|
+
title?: string;
|
|
179
|
+
type: 'rss' | 'atom' | 'rdf';
|
|
180
|
+
source: 'link-tag' | 'common-path' | 'content-scan';
|
|
181
|
+
confidence: number;
|
|
182
|
+
}
|
|
183
|
+
declare class RSSDiscovery {
|
|
184
|
+
private readonly userAgent;
|
|
185
|
+
private readonly timeout;
|
|
186
|
+
/**
|
|
187
|
+
* Discover RSS feeds from a given URL
|
|
188
|
+
*/
|
|
189
|
+
discoverFeeds(url: string): Promise<DiscoveredFeed[]>;
|
|
190
|
+
/**
|
|
191
|
+
* Check if the URL itself is a direct feed
|
|
192
|
+
*/
|
|
193
|
+
private checkDirectFeed;
|
|
194
|
+
/**
|
|
195
|
+
* Fetch HTML page content
|
|
196
|
+
*/
|
|
197
|
+
private fetchPage;
|
|
198
|
+
/**
|
|
199
|
+
* Extract feed URLs from HTML link tags
|
|
200
|
+
*/
|
|
201
|
+
private extractFeedsFromHTML;
|
|
202
|
+
/**
|
|
203
|
+
* Check common feed paths
|
|
204
|
+
*/
|
|
205
|
+
private checkCommonPaths;
|
|
206
|
+
/**
|
|
207
|
+
* Scan HTML content for feed-like patterns
|
|
208
|
+
*/
|
|
209
|
+
private scanForFeedContent;
|
|
210
|
+
/**
|
|
211
|
+
* Validate if a URL is actually a feed
|
|
212
|
+
*/
|
|
213
|
+
private validateFeedUrl;
|
|
214
|
+
/**
|
|
215
|
+
* Resolve relative URLs to absolute URLs
|
|
216
|
+
*/
|
|
217
|
+
private resolveUrl;
|
|
218
|
+
/**
|
|
219
|
+
* Check if content type indicates a feed
|
|
220
|
+
*/
|
|
221
|
+
private isFeedContentType;
|
|
222
|
+
/**
|
|
223
|
+
* Determine feed type from content type
|
|
224
|
+
*/
|
|
225
|
+
private determineFeedType;
|
|
226
|
+
/**
|
|
227
|
+
* Guess feed type from URL or text
|
|
228
|
+
*/
|
|
229
|
+
private guessFeedType;
|
|
230
|
+
/**
|
|
231
|
+
* Check if a link looks like it could be a feed
|
|
232
|
+
*/
|
|
233
|
+
private isFeedLikeLink;
|
|
234
|
+
}
|
|
235
|
+
declare const globalRSSDiscovery: RSSDiscovery;
|
|
236
|
+
|
|
237
|
+
declare const CandidateArticleSchema: z.ZodObject<{
|
|
238
|
+
url: z.ZodString;
|
|
239
|
+
title: z.ZodString;
|
|
240
|
+
publishedAt: z.ZodDate;
|
|
241
|
+
content: z.ZodOptional<z.ZodString>;
|
|
242
|
+
excerpt: z.ZodOptional<z.ZodString>;
|
|
243
|
+
guid: z.ZodString;
|
|
244
|
+
confidence: z.ZodNumber;
|
|
245
|
+
source: z.ZodEnum<["rss", "sitemap", "html", "discovery"]>;
|
|
246
|
+
extractionMethod: z.ZodEnum<["rss", "sitemap", "html-links", "content-extraction"]>;
|
|
247
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodAny>>;
|
|
248
|
+
}, "strip", z.ZodTypeAny, {
|
|
249
|
+
url: string;
|
|
250
|
+
source: "sitemap" | "rss" | "html" | "discovery";
|
|
251
|
+
confidence: number;
|
|
252
|
+
title: string;
|
|
253
|
+
extractionMethod: "sitemap" | "rss" | "html-links" | "content-extraction";
|
|
254
|
+
publishedAt: Date;
|
|
255
|
+
guid: string;
|
|
256
|
+
content?: string | undefined;
|
|
257
|
+
excerpt?: string | undefined;
|
|
258
|
+
metadata?: Record<string, any> | undefined;
|
|
259
|
+
}, {
|
|
260
|
+
url: string;
|
|
261
|
+
source: "sitemap" | "rss" | "html" | "discovery";
|
|
262
|
+
confidence: number;
|
|
263
|
+
title: string;
|
|
264
|
+
extractionMethod: "sitemap" | "rss" | "html-links" | "content-extraction";
|
|
265
|
+
publishedAt: Date;
|
|
266
|
+
guid: string;
|
|
267
|
+
content?: string | undefined;
|
|
268
|
+
excerpt?: string | undefined;
|
|
269
|
+
metadata?: Record<string, any> | undefined;
|
|
270
|
+
}>;
|
|
271
|
+
type CandidateArticle = z.infer<typeof CandidateArticleSchema>;
|
|
272
|
+
declare const SourceConfigSchema: z.ZodObject<{
|
|
273
|
+
sourceType: z.ZodEnum<["rss", "sitemap", "html", "auto"]>;
|
|
274
|
+
allowPaths: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
275
|
+
denyPaths: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
276
|
+
maxDepth: z.ZodOptional<z.ZodNumber>;
|
|
277
|
+
detectOnly: z.ZodOptional<z.ZodBoolean>;
|
|
278
|
+
scrapeConfig: z.ZodOptional<z.ZodObject<{
|
|
279
|
+
selectors: z.ZodOptional<z.ZodObject<{
|
|
280
|
+
articleLinks: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
281
|
+
titleSelectors: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
282
|
+
dateSelectors: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
283
|
+
excludeSelectors: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
284
|
+
}, "strip", z.ZodTypeAny, {
|
|
285
|
+
excludeSelectors?: string[] | undefined;
|
|
286
|
+
articleLinks?: string[] | undefined;
|
|
287
|
+
titleSelectors?: string[] | undefined;
|
|
288
|
+
dateSelectors?: string[] | undefined;
|
|
289
|
+
}, {
|
|
290
|
+
excludeSelectors?: string[] | undefined;
|
|
291
|
+
articleLinks?: string[] | undefined;
|
|
292
|
+
titleSelectors?: string[] | undefined;
|
|
293
|
+
dateSelectors?: string[] | undefined;
|
|
294
|
+
}>>;
|
|
295
|
+
filters: z.ZodOptional<z.ZodObject<{
|
|
296
|
+
minTitleLength: z.ZodOptional<z.ZodNumber>;
|
|
297
|
+
maxTitleLength: z.ZodOptional<z.ZodNumber>;
|
|
298
|
+
includePatterns: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
299
|
+
excludePatterns: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
300
|
+
}, "strip", z.ZodTypeAny, {
|
|
301
|
+
minTitleLength?: number | undefined;
|
|
302
|
+
maxTitleLength?: number | undefined;
|
|
303
|
+
excludePatterns?: string[] | undefined;
|
|
304
|
+
includePatterns?: string[] | undefined;
|
|
305
|
+
}, {
|
|
306
|
+
minTitleLength?: number | undefined;
|
|
307
|
+
maxTitleLength?: number | undefined;
|
|
308
|
+
excludePatterns?: string[] | undefined;
|
|
309
|
+
includePatterns?: string[] | undefined;
|
|
310
|
+
}>>;
|
|
311
|
+
limits: z.ZodOptional<z.ZodObject<{
|
|
312
|
+
maxLinksPerPage: z.ZodOptional<z.ZodNumber>;
|
|
313
|
+
maxPages: z.ZodOptional<z.ZodNumber>;
|
|
314
|
+
}, "strip", z.ZodTypeAny, {
|
|
315
|
+
maxLinksPerPage?: number | undefined;
|
|
316
|
+
maxPages?: number | undefined;
|
|
317
|
+
}, {
|
|
318
|
+
maxLinksPerPage?: number | undefined;
|
|
319
|
+
maxPages?: number | undefined;
|
|
320
|
+
}>>;
|
|
321
|
+
}, "strip", z.ZodTypeAny, {
|
|
322
|
+
filters?: {
|
|
323
|
+
minTitleLength?: number | undefined;
|
|
324
|
+
maxTitleLength?: number | undefined;
|
|
325
|
+
excludePatterns?: string[] | undefined;
|
|
326
|
+
includePatterns?: string[] | undefined;
|
|
327
|
+
} | undefined;
|
|
328
|
+
selectors?: {
|
|
329
|
+
excludeSelectors?: string[] | undefined;
|
|
330
|
+
articleLinks?: string[] | undefined;
|
|
331
|
+
titleSelectors?: string[] | undefined;
|
|
332
|
+
dateSelectors?: string[] | undefined;
|
|
333
|
+
} | undefined;
|
|
334
|
+
limits?: {
|
|
335
|
+
maxLinksPerPage?: number | undefined;
|
|
336
|
+
maxPages?: number | undefined;
|
|
337
|
+
} | undefined;
|
|
338
|
+
}, {
|
|
339
|
+
filters?: {
|
|
340
|
+
minTitleLength?: number | undefined;
|
|
341
|
+
maxTitleLength?: number | undefined;
|
|
342
|
+
excludePatterns?: string[] | undefined;
|
|
343
|
+
includePatterns?: string[] | undefined;
|
|
344
|
+
} | undefined;
|
|
345
|
+
selectors?: {
|
|
346
|
+
excludeSelectors?: string[] | undefined;
|
|
347
|
+
articleLinks?: string[] | undefined;
|
|
348
|
+
titleSelectors?: string[] | undefined;
|
|
349
|
+
dateSelectors?: string[] | undefined;
|
|
350
|
+
} | undefined;
|
|
351
|
+
limits?: {
|
|
352
|
+
maxLinksPerPage?: number | undefined;
|
|
353
|
+
maxPages?: number | undefined;
|
|
354
|
+
} | undefined;
|
|
355
|
+
}>>;
|
|
356
|
+
}, "strip", z.ZodTypeAny, {
|
|
357
|
+
sourceType: "sitemap" | "rss" | "html" | "auto";
|
|
358
|
+
maxDepth?: number | undefined;
|
|
359
|
+
allowPaths?: string[] | undefined;
|
|
360
|
+
denyPaths?: string[] | undefined;
|
|
361
|
+
detectOnly?: boolean | undefined;
|
|
362
|
+
scrapeConfig?: {
|
|
363
|
+
filters?: {
|
|
364
|
+
minTitleLength?: number | undefined;
|
|
365
|
+
maxTitleLength?: number | undefined;
|
|
366
|
+
excludePatterns?: string[] | undefined;
|
|
367
|
+
includePatterns?: string[] | undefined;
|
|
368
|
+
} | undefined;
|
|
369
|
+
selectors?: {
|
|
370
|
+
excludeSelectors?: string[] | undefined;
|
|
371
|
+
articleLinks?: string[] | undefined;
|
|
372
|
+
titleSelectors?: string[] | undefined;
|
|
373
|
+
dateSelectors?: string[] | undefined;
|
|
374
|
+
} | undefined;
|
|
375
|
+
limits?: {
|
|
376
|
+
maxLinksPerPage?: number | undefined;
|
|
377
|
+
maxPages?: number | undefined;
|
|
378
|
+
} | undefined;
|
|
379
|
+
} | undefined;
|
|
380
|
+
}, {
|
|
381
|
+
sourceType: "sitemap" | "rss" | "html" | "auto";
|
|
382
|
+
maxDepth?: number | undefined;
|
|
383
|
+
allowPaths?: string[] | undefined;
|
|
384
|
+
denyPaths?: string[] | undefined;
|
|
385
|
+
detectOnly?: boolean | undefined;
|
|
386
|
+
scrapeConfig?: {
|
|
387
|
+
filters?: {
|
|
388
|
+
minTitleLength?: number | undefined;
|
|
389
|
+
maxTitleLength?: number | undefined;
|
|
390
|
+
excludePatterns?: string[] | undefined;
|
|
391
|
+
includePatterns?: string[] | undefined;
|
|
392
|
+
} | undefined;
|
|
393
|
+
selectors?: {
|
|
394
|
+
excludeSelectors?: string[] | undefined;
|
|
395
|
+
articleLinks?: string[] | undefined;
|
|
396
|
+
titleSelectors?: string[] | undefined;
|
|
397
|
+
dateSelectors?: string[] | undefined;
|
|
398
|
+
} | undefined;
|
|
399
|
+
limits?: {
|
|
400
|
+
maxLinksPerPage?: number | undefined;
|
|
401
|
+
maxPages?: number | undefined;
|
|
402
|
+
} | undefined;
|
|
403
|
+
} | undefined;
|
|
404
|
+
}>;
|
|
405
|
+
type SourceConfig = z.infer<typeof SourceConfigSchema> & {
|
|
406
|
+
circuitBreaker?: {
|
|
407
|
+
execute<T>(operation: () => Promise<T>): Promise<T>;
|
|
408
|
+
};
|
|
409
|
+
};
|
|
410
|
+
interface OrchestrationResult {
|
|
411
|
+
articles: CandidateArticle[];
|
|
412
|
+
sourceInfo: {
|
|
413
|
+
detectedType: 'rss' | 'sitemap' | 'html';
|
|
414
|
+
discoveredFeeds?: DiscoveredFeed[];
|
|
415
|
+
discoveredSitemaps?: string[];
|
|
416
|
+
extractionStats: {
|
|
417
|
+
attempted: number;
|
|
418
|
+
successful: number;
|
|
419
|
+
failed: number;
|
|
420
|
+
filtered: number;
|
|
421
|
+
};
|
|
422
|
+
};
|
|
423
|
+
processingTime: number;
|
|
424
|
+
errors: string[];
|
|
425
|
+
}
|
|
426
|
+
declare class SourceOrchestrator {
|
|
427
|
+
private readonly maxArticlesPerSource;
|
|
428
|
+
/**
|
|
429
|
+
* Main orchestration method - determines source type and extracts content
|
|
430
|
+
*/
|
|
431
|
+
processSource(url: string, config?: SourceConfig): Promise<OrchestrationResult>;
|
|
432
|
+
/**
|
|
433
|
+
* Auto-detect source type and process accordingly
|
|
434
|
+
*/
|
|
435
|
+
private autoDetectAndProcess;
|
|
436
|
+
/**
|
|
437
|
+
* Process source with known type
|
|
438
|
+
*/
|
|
439
|
+
private processKnownType;
|
|
440
|
+
/**
|
|
441
|
+
* Process URL as RSS feed
|
|
442
|
+
*/
|
|
443
|
+
private processAsRSS;
|
|
444
|
+
/**
|
|
445
|
+
* Process URL as sitemap
|
|
446
|
+
*/
|
|
447
|
+
private processAsSitemap;
|
|
448
|
+
/**
|
|
449
|
+
* Process URL as HTML page
|
|
450
|
+
*/
|
|
451
|
+
private processAsHTML;
|
|
452
|
+
/**
|
|
453
|
+
* Apply path filtering based on allowPaths and denyPaths
|
|
454
|
+
*/
|
|
455
|
+
private applyPathFilters;
|
|
456
|
+
/**
|
|
457
|
+
* Check if a path matches a pattern (supports wildcards)
|
|
458
|
+
*/
|
|
459
|
+
private matchesPattern;
|
|
460
|
+
/**
|
|
461
|
+
* Build scraping configuration from source config
|
|
462
|
+
*/
|
|
463
|
+
private buildScrapingConfig;
|
|
464
|
+
/**
|
|
465
|
+
* Extract title from URL as fallback
|
|
466
|
+
*/
|
|
467
|
+
private extractTitleFromUrl;
|
|
468
|
+
/**
|
|
469
|
+
* Create a consistent GUID for an article
|
|
470
|
+
*/
|
|
471
|
+
private createGuid;
|
|
472
|
+
/**
|
|
473
|
+
* Finalize processing result
|
|
474
|
+
*/
|
|
475
|
+
private finalizeResult;
|
|
476
|
+
/**
|
|
477
|
+
* Extract full content for articles (optional enhancement step)
|
|
478
|
+
*/
|
|
479
|
+
enhanceWithFullContent(articles: CandidateArticle[], maxArticles?: number): Promise<CandidateArticle[]>;
|
|
480
|
+
/**
|
|
481
|
+
* Validate orchestrator configuration
|
|
482
|
+
*/
|
|
483
|
+
static validateConfig(config: any): SourceConfig;
|
|
484
|
+
/**
|
|
485
|
+
* Get source statistics
|
|
486
|
+
*/
|
|
487
|
+
getSourceStats(url: string): Promise<{
|
|
488
|
+
robotsCompliant: boolean;
|
|
489
|
+
hasRSSFeed: boolean;
|
|
490
|
+
hasSitemap: boolean;
|
|
491
|
+
detectedType: string;
|
|
492
|
+
estimatedArticleCount: number;
|
|
493
|
+
}>;
|
|
494
|
+
}
|
|
495
|
+
declare const globalSourceOrchestrator: SourceOrchestrator;
|
|
496
|
+
|
|
497
|
+
interface ExtractedContent {
|
|
498
|
+
url: string;
|
|
499
|
+
title: string;
|
|
500
|
+
content: string;
|
|
501
|
+
textContent: string;
|
|
502
|
+
excerpt?: string;
|
|
503
|
+
byline?: string;
|
|
504
|
+
publishedTime?: Date;
|
|
505
|
+
siteName?: string;
|
|
506
|
+
lang?: string;
|
|
507
|
+
structured?: {
|
|
508
|
+
jsonLd?: any;
|
|
509
|
+
openGraph?: Record<string, string>;
|
|
510
|
+
twitterCard?: Record<string, string>;
|
|
511
|
+
microdata?: any[];
|
|
512
|
+
};
|
|
513
|
+
wordCount: number;
|
|
514
|
+
readingTime: number;
|
|
515
|
+
confidence: number;
|
|
516
|
+
extractionMethod: 'readability' | 'fallback' | 'structured';
|
|
517
|
+
extractedAt: Date;
|
|
518
|
+
errors?: string[];
|
|
519
|
+
}
|
|
520
|
+
declare class ContentExtractor {
|
|
521
|
+
private readonly userAgent;
|
|
522
|
+
private readonly timeout;
|
|
523
|
+
private readonly maxContentSize;
|
|
524
|
+
private readonly minContentLength;
|
|
525
|
+
private readonly wordsPerMinute;
|
|
526
|
+
private readonly ssrfProtection;
|
|
527
|
+
constructor();
|
|
528
|
+
/**
|
|
529
|
+
* Extract content from a URL
|
|
530
|
+
*/
|
|
531
|
+
extractContent(url: string): Promise<ExtractedContent | null>;
|
|
532
|
+
/**
|
|
533
|
+
* Extract content from multiple URLs
|
|
534
|
+
*/
|
|
535
|
+
extractBatch(urls: string[]): Promise<(ExtractedContent | null)[]>;
|
|
536
|
+
private fetchContent;
|
|
537
|
+
private extractFromHTML;
|
|
538
|
+
private extractWithReadability;
|
|
539
|
+
private extractWithFallback;
|
|
540
|
+
private extractStructuredData;
|
|
541
|
+
private extractPublishedTime;
|
|
542
|
+
private extractSiteName;
|
|
543
|
+
private extractLanguage;
|
|
544
|
+
private countWords;
|
|
545
|
+
/**
|
|
546
|
+
* Validate extracted content quality
|
|
547
|
+
*/
|
|
548
|
+
validateContent(content: ExtractedContent): {
|
|
549
|
+
isValid: boolean;
|
|
550
|
+
issues: string[];
|
|
551
|
+
score: number;
|
|
552
|
+
};
|
|
553
|
+
}
|
|
554
|
+
declare const globalContentExtractor: ContentExtractor;
|
|
555
|
+
|
|
556
|
+
declare const PERPLEXITY_MODELS: {
|
|
557
|
+
readonly SONAR: "llama-3.1-sonar-small-128k-online";
|
|
558
|
+
readonly SONAR_PRO: "llama-3.1-sonar-large-128k-online";
|
|
559
|
+
};
|
|
560
|
+
interface ScrapingConfig {
|
|
561
|
+
selectors?: {
|
|
562
|
+
articleLinks?: string[];
|
|
563
|
+
titleSelectors?: string[];
|
|
564
|
+
dateSelectors?: string[];
|
|
565
|
+
excludeSelectors?: string[];
|
|
566
|
+
};
|
|
567
|
+
filters?: {
|
|
568
|
+
minTitleLength?: number;
|
|
569
|
+
maxTitleLength?: number;
|
|
570
|
+
includePatterns?: RegExp[];
|
|
571
|
+
excludePatterns?: RegExp[];
|
|
572
|
+
allowedDomains?: string[];
|
|
573
|
+
};
|
|
574
|
+
limits?: {
|
|
575
|
+
maxLinksPerPage?: number;
|
|
576
|
+
maxDepth?: number;
|
|
577
|
+
};
|
|
578
|
+
perplexityFallback?: {
|
|
579
|
+
enabled?: boolean;
|
|
580
|
+
model?: typeof PERPLEXITY_MODELS[keyof typeof PERPLEXITY_MODELS];
|
|
581
|
+
useForRobotsBlocked?: boolean;
|
|
582
|
+
useForParseFailed?: boolean;
|
|
583
|
+
searchRecency?: 'hour' | 'day' | 'week' | 'month';
|
|
584
|
+
};
|
|
585
|
+
}
|
|
586
|
+
interface ExtractedArticle {
|
|
587
|
+
url: string;
|
|
588
|
+
title?: string;
|
|
589
|
+
publishedDate?: Date;
|
|
590
|
+
description?: string;
|
|
591
|
+
confidence: number;
|
|
592
|
+
source: 'link-text' | 'meta-data' | 'structured-data';
|
|
593
|
+
}
|
|
594
|
+
declare class HTMLScraper {
|
|
595
|
+
private readonly userAgent;
|
|
596
|
+
private readonly timeout;
|
|
597
|
+
private readonly defaultConfig;
|
|
598
|
+
/**
|
|
599
|
+
* Extract article links from a webpage
|
|
600
|
+
*/
|
|
601
|
+
extractArticleLinks(url: string, config?: ScrapingConfig): Promise<ExtractedArticle[]>;
|
|
602
|
+
/**
|
|
603
|
+
* Extract articles from multiple pages with pagination support
|
|
604
|
+
*/
|
|
605
|
+
extractFromMultiplePages(startUrl: string, config?: ScrapingConfig, options?: {
|
|
606
|
+
maxPages?: number;
|
|
607
|
+
paginationSelector?: string;
|
|
608
|
+
nextPagePatterns?: RegExp[];
|
|
609
|
+
}): Promise<ExtractedArticle[]>;
|
|
610
|
+
private fetchPage;
|
|
611
|
+
private parseArticleLinks;
|
|
612
|
+
private extractArticleInfo;
|
|
613
|
+
private extractStructuredData;
|
|
614
|
+
private findNextPageUrls;
|
|
615
|
+
private deduplicateArticles;
|
|
616
|
+
private passesFilters;
|
|
617
|
+
private isLikelyArticleUrl;
|
|
618
|
+
private parseDate;
|
|
619
|
+
private resolveUrl;
|
|
620
|
+
private mergeConfig;
|
|
621
|
+
/**
|
|
622
|
+
* Use Perplexity API to extract articles when traditional scraping fails
|
|
623
|
+
* Requires PERPLEXITY_API_KEY environment variable to be set
|
|
624
|
+
*/
|
|
625
|
+
private extractWithPerplexity;
|
|
626
|
+
}
|
|
627
|
+
|
|
628
|
+
interface SitemapEntry {
|
|
629
|
+
url: string;
|
|
630
|
+
lastmod?: Date;
|
|
631
|
+
changefreq?: 'always' | 'hourly' | 'daily' | 'weekly' | 'monthly' | 'yearly' | 'never';
|
|
632
|
+
priority?: number;
|
|
633
|
+
images?: SitemapImage[];
|
|
634
|
+
news?: SitemapNews;
|
|
635
|
+
}
|
|
636
|
+
interface SitemapImage {
|
|
637
|
+
loc: string;
|
|
638
|
+
caption?: string;
|
|
639
|
+
title?: string;
|
|
640
|
+
}
|
|
641
|
+
interface SitemapNews {
|
|
642
|
+
title: string;
|
|
643
|
+
publishedDate?: Date;
|
|
644
|
+
keywords?: string[];
|
|
645
|
+
}
|
|
646
|
+
declare class SitemapParser {
|
|
647
|
+
private readonly userAgent;
|
|
648
|
+
private readonly timeout;
|
|
649
|
+
private readonly maxSitemapSize;
|
|
650
|
+
private readonly maxEntries;
|
|
651
|
+
private readonly recentTimeframe;
|
|
652
|
+
/**
|
|
653
|
+
* Parse sitemap from URL and return entries
|
|
654
|
+
*/
|
|
655
|
+
parseSitemap(url: string, options?: {
|
|
656
|
+
filterRecent?: boolean;
|
|
657
|
+
maxEntries?: number;
|
|
658
|
+
includeImages?: boolean;
|
|
659
|
+
includeNews?: boolean;
|
|
660
|
+
}): Promise<SitemapEntry[]>;
|
|
661
|
+
/**
|
|
662
|
+
* Discover sitemaps from domain
|
|
663
|
+
*/
|
|
664
|
+
discoverSitemaps(domain: string): Promise<string[]>;
|
|
665
|
+
/**
|
|
666
|
+
* Get recent entries from all sitemaps for a domain
|
|
667
|
+
*/
|
|
668
|
+
getRecentEntries(domain: string, options?: {
|
|
669
|
+
hoursBack?: number;
|
|
670
|
+
maxEntries?: number;
|
|
671
|
+
}): Promise<SitemapEntry[]>;
|
|
672
|
+
private fetchSitemap;
|
|
673
|
+
private checkSitemapExists;
|
|
674
|
+
private isSitemapIndex;
|
|
675
|
+
private parseSitemapIndex;
|
|
676
|
+
private parseRegularSitemap;
|
|
677
|
+
/**
|
|
678
|
+
* Validate sitemap format
|
|
679
|
+
*/
|
|
680
|
+
validateSitemapFormat(xml: string): {
|
|
681
|
+
valid: boolean;
|
|
682
|
+
errors: string[];
|
|
683
|
+
};
|
|
684
|
+
}
|
|
685
|
+
declare const globalSitemapParser: SitemapParser;
|
|
686
|
+
|
|
687
|
+
declare class RobotsChecker {
|
|
688
|
+
private cache;
|
|
689
|
+
private readonly cacheTimeout;
|
|
690
|
+
private readonly userAgent;
|
|
691
|
+
private readonly requestTimeout;
|
|
692
|
+
/**
|
|
693
|
+
* Check if a URL is allowed to be crawled according to robots.txt
|
|
694
|
+
*/
|
|
695
|
+
isAllowed(url: string): Promise<{
|
|
696
|
+
allowed: boolean;
|
|
697
|
+
crawlDelay?: number;
|
|
698
|
+
sitemaps: string[];
|
|
699
|
+
reason?: string;
|
|
700
|
+
}>;
|
|
701
|
+
/**
|
|
702
|
+
* Get sitemaps listed in robots.txt for a domain
|
|
703
|
+
*/
|
|
704
|
+
getSitemaps(domain: string): Promise<string[]>;
|
|
705
|
+
/**
|
|
706
|
+
* Get the recommended crawl delay for a domain
|
|
707
|
+
*/
|
|
708
|
+
getCrawlDelay(domain: string): Promise<number | undefined>;
|
|
709
|
+
private getRobotsTxt;
|
|
710
|
+
private parseRobotsTxt;
|
|
711
|
+
private completeRule;
|
|
712
|
+
private checkRules;
|
|
713
|
+
private findBestMatchingRule;
|
|
714
|
+
private matchesPattern;
|
|
715
|
+
clearCache(): void;
|
|
716
|
+
getCacheStats(): {
|
|
717
|
+
size: number;
|
|
718
|
+
entries: {
|
|
719
|
+
url: string;
|
|
720
|
+
fetchedAt: string;
|
|
721
|
+
expiresAt: string;
|
|
722
|
+
rulesCount: number;
|
|
723
|
+
sitemapsCount: number;
|
|
724
|
+
}[];
|
|
725
|
+
};
|
|
726
|
+
}
|
|
727
|
+
declare const globalRobotsChecker: RobotsChecker;
|
|
728
|
+
|
|
729
|
+
/**
|
|
730
|
+
* @package @tyroneross/scraper-testing
|
|
731
|
+
* Article quality scoring system
|
|
732
|
+
*
|
|
733
|
+
* No LLM required - uses metadata and content signals to determine article quality
|
|
734
|
+
*/
|
|
735
|
+
|
|
736
|
+
/**
|
|
737
|
+
* Default quality score configuration
|
|
738
|
+
* These weights were optimized through testing with 1,788 real articles
|
|
739
|
+
*/
|
|
740
|
+
declare const DEFAULT_QUALITY_CONFIG: Required<QualityScoreConfig>;
|
|
741
|
+
/**
|
|
742
|
+
* Default patterns to block non-article pages
|
|
743
|
+
* These cover common non-article paths across websites
|
|
744
|
+
*/
|
|
745
|
+
declare const DEFAULT_DENY_PATHS: string[];
|
|
746
|
+
/**
|
|
747
|
+
* Validate content quality (Tier 2 filtering)
|
|
748
|
+
* Checks length, title quality, and text-to-HTML ratio
|
|
749
|
+
*
|
|
750
|
+
* @param extracted - Extracted content from article
|
|
751
|
+
* @returns Validation result with score and reasons
|
|
752
|
+
*/
|
|
753
|
+
declare function validateContent(extracted: ExtractedContent$1): ContentValidation;
|
|
754
|
+
/**
|
|
755
|
+
* Calculate article quality score (Tier 3 filtering)
|
|
756
|
+
*
|
|
757
|
+
* Score breakdown:
|
|
758
|
+
* - Content validation (60%): Length, title quality, text-to-HTML ratio
|
|
759
|
+
* - Publication date (12%): Articles should have timestamps
|
|
760
|
+
* - Author/byline (8%): Professional articles cite authors
|
|
761
|
+
* - Schema.org metadata (8%): Structured data indicates article pages
|
|
762
|
+
* - Reading time (12%): Substantial content (2+ min read)
|
|
763
|
+
*
|
|
764
|
+
* @param extracted - Extracted content from article
|
|
765
|
+
* @param config - Optional quality score configuration
|
|
766
|
+
* @returns Quality score between 0-1
|
|
767
|
+
*/
|
|
768
|
+
declare function calculateArticleQualityScore(extracted: ExtractedContent$1, config?: QualityScoreConfig): number;
|
|
769
|
+
/**
|
|
770
|
+
* Check if a URL should be denied based on path patterns
|
|
771
|
+
*
|
|
772
|
+
* @param url - URL to check
|
|
773
|
+
* @param denyPaths - Patterns to deny (supports wildcards with *)
|
|
774
|
+
* @returns True if URL should be denied
|
|
775
|
+
*/
|
|
776
|
+
declare function shouldDenyUrl(url: string, denyPaths?: string[]): boolean;
|
|
777
|
+
/**
|
|
778
|
+
* Get quality score breakdown for debugging
|
|
779
|
+
* Useful for understanding why an article scored a certain way
|
|
780
|
+
*
|
|
781
|
+
* @param extracted - Extracted content from article
|
|
782
|
+
* @param config - Optional quality score configuration
|
|
783
|
+
* @returns Breakdown of quality score components
|
|
784
|
+
*/
|
|
785
|
+
declare function getQualityBreakdown(extracted: ExtractedContent$1, config?: QualityScoreConfig): {
|
|
786
|
+
contentValidation: number;
|
|
787
|
+
publishedDate: number;
|
|
788
|
+
author: number;
|
|
789
|
+
schema: number;
|
|
790
|
+
readingTime: number;
|
|
791
|
+
total: number;
|
|
792
|
+
passesThreshold: boolean;
|
|
793
|
+
};
|
|
794
|
+
|
|
795
|
+
interface CircuitBreakerOptions {
|
|
796
|
+
failureThreshold: number;
|
|
797
|
+
timeout: number;
|
|
798
|
+
resetTimeout: number;
|
|
799
|
+
name: string;
|
|
800
|
+
}
|
|
801
|
+
declare class CircuitBreaker {
|
|
802
|
+
private failures;
|
|
803
|
+
private lastFailureTime;
|
|
804
|
+
private state;
|
|
805
|
+
private options;
|
|
806
|
+
constructor(options: CircuitBreakerOptions);
|
|
807
|
+
execute<T>(operation: () => Promise<T>): Promise<T>;
|
|
808
|
+
private executeWithTimeout;
|
|
809
|
+
private onSuccess;
|
|
810
|
+
private onFailure;
|
|
811
|
+
getState(): {
|
|
812
|
+
state: "CLOSED" | "OPEN" | "HALF_OPEN";
|
|
813
|
+
failures: number;
|
|
814
|
+
lastFailureTime: number;
|
|
815
|
+
};
|
|
816
|
+
}
|
|
817
|
+
declare const circuitBreakers: {
|
|
818
|
+
rss: CircuitBreaker;
|
|
819
|
+
scraping: CircuitBreaker;
|
|
820
|
+
scrapingTest: CircuitBreaker;
|
|
821
|
+
};
|
|
822
|
+
|
|
823
|
+
declare class ScrapingRateLimiter {
|
|
824
|
+
private hosts;
|
|
825
|
+
private readonly baseDelay;
|
|
826
|
+
private readonly maxBackoff;
|
|
827
|
+
private readonly maxConcurrent;
|
|
828
|
+
private activeRequests;
|
|
829
|
+
constructor(options?: {
|
|
830
|
+
requestsPerSecond?: number;
|
|
831
|
+
maxBackoff?: number;
|
|
832
|
+
maxConcurrent?: number;
|
|
833
|
+
});
|
|
834
|
+
execute<T>(url: string, operation: () => Promise<T>, options?: {
|
|
835
|
+
priority?: number;
|
|
836
|
+
maxRetries?: number;
|
|
837
|
+
}): Promise<T>;
|
|
838
|
+
private extractHost;
|
|
839
|
+
private enqueueRequest;
|
|
840
|
+
private processQueue;
|
|
841
|
+
private handleRequestError;
|
|
842
|
+
private shouldRetry;
|
|
843
|
+
private shouldBackoff;
|
|
844
|
+
private wait;
|
|
845
|
+
getStats(): Record<string, any>;
|
|
846
|
+
}
|
|
847
|
+
declare const globalRateLimiter: ScrapingRateLimiter;
|
|
848
|
+
|
|
849
|
+
interface RSSItem {
|
|
850
|
+
title: string;
|
|
851
|
+
link: string;
|
|
852
|
+
pubDate: string;
|
|
853
|
+
guid: string;
|
|
854
|
+
content?: string;
|
|
855
|
+
contentSnippet?: string;
|
|
856
|
+
}
|
|
857
|
+
declare function fetchRSSFeed(url: string, _sourceId?: string): Promise<RSSItem[]>;
|
|
858
|
+
|
|
859
|
+
/**
|
|
860
|
+
* Convert HTML to clean Markdown
|
|
861
|
+
* - Preserves headings, bold, lists, links, code blocks
|
|
862
|
+
* - Strips navigation, forms, UI elements
|
|
863
|
+
* - Smart paragraph detection
|
|
864
|
+
*/
|
|
865
|
+
declare function htmlToMarkdown(html: string): string;
|
|
866
|
+
/**
|
|
867
|
+
* Strip non-article content from HTML before conversion
|
|
868
|
+
* Removes navigation, forms, UI elements
|
|
869
|
+
*/
|
|
870
|
+
declare function stripNonArticleContent(html: string): string;
|
|
871
|
+
/**
|
|
872
|
+
* Convert HTML to Markdown with full cleaning
|
|
873
|
+
* This is the main function developers should use
|
|
874
|
+
*/
|
|
875
|
+
declare function convertToMarkdown(html: string, options?: {
|
|
876
|
+
cleanNonArticle?: boolean;
|
|
877
|
+
smartParagraphs?: boolean;
|
|
878
|
+
}): string;
|
|
879
|
+
|
|
880
|
+
/**
|
|
881
|
+
* Text cleanup utilities
|
|
882
|
+
* Normalize whitespace, remove excessive line breaks, clean HTML entities
|
|
883
|
+
*/
|
|
884
|
+
/**
|
|
885
|
+
* Clean text content
|
|
886
|
+
* - Normalize whitespace between paragraphs
|
|
887
|
+
* - Remove excessive line breaks
|
|
888
|
+
* - Decode HTML entities
|
|
889
|
+
* - Trim redundant spaces
|
|
890
|
+
*/
|
|
891
|
+
declare function cleanText(text: string): string;
|
|
892
|
+
/**
|
|
893
|
+
* Decode HTML entities ( , &, etc.)
|
|
894
|
+
*/
|
|
895
|
+
declare function decodeHTMLEntities(text: string): string;
|
|
896
|
+
/**
|
|
897
|
+
* Normalize whitespace
|
|
898
|
+
* - Replace multiple spaces with single space
|
|
899
|
+
* - Replace tabs with spaces
|
|
900
|
+
* - Remove trailing/leading whitespace from lines
|
|
901
|
+
*/
|
|
902
|
+
declare function normalizeWhitespace(text: string): string;
|
|
903
|
+
/**
|
|
904
|
+
* Detect paragraph boundaries and add proper spacing
|
|
905
|
+
* Looks for sentence endings followed by capital letters
|
|
906
|
+
*/
|
|
907
|
+
declare function detectParagraphs(text: string): string;
|
|
908
|
+
/**
|
|
909
|
+
* Remove URLs from text
|
|
910
|
+
* Useful for cleaning up citations or references
|
|
911
|
+
*/
|
|
912
|
+
declare function removeUrls(text: string): string;
|
|
913
|
+
/**
|
|
914
|
+
* Truncate text to a maximum length
|
|
915
|
+
* Breaks at word boundaries and adds ellipsis
|
|
916
|
+
*/
|
|
917
|
+
declare function truncateText(text: string, maxLength: number): string;
|
|
918
|
+
/**
|
|
919
|
+
* Extract plain text from HTML
|
|
920
|
+
* Quick and dirty HTML stripping
|
|
921
|
+
*/
|
|
922
|
+
declare function stripHTML(html: string): string;
|
|
923
|
+
|
|
924
|
+
/**
|
|
925
|
+
* @tyroneross/blog-scraper
|
|
926
|
+
*
|
|
927
|
+
* A powerful web scraping SDK for extracting blog articles and content.
|
|
928
|
+
* No LLM required - uses Mozilla Readability and intelligent quality scoring.
|
|
929
|
+
*
|
|
930
|
+
* @example Simple usage
|
|
931
|
+
* ```typescript
|
|
932
|
+
* import { scrape } from '@tyroneross/blog-scraper';
|
|
933
|
+
*
|
|
934
|
+
* const result = await scrape('https://example.com/blog');
|
|
935
|
+
* console.log(`Found ${result.articles.length} articles`);
|
|
936
|
+
* ```
|
|
937
|
+
*
|
|
938
|
+
* @example Advanced usage with custom components
|
|
939
|
+
* ```typescript
|
|
940
|
+
* import { ContentExtractor, QualityScorer } from '@tyroneross/blog-scraper';
|
|
941
|
+
*
|
|
942
|
+
* const extractor = new ContentExtractor();
|
|
943
|
+
* const content = await extractor.extractContent(url);
|
|
944
|
+
* ```
|
|
945
|
+
*/
|
|
946
|
+
|
|
947
|
+
declare const VERSION = "0.1.0";
|
|
948
|
+
|
|
949
|
+
export { type CandidateArticle, CircuitBreaker, ContentExtractor, type ContentValidation, DEFAULT_DENY_PATHS, DEFAULT_QUALITY_CONFIG, type DiscoveredFeed, type ExtractedArticle, type ExtractedContent$1 as ExtractedContent, type ExtractedContent as ExtractorExtractedContent, HTMLScraper, type OrchestrationResult, type QualityScoreConfig, RSSDiscovery, type RSSItem, RobotsChecker, type ScrapeOptions, type ScrapedArticle, type ScraperPlugin, type ScraperResultsProps, type ScraperTestProps, type ScraperTestRequest, type ScraperTestResult, type ScrapingConfig, ScrapingRateLimiter, type SitemapEntry, SitemapParser, type SourceConfig, SourceOrchestrator, VERSION, calculateArticleQualityScore, circuitBreakers, cleanText, convertToMarkdown, decodeHTMLEntities, detectParagraphs, fetchRSSFeed, getQualityBreakdown, globalContentExtractor, globalRSSDiscovery, globalRateLimiter, globalRobotsChecker, globalSitemapParser, globalSourceOrchestrator, htmlToMarkdown, normalizeWhitespace, quickScrape, removeUrls, scrape, shouldDenyUrl, stripHTML, stripNonArticleContent, truncateText, validateContent };
|