@tyroneross/blog-scraper 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,949 @@
1
+ import { z } from 'zod';
2
+
3
+ /**
4
+ * @package @tyroneross/scraper-testing
5
+ * Core types for web scraper testing
6
+ */
7
+ interface ScrapedArticle {
8
+ url: string;
9
+ title: string;
10
+ publishedDate?: Date | string;
11
+ description?: string;
12
+ fullContent?: string;
13
+ fullContentMarkdown?: string;
14
+ fullContentText?: string;
15
+ confidence: number;
16
+ source: 'link-text' | 'meta-data' | 'structured-data';
17
+ qualityScore?: number;
18
+ metadata?: Record<string, any>;
19
+ }
20
+ interface ScraperTestResult {
21
+ url: string;
22
+ detectedType: 'rss' | 'sitemap' | 'html' | 'unknown';
23
+ confidence: 'high' | 'medium' | 'low';
24
+ articles: ScrapedArticle[];
25
+ extractionStats: {
26
+ attempted: number;
27
+ successful: number;
28
+ failed: number;
29
+ filtered: number;
30
+ totalDiscovered?: number;
31
+ afterDenyFilter?: number;
32
+ afterContentValidation?: number;
33
+ afterQualityFilter?: number;
34
+ };
35
+ processingTime: number;
36
+ errors: string[];
37
+ timestamp: string;
38
+ }
39
+ interface ScraperTestRequest {
40
+ url: string;
41
+ sourceType?: 'auto' | 'rss' | 'sitemap' | 'html';
42
+ maxArticles?: number;
43
+ extractFullContent?: boolean;
44
+ denyPaths?: string[];
45
+ qualityThreshold?: number;
46
+ }
47
+ interface ScraperTestProps {
48
+ onTestComplete?: (result: ScraperTestResult) => void;
49
+ onTestStart?: (url: string) => void;
50
+ onError?: (error: Error) => void;
51
+ className?: string;
52
+ defaultUrl?: string;
53
+ plugins?: ScraperPlugin[];
54
+ }
55
+ interface ScraperResultsProps {
56
+ result: ScraperTestResult | null;
57
+ loading?: boolean;
58
+ error?: string | null;
59
+ className?: string;
60
+ }
61
+ /**
62
+ * Plugin system for extending scraper functionality
63
+ * Allows users to add their own LLM-based enhancements
64
+ */
65
+ interface ScraperPlugin {
66
+ name: string;
67
+ version: string;
68
+ /**
69
+ * Called before scraping starts
70
+ * Useful for validation, rate limiting, or pre-processing
71
+ */
72
+ beforeScrape?: (url: string) => Promise<void>;
73
+ /**
74
+ * Called after all articles are scraped
75
+ * Useful for batch processing or re-ranking
76
+ */
77
+ afterScrape?: (articles: ScrapedArticle[]) => Promise<ScrapedArticle[]>;
78
+ /**
79
+ * Called for each article individually
80
+ * Useful for adding AI-based quality scores or classifications
81
+ */
82
+ enhanceArticle?: (article: ScrapedArticle) => Promise<ScrapedArticle>;
83
+ /**
84
+ * Called to determine if an article should be filtered out
85
+ * Return true to keep the article, false to filter it out
86
+ */
87
+ filterArticle?: (article: ScrapedArticle) => Promise<boolean>;
88
+ }
89
+ /**
90
+ * Quality scoring configuration
91
+ */
92
+ interface QualityScoreConfig {
93
+ contentWeight?: number;
94
+ dateWeight?: number;
95
+ authorWeight?: number;
96
+ schemaWeight?: number;
97
+ readingTimeWeight?: number;
98
+ threshold?: number;
99
+ }
100
+ /**
101
+ * Content validation result
102
+ */
103
+ interface ContentValidation {
104
+ isValid: boolean;
105
+ score: number;
106
+ reasons: string[];
107
+ }
108
+ /**
109
+ * Extracted content structure
110
+ */
111
+ interface ExtractedContent$1 {
112
+ title?: string;
113
+ byline?: string;
114
+ content?: string;
115
+ textContent?: string;
116
+ length?: number;
117
+ excerpt?: string;
118
+ siteName?: string;
119
+ publishedTime?: Date | string;
120
+ lang?: string;
121
+ readingTime?: number;
122
+ structured?: {
123
+ jsonLd?: any;
124
+ openGraph?: Record<string, string>;
125
+ twitter?: Record<string, string>;
126
+ };
127
+ }
128
+
129
+ /**
130
+ * @package @tyroneross/blog-scraper
131
+ * High-level API for easy scraping
132
+ */
133
+
134
+ /**
135
+ * Options for the scrape function
136
+ */
137
+ interface ScrapeOptions {
138
+ /** Source type detection mode (default: 'auto') */
139
+ sourceType?: 'auto' | 'rss' | 'sitemap' | 'html';
140
+ /** Maximum number of articles to return (default: 50) */
141
+ maxArticles?: number;
142
+ /** Extract full article content (default: true) */
143
+ extractFullContent?: boolean;
144
+ /** URL patterns to exclude (default: common non-article paths) */
145
+ denyPaths?: string[];
146
+ /** Minimum quality score 0-1 (default: 0.6) */
147
+ qualityThreshold?: number;
148
+ }
149
+ /**
150
+ * Main scraping function - simple interface for extracting articles
151
+ *
152
+ * @example
153
+ * ```typescript
154
+ * import { scrape } from '@tyroneross/blog-scraper';
155
+ *
156
+ * const result = await scrape('https://example.com/blog');
157
+ * console.log(`Found ${result.articles.length} articles`);
158
+ * ```
159
+ *
160
+ * @param url - URL to scrape (RSS feed, sitemap, or HTML page)
161
+ * @param options - Optional scraping configuration
162
+ * @returns Promise with scraping results
163
+ */
164
+ declare function scrape(url: string, options?: ScrapeOptions): Promise<ScraperTestResult>;
165
+ /**
166
+ * Quick scrape - returns just the article URLs and titles (fast)
167
+ *
168
+ * @example
169
+ * ```typescript
170
+ * const urls = await quickScrape('https://example.com/blog');
171
+ * console.log(urls); // ['url1', 'url2', ...]
172
+ * ```
173
+ */
174
+ declare function quickScrape(url: string): Promise<string[]>;
175
+
176
+ interface DiscoveredFeed {
177
+ url: string;
178
+ title?: string;
179
+ type: 'rss' | 'atom' | 'rdf';
180
+ source: 'link-tag' | 'common-path' | 'content-scan';
181
+ confidence: number;
182
+ }
183
+ declare class RSSDiscovery {
184
+ private readonly userAgent;
185
+ private readonly timeout;
186
+ /**
187
+ * Discover RSS feeds from a given URL
188
+ */
189
+ discoverFeeds(url: string): Promise<DiscoveredFeed[]>;
190
+ /**
191
+ * Check if the URL itself is a direct feed
192
+ */
193
+ private checkDirectFeed;
194
+ /**
195
+ * Fetch HTML page content
196
+ */
197
+ private fetchPage;
198
+ /**
199
+ * Extract feed URLs from HTML link tags
200
+ */
201
+ private extractFeedsFromHTML;
202
+ /**
203
+ * Check common feed paths
204
+ */
205
+ private checkCommonPaths;
206
+ /**
207
+ * Scan HTML content for feed-like patterns
208
+ */
209
+ private scanForFeedContent;
210
+ /**
211
+ * Validate if a URL is actually a feed
212
+ */
213
+ private validateFeedUrl;
214
+ /**
215
+ * Resolve relative URLs to absolute URLs
216
+ */
217
+ private resolveUrl;
218
+ /**
219
+ * Check if content type indicates a feed
220
+ */
221
+ private isFeedContentType;
222
+ /**
223
+ * Determine feed type from content type
224
+ */
225
+ private determineFeedType;
226
+ /**
227
+ * Guess feed type from URL or text
228
+ */
229
+ private guessFeedType;
230
+ /**
231
+ * Check if a link looks like it could be a feed
232
+ */
233
+ private isFeedLikeLink;
234
+ }
235
+ declare const globalRSSDiscovery: RSSDiscovery;
236
+
237
+ declare const CandidateArticleSchema: z.ZodObject<{
238
+ url: z.ZodString;
239
+ title: z.ZodString;
240
+ publishedAt: z.ZodDate;
241
+ content: z.ZodOptional<z.ZodString>;
242
+ excerpt: z.ZodOptional<z.ZodString>;
243
+ guid: z.ZodString;
244
+ confidence: z.ZodNumber;
245
+ source: z.ZodEnum<["rss", "sitemap", "html", "discovery"]>;
246
+ extractionMethod: z.ZodEnum<["rss", "sitemap", "html-links", "content-extraction"]>;
247
+ metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodAny>>;
248
+ }, "strip", z.ZodTypeAny, {
249
+ url: string;
250
+ source: "sitemap" | "rss" | "html" | "discovery";
251
+ confidence: number;
252
+ title: string;
253
+ extractionMethod: "sitemap" | "rss" | "html-links" | "content-extraction";
254
+ publishedAt: Date;
255
+ guid: string;
256
+ content?: string | undefined;
257
+ excerpt?: string | undefined;
258
+ metadata?: Record<string, any> | undefined;
259
+ }, {
260
+ url: string;
261
+ source: "sitemap" | "rss" | "html" | "discovery";
262
+ confidence: number;
263
+ title: string;
264
+ extractionMethod: "sitemap" | "rss" | "html-links" | "content-extraction";
265
+ publishedAt: Date;
266
+ guid: string;
267
+ content?: string | undefined;
268
+ excerpt?: string | undefined;
269
+ metadata?: Record<string, any> | undefined;
270
+ }>;
271
+ type CandidateArticle = z.infer<typeof CandidateArticleSchema>;
272
+ declare const SourceConfigSchema: z.ZodObject<{
273
+ sourceType: z.ZodEnum<["rss", "sitemap", "html", "auto"]>;
274
+ allowPaths: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
275
+ denyPaths: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
276
+ maxDepth: z.ZodOptional<z.ZodNumber>;
277
+ detectOnly: z.ZodOptional<z.ZodBoolean>;
278
+ scrapeConfig: z.ZodOptional<z.ZodObject<{
279
+ selectors: z.ZodOptional<z.ZodObject<{
280
+ articleLinks: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
281
+ titleSelectors: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
282
+ dateSelectors: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
283
+ excludeSelectors: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
284
+ }, "strip", z.ZodTypeAny, {
285
+ excludeSelectors?: string[] | undefined;
286
+ articleLinks?: string[] | undefined;
287
+ titleSelectors?: string[] | undefined;
288
+ dateSelectors?: string[] | undefined;
289
+ }, {
290
+ excludeSelectors?: string[] | undefined;
291
+ articleLinks?: string[] | undefined;
292
+ titleSelectors?: string[] | undefined;
293
+ dateSelectors?: string[] | undefined;
294
+ }>>;
295
+ filters: z.ZodOptional<z.ZodObject<{
296
+ minTitleLength: z.ZodOptional<z.ZodNumber>;
297
+ maxTitleLength: z.ZodOptional<z.ZodNumber>;
298
+ includePatterns: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
299
+ excludePatterns: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
300
+ }, "strip", z.ZodTypeAny, {
301
+ minTitleLength?: number | undefined;
302
+ maxTitleLength?: number | undefined;
303
+ excludePatterns?: string[] | undefined;
304
+ includePatterns?: string[] | undefined;
305
+ }, {
306
+ minTitleLength?: number | undefined;
307
+ maxTitleLength?: number | undefined;
308
+ excludePatterns?: string[] | undefined;
309
+ includePatterns?: string[] | undefined;
310
+ }>>;
311
+ limits: z.ZodOptional<z.ZodObject<{
312
+ maxLinksPerPage: z.ZodOptional<z.ZodNumber>;
313
+ maxPages: z.ZodOptional<z.ZodNumber>;
314
+ }, "strip", z.ZodTypeAny, {
315
+ maxLinksPerPage?: number | undefined;
316
+ maxPages?: number | undefined;
317
+ }, {
318
+ maxLinksPerPage?: number | undefined;
319
+ maxPages?: number | undefined;
320
+ }>>;
321
+ }, "strip", z.ZodTypeAny, {
322
+ filters?: {
323
+ minTitleLength?: number | undefined;
324
+ maxTitleLength?: number | undefined;
325
+ excludePatterns?: string[] | undefined;
326
+ includePatterns?: string[] | undefined;
327
+ } | undefined;
328
+ selectors?: {
329
+ excludeSelectors?: string[] | undefined;
330
+ articleLinks?: string[] | undefined;
331
+ titleSelectors?: string[] | undefined;
332
+ dateSelectors?: string[] | undefined;
333
+ } | undefined;
334
+ limits?: {
335
+ maxLinksPerPage?: number | undefined;
336
+ maxPages?: number | undefined;
337
+ } | undefined;
338
+ }, {
339
+ filters?: {
340
+ minTitleLength?: number | undefined;
341
+ maxTitleLength?: number | undefined;
342
+ excludePatterns?: string[] | undefined;
343
+ includePatterns?: string[] | undefined;
344
+ } | undefined;
345
+ selectors?: {
346
+ excludeSelectors?: string[] | undefined;
347
+ articleLinks?: string[] | undefined;
348
+ titleSelectors?: string[] | undefined;
349
+ dateSelectors?: string[] | undefined;
350
+ } | undefined;
351
+ limits?: {
352
+ maxLinksPerPage?: number | undefined;
353
+ maxPages?: number | undefined;
354
+ } | undefined;
355
+ }>>;
356
+ }, "strip", z.ZodTypeAny, {
357
+ sourceType: "sitemap" | "rss" | "html" | "auto";
358
+ maxDepth?: number | undefined;
359
+ allowPaths?: string[] | undefined;
360
+ denyPaths?: string[] | undefined;
361
+ detectOnly?: boolean | undefined;
362
+ scrapeConfig?: {
363
+ filters?: {
364
+ minTitleLength?: number | undefined;
365
+ maxTitleLength?: number | undefined;
366
+ excludePatterns?: string[] | undefined;
367
+ includePatterns?: string[] | undefined;
368
+ } | undefined;
369
+ selectors?: {
370
+ excludeSelectors?: string[] | undefined;
371
+ articleLinks?: string[] | undefined;
372
+ titleSelectors?: string[] | undefined;
373
+ dateSelectors?: string[] | undefined;
374
+ } | undefined;
375
+ limits?: {
376
+ maxLinksPerPage?: number | undefined;
377
+ maxPages?: number | undefined;
378
+ } | undefined;
379
+ } | undefined;
380
+ }, {
381
+ sourceType: "sitemap" | "rss" | "html" | "auto";
382
+ maxDepth?: number | undefined;
383
+ allowPaths?: string[] | undefined;
384
+ denyPaths?: string[] | undefined;
385
+ detectOnly?: boolean | undefined;
386
+ scrapeConfig?: {
387
+ filters?: {
388
+ minTitleLength?: number | undefined;
389
+ maxTitleLength?: number | undefined;
390
+ excludePatterns?: string[] | undefined;
391
+ includePatterns?: string[] | undefined;
392
+ } | undefined;
393
+ selectors?: {
394
+ excludeSelectors?: string[] | undefined;
395
+ articleLinks?: string[] | undefined;
396
+ titleSelectors?: string[] | undefined;
397
+ dateSelectors?: string[] | undefined;
398
+ } | undefined;
399
+ limits?: {
400
+ maxLinksPerPage?: number | undefined;
401
+ maxPages?: number | undefined;
402
+ } | undefined;
403
+ } | undefined;
404
+ }>;
405
+ type SourceConfig = z.infer<typeof SourceConfigSchema> & {
406
+ circuitBreaker?: {
407
+ execute<T>(operation: () => Promise<T>): Promise<T>;
408
+ };
409
+ };
410
+ interface OrchestrationResult {
411
+ articles: CandidateArticle[];
412
+ sourceInfo: {
413
+ detectedType: 'rss' | 'sitemap' | 'html';
414
+ discoveredFeeds?: DiscoveredFeed[];
415
+ discoveredSitemaps?: string[];
416
+ extractionStats: {
417
+ attempted: number;
418
+ successful: number;
419
+ failed: number;
420
+ filtered: number;
421
+ };
422
+ };
423
+ processingTime: number;
424
+ errors: string[];
425
+ }
426
+ declare class SourceOrchestrator {
427
+ private readonly maxArticlesPerSource;
428
+ /**
429
+ * Main orchestration method - determines source type and extracts content
430
+ */
431
+ processSource(url: string, config?: SourceConfig): Promise<OrchestrationResult>;
432
+ /**
433
+ * Auto-detect source type and process accordingly
434
+ */
435
+ private autoDetectAndProcess;
436
+ /**
437
+ * Process source with known type
438
+ */
439
+ private processKnownType;
440
+ /**
441
+ * Process URL as RSS feed
442
+ */
443
+ private processAsRSS;
444
+ /**
445
+ * Process URL as sitemap
446
+ */
447
+ private processAsSitemap;
448
+ /**
449
+ * Process URL as HTML page
450
+ */
451
+ private processAsHTML;
452
+ /**
453
+ * Apply path filtering based on allowPaths and denyPaths
454
+ */
455
+ private applyPathFilters;
456
+ /**
457
+ * Check if a path matches a pattern (supports wildcards)
458
+ */
459
+ private matchesPattern;
460
+ /**
461
+ * Build scraping configuration from source config
462
+ */
463
+ private buildScrapingConfig;
464
+ /**
465
+ * Extract title from URL as fallback
466
+ */
467
+ private extractTitleFromUrl;
468
+ /**
469
+ * Create a consistent GUID for an article
470
+ */
471
+ private createGuid;
472
+ /**
473
+ * Finalize processing result
474
+ */
475
+ private finalizeResult;
476
+ /**
477
+ * Extract full content for articles (optional enhancement step)
478
+ */
479
+ enhanceWithFullContent(articles: CandidateArticle[], maxArticles?: number): Promise<CandidateArticle[]>;
480
+ /**
481
+ * Validate orchestrator configuration
482
+ */
483
+ static validateConfig(config: any): SourceConfig;
484
+ /**
485
+ * Get source statistics
486
+ */
487
+ getSourceStats(url: string): Promise<{
488
+ robotsCompliant: boolean;
489
+ hasRSSFeed: boolean;
490
+ hasSitemap: boolean;
491
+ detectedType: string;
492
+ estimatedArticleCount: number;
493
+ }>;
494
+ }
495
+ declare const globalSourceOrchestrator: SourceOrchestrator;
496
+
497
+ interface ExtractedContent {
498
+ url: string;
499
+ title: string;
500
+ content: string;
501
+ textContent: string;
502
+ excerpt?: string;
503
+ byline?: string;
504
+ publishedTime?: Date;
505
+ siteName?: string;
506
+ lang?: string;
507
+ structured?: {
508
+ jsonLd?: any;
509
+ openGraph?: Record<string, string>;
510
+ twitterCard?: Record<string, string>;
511
+ microdata?: any[];
512
+ };
513
+ wordCount: number;
514
+ readingTime: number;
515
+ confidence: number;
516
+ extractionMethod: 'readability' | 'fallback' | 'structured';
517
+ extractedAt: Date;
518
+ errors?: string[];
519
+ }
520
+ declare class ContentExtractor {
521
+ private readonly userAgent;
522
+ private readonly timeout;
523
+ private readonly maxContentSize;
524
+ private readonly minContentLength;
525
+ private readonly wordsPerMinute;
526
+ private readonly ssrfProtection;
527
+ constructor();
528
+ /**
529
+ * Extract content from a URL
530
+ */
531
+ extractContent(url: string): Promise<ExtractedContent | null>;
532
+ /**
533
+ * Extract content from multiple URLs
534
+ */
535
+ extractBatch(urls: string[]): Promise<(ExtractedContent | null)[]>;
536
+ private fetchContent;
537
+ private extractFromHTML;
538
+ private extractWithReadability;
539
+ private extractWithFallback;
540
+ private extractStructuredData;
541
+ private extractPublishedTime;
542
+ private extractSiteName;
543
+ private extractLanguage;
544
+ private countWords;
545
+ /**
546
+ * Validate extracted content quality
547
+ */
548
+ validateContent(content: ExtractedContent): {
549
+ isValid: boolean;
550
+ issues: string[];
551
+ score: number;
552
+ };
553
+ }
554
+ declare const globalContentExtractor: ContentExtractor;
555
+
556
+ declare const PERPLEXITY_MODELS: {
557
+ readonly SONAR: "llama-3.1-sonar-small-128k-online";
558
+ readonly SONAR_PRO: "llama-3.1-sonar-large-128k-online";
559
+ };
560
+ interface ScrapingConfig {
561
+ selectors?: {
562
+ articleLinks?: string[];
563
+ titleSelectors?: string[];
564
+ dateSelectors?: string[];
565
+ excludeSelectors?: string[];
566
+ };
567
+ filters?: {
568
+ minTitleLength?: number;
569
+ maxTitleLength?: number;
570
+ includePatterns?: RegExp[];
571
+ excludePatterns?: RegExp[];
572
+ allowedDomains?: string[];
573
+ };
574
+ limits?: {
575
+ maxLinksPerPage?: number;
576
+ maxDepth?: number;
577
+ };
578
+ perplexityFallback?: {
579
+ enabled?: boolean;
580
+ model?: typeof PERPLEXITY_MODELS[keyof typeof PERPLEXITY_MODELS];
581
+ useForRobotsBlocked?: boolean;
582
+ useForParseFailed?: boolean;
583
+ searchRecency?: 'hour' | 'day' | 'week' | 'month';
584
+ };
585
+ }
586
+ interface ExtractedArticle {
587
+ url: string;
588
+ title?: string;
589
+ publishedDate?: Date;
590
+ description?: string;
591
+ confidence: number;
592
+ source: 'link-text' | 'meta-data' | 'structured-data';
593
+ }
594
+ declare class HTMLScraper {
595
+ private readonly userAgent;
596
+ private readonly timeout;
597
+ private readonly defaultConfig;
598
+ /**
599
+ * Extract article links from a webpage
600
+ */
601
+ extractArticleLinks(url: string, config?: ScrapingConfig): Promise<ExtractedArticle[]>;
602
+ /**
603
+ * Extract articles from multiple pages with pagination support
604
+ */
605
+ extractFromMultiplePages(startUrl: string, config?: ScrapingConfig, options?: {
606
+ maxPages?: number;
607
+ paginationSelector?: string;
608
+ nextPagePatterns?: RegExp[];
609
+ }): Promise<ExtractedArticle[]>;
610
+ private fetchPage;
611
+ private parseArticleLinks;
612
+ private extractArticleInfo;
613
+ private extractStructuredData;
614
+ private findNextPageUrls;
615
+ private deduplicateArticles;
616
+ private passesFilters;
617
+ private isLikelyArticleUrl;
618
+ private parseDate;
619
+ private resolveUrl;
620
+ private mergeConfig;
621
+ /**
622
+ * Use Perplexity API to extract articles when traditional scraping fails
623
+ * Requires PERPLEXITY_API_KEY environment variable to be set
624
+ */
625
+ private extractWithPerplexity;
626
+ }
627
+
628
+ interface SitemapEntry {
629
+ url: string;
630
+ lastmod?: Date;
631
+ changefreq?: 'always' | 'hourly' | 'daily' | 'weekly' | 'monthly' | 'yearly' | 'never';
632
+ priority?: number;
633
+ images?: SitemapImage[];
634
+ news?: SitemapNews;
635
+ }
636
+ interface SitemapImage {
637
+ loc: string;
638
+ caption?: string;
639
+ title?: string;
640
+ }
641
+ interface SitemapNews {
642
+ title: string;
643
+ publishedDate?: Date;
644
+ keywords?: string[];
645
+ }
646
+ declare class SitemapParser {
647
+ private readonly userAgent;
648
+ private readonly timeout;
649
+ private readonly maxSitemapSize;
650
+ private readonly maxEntries;
651
+ private readonly recentTimeframe;
652
+ /**
653
+ * Parse sitemap from URL and return entries
654
+ */
655
+ parseSitemap(url: string, options?: {
656
+ filterRecent?: boolean;
657
+ maxEntries?: number;
658
+ includeImages?: boolean;
659
+ includeNews?: boolean;
660
+ }): Promise<SitemapEntry[]>;
661
+ /**
662
+ * Discover sitemaps from domain
663
+ */
664
+ discoverSitemaps(domain: string): Promise<string[]>;
665
+ /**
666
+ * Get recent entries from all sitemaps for a domain
667
+ */
668
+ getRecentEntries(domain: string, options?: {
669
+ hoursBack?: number;
670
+ maxEntries?: number;
671
+ }): Promise<SitemapEntry[]>;
672
+ private fetchSitemap;
673
+ private checkSitemapExists;
674
+ private isSitemapIndex;
675
+ private parseSitemapIndex;
676
+ private parseRegularSitemap;
677
+ /**
678
+ * Validate sitemap format
679
+ */
680
+ validateSitemapFormat(xml: string): {
681
+ valid: boolean;
682
+ errors: string[];
683
+ };
684
+ }
685
+ declare const globalSitemapParser: SitemapParser;
686
+
687
+ declare class RobotsChecker {
688
+ private cache;
689
+ private readonly cacheTimeout;
690
+ private readonly userAgent;
691
+ private readonly requestTimeout;
692
+ /**
693
+ * Check if a URL is allowed to be crawled according to robots.txt
694
+ */
695
+ isAllowed(url: string): Promise<{
696
+ allowed: boolean;
697
+ crawlDelay?: number;
698
+ sitemaps: string[];
699
+ reason?: string;
700
+ }>;
701
+ /**
702
+ * Get sitemaps listed in robots.txt for a domain
703
+ */
704
+ getSitemaps(domain: string): Promise<string[]>;
705
+ /**
706
+ * Get the recommended crawl delay for a domain
707
+ */
708
+ getCrawlDelay(domain: string): Promise<number | undefined>;
709
+ private getRobotsTxt;
710
+ private parseRobotsTxt;
711
+ private completeRule;
712
+ private checkRules;
713
+ private findBestMatchingRule;
714
+ private matchesPattern;
715
+ clearCache(): void;
716
+ getCacheStats(): {
717
+ size: number;
718
+ entries: {
719
+ url: string;
720
+ fetchedAt: string;
721
+ expiresAt: string;
722
+ rulesCount: number;
723
+ sitemapsCount: number;
724
+ }[];
725
+ };
726
+ }
727
+ declare const globalRobotsChecker: RobotsChecker;
728
+
729
+ /**
730
+ * @package @tyroneross/scraper-testing
731
+ * Article quality scoring system
732
+ *
733
+ * No LLM required - uses metadata and content signals to determine article quality
734
+ */
735
+
736
+ /**
737
+ * Default quality score configuration
738
+ * These weights were optimized through testing with 1,788 real articles
739
+ */
740
+ declare const DEFAULT_QUALITY_CONFIG: Required<QualityScoreConfig>;
741
+ /**
742
+ * Default patterns to block non-article pages
743
+ * These cover common non-article paths across websites
744
+ */
745
+ declare const DEFAULT_DENY_PATHS: string[];
746
+ /**
747
+ * Validate content quality (Tier 2 filtering)
748
+ * Checks length, title quality, and text-to-HTML ratio
749
+ *
750
+ * @param extracted - Extracted content from article
751
+ * @returns Validation result with score and reasons
752
+ */
753
+ declare function validateContent(extracted: ExtractedContent$1): ContentValidation;
754
+ /**
755
+ * Calculate article quality score (Tier 3 filtering)
756
+ *
757
+ * Score breakdown:
758
+ * - Content validation (60%): Length, title quality, text-to-HTML ratio
759
+ * - Publication date (12%): Articles should have timestamps
760
+ * - Author/byline (8%): Professional articles cite authors
761
+ * - Schema.org metadata (8%): Structured data indicates article pages
762
+ * - Reading time (12%): Substantial content (2+ min read)
763
+ *
764
+ * @param extracted - Extracted content from article
765
+ * @param config - Optional quality score configuration
766
+ * @returns Quality score between 0-1
767
+ */
768
+ declare function calculateArticleQualityScore(extracted: ExtractedContent$1, config?: QualityScoreConfig): number;
769
+ /**
770
+ * Check if a URL should be denied based on path patterns
771
+ *
772
+ * @param url - URL to check
773
+ * @param denyPaths - Patterns to deny (supports wildcards with *)
774
+ * @returns True if URL should be denied
775
+ */
776
+ declare function shouldDenyUrl(url: string, denyPaths?: string[]): boolean;
777
+ /**
778
+ * Get quality score breakdown for debugging
779
+ * Useful for understanding why an article scored a certain way
780
+ *
781
+ * @param extracted - Extracted content from article
782
+ * @param config - Optional quality score configuration
783
+ * @returns Breakdown of quality score components
784
+ */
785
+ declare function getQualityBreakdown(extracted: ExtractedContent$1, config?: QualityScoreConfig): {
786
+ contentValidation: number;
787
+ publishedDate: number;
788
+ author: number;
789
+ schema: number;
790
+ readingTime: number;
791
+ total: number;
792
+ passesThreshold: boolean;
793
+ };
794
+
795
+ interface CircuitBreakerOptions {
796
+ failureThreshold: number;
797
+ timeout: number;
798
+ resetTimeout: number;
799
+ name: string;
800
+ }
801
+ declare class CircuitBreaker {
802
+ private failures;
803
+ private lastFailureTime;
804
+ private state;
805
+ private options;
806
+ constructor(options: CircuitBreakerOptions);
807
+ execute<T>(operation: () => Promise<T>): Promise<T>;
808
+ private executeWithTimeout;
809
+ private onSuccess;
810
+ private onFailure;
811
+ getState(): {
812
+ state: "CLOSED" | "OPEN" | "HALF_OPEN";
813
+ failures: number;
814
+ lastFailureTime: number;
815
+ };
816
+ }
817
+ declare const circuitBreakers: {
818
+ rss: CircuitBreaker;
819
+ scraping: CircuitBreaker;
820
+ scrapingTest: CircuitBreaker;
821
+ };
822
+
823
+ declare class ScrapingRateLimiter {
824
+ private hosts;
825
+ private readonly baseDelay;
826
+ private readonly maxBackoff;
827
+ private readonly maxConcurrent;
828
+ private activeRequests;
829
+ constructor(options?: {
830
+ requestsPerSecond?: number;
831
+ maxBackoff?: number;
832
+ maxConcurrent?: number;
833
+ });
834
+ execute<T>(url: string, operation: () => Promise<T>, options?: {
835
+ priority?: number;
836
+ maxRetries?: number;
837
+ }): Promise<T>;
838
+ private extractHost;
839
+ private enqueueRequest;
840
+ private processQueue;
841
+ private handleRequestError;
842
+ private shouldRetry;
843
+ private shouldBackoff;
844
+ private wait;
845
+ getStats(): Record<string, any>;
846
+ }
847
+ declare const globalRateLimiter: ScrapingRateLimiter;
848
+
849
+ interface RSSItem {
850
+ title: string;
851
+ link: string;
852
+ pubDate: string;
853
+ guid: string;
854
+ content?: string;
855
+ contentSnippet?: string;
856
+ }
857
+ declare function fetchRSSFeed(url: string, _sourceId?: string): Promise<RSSItem[]>;
858
+
859
+ /**
860
+ * Convert HTML to clean Markdown
861
+ * - Preserves headings, bold, lists, links, code blocks
862
+ * - Strips navigation, forms, UI elements
863
+ * - Smart paragraph detection
864
+ */
865
+ declare function htmlToMarkdown(html: string): string;
866
+ /**
867
+ * Strip non-article content from HTML before conversion
868
+ * Removes navigation, forms, UI elements
869
+ */
870
+ declare function stripNonArticleContent(html: string): string;
871
+ /**
872
+ * Convert HTML to Markdown with full cleaning
873
+ * This is the main function developers should use
874
+ */
875
+ declare function convertToMarkdown(html: string, options?: {
876
+ cleanNonArticle?: boolean;
877
+ smartParagraphs?: boolean;
878
+ }): string;
879
+
880
+ /**
881
+ * Text cleanup utilities
882
+ * Normalize whitespace, remove excessive line breaks, clean HTML entities
883
+ */
884
+ /**
885
+ * Clean text content
886
+ * - Normalize whitespace between paragraphs
887
+ * - Remove excessive line breaks
888
+ * - Decode HTML entities
889
+ * - Trim redundant spaces
890
+ */
891
+ declare function cleanText(text: string): string;
892
+ /**
893
+ * Decode HTML entities (&nbsp;, &amp;, etc.)
894
+ */
895
+ declare function decodeHTMLEntities(text: string): string;
896
+ /**
897
+ * Normalize whitespace
898
+ * - Replace multiple spaces with single space
899
+ * - Replace tabs with spaces
900
+ * - Remove trailing/leading whitespace from lines
901
+ */
902
+ declare function normalizeWhitespace(text: string): string;
903
+ /**
904
+ * Detect paragraph boundaries and add proper spacing
905
+ * Looks for sentence endings followed by capital letters
906
+ */
907
+ declare function detectParagraphs(text: string): string;
908
+ /**
909
+ * Remove URLs from text
910
+ * Useful for cleaning up citations or references
911
+ */
912
+ declare function removeUrls(text: string): string;
913
+ /**
914
+ * Truncate text to a maximum length
915
+ * Breaks at word boundaries and adds ellipsis
916
+ */
917
+ declare function truncateText(text: string, maxLength: number): string;
918
+ /**
919
+ * Extract plain text from HTML
920
+ * Quick and dirty HTML stripping
921
+ */
922
+ declare function stripHTML(html: string): string;
923
+
924
+ /**
925
+ * @tyroneross/blog-scraper
926
+ *
927
+ * A powerful web scraping SDK for extracting blog articles and content.
928
+ * No LLM required - uses Mozilla Readability and intelligent quality scoring.
929
+ *
930
+ * @example Simple usage
931
+ * ```typescript
932
+ * import { scrape } from '@tyroneross/blog-scraper';
933
+ *
934
+ * const result = await scrape('https://example.com/blog');
935
+ * console.log(`Found ${result.articles.length} articles`);
936
+ * ```
937
+ *
938
+ * @example Advanced usage with custom components
939
+ * ```typescript
940
+ * import { ContentExtractor, QualityScorer } from '@tyroneross/blog-scraper';
941
+ *
942
+ * const extractor = new ContentExtractor();
943
+ * const content = await extractor.extractContent(url);
944
+ * ```
945
+ */
946
+
947
+ declare const VERSION = "0.1.0";
948
+
949
+ export { type CandidateArticle, CircuitBreaker, ContentExtractor, type ContentValidation, DEFAULT_DENY_PATHS, DEFAULT_QUALITY_CONFIG, type DiscoveredFeed, type ExtractedArticle, type ExtractedContent$1 as ExtractedContent, type ExtractedContent as ExtractorExtractedContent, HTMLScraper, type OrchestrationResult, type QualityScoreConfig, RSSDiscovery, type RSSItem, RobotsChecker, type ScrapeOptions, type ScrapedArticle, type ScraperPlugin, type ScraperResultsProps, type ScraperTestProps, type ScraperTestRequest, type ScraperTestResult, type ScrapingConfig, ScrapingRateLimiter, type SitemapEntry, SitemapParser, type SourceConfig, SourceOrchestrator, VERSION, calculateArticleQualityScore, circuitBreakers, cleanText, convertToMarkdown, decodeHTMLEntities, detectParagraphs, fetchRSSFeed, getQualityBreakdown, globalContentExtractor, globalRSSDiscovery, globalRateLimiter, globalRobotsChecker, globalSitemapParser, globalSourceOrchestrator, htmlToMarkdown, normalizeWhitespace, quickScrape, removeUrls, scrape, shouldDenyUrl, stripHTML, stripNonArticleContent, truncateText, validateContent };