@tyroneross/blog-scraper 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +254 -279
  3. package/dist/lib/circuit-breaker.d.ts +29 -0
  4. package/dist/lib/circuit-breaker.d.ts.map +1 -0
  5. package/dist/lib/circuit-breaker.js +89 -0
  6. package/dist/lib/circuit-breaker.js.map +1 -0
  7. package/dist/lib/content-extractor.d.ts +13 -0
  8. package/dist/lib/content-extractor.d.ts.map +1 -0
  9. package/dist/lib/content-extractor.js +75 -0
  10. package/dist/lib/content-extractor.js.map +1 -0
  11. package/dist/lib/formatters/html-to-markdown.d.ts +21 -0
  12. package/dist/lib/formatters/html-to-markdown.d.ts.map +1 -0
  13. package/dist/lib/formatters/html-to-markdown.js +146 -0
  14. package/dist/lib/formatters/html-to-markdown.js.map +1 -0
  15. package/dist/lib/formatters/text-cleaner.d.ts +44 -0
  16. package/dist/lib/formatters/text-cleaner.d.ts.map +1 -0
  17. package/dist/lib/formatters/text-cleaner.js +143 -0
  18. package/dist/lib/formatters/text-cleaner.js.map +1 -0
  19. package/dist/lib/index.d.ts +96 -0
  20. package/dist/lib/index.d.ts.map +1 -0
  21. package/dist/lib/index.js +184 -0
  22. package/dist/lib/index.js.map +1 -0
  23. package/dist/lib/quality-scorer.d.ts +83 -0
  24. package/dist/lib/quality-scorer.d.ts.map +1 -0
  25. package/dist/lib/quality-scorer.js +376 -0
  26. package/dist/lib/quality-scorer.js.map +1 -0
  27. package/dist/lib/rss-utils.d.ts +31 -0
  28. package/dist/lib/rss-utils.d.ts.map +1 -0
  29. package/dist/lib/rss-utils.js +175 -0
  30. package/dist/lib/rss-utils.js.map +1 -0
  31. package/dist/lib/scraping-rate-limiter.d.ts +52 -0
  32. package/dist/lib/scraping-rate-limiter.d.ts.map +1 -0
  33. package/dist/lib/scraping-rate-limiter.js +238 -0
  34. package/dist/lib/scraping-rate-limiter.js.map +1 -0
  35. package/dist/lib/source-orchestrator.d.ts +306 -0
  36. package/dist/lib/source-orchestrator.d.ts.map +1 -0
  37. package/dist/lib/source-orchestrator.js +840 -0
  38. package/dist/lib/source-orchestrator.js.map +1 -0
  39. package/dist/lib/types.d.ts +143 -0
  40. package/dist/lib/types.d.ts.map +1 -0
  41. package/dist/lib/types.js +7 -0
  42. package/dist/lib/types.js.map +1 -0
  43. package/dist/lib/web-scrapers/content-extractor.d.ts +62 -0
  44. package/dist/lib/web-scrapers/content-extractor.d.ts.map +1 -0
  45. package/dist/lib/web-scrapers/content-extractor.js +531 -0
  46. package/dist/lib/web-scrapers/content-extractor.js.map +1 -0
  47. package/dist/lib/web-scrapers/html-scraper.d.ts +74 -0
  48. package/dist/lib/web-scrapers/html-scraper.d.ts.map +1 -0
  49. package/dist/lib/web-scrapers/html-scraper.js +598 -0
  50. package/dist/lib/web-scrapers/html-scraper.js.map +1 -0
  51. package/dist/lib/web-scrapers/playwright-scraper.d.ts +57 -0
  52. package/dist/lib/web-scrapers/playwright-scraper.d.ts.map +1 -0
  53. package/dist/lib/web-scrapers/playwright-scraper.js +355 -0
  54. package/dist/lib/web-scrapers/playwright-scraper.js.map +1 -0
  55. package/dist/lib/web-scrapers/robots-checker.d.ts +42 -0
  56. package/dist/lib/web-scrapers/robots-checker.d.ts.map +1 -0
  57. package/dist/lib/web-scrapers/robots-checker.js +285 -0
  58. package/dist/lib/web-scrapers/robots-checker.js.map +1 -0
  59. package/dist/lib/web-scrapers/rss-discovery.d.ts +62 -0
  60. package/dist/lib/web-scrapers/rss-discovery.d.ts.map +1 -0
  61. package/dist/lib/web-scrapers/rss-discovery.js +384 -0
  62. package/dist/lib/web-scrapers/rss-discovery.js.map +1 -0
  63. package/dist/lib/web-scrapers/sitemap-parser.d.ts +65 -0
  64. package/dist/lib/web-scrapers/sitemap-parser.d.ts.map +1 -0
  65. package/dist/lib/web-scrapers/sitemap-parser.js +430 -0
  66. package/dist/lib/web-scrapers/sitemap-parser.js.map +1 -0
  67. package/package.json +54 -33
  68. package/dist/index.d.mts +0 -949
  69. package/dist/index.d.ts +0 -949
  70. package/dist/index.js +0 -3236
  71. package/dist/index.mjs +0 -3165
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Tyrone Ross
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md CHANGED
@@ -1,370 +1,345 @@
1
- # @tyroneross/blog-scraper
1
+ # Blog Content Scraper
2
2
 
3
- > Powerful web scraping SDK for extracting blog articles and content. No LLM required.
3
+ Intelligent web scraper for extracting blog/news content from any website. Includes both a **web UI** for testing and a **programmatic SDK** for integration.
4
4
 
5
- [![npm version](https://img.shields.io/npm/v/@tyroneross/blog-scraper.svg)](https://www.npmjs.com/package/@tyroneross/blog-scraper)
6
- [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
5
+ ## Quick Start (SDK)
7
6
 
8
- ## Features
9
-
10
- ✨ **No LLM needed** - Uses Mozilla Readability (92.2% F1 score) for content extraction
11
- 🎯 **3-tier filtering** - URL patterns → content validation → quality scoring
12
- ⚡ **Fast** - Extracts articles in 2-5 seconds
13
- 🔧 **Modular** - Use high-level API or individual components
14
- 📦 **Zero config** - Works out of the box
15
- 🌐 **Multi-source** - RSS feeds, sitemaps, and HTML pages
7
+ ```typescript
8
+ import { scrapeWebsite } from './lib';
16
9
 
17
- ## Installation
10
+ const result = await scrapeWebsite('https://techcrunch.com', {
11
+ maxArticles: 5,
12
+ extractFullContent: true
13
+ });
18
14
 
19
- ```bash
20
- npm install @tyroneross/blog-scraper
15
+ for (const article of result.articles) {
16
+ console.log(article.title, article.qualityScore);
17
+ }
21
18
  ```
22
19
 
23
- ## Quick Start
24
-
25
- ```typescript
26
- import { scrape } from '@tyroneross/blog-scraper';
20
+ See [SDK Documentation](#sdk-documentation) below for full API reference.
27
21
 
28
- // Simple usage - scrape a blog
29
- const result = await scrape('https://example.com/blog');
22
+ ---
30
23
 
31
- console.log(`Found ${result.articles.length} articles`);
32
- console.log(`Processing time: ${result.processingTime}ms`);
24
+ ## Web UI
33
25
 
34
- // Access articles
35
- result.articles.forEach(article => {
36
- console.log(article.title);
37
- console.log(article.url);
38
- console.log(article.fullContentMarkdown); // Markdown format
39
- console.log(article.qualityScore); // 0-1 quality score
40
- });
41
- ```
26
+ Standalone web application for testing web scraping with intelligent content filtering. Built with Next.js, Mozilla Readability, and zero LLM dependencies.
42
27
 
43
- ## API Reference
28
+ ## Features
44
29
 
45
- ### High-Level API (Recommended)
30
+ - ✅ **No configuration needed** - Works immediately
31
+ - 🎯 **3-tier filtering** - URL patterns → content validation → quality scoring
32
+ - ⚡ **Fast** - Mozilla Readability (92.2% F1 score)
33
+ - 📊 **Detailed stats** - See filtering pipeline in action
34
+ - 🎨 **Clean UI** - Built with Tailwind CSS
35
+ - 🚀 **Deploy anywhere** - Vercel, Netlify, Docker, etc.
46
36
 
47
- #### `scrape(url, options?)`
37
+ ## Quick Start
48
38
 
49
- Extract articles from a URL with automatic source detection.
39
+ ### Local Development
50
40
 
51
- ```typescript
52
- import { scrape } from '@tyroneross/blog-scraper';
53
-
54
- const result = await scrape('https://example.com', {
55
- // Optional configuration
56
- sourceType: 'auto', // 'auto' | 'rss' | 'sitemap' | 'html'
57
- maxArticles: 50, // Maximum articles to return
58
- extractFullContent: true, // Extract full article content
59
- denyPaths: ['/about', '/contact'], // URL patterns to exclude
60
- qualityThreshold: 0.6 // Minimum quality score (0-1)
61
- });
41
+ 1. **Install dependencies:**
42
+ ```bash
43
+ npm install
62
44
  ```
63
45
 
64
- **Returns:**
65
- ```typescript
66
- {
67
- url: string;
68
- detectedType: 'rss' | 'sitemap' | 'html';
69
- confidence: 'high' | 'medium' | 'low';
70
- articles: ScrapedArticle[];
71
- extractionStats: {
72
- attempted: number;
73
- successful: number;
74
- failed: number;
75
- filtered: number;
76
- totalDiscovered: number;
77
- afterDenyFilter: number;
78
- afterContentValidation: number;
79
- afterQualityFilter: number;
80
- };
81
- processingTime: number;
82
- errors: string[];
83
- timestamp: string;
84
- }
46
+ 2. **Run dev server:**
47
+ ```bash
48
+ npm run dev
85
49
  ```
86
50
 
87
- #### `quickScrape(url)`
51
+ 3. **Open browser:**
52
+ ```
53
+ http://localhost:3000
54
+ ```
88
55
 
89
- Fast URL-only extraction (no full content).
56
+ ## Deployment
90
57
 
91
- ```typescript
92
- import { quickScrape } from '@tyroneross/blog-scraper';
58
+ ### Vercel (Recommended)
93
59
 
94
- const urls = await quickScrape('https://example.com/blog');
95
- // Returns: ['url1', 'url2', 'url3', ...]
60
+ 1. **Install Vercel CLI:**
61
+ ```bash
62
+ npm install -g vercel
96
63
  ```
97
64
 
98
- ### Modular API (Advanced)
99
-
100
- Use individual components for granular control.
101
-
102
- #### Content Extraction
65
+ 2. **Deploy:**
66
+ ```bash
67
+ vercel
68
+ ```
103
69
 
104
- ```typescript
105
- import { ContentExtractor } from '@tyroneross/blog-scraper';
70
+ 3. **Production deploy:**
71
+ ```bash
72
+ vercel --prod
73
+ ```
106
74
 
107
- const extractor = new ContentExtractor();
108
- const content = await extractor.extractContent('https://example.com/article');
75
+ ### Netlify
109
76
 
110
- console.log(content.title);
111
- console.log(content.textContent);
112
- console.log(content.wordCount);
113
- console.log(content.readingTime);
77
+ 1. **Build command:**
78
+ ```
79
+ npm run build
114
80
  ```
115
81
 
116
- #### Quality Scoring
82
+ 2. **Publish directory:**
83
+ ```
84
+ .next
85
+ ```
117
86
 
118
- ```typescript
119
- import { calculateArticleQualityScore, getQualityBreakdown } from '@tyroneross/blog-scraper';
120
-
121
- const score = calculateArticleQualityScore(extractedContent);
122
- console.log(`Quality score: ${score}`); // 0-1
123
-
124
- // Get detailed breakdown
125
- const breakdown = getQualityBreakdown(extractedContent);
126
- console.log(breakdown);
127
- // {
128
- // contentValidation: 0.6,
129
- // publishedDate: 0.12,
130
- // author: 0.08,
131
- // schema: 0.08,
132
- // readingTime: 0.12,
133
- // total: 1.0,
134
- // passesThreshold: true
135
- // }
87
+ 3. **Deploy:**
88
+ ```bash
89
+ netlify deploy --prod
136
90
  ```
137
91
 
138
- #### Custom Quality Configuration
92
+ ### Docker
93
+
94
+ ```dockerfile
95
+ FROM node:18-alpine
96
+ WORKDIR /app
97
+ COPY package*.json ./
98
+ RUN npm install
99
+ COPY . .
100
+ RUN npm run build
101
+ EXPOSE 3000
102
+ CMD ["npm", "start"]
103
+ ```
139
104
 
140
- ```typescript
141
- import { calculateArticleQualityScore } from '@tyroneross/blog-scraper';
142
-
143
- const score = calculateArticleQualityScore(content, {
144
- contentWeight: 0.8, // Increase content importance
145
- dateWeight: 0.05, // Decrease date importance
146
- authorWeight: 0.05,
147
- schemaWeight: 0.05,
148
- readingTimeWeight: 0.05,
149
- threshold: 0.7 // Stricter threshold
150
- });
105
+ ```bash
106
+ docker build -t scraper-app .
107
+ docker run -p 3000:3000 scraper-app
151
108
  ```
152
109
 
153
- #### RSS Discovery
110
+ ## How It Works
154
111
 
155
- ```typescript
156
- import { RSSDiscovery } from '@tyroneross/blog-scraper';
112
+ ### 3-Tier Filtering System
113
+
114
+ **Tier 1: URL Deny Patterns**
115
+ - Blocks /, /about, /careers, /contact, /tag/*, etc.
116
+ - Fast, pattern-based filtering
157
117
 
158
- const discovery = new RSSDiscovery();
159
- const feeds = await discovery.discoverFeeds('https://example.com');
118
+ **Tier 2: Content Validation**
119
+ - Minimum 200 characters
120
+ - Title length 10-200 characters
121
+ - Text-to-HTML ratio ≥ 10%
160
122
 
161
- feeds.forEach(feed => {
162
- console.log(feed.url);
163
- console.log(feed.title);
164
- console.log(feed.confidence); // 0-1
165
- });
166
- ```
123
+ **Tier 3: Metadata Scoring**
124
+ - Content quality: 60% weight
125
+ - Publication date: 12% weight
126
+ - Author/byline: 8% weight
127
+ - Schema.org metadata: 8% weight
128
+ - Reading time (2+ min): 12% weight
129
+ - **Default threshold**: 50%
167
130
 
168
- #### Sitemap Parsing
131
+ ### Technology Stack
169
132
 
170
- ```typescript
171
- import { SitemapParser } from '@tyroneross/blog-scraper';
133
+ - **Next.js 15** - React framework
134
+ - **TypeScript** - Type safety
135
+ - **Tailwind CSS** - Styling
136
+ - **Mozilla Readability** - Content extraction
137
+ - **JSDOM** - HTML parsing
138
+ - **Zod** - Schema validation
139
+ - **Lucide React** - Icons
172
140
 
173
- const parser = new SitemapParser();
174
- const entries = await parser.parseSitemap('https://example.com/sitemap.xml');
141
+ ## Project Structure
175
142
 
176
- entries.forEach(entry => {
177
- console.log(entry.url);
178
- console.log(entry.lastmod);
179
- console.log(entry.priority);
180
- });
143
+ ```
144
+ scraper-app/
145
+ ├── app/
146
+ │ ├── api/scraper-test/ # API route
147
+ │ │ └── route.ts
148
+ │ ├── layout.tsx # Root layout
149
+ │ ├── page.tsx # Homepage
150
+ │ └── globals.css # Global styles
151
+ ├── components/
152
+ │ ├── ScraperTester.tsx # Main UI component
153
+ │ └── ScraperResults.tsx # Results display
154
+ ├── lib/
155
+ │ ├── types.ts # TypeScript types
156
+ │ ├── quality-scorer.ts # Quality scoring logic
157
+ │ └── content-extractor.ts # Content extraction
158
+ ├── public/ # Static assets
159
+ ├── package.json
160
+ ├── tsconfig.json
161
+ ├── tailwind.config.ts
162
+ └── next.config.js
181
163
  ```
182
164
 
183
- #### HTML Scraping
165
+ ## Environment Variables
184
166
 
185
- ```typescript
186
- import { HTMLScraper } from '@tyroneross/blog-scraper';
187
-
188
- const scraper = new HTMLScraper();
189
- const articles = await scraper.extractFromPage('https://example.com/blog', {
190
- selectors: {
191
- articleLinks: ['article a', '.post-link'],
192
- titleSelectors: ['h1', '.post-title'],
193
- dateSelectors: ['time', '.published-date']
194
- },
195
- filters: {
196
- minTitleLength: 10,
197
- maxTitleLength: 200
198
- }
199
- });
200
- ```
167
+ No environment variables required! The app works out of the box.
201
168
 
202
- #### Rate Limiting
169
+ ## Performance
203
170
 
204
- ```typescript
205
- import { ScrapingRateLimiter } from '@tyroneross/blog-scraper';
171
+ - **Single article:** ~2-5 seconds
172
+ - **Bundle size:** ~150 KB (gzipped)
173
+ - **Zero API costs:** No external APIs used
174
+ - **Memory:** ~100 MB average
206
175
 
207
- // Create custom rate limiter
208
- const limiter = new ScrapingRateLimiter({
209
- maxConcurrent: 5,
210
- minTime: 1000 // 1 second between requests
211
- });
176
+ ## Testing
212
177
 
213
- // Use in your scraping logic
214
- await limiter.execute('example.com', async () => {
215
- // Your scraping code here
216
- });
217
- ```
178
+ ### F1 Score Validation
218
179
 
219
- #### Circuit Breaker
180
+ The **92.2% F1 score** claim for Mozilla Readability is validated through automated testing using two approaches:
220
181
 
221
- ```typescript
222
- import { CircuitBreaker } from '@tyroneross/blog-scraper';
182
+ #### 1. Dragnet Benchmark Dataset (Recommended)
223
183
 
224
- const breaker = new CircuitBreaker('my-operation', {
225
- failureThreshold: 5,
226
- resetTimeout: 60000 // 1 minute
227
- });
184
+ Uses the established [Dragnet benchmark dataset](https://github.com/seomoz/dragnet_data) - a well-documented, peer-reviewed dataset used in academic research:
228
185
 
229
- const result = await breaker.execute(async () => {
230
- // Your operation here
231
- });
186
+ ```bash
187
+ npm run test:f1:dragnet
232
188
  ```
233
189
 
234
- ## Examples
190
+ **Results: 91.4% F1 score** (0.8% from claimed 92.2%)
191
+ - 📊 Dataset: 414 test articles (20 tested for efficiency)
192
+ - 📚 Source: Published research (2013)
193
+ - ✅ 100% extraction success rate
194
+ - 📈 92.6% Precision, 92.3% Recall
235
195
 
236
- ### Example 1: Scrape with Custom Deny Patterns
196
+ #### 2. Custom Test Dataset
237
197
 
238
- ```typescript
239
- import { scrape } from '@tyroneross/blog-scraper';
240
-
241
- const result = await scrape('https://techcrunch.com', {
242
- denyPaths: [
243
- '/',
244
- '/about',
245
- '/contact',
246
- '/tag/*', // Exclude all tag pages
247
- '/author/*' // Exclude all author pages
248
- ],
249
- maxArticles: 20
250
- });
198
+ Quick validation with curated test articles:
199
+
200
+ ```bash
201
+ npm run test:f1
251
202
  ```
252
203
 
253
- ### Example 2: Build Custom Pipeline
204
+ **Results: 96.3% F1 score**
205
+ - 3 manually-labeled test articles
206
+ - Useful for quick validation and development
254
207
 
255
- ```typescript
256
- import {
257
- SourceOrchestrator,
258
- ContentExtractor,
259
- calculateArticleQualityScore
260
- } from '@tyroneross/blog-scraper';
261
-
262
- // Step 1: Discover articles
263
- const orchestrator = new SourceOrchestrator();
264
- const discovered = await orchestrator.processSource('https://example.com', {
265
- sourceType: 'auto'
266
- });
208
+ ---
267
209
 
268
- // Step 2: Extract content
269
- const extractor = new ContentExtractor();
270
- const extracted = await Promise.all(
271
- discovered.articles
272
- .slice(0, 10)
273
- .map(a => extractor.extractContent(a.url))
274
- );
210
+ **What is F1 Score?**
211
+ - **Precision**: % of extracted content that is actually article content (not ads/navigation)
212
+ - **Recall**: % of actual article content that was successfully extracted
213
+ - **F1 Score**: Harmonic mean of precision and recall
275
214
 
276
- // Step 3: Score and filter
277
- const scored = extracted
278
- .filter(Boolean)
279
- .map(content => ({
280
- content,
281
- score: calculateArticleQualityScore(content!)
282
- }))
283
- .filter(item => item.score >= 0.7);
215
+ **Conclusion:** The 92.2% F1 claim is **validated** using the established Dragnet benchmark dataset (91.4% achieved).
284
216
 
285
- console.log(`Found ${scored.length} high-quality articles`);
286
- ```
217
+ See [tests/README.md](./tests/README.md) for detailed testing documentation and how to add new test cases.
287
218
 
288
- ### Example 3: RSS-Only Scraping
219
+ ## License
289
220
 
290
- ```typescript
291
- import { scrape } from '@tyroneross/blog-scraper';
221
+ MIT
292
222
 
293
- const result = await scrape('https://example.com', {
294
- sourceType: 'rss', // Only use RSS feeds
295
- extractFullContent: false, // Don't extract full content
296
- maxArticles: 100
297
- });
298
- ```
223
+ ## Contributing
299
224
 
300
- ## How It Works
225
+ Contributions welcome! Areas for improvement:
226
+ - RSS/Sitemap discovery
227
+ - Batch URL processing
228
+ - Export functionality (CSV, JSON)
229
+ - Custom quality scoring
230
+ - Dark mode
301
231
 
302
- ### 3-Tier Filtering System
232
+ ## Support
303
233
 
304
- **Tier 1: URL Deny Patterns**
305
- - Fast pattern-based filtering
306
- - Excludes non-article pages (/, /about, /tag/*, etc.)
307
- - Customizable patterns
234
+ - Issues: https://github.com/tyroneross/scraper-app/issues
235
+ - Questions: Open a discussion
308
236
 
309
- **Tier 2: Content Validation**
310
- - Minimum 200 characters
311
- - Title length 10-200 characters
312
- - Text-to-HTML ratio ≥ 10%
237
+ ---
313
238
 
314
- **Tier 3: Quality Scoring**
315
- - Content quality: 60% weight
316
- - Publication date: 12% weight
317
- - Author/byline: 8% weight
318
- - Schema.org metadata: 8% weight
319
- - Reading time: 12% weight
320
- - Default threshold: 50%
239
+ ## SDK Documentation
321
240
 
322
- ### Auto-Detection Flow
241
+ The SDK provides programmatic access to the scraping engine without the web UI.
323
242
 
324
- 1. Try RSS feed (highest confidence)
325
- 2. Discover RSS feeds from HTML
326
- 3. Try sitemap parsing
327
- 4. Discover sitemaps from domain
328
- 5. Fall back to HTML link extraction
243
+ ### Installation
329
244
 
330
- ## TypeScript Support
245
+ ```bash
246
+ npm install
247
+ ```
331
248
 
332
- Full TypeScript support with exported types:
249
+ ### Basic Usage
333
250
 
334
251
  ```typescript
335
- import type {
336
- ScrapedArticle,
337
- ScraperTestResult,
338
- ScrapeOptions,
339
- ExtractedContent,
340
- QualityScoreConfig
341
- } from '@tyroneross/blog-scraper';
252
+ import { scrapeWebsite } from './lib';
253
+
254
+ const result = await scrapeWebsite('https://example.com/blog', {
255
+ maxArticles: 10, // Max articles to return (default: 10)
256
+ extractFullContent: true, // Get full article text (default: true)
257
+ qualityThreshold: 0.5, // Min quality score 0-1 (default: 0.5)
258
+ sourceType: 'auto', // 'auto' | 'rss' | 'sitemap' | 'html'
259
+ allowPaths: ['/blog/*'], // Only scrape these paths
260
+ denyPaths: ['/about'], // Skip these paths
261
+ onProgress: (done, total) => console.log(`${done}/${total}`)
262
+ });
342
263
  ```
343
264
 
344
- ## Performance
265
+ ### Response Format
345
266
 
346
- - **Single article extraction:** ~2-5 seconds
347
- - **Bundle size:** ~150 KB (gzipped)
348
- - **Memory usage:** ~100 MB average
349
- - **No external APIs:** Zero API costs
267
+ ```typescript
268
+ {
269
+ url: string;
270
+ detectedType: 'rss' | 'sitemap' | 'html';
271
+ articles: Array<{
272
+ url: string;
273
+ title: string;
274
+ publishedDate: string;
275
+ description?: string;
276
+ fullContent?: string; // Raw HTML
277
+ fullContentMarkdown?: string; // Formatted markdown
278
+ fullContentText?: string; // Plain text
279
+ qualityScore: number; // 0-1
280
+ confidence: number;
281
+ source: 'rss' | 'sitemap' | 'html';
282
+ }>;
283
+ stats: {
284
+ totalDiscovered: number;
285
+ afterQualityFilter: number;
286
+ processingTime: number;
287
+ };
288
+ errors: string[];
289
+ }
290
+ ```
350
291
 
351
- ## Requirements
292
+ ### Advanced: Direct Orchestrator
352
293
 
353
- - Node.js ≥ 18.0.0
354
- - No environment variables needed
294
+ ```typescript
295
+ import { globalSourceOrchestrator } from './lib';
355
296
 
356
- ## License
297
+ const result = await globalSourceOrchestrator.processSource(url, {
298
+ sourceType: 'auto',
299
+ allowPaths: ['/news/*'],
300
+ denyPaths: ['/about', '/careers/*']
301
+ });
357
302
 
358
- MIT © Tyrone Ross
303
+ // Enhance with full content (parallel processing)
304
+ const enhanced = await globalSourceOrchestrator.enhanceWithFullContent(
305
+ result.articles,
306
+ 10,
307
+ { concurrency: 5, onProgress: (done, total) => {} }
308
+ );
309
+ ```
359
310
 
360
- ## Contributing
311
+ ### Rate Limiter Presets
361
312
 
362
- Contributions welcome! Please open an issue or PR.
313
+ ```typescript
314
+ import { createRateLimiter } from './lib';
363
315
 
364
- ## Support
316
+ const limiter = createRateLimiter('moderate'); // or 'conservative', 'aggressive'
317
+ ```
318
+
319
+ | Preset | Req/s | Max Concurrent | Per Host |
320
+ |--------|-------|----------------|----------|
321
+ | conservative | 1 | 10 | 2 |
322
+ | moderate | 2 | 20 | 3 |
323
+ | aggressive | 4 | 30 | 5 |
324
+
325
+ ### Path Patterns
326
+
327
+ ```typescript
328
+ '/blog/*' // Matches /blog/anything
329
+ '/news/2024/*' // Matches /news/2024/anything
330
+ '/about' // Exact match
331
+ ```
332
+
333
+ **Default deny patterns:** `/`, `/about/*`, `/careers/*`, `/contact/*`, `/tag/*`, `/category/*`, `/login`, `/signup`, `/pricing/*`
334
+
335
+ ### Quality Scoring
365
336
 
366
- - [GitHub Issues](https://github.com/tyroneross/blog-content-scraper/issues)
367
- - [Documentation](https://github.com/tyroneross/blog-content-scraper#readme)
337
+ Score weights:
338
+ - Content quality: 60%
339
+ - Publication date: 12%
340
+ - Author/byline: 8%
341
+ - Schema.org data: 8%
342
+ - Reading time: 12%
368
343
 
369
344
  ---
370
345
 
@@ -0,0 +1,29 @@
1
+ interface CircuitBreakerOptions {
2
+ failureThreshold: number;
3
+ timeout: number;
4
+ resetTimeout: number;
5
+ name: string;
6
+ }
7
+ export declare class CircuitBreaker {
8
+ private failures;
9
+ private lastFailureTime;
10
+ private state;
11
+ private options;
12
+ constructor(options: CircuitBreakerOptions);
13
+ execute<T>(operation: () => Promise<T>): Promise<T>;
14
+ private executeWithTimeout;
15
+ private onSuccess;
16
+ private onFailure;
17
+ getState(): {
18
+ state: "CLOSED" | "OPEN" | "HALF_OPEN";
19
+ failures: number;
20
+ lastFailureTime: number;
21
+ };
22
+ }
23
+ export declare const circuitBreakers: {
24
+ rss: CircuitBreaker;
25
+ scraping: CircuitBreaker;
26
+ scrapingTest: CircuitBreaker;
27
+ };
28
+ export {};
29
+ //# sourceMappingURL=circuit-breaker.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"circuit-breaker.d.ts","sourceRoot":"","sources":["../../lib/circuit-breaker.ts"],"names":[],"mappings":"AAAA,UAAU,qBAAqB;IAC7B,gBAAgB,EAAE,MAAM,CAAC;IACzB,OAAO,EAAE,MAAM,CAAC;IAChB,YAAY,EAAE,MAAM,CAAC;IACrB,IAAI,EAAE,MAAM,CAAC;CACd;AAED,qBAAa,cAAc;IACzB,OAAO,CAAC,QAAQ,CAAK;IACrB,OAAO,CAAC,eAAe,CAAK;IAC5B,OAAO,CAAC,KAAK,CAA6C;IAC1D,OAAO,CAAC,OAAO,CAAwB;gBAE3B,OAAO,EAAE,qBAAqB;IAIpC,OAAO,CAAC,CAAC,EAAE,SAAS,EAAE,MAAM,OAAO,CAAC,CAAC,CAAC,GAAG,OAAO,CAAC,CAAC,CAAC;YAoB3C,kBAAkB;IAkBhC,OAAO,CAAC,SAAS;IAKjB,OAAO,CAAC,SAAS;IAUjB,QAAQ;;;;;CAOT;AAGD,eAAO,MAAM,eAAe;;;;CAqB3B,CAAC"}