@tyroneross/blog-scraper 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +254 -279
- package/dist/lib/circuit-breaker.d.ts +29 -0
- package/dist/lib/circuit-breaker.d.ts.map +1 -0
- package/dist/lib/circuit-breaker.js +89 -0
- package/dist/lib/circuit-breaker.js.map +1 -0
- package/dist/lib/content-extractor.d.ts +13 -0
- package/dist/lib/content-extractor.d.ts.map +1 -0
- package/dist/lib/content-extractor.js +75 -0
- package/dist/lib/content-extractor.js.map +1 -0
- package/dist/lib/formatters/html-to-markdown.d.ts +21 -0
- package/dist/lib/formatters/html-to-markdown.d.ts.map +1 -0
- package/dist/lib/formatters/html-to-markdown.js +146 -0
- package/dist/lib/formatters/html-to-markdown.js.map +1 -0
- package/dist/lib/formatters/text-cleaner.d.ts +44 -0
- package/dist/lib/formatters/text-cleaner.d.ts.map +1 -0
- package/dist/lib/formatters/text-cleaner.js +143 -0
- package/dist/lib/formatters/text-cleaner.js.map +1 -0
- package/dist/lib/index.d.ts +96 -0
- package/dist/lib/index.d.ts.map +1 -0
- package/dist/lib/index.js +184 -0
- package/dist/lib/index.js.map +1 -0
- package/dist/lib/quality-scorer.d.ts +83 -0
- package/dist/lib/quality-scorer.d.ts.map +1 -0
- package/dist/lib/quality-scorer.js +376 -0
- package/dist/lib/quality-scorer.js.map +1 -0
- package/dist/lib/rss-utils.d.ts +31 -0
- package/dist/lib/rss-utils.d.ts.map +1 -0
- package/dist/lib/rss-utils.js +175 -0
- package/dist/lib/rss-utils.js.map +1 -0
- package/dist/lib/scraping-rate-limiter.d.ts +52 -0
- package/dist/lib/scraping-rate-limiter.d.ts.map +1 -0
- package/dist/lib/scraping-rate-limiter.js +238 -0
- package/dist/lib/scraping-rate-limiter.js.map +1 -0
- package/dist/lib/source-orchestrator.d.ts +306 -0
- package/dist/lib/source-orchestrator.d.ts.map +1 -0
- package/dist/lib/source-orchestrator.js +840 -0
- package/dist/lib/source-orchestrator.js.map +1 -0
- package/dist/lib/types.d.ts +143 -0
- package/dist/lib/types.d.ts.map +1 -0
- package/dist/lib/types.js +7 -0
- package/dist/lib/types.js.map +1 -0
- package/dist/lib/web-scrapers/content-extractor.d.ts +62 -0
- package/dist/lib/web-scrapers/content-extractor.d.ts.map +1 -0
- package/dist/lib/web-scrapers/content-extractor.js +531 -0
- package/dist/lib/web-scrapers/content-extractor.js.map +1 -0
- package/dist/lib/web-scrapers/html-scraper.d.ts +74 -0
- package/dist/lib/web-scrapers/html-scraper.d.ts.map +1 -0
- package/dist/lib/web-scrapers/html-scraper.js +598 -0
- package/dist/lib/web-scrapers/html-scraper.js.map +1 -0
- package/dist/lib/web-scrapers/playwright-scraper.d.ts +57 -0
- package/dist/lib/web-scrapers/playwright-scraper.d.ts.map +1 -0
- package/dist/lib/web-scrapers/playwright-scraper.js +355 -0
- package/dist/lib/web-scrapers/playwright-scraper.js.map +1 -0
- package/dist/lib/web-scrapers/robots-checker.d.ts +42 -0
- package/dist/lib/web-scrapers/robots-checker.d.ts.map +1 -0
- package/dist/lib/web-scrapers/robots-checker.js +285 -0
- package/dist/lib/web-scrapers/robots-checker.js.map +1 -0
- package/dist/lib/web-scrapers/rss-discovery.d.ts +62 -0
- package/dist/lib/web-scrapers/rss-discovery.d.ts.map +1 -0
- package/dist/lib/web-scrapers/rss-discovery.js +384 -0
- package/dist/lib/web-scrapers/rss-discovery.js.map +1 -0
- package/dist/lib/web-scrapers/sitemap-parser.d.ts +65 -0
- package/dist/lib/web-scrapers/sitemap-parser.d.ts.map +1 -0
- package/dist/lib/web-scrapers/sitemap-parser.js +430 -0
- package/dist/lib/web-scrapers/sitemap-parser.js.map +1 -0
- package/package.json +54 -33
- package/dist/index.d.mts +0 -949
- package/dist/index.d.ts +0 -949
- package/dist/index.js +0 -3236
- package/dist/index.mjs +0 -3165
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Tyrone Ross
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
CHANGED
|
@@ -1,370 +1,345 @@
|
|
|
1
|
-
#
|
|
1
|
+
# Blog Content Scraper
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Intelligent web scraper for extracting blog/news content from any website. Includes both a **web UI** for testing and a **programmatic SDK** for integration.
|
|
4
4
|
|
|
5
|
-
|
|
6
|
-
[](https://opensource.org/licenses/MIT)
|
|
5
|
+
## Quick Start (SDK)
|
|
7
6
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
✨ **No LLM needed** - Uses Mozilla Readability (92.2% F1 score) for content extraction
|
|
11
|
-
🎯 **3-tier filtering** - URL patterns → content validation → quality scoring
|
|
12
|
-
⚡ **Fast** - Extracts articles in 2-5 seconds
|
|
13
|
-
🔧 **Modular** - Use high-level API or individual components
|
|
14
|
-
📦 **Zero config** - Works out of the box
|
|
15
|
-
🌐 **Multi-source** - RSS feeds, sitemaps, and HTML pages
|
|
7
|
+
```typescript
|
|
8
|
+
import { scrapeWebsite } from './lib';
|
|
16
9
|
|
|
17
|
-
|
|
10
|
+
const result = await scrapeWebsite('https://techcrunch.com', {
|
|
11
|
+
maxArticles: 5,
|
|
12
|
+
extractFullContent: true
|
|
13
|
+
});
|
|
18
14
|
|
|
19
|
-
|
|
20
|
-
|
|
15
|
+
for (const article of result.articles) {
|
|
16
|
+
console.log(article.title, article.qualityScore);
|
|
17
|
+
}
|
|
21
18
|
```
|
|
22
19
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
```typescript
|
|
26
|
-
import { scrape } from '@tyroneross/blog-scraper';
|
|
20
|
+
See [SDK Documentation](#sdk-documentation) below for full API reference.
|
|
27
21
|
|
|
28
|
-
|
|
29
|
-
const result = await scrape('https://example.com/blog');
|
|
22
|
+
---
|
|
30
23
|
|
|
31
|
-
|
|
32
|
-
console.log(`Processing time: ${result.processingTime}ms`);
|
|
24
|
+
## Web UI
|
|
33
25
|
|
|
34
|
-
|
|
35
|
-
result.articles.forEach(article => {
|
|
36
|
-
console.log(article.title);
|
|
37
|
-
console.log(article.url);
|
|
38
|
-
console.log(article.fullContentMarkdown); // Markdown format
|
|
39
|
-
console.log(article.qualityScore); // 0-1 quality score
|
|
40
|
-
});
|
|
41
|
-
```
|
|
26
|
+
Standalone web application for testing web scraping with intelligent content filtering. Built with Next.js, Mozilla Readability, and zero LLM dependencies.
|
|
42
27
|
|
|
43
|
-
##
|
|
28
|
+
## Features
|
|
44
29
|
|
|
45
|
-
|
|
30
|
+
- ✅ **No configuration needed** - Works immediately
|
|
31
|
+
- 🎯 **3-tier filtering** - URL patterns → content validation → quality scoring
|
|
32
|
+
- ⚡ **Fast** - Mozilla Readability (92.2% F1 score)
|
|
33
|
+
- 📊 **Detailed stats** - See filtering pipeline in action
|
|
34
|
+
- 🎨 **Clean UI** - Built with Tailwind CSS
|
|
35
|
+
- 🚀 **Deploy anywhere** - Vercel, Netlify, Docker, etc.
|
|
46
36
|
|
|
47
|
-
|
|
37
|
+
## Quick Start
|
|
48
38
|
|
|
49
|
-
|
|
39
|
+
### Local Development
|
|
50
40
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
const result = await scrape('https://example.com', {
|
|
55
|
-
// Optional configuration
|
|
56
|
-
sourceType: 'auto', // 'auto' | 'rss' | 'sitemap' | 'html'
|
|
57
|
-
maxArticles: 50, // Maximum articles to return
|
|
58
|
-
extractFullContent: true, // Extract full article content
|
|
59
|
-
denyPaths: ['/about', '/contact'], // URL patterns to exclude
|
|
60
|
-
qualityThreshold: 0.6 // Minimum quality score (0-1)
|
|
61
|
-
});
|
|
41
|
+
1. **Install dependencies:**
|
|
42
|
+
```bash
|
|
43
|
+
npm install
|
|
62
44
|
```
|
|
63
45
|
|
|
64
|
-
**
|
|
65
|
-
```
|
|
66
|
-
|
|
67
|
-
url: string;
|
|
68
|
-
detectedType: 'rss' | 'sitemap' | 'html';
|
|
69
|
-
confidence: 'high' | 'medium' | 'low';
|
|
70
|
-
articles: ScrapedArticle[];
|
|
71
|
-
extractionStats: {
|
|
72
|
-
attempted: number;
|
|
73
|
-
successful: number;
|
|
74
|
-
failed: number;
|
|
75
|
-
filtered: number;
|
|
76
|
-
totalDiscovered: number;
|
|
77
|
-
afterDenyFilter: number;
|
|
78
|
-
afterContentValidation: number;
|
|
79
|
-
afterQualityFilter: number;
|
|
80
|
-
};
|
|
81
|
-
processingTime: number;
|
|
82
|
-
errors: string[];
|
|
83
|
-
timestamp: string;
|
|
84
|
-
}
|
|
46
|
+
2. **Run dev server:**
|
|
47
|
+
```bash
|
|
48
|
+
npm run dev
|
|
85
49
|
```
|
|
86
50
|
|
|
87
|
-
|
|
51
|
+
3. **Open browser:**
|
|
52
|
+
```
|
|
53
|
+
http://localhost:3000
|
|
54
|
+
```
|
|
88
55
|
|
|
89
|
-
|
|
56
|
+
## Deployment
|
|
90
57
|
|
|
91
|
-
|
|
92
|
-
import { quickScrape } from '@tyroneross/blog-scraper';
|
|
58
|
+
### Vercel (Recommended)
|
|
93
59
|
|
|
94
|
-
|
|
95
|
-
|
|
60
|
+
1. **Install Vercel CLI:**
|
|
61
|
+
```bash
|
|
62
|
+
npm install -g vercel
|
|
96
63
|
```
|
|
97
64
|
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
#### Content Extraction
|
|
65
|
+
2. **Deploy:**
|
|
66
|
+
```bash
|
|
67
|
+
vercel
|
|
68
|
+
```
|
|
103
69
|
|
|
104
|
-
|
|
105
|
-
|
|
70
|
+
3. **Production deploy:**
|
|
71
|
+
```bash
|
|
72
|
+
vercel --prod
|
|
73
|
+
```
|
|
106
74
|
|
|
107
|
-
|
|
108
|
-
const content = await extractor.extractContent('https://example.com/article');
|
|
75
|
+
### Netlify
|
|
109
76
|
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
console.log(content.readingTime);
|
|
77
|
+
1. **Build command:**
|
|
78
|
+
```
|
|
79
|
+
npm run build
|
|
114
80
|
```
|
|
115
81
|
|
|
116
|
-
|
|
82
|
+
2. **Publish directory:**
|
|
83
|
+
```
|
|
84
|
+
.next
|
|
85
|
+
```
|
|
117
86
|
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
const score = calculateArticleQualityScore(extractedContent);
|
|
122
|
-
console.log(`Quality score: ${score}`); // 0-1
|
|
123
|
-
|
|
124
|
-
// Get detailed breakdown
|
|
125
|
-
const breakdown = getQualityBreakdown(extractedContent);
|
|
126
|
-
console.log(breakdown);
|
|
127
|
-
// {
|
|
128
|
-
// contentValidation: 0.6,
|
|
129
|
-
// publishedDate: 0.12,
|
|
130
|
-
// author: 0.08,
|
|
131
|
-
// schema: 0.08,
|
|
132
|
-
// readingTime: 0.12,
|
|
133
|
-
// total: 1.0,
|
|
134
|
-
// passesThreshold: true
|
|
135
|
-
// }
|
|
87
|
+
3. **Deploy:**
|
|
88
|
+
```bash
|
|
89
|
+
netlify deploy --prod
|
|
136
90
|
```
|
|
137
91
|
|
|
138
|
-
|
|
92
|
+
### Docker
|
|
93
|
+
|
|
94
|
+
```dockerfile
|
|
95
|
+
FROM node:18-alpine
|
|
96
|
+
WORKDIR /app
|
|
97
|
+
COPY package*.json ./
|
|
98
|
+
RUN npm install
|
|
99
|
+
COPY . .
|
|
100
|
+
RUN npm run build
|
|
101
|
+
EXPOSE 3000
|
|
102
|
+
CMD ["npm", "start"]
|
|
103
|
+
```
|
|
139
104
|
|
|
140
|
-
```
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
const score = calculateArticleQualityScore(content, {
|
|
144
|
-
contentWeight: 0.8, // Increase content importance
|
|
145
|
-
dateWeight: 0.05, // Decrease date importance
|
|
146
|
-
authorWeight: 0.05,
|
|
147
|
-
schemaWeight: 0.05,
|
|
148
|
-
readingTimeWeight: 0.05,
|
|
149
|
-
threshold: 0.7 // Stricter threshold
|
|
150
|
-
});
|
|
105
|
+
```bash
|
|
106
|
+
docker build -t scraper-app .
|
|
107
|
+
docker run -p 3000:3000 scraper-app
|
|
151
108
|
```
|
|
152
109
|
|
|
153
|
-
|
|
110
|
+
## How It Works
|
|
154
111
|
|
|
155
|
-
|
|
156
|
-
|
|
112
|
+
### 3-Tier Filtering System
|
|
113
|
+
|
|
114
|
+
**Tier 1: URL Deny Patterns**
|
|
115
|
+
- Blocks /, /about, /careers, /contact, /tag/*, etc.
|
|
116
|
+
- Fast, pattern-based filtering
|
|
157
117
|
|
|
158
|
-
|
|
159
|
-
|
|
118
|
+
**Tier 2: Content Validation**
|
|
119
|
+
- Minimum 200 characters
|
|
120
|
+
- Title length 10-200 characters
|
|
121
|
+
- Text-to-HTML ratio ≥ 10%
|
|
160
122
|
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
123
|
+
**Tier 3: Metadata Scoring**
|
|
124
|
+
- Content quality: 60% weight
|
|
125
|
+
- Publication date: 12% weight
|
|
126
|
+
- Author/byline: 8% weight
|
|
127
|
+
- Schema.org metadata: 8% weight
|
|
128
|
+
- Reading time (2+ min): 12% weight
|
|
129
|
+
- **Default threshold**: 50%
|
|
167
130
|
|
|
168
|
-
|
|
131
|
+
### Technology Stack
|
|
169
132
|
|
|
170
|
-
|
|
171
|
-
|
|
133
|
+
- **Next.js 15** - React framework
|
|
134
|
+
- **TypeScript** - Type safety
|
|
135
|
+
- **Tailwind CSS** - Styling
|
|
136
|
+
- **Mozilla Readability** - Content extraction
|
|
137
|
+
- **JSDOM** - HTML parsing
|
|
138
|
+
- **Zod** - Schema validation
|
|
139
|
+
- **Lucide React** - Icons
|
|
172
140
|
|
|
173
|
-
|
|
174
|
-
const entries = await parser.parseSitemap('https://example.com/sitemap.xml');
|
|
141
|
+
## Project Structure
|
|
175
142
|
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
143
|
+
```
|
|
144
|
+
scraper-app/
|
|
145
|
+
├── app/
|
|
146
|
+
│ ├── api/scraper-test/ # API route
|
|
147
|
+
│ │ └── route.ts
|
|
148
|
+
│ ├── layout.tsx # Root layout
|
|
149
|
+
│ ├── page.tsx # Homepage
|
|
150
|
+
│ └── globals.css # Global styles
|
|
151
|
+
├── components/
|
|
152
|
+
│ ├── ScraperTester.tsx # Main UI component
|
|
153
|
+
│ └── ScraperResults.tsx # Results display
|
|
154
|
+
├── lib/
|
|
155
|
+
│ ├── types.ts # TypeScript types
|
|
156
|
+
│ ├── quality-scorer.ts # Quality scoring logic
|
|
157
|
+
│ └── content-extractor.ts # Content extraction
|
|
158
|
+
├── public/ # Static assets
|
|
159
|
+
├── package.json
|
|
160
|
+
├── tsconfig.json
|
|
161
|
+
├── tailwind.config.ts
|
|
162
|
+
└── next.config.js
|
|
181
163
|
```
|
|
182
164
|
|
|
183
|
-
|
|
165
|
+
## Environment Variables
|
|
184
166
|
|
|
185
|
-
|
|
186
|
-
import { HTMLScraper } from '@tyroneross/blog-scraper';
|
|
187
|
-
|
|
188
|
-
const scraper = new HTMLScraper();
|
|
189
|
-
const articles = await scraper.extractFromPage('https://example.com/blog', {
|
|
190
|
-
selectors: {
|
|
191
|
-
articleLinks: ['article a', '.post-link'],
|
|
192
|
-
titleSelectors: ['h1', '.post-title'],
|
|
193
|
-
dateSelectors: ['time', '.published-date']
|
|
194
|
-
},
|
|
195
|
-
filters: {
|
|
196
|
-
minTitleLength: 10,
|
|
197
|
-
maxTitleLength: 200
|
|
198
|
-
}
|
|
199
|
-
});
|
|
200
|
-
```
|
|
167
|
+
No environment variables required! The app works out of the box.
|
|
201
168
|
|
|
202
|
-
|
|
169
|
+
## Performance
|
|
203
170
|
|
|
204
|
-
|
|
205
|
-
|
|
171
|
+
- **Single article:** ~2-5 seconds
|
|
172
|
+
- **Bundle size:** ~150 KB (gzipped)
|
|
173
|
+
- **Zero API costs:** No external APIs used
|
|
174
|
+
- **Memory:** ~100 MB average
|
|
206
175
|
|
|
207
|
-
|
|
208
|
-
const limiter = new ScrapingRateLimiter({
|
|
209
|
-
maxConcurrent: 5,
|
|
210
|
-
minTime: 1000 // 1 second between requests
|
|
211
|
-
});
|
|
176
|
+
## Testing
|
|
212
177
|
|
|
213
|
-
|
|
214
|
-
await limiter.execute('example.com', async () => {
|
|
215
|
-
// Your scraping code here
|
|
216
|
-
});
|
|
217
|
-
```
|
|
178
|
+
### F1 Score Validation
|
|
218
179
|
|
|
219
|
-
|
|
180
|
+
The **92.2% F1 score** claim for Mozilla Readability is validated through automated testing using two approaches:
|
|
220
181
|
|
|
221
|
-
|
|
222
|
-
import { CircuitBreaker } from '@tyroneross/blog-scraper';
|
|
182
|
+
#### 1. Dragnet Benchmark Dataset (Recommended)
|
|
223
183
|
|
|
224
|
-
|
|
225
|
-
failureThreshold: 5,
|
|
226
|
-
resetTimeout: 60000 // 1 minute
|
|
227
|
-
});
|
|
184
|
+
Uses the established [Dragnet benchmark dataset](https://github.com/seomoz/dragnet_data) - a well-documented, peer-reviewed dataset used in academic research:
|
|
228
185
|
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
});
|
|
186
|
+
```bash
|
|
187
|
+
npm run test:f1:dragnet
|
|
232
188
|
```
|
|
233
189
|
|
|
234
|
-
|
|
190
|
+
**Results: 91.4% F1 score** (0.8% from claimed 92.2%)
|
|
191
|
+
- 📊 Dataset: 414 test articles (20 tested for efficiency)
|
|
192
|
+
- 📚 Source: Published research (2013)
|
|
193
|
+
- ✅ 100% extraction success rate
|
|
194
|
+
- 📈 92.6% Precision, 92.3% Recall
|
|
235
195
|
|
|
236
|
-
|
|
196
|
+
#### 2. Custom Test Dataset
|
|
237
197
|
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
denyPaths: [
|
|
243
|
-
'/',
|
|
244
|
-
'/about',
|
|
245
|
-
'/contact',
|
|
246
|
-
'/tag/*', // Exclude all tag pages
|
|
247
|
-
'/author/*' // Exclude all author pages
|
|
248
|
-
],
|
|
249
|
-
maxArticles: 20
|
|
250
|
-
});
|
|
198
|
+
Quick validation with curated test articles:
|
|
199
|
+
|
|
200
|
+
```bash
|
|
201
|
+
npm run test:f1
|
|
251
202
|
```
|
|
252
203
|
|
|
253
|
-
|
|
204
|
+
**Results: 96.3% F1 score**
|
|
205
|
+
- 3 manually-labeled test articles
|
|
206
|
+
- Useful for quick validation and development
|
|
254
207
|
|
|
255
|
-
|
|
256
|
-
import {
|
|
257
|
-
SourceOrchestrator,
|
|
258
|
-
ContentExtractor,
|
|
259
|
-
calculateArticleQualityScore
|
|
260
|
-
} from '@tyroneross/blog-scraper';
|
|
261
|
-
|
|
262
|
-
// Step 1: Discover articles
|
|
263
|
-
const orchestrator = new SourceOrchestrator();
|
|
264
|
-
const discovered = await orchestrator.processSource('https://example.com', {
|
|
265
|
-
sourceType: 'auto'
|
|
266
|
-
});
|
|
208
|
+
---
|
|
267
209
|
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
.slice(0, 10)
|
|
273
|
-
.map(a => extractor.extractContent(a.url))
|
|
274
|
-
);
|
|
210
|
+
**What is F1 Score?**
|
|
211
|
+
- **Precision**: % of extracted content that is actually article content (not ads/navigation)
|
|
212
|
+
- **Recall**: % of actual article content that was successfully extracted
|
|
213
|
+
- **F1 Score**: Harmonic mean of precision and recall
|
|
275
214
|
|
|
276
|
-
|
|
277
|
-
const scored = extracted
|
|
278
|
-
.filter(Boolean)
|
|
279
|
-
.map(content => ({
|
|
280
|
-
content,
|
|
281
|
-
score: calculateArticleQualityScore(content!)
|
|
282
|
-
}))
|
|
283
|
-
.filter(item => item.score >= 0.7);
|
|
215
|
+
**Conclusion:** The 92.2% F1 claim is **validated** using the established Dragnet benchmark dataset (91.4% achieved).
|
|
284
216
|
|
|
285
|
-
|
|
286
|
-
```
|
|
217
|
+
See [tests/README.md](./tests/README.md) for detailed testing documentation and how to add new test cases.
|
|
287
218
|
|
|
288
|
-
|
|
219
|
+
## License
|
|
289
220
|
|
|
290
|
-
|
|
291
|
-
import { scrape } from '@tyroneross/blog-scraper';
|
|
221
|
+
MIT
|
|
292
222
|
|
|
293
|
-
|
|
294
|
-
sourceType: 'rss', // Only use RSS feeds
|
|
295
|
-
extractFullContent: false, // Don't extract full content
|
|
296
|
-
maxArticles: 100
|
|
297
|
-
});
|
|
298
|
-
```
|
|
223
|
+
## Contributing
|
|
299
224
|
|
|
300
|
-
|
|
225
|
+
Contributions welcome! Areas for improvement:
|
|
226
|
+
- RSS/Sitemap discovery
|
|
227
|
+
- Batch URL processing
|
|
228
|
+
- Export functionality (CSV, JSON)
|
|
229
|
+
- Custom quality scoring
|
|
230
|
+
- Dark mode
|
|
301
231
|
|
|
302
|
-
|
|
232
|
+
## Support
|
|
303
233
|
|
|
304
|
-
|
|
305
|
-
-
|
|
306
|
-
- Excludes non-article pages (/, /about, /tag/*, etc.)
|
|
307
|
-
- Customizable patterns
|
|
234
|
+
- Issues: https://github.com/tyroneross/scraper-app/issues
|
|
235
|
+
- Questions: Open a discussion
|
|
308
236
|
|
|
309
|
-
|
|
310
|
-
- Minimum 200 characters
|
|
311
|
-
- Title length 10-200 characters
|
|
312
|
-
- Text-to-HTML ratio ≥ 10%
|
|
237
|
+
---
|
|
313
238
|
|
|
314
|
-
|
|
315
|
-
- Content quality: 60% weight
|
|
316
|
-
- Publication date: 12% weight
|
|
317
|
-
- Author/byline: 8% weight
|
|
318
|
-
- Schema.org metadata: 8% weight
|
|
319
|
-
- Reading time: 12% weight
|
|
320
|
-
- Default threshold: 50%
|
|
239
|
+
## SDK Documentation
|
|
321
240
|
|
|
322
|
-
|
|
241
|
+
The SDK provides programmatic access to the scraping engine without the web UI.
|
|
323
242
|
|
|
324
|
-
|
|
325
|
-
2. Discover RSS feeds from HTML
|
|
326
|
-
3. Try sitemap parsing
|
|
327
|
-
4. Discover sitemaps from domain
|
|
328
|
-
5. Fall back to HTML link extraction
|
|
243
|
+
### Installation
|
|
329
244
|
|
|
330
|
-
|
|
245
|
+
```bash
|
|
246
|
+
npm install
|
|
247
|
+
```
|
|
331
248
|
|
|
332
|
-
|
|
249
|
+
### Basic Usage
|
|
333
250
|
|
|
334
251
|
```typescript
|
|
335
|
-
import
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
252
|
+
import { scrapeWebsite } from './lib';
|
|
253
|
+
|
|
254
|
+
const result = await scrapeWebsite('https://example.com/blog', {
|
|
255
|
+
maxArticles: 10, // Max articles to return (default: 10)
|
|
256
|
+
extractFullContent: true, // Get full article text (default: true)
|
|
257
|
+
qualityThreshold: 0.5, // Min quality score 0-1 (default: 0.5)
|
|
258
|
+
sourceType: 'auto', // 'auto' | 'rss' | 'sitemap' | 'html'
|
|
259
|
+
allowPaths: ['/blog/*'], // Only scrape these paths
|
|
260
|
+
denyPaths: ['/about'], // Skip these paths
|
|
261
|
+
onProgress: (done, total) => console.log(`${done}/${total}`)
|
|
262
|
+
});
|
|
342
263
|
```
|
|
343
264
|
|
|
344
|
-
|
|
265
|
+
### Response Format
|
|
345
266
|
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
267
|
+
```typescript
|
|
268
|
+
{
|
|
269
|
+
url: string;
|
|
270
|
+
detectedType: 'rss' | 'sitemap' | 'html';
|
|
271
|
+
articles: Array<{
|
|
272
|
+
url: string;
|
|
273
|
+
title: string;
|
|
274
|
+
publishedDate: string;
|
|
275
|
+
description?: string;
|
|
276
|
+
fullContent?: string; // Raw HTML
|
|
277
|
+
fullContentMarkdown?: string; // Formatted markdown
|
|
278
|
+
fullContentText?: string; // Plain text
|
|
279
|
+
qualityScore: number; // 0-1
|
|
280
|
+
confidence: number;
|
|
281
|
+
source: 'rss' | 'sitemap' | 'html';
|
|
282
|
+
}>;
|
|
283
|
+
stats: {
|
|
284
|
+
totalDiscovered: number;
|
|
285
|
+
afterQualityFilter: number;
|
|
286
|
+
processingTime: number;
|
|
287
|
+
};
|
|
288
|
+
errors: string[];
|
|
289
|
+
}
|
|
290
|
+
```
|
|
350
291
|
|
|
351
|
-
|
|
292
|
+
### Advanced: Direct Orchestrator
|
|
352
293
|
|
|
353
|
-
|
|
354
|
-
|
|
294
|
+
```typescript
|
|
295
|
+
import { globalSourceOrchestrator } from './lib';
|
|
355
296
|
|
|
356
|
-
|
|
297
|
+
const result = await globalSourceOrchestrator.processSource(url, {
|
|
298
|
+
sourceType: 'auto',
|
|
299
|
+
allowPaths: ['/news/*'],
|
|
300
|
+
denyPaths: ['/about', '/careers/*']
|
|
301
|
+
});
|
|
357
302
|
|
|
358
|
-
|
|
303
|
+
// Enhance with full content (parallel processing)
|
|
304
|
+
const enhanced = await globalSourceOrchestrator.enhanceWithFullContent(
|
|
305
|
+
result.articles,
|
|
306
|
+
10,
|
|
307
|
+
{ concurrency: 5, onProgress: (done, total) => {} }
|
|
308
|
+
);
|
|
309
|
+
```
|
|
359
310
|
|
|
360
|
-
|
|
311
|
+
### Rate Limiter Presets
|
|
361
312
|
|
|
362
|
-
|
|
313
|
+
```typescript
|
|
314
|
+
import { createRateLimiter } from './lib';
|
|
363
315
|
|
|
364
|
-
|
|
316
|
+
const limiter = createRateLimiter('moderate'); // or 'conservative', 'aggressive'
|
|
317
|
+
```
|
|
318
|
+
|
|
319
|
+
| Preset | Req/s | Max Concurrent | Per Host |
|
|
320
|
+
|--------|-------|----------------|----------|
|
|
321
|
+
| conservative | 1 | 10 | 2 |
|
|
322
|
+
| moderate | 2 | 20 | 3 |
|
|
323
|
+
| aggressive | 4 | 30 | 5 |
|
|
324
|
+
|
|
325
|
+
### Path Patterns
|
|
326
|
+
|
|
327
|
+
```typescript
|
|
328
|
+
'/blog/*' // Matches /blog/anything
|
|
329
|
+
'/news/2024/*' // Matches /news/2024/anything
|
|
330
|
+
'/about' // Exact match
|
|
331
|
+
```
|
|
332
|
+
|
|
333
|
+
**Default deny patterns:** `/`, `/about/*`, `/careers/*`, `/contact/*`, `/tag/*`, `/category/*`, `/login`, `/signup`, `/pricing/*`
|
|
334
|
+
|
|
335
|
+
### Quality Scoring
|
|
365
336
|
|
|
366
|
-
|
|
367
|
-
-
|
|
337
|
+
Score weights:
|
|
338
|
+
- Content quality: 60%
|
|
339
|
+
- Publication date: 12%
|
|
340
|
+
- Author/byline: 8%
|
|
341
|
+
- Schema.org data: 8%
|
|
342
|
+
- Reading time: 12%
|
|
368
343
|
|
|
369
344
|
---
|
|
370
345
|
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
interface CircuitBreakerOptions {
|
|
2
|
+
failureThreshold: number;
|
|
3
|
+
timeout: number;
|
|
4
|
+
resetTimeout: number;
|
|
5
|
+
name: string;
|
|
6
|
+
}
|
|
7
|
+
export declare class CircuitBreaker {
|
|
8
|
+
private failures;
|
|
9
|
+
private lastFailureTime;
|
|
10
|
+
private state;
|
|
11
|
+
private options;
|
|
12
|
+
constructor(options: CircuitBreakerOptions);
|
|
13
|
+
execute<T>(operation: () => Promise<T>): Promise<T>;
|
|
14
|
+
private executeWithTimeout;
|
|
15
|
+
private onSuccess;
|
|
16
|
+
private onFailure;
|
|
17
|
+
getState(): {
|
|
18
|
+
state: "CLOSED" | "OPEN" | "HALF_OPEN";
|
|
19
|
+
failures: number;
|
|
20
|
+
lastFailureTime: number;
|
|
21
|
+
};
|
|
22
|
+
}
|
|
23
|
+
export declare const circuitBreakers: {
|
|
24
|
+
rss: CircuitBreaker;
|
|
25
|
+
scraping: CircuitBreaker;
|
|
26
|
+
scrapingTest: CircuitBreaker;
|
|
27
|
+
};
|
|
28
|
+
export {};
|
|
29
|
+
//# sourceMappingURL=circuit-breaker.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"circuit-breaker.d.ts","sourceRoot":"","sources":["../../lib/circuit-breaker.ts"],"names":[],"mappings":"AAAA,UAAU,qBAAqB;IAC7B,gBAAgB,EAAE,MAAM,CAAC;IACzB,OAAO,EAAE,MAAM,CAAC;IAChB,YAAY,EAAE,MAAM,CAAC;IACrB,IAAI,EAAE,MAAM,CAAC;CACd;AAED,qBAAa,cAAc;IACzB,OAAO,CAAC,QAAQ,CAAK;IACrB,OAAO,CAAC,eAAe,CAAK;IAC5B,OAAO,CAAC,KAAK,CAA6C;IAC1D,OAAO,CAAC,OAAO,CAAwB;gBAE3B,OAAO,EAAE,qBAAqB;IAIpC,OAAO,CAAC,CAAC,EAAE,SAAS,EAAE,MAAM,OAAO,CAAC,CAAC,CAAC,GAAG,OAAO,CAAC,CAAC,CAAC;YAoB3C,kBAAkB;IAkBhC,OAAO,CAAC,SAAS;IAKjB,OAAO,CAAC,SAAS;IAUjB,QAAQ;;;;;CAOT;AAGD,eAAO,MAAM,eAAe;;;;CAqB3B,CAAC"}
|