@tyroneross/blog-scraper 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +254 -279
- package/dist/lib/circuit-breaker.d.ts +29 -0
- package/dist/lib/circuit-breaker.d.ts.map +1 -0
- package/dist/lib/circuit-breaker.js +89 -0
- package/dist/lib/circuit-breaker.js.map +1 -0
- package/dist/lib/content-extractor.d.ts +13 -0
- package/dist/lib/content-extractor.d.ts.map +1 -0
- package/dist/lib/content-extractor.js +75 -0
- package/dist/lib/content-extractor.js.map +1 -0
- package/dist/lib/formatters/html-to-markdown.d.ts +21 -0
- package/dist/lib/formatters/html-to-markdown.d.ts.map +1 -0
- package/dist/lib/formatters/html-to-markdown.js +146 -0
- package/dist/lib/formatters/html-to-markdown.js.map +1 -0
- package/dist/lib/formatters/text-cleaner.d.ts +44 -0
- package/dist/lib/formatters/text-cleaner.d.ts.map +1 -0
- package/dist/lib/formatters/text-cleaner.js +143 -0
- package/dist/lib/formatters/text-cleaner.js.map +1 -0
- package/dist/lib/index.d.ts +96 -0
- package/dist/lib/index.d.ts.map +1 -0
- package/dist/lib/index.js +184 -0
- package/dist/lib/index.js.map +1 -0
- package/dist/lib/quality-scorer.d.ts +83 -0
- package/dist/lib/quality-scorer.d.ts.map +1 -0
- package/dist/lib/quality-scorer.js +376 -0
- package/dist/lib/quality-scorer.js.map +1 -0
- package/dist/lib/rss-utils.d.ts +31 -0
- package/dist/lib/rss-utils.d.ts.map +1 -0
- package/dist/lib/rss-utils.js +175 -0
- package/dist/lib/rss-utils.js.map +1 -0
- package/dist/lib/scraping-rate-limiter.d.ts +52 -0
- package/dist/lib/scraping-rate-limiter.d.ts.map +1 -0
- package/dist/lib/scraping-rate-limiter.js +238 -0
- package/dist/lib/scraping-rate-limiter.js.map +1 -0
- package/dist/lib/source-orchestrator.d.ts +306 -0
- package/dist/lib/source-orchestrator.d.ts.map +1 -0
- package/dist/lib/source-orchestrator.js +840 -0
- package/dist/lib/source-orchestrator.js.map +1 -0
- package/dist/lib/types.d.ts +143 -0
- package/dist/lib/types.d.ts.map +1 -0
- package/dist/lib/types.js +7 -0
- package/dist/lib/types.js.map +1 -0
- package/dist/lib/web-scrapers/content-extractor.d.ts +62 -0
- package/dist/lib/web-scrapers/content-extractor.d.ts.map +1 -0
- package/dist/lib/web-scrapers/content-extractor.js +531 -0
- package/dist/lib/web-scrapers/content-extractor.js.map +1 -0
- package/dist/lib/web-scrapers/html-scraper.d.ts +74 -0
- package/dist/lib/web-scrapers/html-scraper.d.ts.map +1 -0
- package/dist/lib/web-scrapers/html-scraper.js +598 -0
- package/dist/lib/web-scrapers/html-scraper.js.map +1 -0
- package/dist/lib/web-scrapers/playwright-scraper.d.ts +57 -0
- package/dist/lib/web-scrapers/playwright-scraper.d.ts.map +1 -0
- package/dist/lib/web-scrapers/playwright-scraper.js +355 -0
- package/dist/lib/web-scrapers/playwright-scraper.js.map +1 -0
- package/dist/lib/web-scrapers/robots-checker.d.ts +42 -0
- package/dist/lib/web-scrapers/robots-checker.d.ts.map +1 -0
- package/dist/lib/web-scrapers/robots-checker.js +285 -0
- package/dist/lib/web-scrapers/robots-checker.js.map +1 -0
- package/dist/lib/web-scrapers/rss-discovery.d.ts +62 -0
- package/dist/lib/web-scrapers/rss-discovery.d.ts.map +1 -0
- package/dist/lib/web-scrapers/rss-discovery.js +384 -0
- package/dist/lib/web-scrapers/rss-discovery.js.map +1 -0
- package/dist/lib/web-scrapers/sitemap-parser.d.ts +65 -0
- package/dist/lib/web-scrapers/sitemap-parser.d.ts.map +1 -0
- package/dist/lib/web-scrapers/sitemap-parser.js +430 -0
- package/dist/lib/web-scrapers/sitemap-parser.js.map +1 -0
- package/package.json +54 -33
- package/dist/index.d.mts +0 -949
- package/dist/index.d.ts +0 -949
- package/dist/index.js +0 -3236
- package/dist/index.mjs +0 -3165
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.globalRateLimiter = exports.ScrapingRateLimiter = exports.RATE_LIMITER_PRESETS = void 0;
|
|
4
|
+
exports.createRateLimiter = createRateLimiter;
|
|
5
|
+
// Preset configurations for different use cases
|
|
6
|
+
exports.RATE_LIMITER_PRESETS = {
|
|
7
|
+
// Conservative: for web app with strict limits
|
|
8
|
+
conservative: {
|
|
9
|
+
requestsPerSecond: 1,
|
|
10
|
+
maxBackoff: 30000,
|
|
11
|
+
maxConcurrent: 10,
|
|
12
|
+
maxConcurrentPerHost: 2,
|
|
13
|
+
},
|
|
14
|
+
// Moderate: good balance of speed and politeness (default for web app)
|
|
15
|
+
moderate: {
|
|
16
|
+
requestsPerSecond: 2,
|
|
17
|
+
maxBackoff: 30000,
|
|
18
|
+
maxConcurrent: 20,
|
|
19
|
+
maxConcurrentPerHost: 3,
|
|
20
|
+
},
|
|
21
|
+
// Aggressive: for SDK usage with higher limits
|
|
22
|
+
aggressive: {
|
|
23
|
+
requestsPerSecond: 4,
|
|
24
|
+
maxBackoff: 15000,
|
|
25
|
+
maxConcurrent: 30,
|
|
26
|
+
maxConcurrentPerHost: 5,
|
|
27
|
+
},
|
|
28
|
+
};
|
|
29
|
+
class ScrapingRateLimiter {
|
|
30
|
+
constructor(options = {}) {
|
|
31
|
+
this.hosts = new Map();
|
|
32
|
+
this.activeRequests = new Set();
|
|
33
|
+
this.baseDelay = Math.floor(1000 / (options.requestsPerSecond || 2));
|
|
34
|
+
this.maxBackoff = options.maxBackoff || 30000;
|
|
35
|
+
this.maxConcurrent = options.maxConcurrent || 20;
|
|
36
|
+
this.maxConcurrentPerHost = options.maxConcurrentPerHost || 3;
|
|
37
|
+
}
|
|
38
|
+
static fromPreset(preset) {
|
|
39
|
+
return new ScrapingRateLimiter(exports.RATE_LIMITER_PRESETS[preset]);
|
|
40
|
+
}
|
|
41
|
+
async execute(url, operation, options = {}) {
|
|
42
|
+
const host = this.extractHost(url);
|
|
43
|
+
if (!host) {
|
|
44
|
+
throw new Error(`Invalid URL: ${url}`);
|
|
45
|
+
}
|
|
46
|
+
return new Promise((resolve, reject) => {
|
|
47
|
+
const request = {
|
|
48
|
+
resolve,
|
|
49
|
+
reject,
|
|
50
|
+
operation,
|
|
51
|
+
priority: options.priority || 0,
|
|
52
|
+
retryCount: 0,
|
|
53
|
+
maxRetries: options.maxRetries || 3,
|
|
54
|
+
host
|
|
55
|
+
};
|
|
56
|
+
this.enqueueRequest(host, request);
|
|
57
|
+
});
|
|
58
|
+
}
|
|
59
|
+
extractHost(url) {
|
|
60
|
+
try {
|
|
61
|
+
const parsed = new URL(url);
|
|
62
|
+
return parsed.hostname.toLowerCase();
|
|
63
|
+
}
|
|
64
|
+
catch {
|
|
65
|
+
return null;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
enqueueRequest(host, request) {
|
|
69
|
+
if (!this.hosts.has(host)) {
|
|
70
|
+
this.hosts.set(host, {
|
|
71
|
+
lastRequest: 0,
|
|
72
|
+
backoffUntil: 0,
|
|
73
|
+
backoffMultiplier: 1,
|
|
74
|
+
queue: [],
|
|
75
|
+
processing: false,
|
|
76
|
+
activeCount: 0
|
|
77
|
+
});
|
|
78
|
+
}
|
|
79
|
+
const hostState = this.hosts.get(host);
|
|
80
|
+
// Insert request in priority order (higher priority first)
|
|
81
|
+
const insertIndex = hostState.queue.findIndex(req => req.priority < request.priority);
|
|
82
|
+
if (insertIndex === -1) {
|
|
83
|
+
hostState.queue.push(request);
|
|
84
|
+
}
|
|
85
|
+
else {
|
|
86
|
+
hostState.queue.splice(insertIndex, 0, request);
|
|
87
|
+
}
|
|
88
|
+
// Start processing if not already running
|
|
89
|
+
if (!hostState.processing) {
|
|
90
|
+
this.processQueue(host).catch(error => {
|
|
91
|
+
console.error(`[RateLimiter] Error processing queue for ${host}:`, error);
|
|
92
|
+
});
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
async processQueue(host) {
|
|
96
|
+
const hostState = this.hosts.get(host);
|
|
97
|
+
if (!hostState || hostState.processing) {
|
|
98
|
+
return;
|
|
99
|
+
}
|
|
100
|
+
hostState.processing = true;
|
|
101
|
+
try {
|
|
102
|
+
while (hostState.queue.length > 0) {
|
|
103
|
+
// Check if we're within global concurrent limits
|
|
104
|
+
if (this.activeRequests.size >= this.maxConcurrent) {
|
|
105
|
+
await this.wait(100);
|
|
106
|
+
continue;
|
|
107
|
+
}
|
|
108
|
+
// Check if we're within per-host concurrent limits
|
|
109
|
+
if (hostState.activeCount >= this.maxConcurrentPerHost) {
|
|
110
|
+
await this.wait(100);
|
|
111
|
+
continue;
|
|
112
|
+
}
|
|
113
|
+
// Check if we're still in backoff period
|
|
114
|
+
if (Date.now() < hostState.backoffUntil) {
|
|
115
|
+
const waitTime = hostState.backoffUntil - Date.now();
|
|
116
|
+
await this.wait(Math.min(waitTime, 1000));
|
|
117
|
+
continue;
|
|
118
|
+
}
|
|
119
|
+
// Check rate limiting
|
|
120
|
+
const now = Date.now();
|
|
121
|
+
const timeSinceLastRequest = now - hostState.lastRequest;
|
|
122
|
+
if (timeSinceLastRequest < this.baseDelay) {
|
|
123
|
+
const waitTime = this.baseDelay - timeSinceLastRequest;
|
|
124
|
+
await this.wait(waitTime);
|
|
125
|
+
continue;
|
|
126
|
+
}
|
|
127
|
+
const request = hostState.queue.shift();
|
|
128
|
+
const requestId = `${host}-${Date.now()}-${Math.random()}`;
|
|
129
|
+
this.activeRequests.add(requestId);
|
|
130
|
+
hostState.activeCount++;
|
|
131
|
+
try {
|
|
132
|
+
hostState.lastRequest = Date.now();
|
|
133
|
+
const result = await request.operation();
|
|
134
|
+
// Reset backoff on success
|
|
135
|
+
hostState.backoffMultiplier = 1;
|
|
136
|
+
hostState.backoffUntil = 0;
|
|
137
|
+
request.resolve(result);
|
|
138
|
+
}
|
|
139
|
+
catch (error) {
|
|
140
|
+
await this.handleRequestError(hostState, request, error);
|
|
141
|
+
}
|
|
142
|
+
finally {
|
|
143
|
+
this.activeRequests.delete(requestId);
|
|
144
|
+
hostState.activeCount--;
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
finally {
|
|
149
|
+
hostState.processing = false;
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
async handleRequestError(hostState, request, error) {
|
|
153
|
+
const shouldRetry = this.shouldRetry(error, request);
|
|
154
|
+
if (shouldRetry && request.retryCount < request.maxRetries) {
|
|
155
|
+
request.retryCount++;
|
|
156
|
+
// Apply exponential backoff for certain errors
|
|
157
|
+
if (this.shouldBackoff(error)) {
|
|
158
|
+
const backoffTime = Math.min(this.baseDelay * hostState.backoffMultiplier * Math.pow(2, request.retryCount), this.maxBackoff);
|
|
159
|
+
hostState.backoffUntil = Date.now() + backoffTime;
|
|
160
|
+
hostState.backoffMultiplier = Math.min(hostState.backoffMultiplier * 1.5, 10);
|
|
161
|
+
console.warn(`[RateLimiter] Backing off ${request.host} for ${backoffTime}ms ` +
|
|
162
|
+
`(attempt ${request.retryCount}/${request.maxRetries}): ${error.message}`);
|
|
163
|
+
}
|
|
164
|
+
// Re-queue the request with lower priority
|
|
165
|
+
request.priority = Math.max(request.priority - 1, -10);
|
|
166
|
+
hostState.queue.unshift(request);
|
|
167
|
+
}
|
|
168
|
+
else {
|
|
169
|
+
console.error(`[RateLimiter] Request failed for ${request.host} ` +
|
|
170
|
+
`(${request.retryCount}/${request.maxRetries} retries): ${error.message}`);
|
|
171
|
+
request.reject(error);
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
shouldRetry(error, request) {
|
|
175
|
+
// Don't retry if we've exceeded max retries
|
|
176
|
+
if (request.retryCount >= request.maxRetries) {
|
|
177
|
+
return false;
|
|
178
|
+
}
|
|
179
|
+
// Retry on network errors
|
|
180
|
+
if (error.code === 'ENOTFOUND' || error.code === 'ECONNREFUSED') {
|
|
181
|
+
return true;
|
|
182
|
+
}
|
|
183
|
+
// Retry on HTTP status codes that might be transient
|
|
184
|
+
if (error.status) {
|
|
185
|
+
const status = error.status;
|
|
186
|
+
return status === 408 || status === 429 || status >= 500;
|
|
187
|
+
}
|
|
188
|
+
// Retry on timeout errors
|
|
189
|
+
if (error.name === 'AbortError' || error.message.includes('timeout')) {
|
|
190
|
+
return true;
|
|
191
|
+
}
|
|
192
|
+
return false;
|
|
193
|
+
}
|
|
194
|
+
shouldBackoff(error) {
|
|
195
|
+
// Apply backoff for rate limiting and server errors
|
|
196
|
+
if (error.status) {
|
|
197
|
+
const status = error.status;
|
|
198
|
+
return status === 429 || status >= 500;
|
|
199
|
+
}
|
|
200
|
+
// Apply backoff for connection errors
|
|
201
|
+
if (error.code === 'ECONNREFUSED' || error.code === 'ENOTFOUND') {
|
|
202
|
+
return true;
|
|
203
|
+
}
|
|
204
|
+
return false;
|
|
205
|
+
}
|
|
206
|
+
wait(ms) {
|
|
207
|
+
return new Promise(resolve => setTimeout(resolve, ms));
|
|
208
|
+
}
|
|
209
|
+
// Utility method to get current queue stats
|
|
210
|
+
getStats() {
|
|
211
|
+
const stats = {};
|
|
212
|
+
this.hosts.forEach((state, host) => {
|
|
213
|
+
stats[host] = {
|
|
214
|
+
queueLength: state.queue.length,
|
|
215
|
+
processing: state.processing,
|
|
216
|
+
backoffUntil: state.backoffUntil,
|
|
217
|
+
backoffMultiplier: state.backoffMultiplier,
|
|
218
|
+
lastRequest: state.lastRequest
|
|
219
|
+
};
|
|
220
|
+
});
|
|
221
|
+
return {
|
|
222
|
+
hosts: stats,
|
|
223
|
+
activeRequests: this.activeRequests.size,
|
|
224
|
+
maxConcurrent: this.maxConcurrent
|
|
225
|
+
};
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
exports.ScrapingRateLimiter = ScrapingRateLimiter;
|
|
229
|
+
// Default global rate limiter instance (moderate preset for web app)
|
|
230
|
+
exports.globalRateLimiter = ScrapingRateLimiter.fromPreset('moderate');
|
|
231
|
+
// Factory function to create rate limiter with custom config (for SDK users)
|
|
232
|
+
function createRateLimiter(config) {
|
|
233
|
+
if (typeof config === 'string') {
|
|
234
|
+
return ScrapingRateLimiter.fromPreset(config);
|
|
235
|
+
}
|
|
236
|
+
return new ScrapingRateLimiter(config);
|
|
237
|
+
}
|
|
238
|
+
//# sourceMappingURL=scraping-rate-limiter.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"scraping-rate-limiter.js","sourceRoot":"","sources":["../../lib/scraping-rate-limiter.ts"],"names":[],"mappings":";;;AA4TA,8CAKC;AAvSD,gDAAgD;AACnC,QAAA,oBAAoB,GAAG;IAClC,+CAA+C;IAC/C,YAAY,EAAE;QACZ,iBAAiB,EAAE,CAAC;QACpB,UAAU,EAAE,KAAK;QACjB,aAAa,EAAE,EAAE;QACjB,oBAAoB,EAAE,CAAC;KACxB;IACD,uEAAuE;IACvE,QAAQ,EAAE;QACR,iBAAiB,EAAE,CAAC;QACpB,UAAU,EAAE,KAAK;QACjB,aAAa,EAAE,EAAE;QACjB,oBAAoB,EAAE,CAAC;KACxB;IACD,+CAA+C;IAC/C,UAAU,EAAE;QACV,iBAAiB,EAAE,CAAC;QACpB,UAAU,EAAE,KAAK;QACjB,aAAa,EAAE,EAAE;QACjB,oBAAoB,EAAE,CAAC;KACxB;CACO,CAAC;AAIX,MAAa,mBAAmB;IAQ9B,YAAY,UAA6B,EAAE;QAPnC,UAAK,GAAG,IAAI,GAAG,EAAqB,CAAC;QAKrC,mBAAc,GAAG,IAAI,GAAG,EAAU,CAAC;QAGzC,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,GAAG,CAAC,OAAO,CAAC,iBAAiB,IAAI,CAAC,CAAC,CAAC,CAAC;QACrE,IAAI,CAAC,UAAU,GAAG,OAAO,CAAC,UAAU,IAAI,KAAK,CAAC;QAC9C,IAAI,CAAC,aAAa,GAAG,OAAO,CAAC,aAAa,IAAI,EAAE,CAAC;QACjD,IAAI,CAAC,oBAAoB,GAAG,OAAO,CAAC,oBAAoB,IAAI,CAAC,CAAC;IAChE,CAAC;IAED,MAAM,CAAC,UAAU,CAAC,MAAyB;QACzC,OAAO,IAAI,mBAAmB,CAAC,4BAAoB,CAAC,MAAM,CAAC,CAAC,CAAC;IAC/D,CAAC;IAED,KAAK,CAAC,OAAO,CACX,GAAW,EACX,SAA2B,EAC3B,UAGI,EAAE;QAEN,MAAM,IAAI,GAAG,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC;QACnC,IAAI,CAAC,IAAI,EAAE,CAAC;YACV,MAAM,IAAI,KAAK,CAAC,gBAAgB,GAAG,EAAE,CAAC,CAAC;QACzC,CAAC;QAED,OAAO,IAAI,OAAO,CAAI,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;YACxC,MAAM,OAAO,GAAkB;gBAC7B,OAAO;gBACP,MAAM;gBACN,SAAS;gBACT,QAAQ,EAAE,OAAO,CAAC,QAAQ,IAAI,CAAC;gBAC/B,UAAU,EAAE,CAAC;gBACb,UAAU,EAAE,OAAO,CAAC,UAAU,IAAI,CAAC;gBACnC,IAAI;aACL,CAAC;YAEF,IAAI,CAAC,cAAc,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;QACrC,CAAC,CAAC,CAAC;IACL,CAAC;IAEO,WAAW,CAAC,GAAW;QAC7B,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;YAC5B,OAAO,MAAM,CAAC,QAAQ,CAAC,WAAW,EAAE,CAAC;QACvC,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IAEO,cAAc,CAAC,IAAY,EAAE,OAAsB;QACzD,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC;YAC1B,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,EAAE;gBACnB,WAAW,EAAE,CAAC;gBACd,YAAY,EAAE,CAAC;gBACf,iBAAiB,EAAE,CAAC;gBACpB,KAAK,EAAE,EAAE;gBACT,UAAU,EAAE,KAAK;gBACjB,WAAW,EAAE,CAAC;aACf,CAAC,CAAC;QACL,CAAC;QAED,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAE,CAAC;QAExC,2DAA2D;QAC3D,MAAM,WAAW,GAAG,SAAS,CAAC,KAAK,CAAC,SAAS,CAC3C,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,QAAQ,GAAG,OAAO,CAAC,QAAQ,CACvC,CAAC;QAEF,IAAI,WAAW,KAAK,CAAC,CAAC,EAAE,CAAC;YACvB,SAAS,CAAC,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAChC,CAAC;aAAM,CAAC;YACN,SAAS,CAAC,KAAK,CAAC,MAAM,CAAC,WAAW,EAAE,CAAC,EAAE,OAAO,CAAC,CAAC;QAClD,CAAC;QAED,0CAA0C;QAC1C,IAAI,CAAC,SAAS,CAAC,UAAU,EAAE,CAAC;YAC1B,IAAI,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE;gBACpC,OAAO,CAAC,KAAK,CAAC,4CAA4C,IAAI,GAAG,EAAE,KAAK,CAAC,CAAC;YAC5E,CAAC,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAEO,KAAK,CAAC,YAAY,CAAC,IAAY;QACrC,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;QACvC,IAAI,CAAC,SAAS,IAAI,SAAS,CAAC,UAAU,EAAE,CAAC;YACvC,OAAO;QACT,CAAC;QAED,SAAS,CAAC,UAAU,GAAG,IAAI,CAAC;QAE5B,IAAI,CAAC;YACH,OAAO,SAAS,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAClC,iDAAiD;gBACjD,IAAI,IAAI,CAAC,cAAc,CAAC,IAAI,IAAI,IAAI,CAAC,aAAa,EAAE,CAAC;oBACnD,MAAM,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;oBACrB,SAAS;gBACX,CAAC;gBAED,mDAAmD;gBACnD,IAAI,SAAS,CAAC,WAAW,IAAI,IAAI,CAAC,oBAAoB,EAAE,CAAC;oBACvD,MAAM,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;oBACrB,SAAS;gBACX,CAAC;gBAED,yCAAyC;gBACzC,IAAI,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC,YAAY,EAAE,CAAC;oBACxC,MAAM,QAAQ,GAAG,SAAS,CAAC,YAAY,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;oBACrD,MAAM,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,QAAQ,EAAE,IAAI,CAAC,CAAC,CAAC;oBAC1C,SAAS;gBACX,CAAC;gBAED,sBAAsB;gBACtB,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;gBACvB,MAAM,oBAAoB,GAAG,GAAG,GAAG,SAAS,CAAC,WAAW,CAAC;gBACzD,IAAI,oBAAoB,GAAG,IAAI,CAAC,SAAS,EAAE,CAAC;oBAC1C,MAAM,QAAQ,GAAG,IAAI,CAAC,SAAS,GAAG,oBAAoB,CAAC;oBACvD,MAAM,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;oBAC1B,SAAS;gBACX,CAAC;gBAED,MAAM,OAAO,GAAG,SAAS,CAAC,KAAK,CAAC,KAAK,EAAG,CAAC;gBACzC,MAAM,SAAS,GAAG,GAAG,IAAI,IAAI,IAAI,CAAC,GAAG,EAAE,IAAI,IAAI,CAAC,MAAM,EAAE,EAAE,CAAC;gBAC3D,IAAI,CAAC,cAAc,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;gBACnC,SAAS,CAAC,WAAW,EAAE,CAAC;gBAExB,IAAI,CAAC;oBACH,SAAS,CAAC,WAAW,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;oBACnC,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,SAAS,EAAE,CAAC;oBAEzC,2BAA2B;oBAC3B,SAAS,CAAC,iBAAiB,GAAG,CAAC,CAAC;oBAChC,SAAS,CAAC,YAAY,GAAG,CAAC,CAAC;oBAE3B,OAAO,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC;gBAE1B,CAAC;gBAAC,OAAO,KAAU,EAAE,CAAC;oBACpB,MAAM,IAAI,CAAC,kBAAkB,CAAC,SAAS,EAAE,OAAO,EAAE,KAAK,CAAC,CAAC;gBAC3D,CAAC;wBAAS,CAAC;oBACT,IAAI,CAAC,cAAc,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;oBACtC,SAAS,CAAC,WAAW,EAAE,CAAC;gBAC1B,CAAC;YACH,CAAC;QACH,CAAC;gBAAS,CAAC;YACT,SAAS,CAAC,UAAU,GAAG,KAAK,CAAC;QAC/B,CAAC;IACH,CAAC;IAEO,KAAK,CAAC,kBAAkB,CAC9B,SAAoB,EACpB,OAAsB,EACtB,KAAU;QAEV,MAAM,WAAW,GAAG,IAAI,CAAC,WAAW,CAAC,KAAK,EAAE,OAAO,CAAC,CAAC;QAErD,IAAI,WAAW,IAAI,OAAO,CAAC,UAAU,GAAG,OAAO,CAAC,UAAU,EAAE,CAAC;YAC3D,OAAO,CAAC,UAAU,EAAE,CAAC;YAErB,+CAA+C;YAC/C,IAAI,IAAI,CAAC,aAAa,CAAC,KAAK,CAAC,EAAE,CAAC;gBAC9B,MAAM,WAAW,GAAG,IAAI,CAAC,GAAG,CAC1B,IAAI,CAAC,SAAS,GAAG,SAAS,CAAC,iBAAiB,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,OAAO,CAAC,UAAU,CAAC,EAC9E,IAAI,CAAC,UAAU,CAChB,CAAC;gBAEF,SAAS,CAAC,YAAY,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,WAAW,CAAC;gBAClD,SAAS,CAAC,iBAAiB,GAAG,IAAI,CAAC,GAAG,CAAC,SAAS,CAAC,iBAAiB,GAAG,GAAG,EAAE,EAAE,CAAC,CAAC;gBAE9E,OAAO,CAAC,IAAI,CACV,6BAA6B,OAAO,CAAC,IAAI,QAAQ,WAAW,KAAK;oBACjE,YAAY,OAAO,CAAC,UAAU,IAAI,OAAO,CAAC,UAAU,MAAM,KAAK,CAAC,OAAO,EAAE,CAC1E,CAAC;YACJ,CAAC;YAED,2CAA2C;YAC3C,OAAO,CAAC,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,QAAQ,GAAG,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC;YACvD,SAAS,CAAC,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;QAEnC,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,KAAK,CACX,oCAAoC,OAAO,CAAC,IAAI,GAAG;gBACnD,IAAI,OAAO,CAAC,UAAU,IAAI,OAAO,CAAC,UAAU,cAAc,KAAK,CAAC,OAAO,EAAE,CAC1E,CAAC;YACF,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;QACxB,CAAC;IACH,CAAC;IAEO,WAAW,CAAC,KAAU,EAAE,OAAsB;QACpD,4CAA4C;QAC5C,IAAI,OAAO,CAAC,UAAU,IAAI,OAAO,CAAC,UAAU,EAAE,CAAC;YAC7C,OAAO,KAAK,CAAC;QACf,CAAC;QAED,0BAA0B;QAC1B,IAAI,KAAK,CAAC,IAAI,KAAK,WAAW,IAAI,KAAK,CAAC,IAAI,KAAK,cAAc,EAAE,CAAC;YAChE,OAAO,IAAI,CAAC;QACd,CAAC;QAED,qDAAqD;QACrD,IAAI,KAAK,CAAC,MAAM,EAAE,CAAC;YACjB,MAAM,MAAM,GAAG,KAAK,CAAC,MAAM,CAAC;YAC5B,OAAO,MAAM,KAAK,GAAG,IAAI,MAAM,KAAK,GAAG,IAAI,MAAM,IAAI,GAAG,CAAC;QAC3D,CAAC;QAED,0BAA0B;QAC1B,IAAI,KAAK,CAAC,IAAI,KAAK,YAAY,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAC,EAAE,CAAC;YACrE,OAAO,IAAI,CAAC;QACd,CAAC;QAED,OAAO,KAAK,CAAC;IACf,CAAC;IAEO,aAAa,CAAC,KAAU;QAC9B,oDAAoD;QACpD,IAAI,KAAK,CAAC,MAAM,EAAE,CAAC;YACjB,MAAM,MAAM,GAAG,KAAK,CAAC,MAAM,CAAC;YAC5B,OAAO,MAAM,KAAK,GAAG,IAAI,MAAM,IAAI,GAAG,CAAC;QACzC,CAAC;QAED,sCAAsC;QACtC,IAAI,KAAK,CAAC,IAAI,KAAK,cAAc,IAAI,KAAK,CAAC,IAAI,KAAK,WAAW,EAAE,CAAC;YAChE,OAAO,IAAI,CAAC;QACd,CAAC;QAED,OAAO,KAAK,CAAC;IACf,CAAC;IAEO,IAAI,CAAC,EAAU;QACrB,OAAO,IAAI,OAAO,CAAC,OAAO,CAAC,EAAE,CAAC,UAAU,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC,CAAC;IACzD,CAAC;IAED,4CAA4C;IAC5C,QAAQ;QACN,MAAM,KAAK,GAAwB,EAAE,CAAC;QAEtC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,KAAK,EAAE,IAAI,EAAE,EAAE;YACjC,KAAK,CAAC,IAAI,CAAC,GAAG;gBACZ,WAAW,EAAE,KAAK,CAAC,KAAK,CAAC,MAAM;gBAC/B,UAAU,EAAE,KAAK,CAAC,UAAU;gBAC5B,YAAY,EAAE,KAAK,CAAC,YAAY;gBAChC,iBAAiB,EAAE,KAAK,CAAC,iBAAiB;gBAC1C,WAAW,EAAE,KAAK,CAAC,WAAW;aAC/B,CAAC;QACJ,CAAC,CAAC,CAAC;QAEH,OAAO;YACL,KAAK,EAAE,KAAK;YACZ,cAAc,EAAE,IAAI,CAAC,cAAc,CAAC,IAAI;YACxC,aAAa,EAAE,IAAI,CAAC,aAAa;SAClC,CAAC;IACJ,CAAC;CACF;AAjQD,kDAiQC;AAED,qEAAqE;AACxD,QAAA,iBAAiB,GAAG,mBAAmB,CAAC,UAAU,CAAC,UAAU,CAAC,CAAC;AAE5E,6EAA6E;AAC7E,SAAgB,iBAAiB,CAAC,MAA6C;IAC7E,IAAI,OAAO,MAAM,KAAK,QAAQ,EAAE,CAAC;QAC/B,OAAO,mBAAmB,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC;IAChD,CAAC;IACD,OAAO,IAAI,mBAAmB,CAAC,MAAM,CAAC,CAAC;AACzC,CAAC"}
|
|
@@ -0,0 +1,306 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
import { DiscoveredFeed } from './web-scrapers/rss-discovery';
|
|
3
|
+
export declare const CandidateArticleSchema: z.ZodObject<{
|
|
4
|
+
url: z.ZodString;
|
|
5
|
+
title: z.ZodString;
|
|
6
|
+
publishedAt: z.ZodDate;
|
|
7
|
+
content: z.ZodOptional<z.ZodString>;
|
|
8
|
+
excerpt: z.ZodOptional<z.ZodString>;
|
|
9
|
+
guid: z.ZodString;
|
|
10
|
+
confidence: z.ZodNumber;
|
|
11
|
+
source: z.ZodEnum<["rss", "sitemap", "html", "discovery"]>;
|
|
12
|
+
extractionMethod: z.ZodEnum<["rss", "sitemap", "html-links", "content-extraction"]>;
|
|
13
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodAny>>;
|
|
14
|
+
}, "strip", z.ZodTypeAny, {
|
|
15
|
+
url: string;
|
|
16
|
+
title: string;
|
|
17
|
+
source: "rss" | "sitemap" | "html" | "discovery";
|
|
18
|
+
confidence: number;
|
|
19
|
+
extractionMethod: "rss" | "sitemap" | "html-links" | "content-extraction";
|
|
20
|
+
publishedAt: Date;
|
|
21
|
+
guid: string;
|
|
22
|
+
content?: string | undefined;
|
|
23
|
+
excerpt?: string | undefined;
|
|
24
|
+
metadata?: Record<string, any> | undefined;
|
|
25
|
+
}, {
|
|
26
|
+
url: string;
|
|
27
|
+
title: string;
|
|
28
|
+
source: "rss" | "sitemap" | "html" | "discovery";
|
|
29
|
+
confidence: number;
|
|
30
|
+
extractionMethod: "rss" | "sitemap" | "html-links" | "content-extraction";
|
|
31
|
+
publishedAt: Date;
|
|
32
|
+
guid: string;
|
|
33
|
+
content?: string | undefined;
|
|
34
|
+
excerpt?: string | undefined;
|
|
35
|
+
metadata?: Record<string, any> | undefined;
|
|
36
|
+
}>;
|
|
37
|
+
export type CandidateArticle = z.infer<typeof CandidateArticleSchema>;
|
|
38
|
+
export declare const SourceConfigSchema: z.ZodObject<{
|
|
39
|
+
sourceType: z.ZodEnum<["rss", "sitemap", "html", "auto"]>;
|
|
40
|
+
allowPaths: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
41
|
+
denyPaths: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
42
|
+
maxDepth: z.ZodOptional<z.ZodNumber>;
|
|
43
|
+
detectOnly: z.ZodOptional<z.ZodBoolean>;
|
|
44
|
+
scrapeConfig: z.ZodOptional<z.ZodObject<{
|
|
45
|
+
selectors: z.ZodOptional<z.ZodObject<{
|
|
46
|
+
articleLinks: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
47
|
+
titleSelectors: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
48
|
+
dateSelectors: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
49
|
+
excludeSelectors: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
50
|
+
}, "strip", z.ZodTypeAny, {
|
|
51
|
+
excludeSelectors?: string[] | undefined;
|
|
52
|
+
articleLinks?: string[] | undefined;
|
|
53
|
+
titleSelectors?: string[] | undefined;
|
|
54
|
+
dateSelectors?: string[] | undefined;
|
|
55
|
+
}, {
|
|
56
|
+
excludeSelectors?: string[] | undefined;
|
|
57
|
+
articleLinks?: string[] | undefined;
|
|
58
|
+
titleSelectors?: string[] | undefined;
|
|
59
|
+
dateSelectors?: string[] | undefined;
|
|
60
|
+
}>>;
|
|
61
|
+
filters: z.ZodOptional<z.ZodObject<{
|
|
62
|
+
minTitleLength: z.ZodOptional<z.ZodNumber>;
|
|
63
|
+
maxTitleLength: z.ZodOptional<z.ZodNumber>;
|
|
64
|
+
includePatterns: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
65
|
+
excludePatterns: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
66
|
+
}, "strip", z.ZodTypeAny, {
|
|
67
|
+
minTitleLength?: number | undefined;
|
|
68
|
+
maxTitleLength?: number | undefined;
|
|
69
|
+
excludePatterns?: string[] | undefined;
|
|
70
|
+
includePatterns?: string[] | undefined;
|
|
71
|
+
}, {
|
|
72
|
+
minTitleLength?: number | undefined;
|
|
73
|
+
maxTitleLength?: number | undefined;
|
|
74
|
+
excludePatterns?: string[] | undefined;
|
|
75
|
+
includePatterns?: string[] | undefined;
|
|
76
|
+
}>>;
|
|
77
|
+
limits: z.ZodOptional<z.ZodObject<{
|
|
78
|
+
maxLinksPerPage: z.ZodOptional<z.ZodNumber>;
|
|
79
|
+
maxPages: z.ZodOptional<z.ZodNumber>;
|
|
80
|
+
}, "strip", z.ZodTypeAny, {
|
|
81
|
+
maxLinksPerPage?: number | undefined;
|
|
82
|
+
maxPages?: number | undefined;
|
|
83
|
+
}, {
|
|
84
|
+
maxLinksPerPage?: number | undefined;
|
|
85
|
+
maxPages?: number | undefined;
|
|
86
|
+
}>>;
|
|
87
|
+
}, "strip", z.ZodTypeAny, {
|
|
88
|
+
filters?: {
|
|
89
|
+
minTitleLength?: number | undefined;
|
|
90
|
+
maxTitleLength?: number | undefined;
|
|
91
|
+
excludePatterns?: string[] | undefined;
|
|
92
|
+
includePatterns?: string[] | undefined;
|
|
93
|
+
} | undefined;
|
|
94
|
+
selectors?: {
|
|
95
|
+
excludeSelectors?: string[] | undefined;
|
|
96
|
+
articleLinks?: string[] | undefined;
|
|
97
|
+
titleSelectors?: string[] | undefined;
|
|
98
|
+
dateSelectors?: string[] | undefined;
|
|
99
|
+
} | undefined;
|
|
100
|
+
limits?: {
|
|
101
|
+
maxLinksPerPage?: number | undefined;
|
|
102
|
+
maxPages?: number | undefined;
|
|
103
|
+
} | undefined;
|
|
104
|
+
}, {
|
|
105
|
+
filters?: {
|
|
106
|
+
minTitleLength?: number | undefined;
|
|
107
|
+
maxTitleLength?: number | undefined;
|
|
108
|
+
excludePatterns?: string[] | undefined;
|
|
109
|
+
includePatterns?: string[] | undefined;
|
|
110
|
+
} | undefined;
|
|
111
|
+
selectors?: {
|
|
112
|
+
excludeSelectors?: string[] | undefined;
|
|
113
|
+
articleLinks?: string[] | undefined;
|
|
114
|
+
titleSelectors?: string[] | undefined;
|
|
115
|
+
dateSelectors?: string[] | undefined;
|
|
116
|
+
} | undefined;
|
|
117
|
+
limits?: {
|
|
118
|
+
maxLinksPerPage?: number | undefined;
|
|
119
|
+
maxPages?: number | undefined;
|
|
120
|
+
} | undefined;
|
|
121
|
+
}>>;
|
|
122
|
+
}, "strip", z.ZodTypeAny, {
|
|
123
|
+
sourceType: "rss" | "sitemap" | "html" | "auto";
|
|
124
|
+
maxDepth?: number | undefined;
|
|
125
|
+
allowPaths?: string[] | undefined;
|
|
126
|
+
denyPaths?: string[] | undefined;
|
|
127
|
+
detectOnly?: boolean | undefined;
|
|
128
|
+
scrapeConfig?: {
|
|
129
|
+
filters?: {
|
|
130
|
+
minTitleLength?: number | undefined;
|
|
131
|
+
maxTitleLength?: number | undefined;
|
|
132
|
+
excludePatterns?: string[] | undefined;
|
|
133
|
+
includePatterns?: string[] | undefined;
|
|
134
|
+
} | undefined;
|
|
135
|
+
selectors?: {
|
|
136
|
+
excludeSelectors?: string[] | undefined;
|
|
137
|
+
articleLinks?: string[] | undefined;
|
|
138
|
+
titleSelectors?: string[] | undefined;
|
|
139
|
+
dateSelectors?: string[] | undefined;
|
|
140
|
+
} | undefined;
|
|
141
|
+
limits?: {
|
|
142
|
+
maxLinksPerPage?: number | undefined;
|
|
143
|
+
maxPages?: number | undefined;
|
|
144
|
+
} | undefined;
|
|
145
|
+
} | undefined;
|
|
146
|
+
}, {
|
|
147
|
+
sourceType: "rss" | "sitemap" | "html" | "auto";
|
|
148
|
+
maxDepth?: number | undefined;
|
|
149
|
+
allowPaths?: string[] | undefined;
|
|
150
|
+
denyPaths?: string[] | undefined;
|
|
151
|
+
detectOnly?: boolean | undefined;
|
|
152
|
+
scrapeConfig?: {
|
|
153
|
+
filters?: {
|
|
154
|
+
minTitleLength?: number | undefined;
|
|
155
|
+
maxTitleLength?: number | undefined;
|
|
156
|
+
excludePatterns?: string[] | undefined;
|
|
157
|
+
includePatterns?: string[] | undefined;
|
|
158
|
+
} | undefined;
|
|
159
|
+
selectors?: {
|
|
160
|
+
excludeSelectors?: string[] | undefined;
|
|
161
|
+
articleLinks?: string[] | undefined;
|
|
162
|
+
titleSelectors?: string[] | undefined;
|
|
163
|
+
dateSelectors?: string[] | undefined;
|
|
164
|
+
} | undefined;
|
|
165
|
+
limits?: {
|
|
166
|
+
maxLinksPerPage?: number | undefined;
|
|
167
|
+
maxPages?: number | undefined;
|
|
168
|
+
} | undefined;
|
|
169
|
+
} | undefined;
|
|
170
|
+
}>;
|
|
171
|
+
export type SourceConfig = z.infer<typeof SourceConfigSchema> & {
|
|
172
|
+
circuitBreaker?: {
|
|
173
|
+
execute<T>(operation: () => Promise<T>): Promise<T>;
|
|
174
|
+
};
|
|
175
|
+
};
|
|
176
|
+
export interface OrchestrationResult {
|
|
177
|
+
articles: CandidateArticle[];
|
|
178
|
+
sourceInfo: {
|
|
179
|
+
detectedType: 'rss' | 'sitemap' | 'html';
|
|
180
|
+
discoveredFeeds?: DiscoveredFeed[];
|
|
181
|
+
discoveredSitemaps?: string[];
|
|
182
|
+
extractionStats: {
|
|
183
|
+
attempted: number;
|
|
184
|
+
successful: number;
|
|
185
|
+
failed: number;
|
|
186
|
+
filtered: number;
|
|
187
|
+
};
|
|
188
|
+
};
|
|
189
|
+
processingTime: number;
|
|
190
|
+
errors: string[];
|
|
191
|
+
}
|
|
192
|
+
export declare class SourceOrchestrator {
|
|
193
|
+
private readonly maxArticlesPerSource;
|
|
194
|
+
private readonly recentTimeframe;
|
|
195
|
+
/**
|
|
196
|
+
* Common content section paths - prioritized for news/blog content
|
|
197
|
+
*/
|
|
198
|
+
private readonly contentSectionPaths;
|
|
199
|
+
/**
|
|
200
|
+
* Common blog subdomains to check when scraping root domains
|
|
201
|
+
* Many companies host their blogs on separate subdomains
|
|
202
|
+
*/
|
|
203
|
+
private readonly blogSubdomains;
|
|
204
|
+
/**
|
|
205
|
+
* Infer path filters from the input URL
|
|
206
|
+
* e.g., if user enters anthropic.com/news, filter results to /news/* paths
|
|
207
|
+
*/
|
|
208
|
+
private inferPathFiltersFromUrl;
|
|
209
|
+
/**
|
|
210
|
+
* Discover content sections from sitemap when user enters root domain
|
|
211
|
+
* Returns prioritized list of content paths found
|
|
212
|
+
*/
|
|
213
|
+
private discoverContentSectionsFromSitemap;
|
|
214
|
+
/**
|
|
215
|
+
* Filter sitemap entries to content sections when processing root domain
|
|
216
|
+
* Also applies non-English locale filtering
|
|
217
|
+
*/
|
|
218
|
+
private filterToContentSections;
|
|
219
|
+
/**
|
|
220
|
+
* Discover blog subdomains for a given domain
|
|
221
|
+
* e.g., for nvidia.com, check if blogs.nvidia.com exists
|
|
222
|
+
*/
|
|
223
|
+
private discoverBlogSubdomains;
|
|
224
|
+
/**
|
|
225
|
+
* Main orchestration method - determines source type and extracts content
|
|
226
|
+
*/
|
|
227
|
+
processSource(url: string, config?: SourceConfig): Promise<OrchestrationResult>;
|
|
228
|
+
/**
|
|
229
|
+
* Auto-detect source type and process accordingly
|
|
230
|
+
*/
|
|
231
|
+
private autoDetectAndProcess;
|
|
232
|
+
/**
|
|
233
|
+
* Process source with known type
|
|
234
|
+
*/
|
|
235
|
+
private processKnownType;
|
|
236
|
+
/**
|
|
237
|
+
* Process URL as RSS feed
|
|
238
|
+
*/
|
|
239
|
+
private processAsRSS;
|
|
240
|
+
/**
|
|
241
|
+
* Process URL as sitemap
|
|
242
|
+
*/
|
|
243
|
+
private processAsSitemap;
|
|
244
|
+
/**
|
|
245
|
+
* Process URL as HTML page
|
|
246
|
+
*/
|
|
247
|
+
private processAsHTML;
|
|
248
|
+
/**
|
|
249
|
+
* Process URL using Playwright for JavaScript-rendered pages
|
|
250
|
+
* Used as fallback when static HTML scraping fails
|
|
251
|
+
*/
|
|
252
|
+
private processAsPlaywright;
|
|
253
|
+
/**
|
|
254
|
+
* Apply path filtering based on allowPaths and denyPaths
|
|
255
|
+
* Also filters out non-English locale paths
|
|
256
|
+
*
|
|
257
|
+
* @param articles - Articles to filter
|
|
258
|
+
* @param config - Source configuration
|
|
259
|
+
* @param options - Filtering options
|
|
260
|
+
* @param options.skipAllowFilters - Skip allow path filtering (useful for RSS which is already curated)
|
|
261
|
+
*/
|
|
262
|
+
private applyPathFilters;
|
|
263
|
+
/**
|
|
264
|
+
* Check if a path matches a pattern (supports wildcards)
|
|
265
|
+
*/
|
|
266
|
+
private matchesPattern;
|
|
267
|
+
/**
|
|
268
|
+
* Build scraping configuration from source config
|
|
269
|
+
*/
|
|
270
|
+
private buildScrapingConfig;
|
|
271
|
+
/**
|
|
272
|
+
* Extract title from URL as fallback
|
|
273
|
+
*/
|
|
274
|
+
private extractTitleFromUrl;
|
|
275
|
+
/**
|
|
276
|
+
* Create a consistent GUID for an article
|
|
277
|
+
*/
|
|
278
|
+
private createGuid;
|
|
279
|
+
/**
|
|
280
|
+
* Finalize processing result
|
|
281
|
+
*/
|
|
282
|
+
private finalizeResult;
|
|
283
|
+
/**
|
|
284
|
+
* Extract full content for articles (optional enhancement step)
|
|
285
|
+
*/
|
|
286
|
+
enhanceWithFullContent(articles: CandidateArticle[], maxArticles?: number, options?: {
|
|
287
|
+
concurrency?: number;
|
|
288
|
+
onProgress?: (completed: number, total: number) => void;
|
|
289
|
+
}): Promise<CandidateArticle[]>;
|
|
290
|
+
/**
|
|
291
|
+
* Validate orchestrator configuration
|
|
292
|
+
*/
|
|
293
|
+
static validateConfig(config: any): SourceConfig;
|
|
294
|
+
/**
|
|
295
|
+
* Get source statistics
|
|
296
|
+
*/
|
|
297
|
+
getSourceStats(url: string): Promise<{
|
|
298
|
+
robotsCompliant: boolean;
|
|
299
|
+
hasRSSFeed: boolean;
|
|
300
|
+
hasSitemap: boolean;
|
|
301
|
+
detectedType: string;
|
|
302
|
+
estimatedArticleCount: number;
|
|
303
|
+
}>;
|
|
304
|
+
}
|
|
305
|
+
export declare const globalSourceOrchestrator: SourceOrchestrator;
|
|
306
|
+
//# sourceMappingURL=source-orchestrator.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"source-orchestrator.d.ts","sourceRoot":"","sources":["../../lib/source-orchestrator.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAIxB,OAAO,EAAsB,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAelF,eAAO,MAAM,sBAAsB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAWjC,CAAC;AAEH,MAAM,MAAM,gBAAgB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,sBAAsB,CAAC,CAAC;AAEtE,eAAO,MAAM,kBAAkB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAwB7B,CAAC;AAEH,MAAM,MAAM,YAAY,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,kBAAkB,CAAC,GAAG;IAC9D,cAAc,CAAC,EAAE;QAAE,OAAO,CAAC,CAAC,EAAE,SAAS,EAAE,MAAM,OAAO,CAAC,CAAC,CAAC,GAAG,OAAO,CAAC,CAAC,CAAC,CAAA;KAAE,CAAC;CAC1E,CAAC;AAEF,MAAM,WAAW,mBAAmB;IAClC,QAAQ,EAAE,gBAAgB,EAAE,CAAC;IAC7B,UAAU,EAAE;QACV,YAAY,EAAE,KAAK,GAAG,SAAS,GAAG,MAAM,CAAC;QACzC,eAAe,CAAC,EAAE,cAAc,EAAE,CAAC;QACnC,kBAAkB,CAAC,EAAE,MAAM,EAAE,CAAC;QAC9B,eAAe,EAAE;YACf,SAAS,EAAE,MAAM,CAAC;YAClB,UAAU,EAAE,MAAM,CAAC;YACnB,MAAM,EAAE,MAAM,CAAC;YACf,QAAQ,EAAE,MAAM,CAAC;SAClB,CAAC;KACH,CAAC;IACF,cAAc,EAAE,MAAM,CAAC;IACvB,MAAM,EAAE,MAAM,EAAE,CAAC;CAClB;AAED,qBAAa,kBAAkB;IAC7B,OAAO,CAAC,QAAQ,CAAC,oBAAoB,CAAQ;IAC7C,OAAO,CAAC,QAAQ,CAAC,eAAe,CAAuB;IAEvD;;OAEG;IACH,OAAO,CAAC,QAAQ,CAAC,mBAAmB,CAIlC;IAEF;;;OAGG;IACH,OAAO,CAAC,QAAQ,CAAC,cAAc,CAG7B;IAEF;;;OAGG;IACH,OAAO,CAAC,uBAAuB;IAqC/B;;;OAGG;IACH,OAAO,CAAC,kCAAkC;IAmC1C;;;OAGG;IACH,OAAO,CAAC,uBAAuB;IA4B/B;;;OAGG;YACW,sBAAsB;IAsCpC;;OAEG;IACG,aAAa,CACjB,GAAG,EAAE,MAAM,EACX,MAAM,GAAE,YAAqC,GAC5C,OAAO,CAAC,mBAAmB,CAAC;IA0C/B;;OAEG;YACW,oBAAoB;IAqLlC;;OAEG;YACW,gBAAgB;IAsC9B;;OAEG;YACW,YAAY;IAmC1B;;OAEG;YACW,gBAAgB;IAuC9B;;OAEG;YACW,aAAa;IAoC3B;;;OAGG;YACW,mBAAmB;IAwCjC;;;;;;;;OAQG;IACH,OAAO,CAAC,gBAAgB;IAiDxB;;OAEG;IACH,OAAO,CAAC,cAAc;IAwBtB;;OAEG;IACH,OAAO,CAAC,mBAAmB;IA4B3B;;OAEG;IACH,OAAO,CAAC,mBAAmB;IAiB3B;;OAEG;IACH,OAAO,CAAC,UAAU;IAIlB;;OAEG;IACH,OAAO,CAAC,cAAc;IA0BtB;;OAEG;IACG,sBAAsB,CAC1B,QAAQ,EAAE,gBAAgB,EAAE,EAC5B,WAAW,GAAE,MAAW,EACxB,OAAO,GAAE;QACP,WAAW,CAAC,EAAE,MAAM,CAAC;QACrB,UAAU,CAAC,EAAE,CAAC,SAAS,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,KAAK,IAAI,CAAC;KACpD,GACL,OAAO,CAAC,gBAAgB,EAAE,CAAC;IAgD9B;;OAEG;IACH,MAAM,CAAC,cAAc,CAAC,MAAM,EAAE,GAAG,GAAG,YAAY;IAWhD;;OAEG;IACG,cAAc,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC;QACzC,eAAe,EAAE,OAAO,CAAC;QACzB,UAAU,EAAE,OAAO,CAAC;QACpB,UAAU,EAAE,OAAO,CAAC;QACpB,YAAY,EAAE,MAAM,CAAC;QACrB,qBAAqB,EAAE,MAAM,CAAC;KAC/B,CAAC;CA4BH;AAGD,eAAO,MAAM,wBAAwB,oBAA2B,CAAC"}
|