@tyroneross/blog-scraper 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +254 -279
- package/dist/lib/circuit-breaker.d.ts +29 -0
- package/dist/lib/circuit-breaker.d.ts.map +1 -0
- package/dist/lib/circuit-breaker.js +89 -0
- package/dist/lib/circuit-breaker.js.map +1 -0
- package/dist/lib/content-extractor.d.ts +13 -0
- package/dist/lib/content-extractor.d.ts.map +1 -0
- package/dist/lib/content-extractor.js +75 -0
- package/dist/lib/content-extractor.js.map +1 -0
- package/dist/lib/formatters/html-to-markdown.d.ts +21 -0
- package/dist/lib/formatters/html-to-markdown.d.ts.map +1 -0
- package/dist/lib/formatters/html-to-markdown.js +146 -0
- package/dist/lib/formatters/html-to-markdown.js.map +1 -0
- package/dist/lib/formatters/text-cleaner.d.ts +44 -0
- package/dist/lib/formatters/text-cleaner.d.ts.map +1 -0
- package/dist/lib/formatters/text-cleaner.js +143 -0
- package/dist/lib/formatters/text-cleaner.js.map +1 -0
- package/dist/lib/index.d.ts +96 -0
- package/dist/lib/index.d.ts.map +1 -0
- package/dist/lib/index.js +184 -0
- package/dist/lib/index.js.map +1 -0
- package/dist/lib/quality-scorer.d.ts +83 -0
- package/dist/lib/quality-scorer.d.ts.map +1 -0
- package/dist/lib/quality-scorer.js +376 -0
- package/dist/lib/quality-scorer.js.map +1 -0
- package/dist/lib/rss-utils.d.ts +31 -0
- package/dist/lib/rss-utils.d.ts.map +1 -0
- package/dist/lib/rss-utils.js +175 -0
- package/dist/lib/rss-utils.js.map +1 -0
- package/dist/lib/scraping-rate-limiter.d.ts +52 -0
- package/dist/lib/scraping-rate-limiter.d.ts.map +1 -0
- package/dist/lib/scraping-rate-limiter.js +238 -0
- package/dist/lib/scraping-rate-limiter.js.map +1 -0
- package/dist/lib/source-orchestrator.d.ts +306 -0
- package/dist/lib/source-orchestrator.d.ts.map +1 -0
- package/dist/lib/source-orchestrator.js +840 -0
- package/dist/lib/source-orchestrator.js.map +1 -0
- package/dist/lib/types.d.ts +143 -0
- package/dist/lib/types.d.ts.map +1 -0
- package/dist/lib/types.js +7 -0
- package/dist/lib/types.js.map +1 -0
- package/dist/lib/web-scrapers/content-extractor.d.ts +62 -0
- package/dist/lib/web-scrapers/content-extractor.d.ts.map +1 -0
- package/dist/lib/web-scrapers/content-extractor.js +531 -0
- package/dist/lib/web-scrapers/content-extractor.js.map +1 -0
- package/dist/lib/web-scrapers/html-scraper.d.ts +74 -0
- package/dist/lib/web-scrapers/html-scraper.d.ts.map +1 -0
- package/dist/lib/web-scrapers/html-scraper.js +598 -0
- package/dist/lib/web-scrapers/html-scraper.js.map +1 -0
- package/dist/lib/web-scrapers/playwright-scraper.d.ts +57 -0
- package/dist/lib/web-scrapers/playwright-scraper.d.ts.map +1 -0
- package/dist/lib/web-scrapers/playwright-scraper.js +355 -0
- package/dist/lib/web-scrapers/playwright-scraper.js.map +1 -0
- package/dist/lib/web-scrapers/robots-checker.d.ts +42 -0
- package/dist/lib/web-scrapers/robots-checker.d.ts.map +1 -0
- package/dist/lib/web-scrapers/robots-checker.js +285 -0
- package/dist/lib/web-scrapers/robots-checker.js.map +1 -0
- package/dist/lib/web-scrapers/rss-discovery.d.ts +62 -0
- package/dist/lib/web-scrapers/rss-discovery.d.ts.map +1 -0
- package/dist/lib/web-scrapers/rss-discovery.js +384 -0
- package/dist/lib/web-scrapers/rss-discovery.js.map +1 -0
- package/dist/lib/web-scrapers/sitemap-parser.d.ts +65 -0
- package/dist/lib/web-scrapers/sitemap-parser.d.ts.map +1 -0
- package/dist/lib/web-scrapers/sitemap-parser.js +430 -0
- package/dist/lib/web-scrapers/sitemap-parser.js.map +1 -0
- package/package.json +54 -33
- package/dist/index.d.mts +0 -949
- package/dist/index.d.ts +0 -949
- package/dist/index.js +0 -3236
- package/dist/index.mjs +0 -3165
|
@@ -0,0 +1,430 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
36
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
37
|
+
};
|
|
38
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
39
|
+
exports.globalSitemapParser = exports.SitemapParser = void 0;
|
|
40
|
+
const cheerio = __importStar(require("cheerio"));
|
|
41
|
+
const p_limit_1 = __importDefault(require("p-limit"));
|
|
42
|
+
const scraping_rate_limiter_1 = require("../scraping-rate-limiter");
|
|
43
|
+
const robots_checker_1 = require("./robots-checker");
|
|
44
|
+
class SitemapParser {
|
|
45
|
+
constructor() {
|
|
46
|
+
this.userAgent = 'Mozilla/5.0 (compatible; AtomizeNews/1.0; +https://atomize-news.vercel.app)';
|
|
47
|
+
this.timeout = 15000; // 15 seconds for sitemaps
|
|
48
|
+
this.maxSitemapSize = 50 * 1024 * 1024; // 50MB max
|
|
49
|
+
this.maxEntries = 50000; // Max entries per sitemap
|
|
50
|
+
this.recentTimeframe = 48 * 60 * 60 * 1000; // 48 hours in ms
|
|
51
|
+
}
|
|
52
|
+
/**
|
|
53
|
+
* Parse sitemap from URL and return entries
|
|
54
|
+
*/
|
|
55
|
+
async parseSitemap(url, options = {}) {
|
|
56
|
+
console.log(`🗺️ [Sitemap] Starting to parse ${url}`);
|
|
57
|
+
try {
|
|
58
|
+
// Check robots.txt compliance
|
|
59
|
+
const robotsCheck = await robots_checker_1.globalRobotsChecker.isAllowed(url);
|
|
60
|
+
if (!robotsCheck.allowed) {
|
|
61
|
+
console.warn(`🤖 [Sitemap] URL blocked by robots.txt: ${url} - ${robotsCheck.reason}`);
|
|
62
|
+
return [];
|
|
63
|
+
}
|
|
64
|
+
const xml = await this.fetchSitemap(url);
|
|
65
|
+
if (!xml) {
|
|
66
|
+
return [];
|
|
67
|
+
}
|
|
68
|
+
// Detect if this is a sitemap index or regular sitemap
|
|
69
|
+
if (this.isSitemapIndex(xml)) {
|
|
70
|
+
return await this.parseSitemapIndex(xml, options);
|
|
71
|
+
}
|
|
72
|
+
else {
|
|
73
|
+
return this.parseRegularSitemap(xml, options);
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
catch (error) {
|
|
77
|
+
console.error(`❌ [Sitemap] Error parsing sitemap ${url}:`, error);
|
|
78
|
+
return [];
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
/**
|
|
82
|
+
* Discover sitemaps from domain
|
|
83
|
+
*/
|
|
84
|
+
async discoverSitemaps(domain) {
|
|
85
|
+
const sitemaps = [];
|
|
86
|
+
try {
|
|
87
|
+
// First, check robots.txt for sitemap declarations
|
|
88
|
+
const robotsSitemaps = await robots_checker_1.globalRobotsChecker.getSitemaps(domain);
|
|
89
|
+
sitemaps.push(...robotsSitemaps);
|
|
90
|
+
// Try common sitemap paths
|
|
91
|
+
const commonPaths = [
|
|
92
|
+
'/sitemap.xml',
|
|
93
|
+
'/sitemap_index.xml',
|
|
94
|
+
'/sitemaps.xml',
|
|
95
|
+
'/sitemap/',
|
|
96
|
+
'/news-sitemap.xml'
|
|
97
|
+
];
|
|
98
|
+
for (const path of commonPaths) {
|
|
99
|
+
const sitemapUrl = `https://${domain}${path}`;
|
|
100
|
+
// Skip if already found in robots.txt
|
|
101
|
+
if (sitemaps.includes(sitemapUrl)) {
|
|
102
|
+
continue;
|
|
103
|
+
}
|
|
104
|
+
const exists = await this.checkSitemapExists(sitemapUrl);
|
|
105
|
+
if (exists) {
|
|
106
|
+
sitemaps.push(sitemapUrl);
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
console.log(`🗺️ [Sitemap] Discovered ${sitemaps.length} sitemaps for ${domain}`);
|
|
110
|
+
return Array.from(new Set(sitemaps)); // Remove duplicates
|
|
111
|
+
}
|
|
112
|
+
catch (error) {
|
|
113
|
+
console.error(`❌ [Sitemap] Error discovering sitemaps for ${domain}:`, error);
|
|
114
|
+
return [];
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
/**
|
|
118
|
+
* Get recent entries from all sitemaps for a domain
|
|
119
|
+
*/
|
|
120
|
+
async getRecentEntries(domain, options = {}) {
|
|
121
|
+
const hoursBack = options.hoursBack || 48;
|
|
122
|
+
const maxEntries = options.maxEntries || 1000;
|
|
123
|
+
const sitemaps = await this.discoverSitemaps(domain);
|
|
124
|
+
const allEntries = [];
|
|
125
|
+
for (const sitemapUrl of sitemaps) {
|
|
126
|
+
try {
|
|
127
|
+
const entries = await this.parseSitemap(sitemapUrl, {
|
|
128
|
+
filterRecent: true,
|
|
129
|
+
maxEntries: Math.floor(maxEntries / sitemaps.length), // Distribute quota
|
|
130
|
+
includeNews: true
|
|
131
|
+
});
|
|
132
|
+
allEntries.push(...entries);
|
|
133
|
+
}
|
|
134
|
+
catch (error) {
|
|
135
|
+
console.warn(`⚠️ [Sitemap] Error parsing ${sitemapUrl}:`, error);
|
|
136
|
+
continue;
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
// Filter by time and sort by lastmod
|
|
140
|
+
const cutoffTime = new Date(Date.now() - (hoursBack * 60 * 60 * 1000));
|
|
141
|
+
const recentEntries = allEntries
|
|
142
|
+
.filter(entry => entry.lastmod && entry.lastmod >= cutoffTime)
|
|
143
|
+
.sort((a, b) => {
|
|
144
|
+
if (!a.lastmod || !b.lastmod)
|
|
145
|
+
return 0;
|
|
146
|
+
return b.lastmod.getTime() - a.lastmod.getTime();
|
|
147
|
+
})
|
|
148
|
+
.slice(0, maxEntries);
|
|
149
|
+
console.log(`🗺️ [Sitemap] Found ${recentEntries.length} recent entries from ${domain}`);
|
|
150
|
+
return recentEntries;
|
|
151
|
+
}
|
|
152
|
+
async fetchSitemap(url) {
|
|
153
|
+
try {
|
|
154
|
+
return await scraping_rate_limiter_1.globalRateLimiter.execute(url, async () => {
|
|
155
|
+
const controller = new AbortController();
|
|
156
|
+
const timeoutId = setTimeout(() => controller.abort(), this.timeout);
|
|
157
|
+
try {
|
|
158
|
+
const response = await fetch(url, {
|
|
159
|
+
headers: {
|
|
160
|
+
'User-Agent': this.userAgent,
|
|
161
|
+
'Accept': 'application/xml, text/xml, */*',
|
|
162
|
+
},
|
|
163
|
+
signal: controller.signal,
|
|
164
|
+
});
|
|
165
|
+
clearTimeout(timeoutId);
|
|
166
|
+
if (!response.ok) {
|
|
167
|
+
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
168
|
+
}
|
|
169
|
+
const contentLength = response.headers.get('content-length');
|
|
170
|
+
if (contentLength && parseInt(contentLength) > this.maxSitemapSize) {
|
|
171
|
+
throw new Error(`Sitemap too large: ${contentLength} bytes`);
|
|
172
|
+
}
|
|
173
|
+
const xml = await response.text();
|
|
174
|
+
if (xml.length > this.maxSitemapSize) {
|
|
175
|
+
throw new Error(`Sitemap too large: ${xml.length} bytes`);
|
|
176
|
+
}
|
|
177
|
+
return xml;
|
|
178
|
+
}
|
|
179
|
+
catch (error) {
|
|
180
|
+
clearTimeout(timeoutId);
|
|
181
|
+
throw error;
|
|
182
|
+
}
|
|
183
|
+
});
|
|
184
|
+
}
|
|
185
|
+
catch (error) {
|
|
186
|
+
console.error(`❌ [Sitemap] Error fetching ${url}:`, error);
|
|
187
|
+
return null;
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
async checkSitemapExists(url) {
|
|
191
|
+
try {
|
|
192
|
+
return await scraping_rate_limiter_1.globalRateLimiter.execute(url, async () => {
|
|
193
|
+
const controller = new AbortController();
|
|
194
|
+
const timeoutId = setTimeout(() => controller.abort(), 5000);
|
|
195
|
+
try {
|
|
196
|
+
const response = await fetch(url, {
|
|
197
|
+
method: 'HEAD',
|
|
198
|
+
headers: { 'User-Agent': this.userAgent },
|
|
199
|
+
signal: controller.signal,
|
|
200
|
+
});
|
|
201
|
+
clearTimeout(timeoutId);
|
|
202
|
+
return response.ok;
|
|
203
|
+
}
|
|
204
|
+
catch (error) {
|
|
205
|
+
clearTimeout(timeoutId);
|
|
206
|
+
return false;
|
|
207
|
+
}
|
|
208
|
+
});
|
|
209
|
+
}
|
|
210
|
+
catch (error) {
|
|
211
|
+
return false;
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
isSitemapIndex(xml) {
|
|
215
|
+
return xml.includes('<sitemapindex') || xml.includes('</sitemapindex>');
|
|
216
|
+
}
|
|
217
|
+
async parseSitemapIndex(xml, options) {
|
|
218
|
+
console.log(`🗺️ [Sitemap] Parsing sitemap index`);
|
|
219
|
+
const $ = cheerio.load(xml, { xmlMode: true });
|
|
220
|
+
const sitemaps = [];
|
|
221
|
+
const allEntries = [];
|
|
222
|
+
// Extract sitemap URLs from index
|
|
223
|
+
$('sitemap').each((_, element) => {
|
|
224
|
+
const $element = $(element);
|
|
225
|
+
const loc = $element.find('loc').first().text().trim();
|
|
226
|
+
if (loc) {
|
|
227
|
+
sitemaps.push(loc);
|
|
228
|
+
}
|
|
229
|
+
});
|
|
230
|
+
console.log(`🗺️ [Sitemap] Found ${sitemaps.length} sitemaps in index`);
|
|
231
|
+
// Prioritize US English sitemaps, then other English, then rest
|
|
232
|
+
const prioritizedSitemaps = sitemaps.sort((a, b) => {
|
|
233
|
+
const aLower = a.toLowerCase();
|
|
234
|
+
const bLower = b.toLowerCase();
|
|
235
|
+
// /en-us/ comes first
|
|
236
|
+
const aIsEnUs = aLower.includes('/en-us/') || aLower.includes('/en_us/');
|
|
237
|
+
const bIsEnUs = bLower.includes('/en-us/') || bLower.includes('/en_us/');
|
|
238
|
+
if (aIsEnUs && !bIsEnUs)
|
|
239
|
+
return -1;
|
|
240
|
+
if (bIsEnUs && !aIsEnUs)
|
|
241
|
+
return 1;
|
|
242
|
+
// Then other English locales
|
|
243
|
+
const aIsEnglish = /\/en[-_][a-z]{2}\//i.test(aLower);
|
|
244
|
+
const bIsEnglish = /\/en[-_][a-z]{2}\//i.test(bLower);
|
|
245
|
+
if (aIsEnglish && !bIsEnglish)
|
|
246
|
+
return -1;
|
|
247
|
+
if (bIsEnglish && !aIsEnglish)
|
|
248
|
+
return 1;
|
|
249
|
+
return 0;
|
|
250
|
+
});
|
|
251
|
+
// Parse each individual sitemap in PARALLEL (prioritize en-us, limit to 10)
|
|
252
|
+
const sitemapsToProcess = prioritizedSitemaps.slice(0, 10);
|
|
253
|
+
const entriesPerSitemap = Math.floor((options.maxEntries || this.maxEntries) / Math.min(sitemaps.length, 10));
|
|
254
|
+
// Parallel fetching with concurrency limit of 5
|
|
255
|
+
const limit = (0, p_limit_1.default)(5);
|
|
256
|
+
console.log(`🗺️ [Sitemap] Fetching ${sitemapsToProcess.length} sitemaps in parallel (concurrency: 5)`);
|
|
257
|
+
const results = await Promise.allSettled(sitemapsToProcess.map(sitemapUrl => limit(async () => {
|
|
258
|
+
const sitemapXml = await this.fetchSitemap(sitemapUrl);
|
|
259
|
+
if (sitemapXml) {
|
|
260
|
+
return this.parseRegularSitemap(sitemapXml, {
|
|
261
|
+
...options,
|
|
262
|
+
maxEntries: entriesPerSitemap
|
|
263
|
+
});
|
|
264
|
+
}
|
|
265
|
+
return [];
|
|
266
|
+
})));
|
|
267
|
+
// Collect successful results
|
|
268
|
+
for (const result of results) {
|
|
269
|
+
if (result.status === 'fulfilled') {
|
|
270
|
+
allEntries.push(...result.value);
|
|
271
|
+
}
|
|
272
|
+
else {
|
|
273
|
+
console.warn(`⚠️ [Sitemap] Error parsing sitemap:`, result.reason);
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
console.log(`🗺️ [Sitemap] Parallel fetch complete: ${allEntries.length} entries from ${sitemapsToProcess.length} sitemaps`);
|
|
277
|
+
return allEntries;
|
|
278
|
+
}
|
|
279
|
+
parseRegularSitemap(xml, options) {
|
|
280
|
+
console.log(`🗺️ [Sitemap] Parsing regular sitemap`);
|
|
281
|
+
const $ = cheerio.load(xml, { xmlMode: true });
|
|
282
|
+
const entries = [];
|
|
283
|
+
const maxEntries = options.maxEntries || this.maxEntries;
|
|
284
|
+
const cutoffTime = options.filterRecent
|
|
285
|
+
? new Date(Date.now() - this.recentTimeframe)
|
|
286
|
+
: null;
|
|
287
|
+
$('url').each((index, element) => {
|
|
288
|
+
if (entries.length >= maxEntries) {
|
|
289
|
+
return false; // Break the loop
|
|
290
|
+
}
|
|
291
|
+
const $element = $(element);
|
|
292
|
+
const loc = $element.find('loc').first().text().trim();
|
|
293
|
+
if (!loc)
|
|
294
|
+
return;
|
|
295
|
+
const entry = { url: loc };
|
|
296
|
+
// Parse lastmod
|
|
297
|
+
const lastmodText = $element.find('lastmod').first().text().trim();
|
|
298
|
+
if (lastmodText) {
|
|
299
|
+
const lastmod = new Date(lastmodText);
|
|
300
|
+
if (!isNaN(lastmod.getTime())) {
|
|
301
|
+
entry.lastmod = lastmod;
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
// Filter by recency if requested
|
|
305
|
+
if (cutoffTime && entry.lastmod && entry.lastmod < cutoffTime) {
|
|
306
|
+
return; // Skip this entry
|
|
307
|
+
}
|
|
308
|
+
// Parse changefreq
|
|
309
|
+
const changefreq = $element.find('changefreq').first().text().trim();
|
|
310
|
+
if (changefreq) {
|
|
311
|
+
entry.changefreq = changefreq;
|
|
312
|
+
}
|
|
313
|
+
// Parse priority
|
|
314
|
+
const priorityText = $element.find('priority').first().text().trim();
|
|
315
|
+
if (priorityText) {
|
|
316
|
+
const priority = parseFloat(priorityText);
|
|
317
|
+
if (!isNaN(priority)) {
|
|
318
|
+
entry.priority = priority;
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
// Parse images if requested
|
|
322
|
+
if (options.includeImages) {
|
|
323
|
+
const images = [];
|
|
324
|
+
$element.find('image\\:image').each((_, imgElement) => {
|
|
325
|
+
const $img = $(imgElement);
|
|
326
|
+
const imgLoc = $img.find('image\\:loc').first().text().trim();
|
|
327
|
+
if (imgLoc) {
|
|
328
|
+
images.push({
|
|
329
|
+
loc: imgLoc,
|
|
330
|
+
caption: $img.find('image\\:caption').first().text().trim() || undefined,
|
|
331
|
+
title: $img.find('image\\:title').first().text().trim() || undefined,
|
|
332
|
+
});
|
|
333
|
+
}
|
|
334
|
+
});
|
|
335
|
+
if (images.length > 0) {
|
|
336
|
+
entry.images = images;
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
// Parse news if requested
|
|
340
|
+
if (options.includeNews) {
|
|
341
|
+
const $news = $element.find('news\\:news');
|
|
342
|
+
if ($news.length > 0) {
|
|
343
|
+
const title = $news.find('news\\:title').first().text().trim();
|
|
344
|
+
if (title) {
|
|
345
|
+
entry.news = { title };
|
|
346
|
+
const pubDateText = $news.find('news\\:publication_date').first().text().trim();
|
|
347
|
+
if (pubDateText) {
|
|
348
|
+
const pubDate = new Date(pubDateText);
|
|
349
|
+
if (!isNaN(pubDate.getTime())) {
|
|
350
|
+
entry.news.publishedDate = pubDate;
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
const keywords = $news.find('news\\:keywords').first().text().trim();
|
|
354
|
+
if (keywords) {
|
|
355
|
+
entry.news.keywords = keywords.split(',').map(k => k.trim());
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
entries.push(entry);
|
|
361
|
+
});
|
|
362
|
+
console.log(`🗺️ [Sitemap] Parsed ${entries.length} entries from sitemap`);
|
|
363
|
+
return entries;
|
|
364
|
+
}
|
|
365
|
+
/**
|
|
366
|
+
* Validate sitemap format
|
|
367
|
+
*/
|
|
368
|
+
validateSitemapFormat(xml) {
|
|
369
|
+
const errors = [];
|
|
370
|
+
try {
|
|
371
|
+
const $ = cheerio.load(xml, { xmlMode: true });
|
|
372
|
+
// Check for root element
|
|
373
|
+
const hasUrlset = $('urlset').length > 0;
|
|
374
|
+
const hasSitemapIndex = $('sitemapindex').length > 0;
|
|
375
|
+
if (!hasUrlset && !hasSitemapIndex) {
|
|
376
|
+
errors.push('Missing required root element: <urlset> or <sitemapindex>');
|
|
377
|
+
}
|
|
378
|
+
// Check URL count (for regular sitemaps)
|
|
379
|
+
if (hasUrlset) {
|
|
380
|
+
const urlCount = $('url').length;
|
|
381
|
+
if (urlCount > 50000) {
|
|
382
|
+
errors.push(`Too many URLs: ${urlCount} (max: 50,000)`);
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
// Validate URL entries
|
|
386
|
+
$('url').each((index, element) => {
|
|
387
|
+
const $element = $(element);
|
|
388
|
+
const loc = $element.find('loc').first().text().trim();
|
|
389
|
+
if (!loc) {
|
|
390
|
+
errors.push(`URL entry ${index + 1} missing <loc> element`);
|
|
391
|
+
}
|
|
392
|
+
else {
|
|
393
|
+
try {
|
|
394
|
+
new URL(loc);
|
|
395
|
+
}
|
|
396
|
+
catch {
|
|
397
|
+
errors.push(`Invalid URL in entry ${index + 1}: ${loc}`);
|
|
398
|
+
}
|
|
399
|
+
}
|
|
400
|
+
// Validate lastmod format
|
|
401
|
+
const lastmod = $element.find('lastmod').first().text().trim();
|
|
402
|
+
if (lastmod) {
|
|
403
|
+
const date = new Date(lastmod);
|
|
404
|
+
if (isNaN(date.getTime())) {
|
|
405
|
+
errors.push(`Invalid lastmod date in entry ${index + 1}: ${lastmod}`);
|
|
406
|
+
}
|
|
407
|
+
}
|
|
408
|
+
// Validate priority
|
|
409
|
+
const priority = $element.find('priority').first().text().trim();
|
|
410
|
+
if (priority) {
|
|
411
|
+
const priorityNum = parseFloat(priority);
|
|
412
|
+
if (isNaN(priorityNum) || priorityNum < 0 || priorityNum > 1) {
|
|
413
|
+
errors.push(`Invalid priority in entry ${index + 1}: ${priority} (must be 0-1)`);
|
|
414
|
+
}
|
|
415
|
+
}
|
|
416
|
+
});
|
|
417
|
+
}
|
|
418
|
+
catch (error) {
|
|
419
|
+
errors.push(`XML parsing error: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
420
|
+
}
|
|
421
|
+
return {
|
|
422
|
+
valid: errors.length === 0,
|
|
423
|
+
errors
|
|
424
|
+
};
|
|
425
|
+
}
|
|
426
|
+
}
|
|
427
|
+
exports.SitemapParser = SitemapParser;
|
|
428
|
+
// Default global instance
|
|
429
|
+
exports.globalSitemapParser = new SitemapParser();
|
|
430
|
+
//# sourceMappingURL=sitemap-parser.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"sitemap-parser.js","sourceRoot":"","sources":["../../../lib/web-scrapers/sitemap-parser.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,iDAAmC;AACnC,sDAA6B;AAC7B,oEAA6D;AAC7D,qDAAuD;AA8BvD,MAAa,aAAa;IAA1B;QACmB,cAAS,GAAG,6EAA6E,CAAC;QAC1F,YAAO,GAAG,KAAK,CAAC,CAAC,0BAA0B;QAC3C,mBAAc,GAAG,EAAE,GAAG,IAAI,GAAG,IAAI,CAAC,CAAC,WAAW;QAC9C,eAAU,GAAG,KAAK,CAAC,CAAC,0BAA0B;QAC9C,oBAAe,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,IAAI,CAAC,CAAC,iBAAiB;IAuc3E,CAAC;IArcC;;OAEG;IACH,KAAK,CAAC,YAAY,CAChB,GAAW,EACX,UAKI,EAAE;QAEN,OAAO,CAAC,GAAG,CAAC,mCAAmC,GAAG,EAAE,CAAC,CAAC;QAEtD,IAAI,CAAC;YACH,8BAA8B;YAC9B,MAAM,WAAW,GAAG,MAAM,oCAAmB,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC;YAC7D,IAAI,CAAC,WAAW,CAAC,OAAO,EAAE,CAAC;gBACzB,OAAO,CAAC,IAAI,CAAC,2CAA2C,GAAG,MAAM,WAAW,CAAC,MAAM,EAAE,CAAC,CAAC;gBACvF,OAAO,EAAE,CAAC;YACZ,CAAC;YAED,MAAM,GAAG,GAAG,MAAM,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;YACzC,IAAI,CAAC,GAAG,EAAE,CAAC;gBACT,OAAO,EAAE,CAAC;YACZ,CAAC;YAED,uDAAuD;YACvD,IAAI,IAAI,CAAC,cAAc,CAAC,GAAG,CAAC,EAAE,CAAC;gBAC7B,OAAO,MAAM,IAAI,CAAC,iBAAiB,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC;YACpD,CAAC;iBAAM,CAAC;gBACN,OAAO,IAAI,CAAC,mBAAmB,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC;YAChD,CAAC;QAEH,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,qCAAqC,GAAG,GAAG,EAAE,KAAK,CAAC,CAAC;YAClE,OAAO,EAAE,CAAC;QACZ,CAAC;IACH,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,gBAAgB,CAAC,MAAc;QACnC,MAAM,QAAQ,GAAa,EAAE,CAAC;QAE9B,IAAI,CAAC;YACH,mDAAmD;YACnD,MAAM,cAAc,GAAG,MAAM,oCAAmB,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC;YACrE,QAAQ,CAAC,IAAI,CAAC,GAAG,cAAc,CAAC,CAAC;YAEjC,2BAA2B;YAC3B,MAAM,WAAW,GAAG;gBAClB,cAAc;gBACd,oBAAoB;gBACpB,eAAe;gBACf,WAAW;gBACX,mBAAmB;aACpB,CAAC;YAEF,KAAK,MAAM,IAAI,IAAI,WAAW,EAAE,CAAC;gBAC/B,MAAM,UAAU,GAAG,WAAW,MAAM,GAAG,IAAI,EAAE,CAAC;gBAE9C,sCAAsC;gBACtC,IAAI,QAAQ,CAAC,QAAQ,CAAC,UAAU,CAAC,EAAE,CAAC;oBAClC,SAAS;gBACX,CAAC;gBAED,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,kBAAkB,CAAC,UAAU,CAAC,CAAC;gBACzD,IAAI,MAAM,EAAE,CAAC;oBACX,QAAQ,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;gBAC5B,CAAC;YACH,CAAC;YAED,OAAO,CAAC,GAAG,CAAC,4BAA4B,QAAQ,CAAC,MAAM,iBAAiB,MAAM,EAAE,CAAC,CAAC;YAClF,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,GAAG,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,oBAAoB;QAE5D,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,8CAA8C,MAAM,GAAG,EAAE,KAAK,CAAC,CAAC;YAC9E,OAAO,EAAE,CAAC;QACZ,CAAC;IACH,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,gBAAgB,CACpB,MAAc,EACd,UAAuD,EAAE;QAEzD,MAAM,SAAS,GAAG,OAAO,CAAC,SAAS,IAAI,EAAE,CAAC;QAC1C,MAAM,UAAU,GAAG,OAAO,CAAC,UAAU,IAAI,IAAI,CAAC;QAE9C,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,gBAAgB,CAAC,MAAM,CAAC,CAAC;QACrD,MAAM,UAAU,GAAmB,EAAE,CAAC;QAEtC,KAAK,MAAM,UAAU,IAAI,QAAQ,EAAE,CAAC;YAClC,IAAI,CAAC;gBACH,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,YAAY,CAAC,UAAU,EAAE;oBAClD,YAAY,EAAE,IAAI;oBAClB,UAAU,EAAE,IAAI,CAAC,KAAK,CAAC,UAAU,GAAG,QAAQ,CAAC,MAAM,CAAC,EAAE,mBAAmB;oBACzE,WAAW,EAAE,IAAI;iBAClB,CAAC,CAAC;gBACH,UAAU,CAAC,IAAI,CAAC,GAAG,OAAO,CAAC,CAAC;YAC9B,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,OAAO,CAAC,IAAI,CAAC,8BAA8B,UAAU,GAAG,EAAE,KAAK,CAAC,CAAC;gBACjE,SAAS;YACX,CAAC;QACH,CAAC;QAED,qCAAqC;QACrC,MAAM,UAAU,GAAG,IAAI,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,GAAG,CAAC,SAAS,GAAG,EAAE,GAAG,EAAE,GAAG,IAAI,CAAC,CAAC,CAAC;QACvE,MAAM,aAAa,GAAG,UAAU;aAC7B,MAAM,CAAC,KAAK,CAAC,EAAE,CAAC,KAAK,CAAC,OAAO,IAAI,KAAK,CAAC,OAAO,IAAI,UAAU,CAAC;aAC7D,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;YACb,IAAI,CAAC,CAAC,CAAC,OAAO,IAAI,CAAC,CAAC,CAAC,OAAO;gBAAE,OAAO,CAAC,CAAC;YACvC,OAAO,CAAC,CAAC,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC;QACnD,CAAC,CAAC;aACD,KAAK,CAAC,CAAC,EAAE,UAAU,CAAC,CAAC;QAExB,OAAO,CAAC,GAAG,CAAC,uBAAuB,aAAa,CAAC,MAAM,wBAAwB,MAAM,EAAE,CAAC,CAAC;QACzF,OAAO,aAAa,CAAC;IACvB,CAAC;IAEO,KAAK,CAAC,YAAY,CAAC,GAAW;QACpC,IAAI,CAAC;YACH,OAAO,MAAM,yCAAiB,CAAC,OAAO,CAAC,GAAG,EAAE,KAAK,IAAI,EAAE;gBACrD,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;gBACzC,MAAM,SAAS,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,IAAI,CAAC,OAAO,CAAC,CAAC;gBAErE,IAAI,CAAC;oBACH,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;wBAChC,OAAO,EAAE;4BACP,YAAY,EAAE,IAAI,CAAC,SAAS;4BAC5B,QAAQ,EAAE,gCAAgC;yBAC3C;wBACD,MAAM,EAAE,UAAU,CAAC,MAAM;qBAC1B,CAAC,CAAC;oBAEH,YAAY,CAAC,SAAS,CAAC,CAAC;oBAExB,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;wBACjB,MAAM,IAAI,KAAK,CAAC,QAAQ,QAAQ,CAAC,MAAM,KAAK,QAAQ,CAAC,UAAU,EAAE,CAAC,CAAC;oBACrE,CAAC;oBAED,MAAM,aAAa,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,gBAAgB,CAAC,CAAC;oBAC7D,IAAI,aAAa,IAAI,QAAQ,CAAC,aAAa,CAAC,GAAG,IAAI,CAAC,cAAc,EAAE,CAAC;wBACnE,MAAM,IAAI,KAAK,CAAC,sBAAsB,aAAa,QAAQ,CAAC,CAAC;oBAC/D,CAAC;oBAED,MAAM,GAAG,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;oBAElC,IAAI,GAAG,CAAC,MAAM,GAAG,IAAI,CAAC,cAAc,EAAE,CAAC;wBACrC,MAAM,IAAI,KAAK,CAAC,sBAAsB,GAAG,CAAC,MAAM,QAAQ,CAAC,CAAC;oBAC5D,CAAC;oBAED,OAAO,GAAG,CAAC;gBAEb,CAAC;gBAAC,OAAO,KAAK,EAAE,CAAC;oBACf,YAAY,CAAC,SAAS,CAAC,CAAC;oBACxB,MAAM,KAAK,CAAC;gBACd,CAAC;YACH,CAAC,CAAC,CAAC;QACL,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,8BAA8B,GAAG,GAAG,EAAE,KAAK,CAAC,CAAC;YAC3D,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IAEO,KAAK,CAAC,kBAAkB,CAAC,GAAW;QAC1C,IAAI,CAAC;YACH,OAAO,MAAM,yCAAiB,CAAC,OAAO,CAAC,GAAG,EAAE,KAAK,IAAI,EAAE;gBACrD,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;gBACzC,MAAM,SAAS,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,IAAI,CAAC,CAAC;gBAE7D,IAAI,CAAC;oBACH,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;wBAChC,MAAM,EAAE,MAAM;wBACd,OAAO,EAAE,EAAE,YAAY,EAAE,IAAI,CAAC,SAAS,EAAE;wBACzC,MAAM,EAAE,UAAU,CAAC,MAAM;qBAC1B,CAAC,CAAC;oBAEH,YAAY,CAAC,SAAS,CAAC,CAAC;oBACxB,OAAO,QAAQ,CAAC,EAAE,CAAC;gBAErB,CAAC;gBAAC,OAAO,KAAK,EAAE,CAAC;oBACf,YAAY,CAAC,SAAS,CAAC,CAAC;oBACxB,OAAO,KAAK,CAAC;gBACf,CAAC;YACH,CAAC,CAAC,CAAC;QACL,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,KAAK,CAAC;QACf,CAAC;IACH,CAAC;IAEO,cAAc,CAAC,GAAW;QAChC,OAAO,GAAG,CAAC,QAAQ,CAAC,eAAe,CAAC,IAAI,GAAG,CAAC,QAAQ,CAAC,iBAAiB,CAAC,CAAC;IAC1E,CAAC;IAEO,KAAK,CAAC,iBAAiB,CAC7B,GAAW,EACX,OAAY;QAEZ,OAAO,CAAC,GAAG,CAAC,qCAAqC,CAAC,CAAC;QAEnD,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,GAAG,EAAE,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC;QAC/C,MAAM,QAAQ,GAAa,EAAE,CAAC;QAC9B,MAAM,UAAU,GAAmB,EAAE,CAAC;QAEtC,kCAAkC;QAClC,CAAC,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,EAAE;YAC/B,MAAM,QAAQ,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC;YAC5B,MAAM,GAAG,GAAG,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;YACvD,IAAI,GAAG,EAAE,CAAC;gBACR,QAAQ,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;YACrB,CAAC;QACH,CAAC,CAAC,CAAC;QAEH,OAAO,CAAC,GAAG,CAAC,uBAAuB,QAAQ,CAAC,MAAM,oBAAoB,CAAC,CAAC;QAExE,gEAAgE;QAChE,MAAM,mBAAmB,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;YACjD,MAAM,MAAM,GAAG,CAAC,CAAC,WAAW,EAAE,CAAC;YAC/B,MAAM,MAAM,GAAG,CAAC,CAAC,WAAW,EAAE,CAAC;YAE/B,sBAAsB;YACtB,MAAM,OAAO,GAAG,MAAM,CAAC,QAAQ,CAAC,SAAS,CAAC,IAAI,MAAM,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC;YACzE,MAAM,OAAO,GAAG,MAAM,CAAC,QAAQ,CAAC,SAAS,CAAC,IAAI,MAAM,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC;YACzE,IAAI,OAAO,IAAI,CAAC,OAAO;gBAAE,OAAO,CAAC,CAAC,CAAC;YACnC,IAAI,OAAO,IAAI,CAAC,OAAO;gBAAE,OAAO,CAAC,CAAC;YAElC,6BAA6B;YAC7B,MAAM,UAAU,GAAG,qBAAqB,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACtD,MAAM,UAAU,GAAG,qBAAqB,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACtD,IAAI,UAAU,IAAI,CAAC,UAAU;gBAAE,OAAO,CAAC,CAAC,CAAC;YACzC,IAAI,UAAU,IAAI,CAAC,UAAU;gBAAE,OAAO,CAAC,CAAC;YAExC,OAAO,CAAC,CAAC;QACX,CAAC,CAAC,CAAC;QAEH,4EAA4E;QAC5E,MAAM,iBAAiB,GAAG,mBAAmB,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QAC3D,MAAM,iBAAiB,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,OAAO,CAAC,UAAU,IAAI,IAAI,CAAC,UAAU,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,CAAC;QAE9G,gDAAgD;QAChD,MAAM,KAAK,GAAG,IAAA,iBAAM,EAAC,CAAC,CAAC,CAAC;QACxB,OAAO,CAAC,GAAG,CAAC,0BAA0B,iBAAiB,CAAC,MAAM,wCAAwC,CAAC,CAAC;QAExG,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,UAAU,CACtC,iBAAiB,CAAC,GAAG,CAAC,UAAU,CAAC,EAAE,CACjC,KAAK,CAAC,KAAK,IAAI,EAAE;YACf,MAAM,UAAU,GAAG,MAAM,IAAI,CAAC,YAAY,CAAC,UAAU,CAAC,CAAC;YACvD,IAAI,UAAU,EAAE,CAAC;gBACf,OAAO,IAAI,CAAC,mBAAmB,CAAC,UAAU,EAAE;oBAC1C,GAAG,OAAO;oBACV,UAAU,EAAE,iBAAiB;iBAC9B,CAAC,CAAC;YACL,CAAC;YACD,OAAO,EAAE,CAAC;QACZ,CAAC,CAAC,CACH,CACF,CAAC;QAEF,6BAA6B;QAC7B,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;YAC7B,IAAI,MAAM,CAAC,MAAM,KAAK,WAAW,EAAE,CAAC;gBAClC,UAAU,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC;YACnC,CAAC;iBAAM,CAAC;gBACN,OAAO,CAAC,IAAI,CAAC,qCAAqC,EAAE,MAAM,CAAC,MAAM,CAAC,CAAC;YACrE,CAAC;QACH,CAAC;QAED,OAAO,CAAC,GAAG,CAAC,0CAA0C,UAAU,CAAC,MAAM,iBAAiB,iBAAiB,CAAC,MAAM,WAAW,CAAC,CAAC;QAC7H,OAAO,UAAU,CAAC;IACpB,CAAC;IAEO,mBAAmB,CACzB,GAAW,EACX,OAKC;QAED,OAAO,CAAC,GAAG,CAAC,uCAAuC,CAAC,CAAC;QAErD,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,GAAG,EAAE,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC;QAC/C,MAAM,OAAO,GAAmB,EAAE,CAAC;QACnC,MAAM,UAAU,GAAG,OAAO,CAAC,UAAU,IAAI,IAAI,CAAC,UAAU,CAAC;QACzD,MAAM,UAAU,GAAG,OAAO,CAAC,YAAY;YACrC,CAAC,CAAC,IAAI,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,GAAG,IAAI,CAAC,eAAe,CAAC;YAC7C,CAAC,CAAC,IAAI,CAAC;QAET,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;YAC/B,IAAI,OAAO,CAAC,MAAM,IAAI,UAAU,EAAE,CAAC;gBACjC,OAAO,KAAK,CAAC,CAAC,iBAAiB;YACjC,CAAC;YAED,MAAM,QAAQ,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC;YAC5B,MAAM,GAAG,GAAG,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;YAEvD,IAAI,CAAC,GAAG;gBAAE,OAAO;YAEjB,MAAM,KAAK,GAAiB,EAAE,GAAG,EAAE,GAAG,EAAE,CAAC;YAEzC,gBAAgB;YAChB,MAAM,WAAW,GAAG,QAAQ,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;YACnE,IAAI,WAAW,EAAE,CAAC;gBAChB,MAAM,OAAO,GAAG,IAAI,IAAI,CAAC,WAAW,CAAC,CAAC;gBACtC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC,EAAE,CAAC;oBAC9B,KAAK,CAAC,OAAO,GAAG,OAAO,CAAC;gBAC1B,CAAC;YACH,CAAC;YAED,iCAAiC;YACjC,IAAI,UAAU,IAAI,KAAK,CAAC,OAAO,IAAI,KAAK,CAAC,OAAO,GAAG,UAAU,EAAE,CAAC;gBAC9D,OAAO,CAAC,kBAAkB;YAC5B,CAAC;YAED,mBAAmB;YACnB,MAAM,UAAU,GAAG,QAAQ,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;YACrE,IAAI,UAAU,EAAE,CAAC;gBACf,KAAK,CAAC,UAAU,GAAG,UAAiB,CAAC;YACvC,CAAC;YAED,iBAAiB;YACjB,MAAM,YAAY,GAAG,QAAQ,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;YACrE,IAAI,YAAY,EAAE,CAAC;gBACjB,MAAM,QAAQ,GAAG,UAAU,CAAC,YAAY,CAAC,CAAC;gBAC1C,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,EAAE,CAAC;oBACrB,KAAK,CAAC,QAAQ,GAAG,QAAQ,CAAC;gBAC5B,CAAC;YACH,CAAC;YAED,4BAA4B;YAC5B,IAAI,OAAO,CAAC,aAAa,EAAE,CAAC;gBAC1B,MAAM,MAAM,GAAmB,EAAE,CAAC;gBAClC,QAAQ,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,UAAU,EAAE,EAAE;oBACpD,MAAM,IAAI,GAAG,CAAC,CAAC,UAAU,CAAC,CAAC;oBAC3B,MAAM,MAAM,GAAG,IAAI,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;oBAC9D,IAAI,MAAM,EAAE,CAAC;wBACX,MAAM,CAAC,IAAI,CAAC;4BACV,GAAG,EAAE,MAAM;4BACX,OAAO,EAAE,IAAI,CAAC,IAAI,CAAC,iBAAiB,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,IAAI,SAAS;4BACxE,KAAK,EAAE,IAAI,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,IAAI,SAAS;yBACrE,CAAC,CAAC;oBACL,CAAC;gBACH,CAAC,CAAC,CAAC;gBACH,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBACtB,KAAK,CAAC,MAAM,GAAG,MAAM,CAAC;gBACxB,CAAC;YACH,CAAC;YAED,0BAA0B;YAC1B,IAAI,OAAO,CAAC,WAAW,EAAE,CAAC;gBACxB,MAAM,KAAK,GAAG,QAAQ,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;gBAC3C,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBACrB,MAAM,KAAK,GAAG,KAAK,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;oBAC/D,IAAI,KAAK,EAAE,CAAC;wBACV,KAAK,CAAC,IAAI,GAAG,EAAE,KAAK,EAAE,CAAC;wBAEvB,MAAM,WAAW,GAAG,KAAK,CAAC,IAAI,CAAC,yBAAyB,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;wBAChF,IAAI,WAAW,EAAE,CAAC;4BAChB,MAAM,OAAO,GAAG,IAAI,IAAI,CAAC,WAAW,CAAC,CAAC;4BACtC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC,EAAE,CAAC;gCAC9B,KAAK,CAAC,IAAI,CAAC,aAAa,GAAG,OAAO,CAAC;4BACrC,CAAC;wBACH,CAAC;wBAED,MAAM,QAAQ,GAAG,KAAK,CAAC,IAAI,CAAC,iBAAiB,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;wBACrE,IAAI,QAAQ,EAAE,CAAC;4BACb,KAAK,CAAC,IAAI,CAAC,QAAQ,GAAG,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;wBAC/D,CAAC;oBACH,CAAC;gBACH,CAAC;YACH,CAAC;YAED,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACtB,CAAC,CAAC,CAAC;QAEH,OAAO,CAAC,GAAG,CAAC,wBAAwB,OAAO,CAAC,MAAM,uBAAuB,CAAC,CAAC;QAC3E,OAAO,OAAO,CAAC;IACjB,CAAC;IAED;;OAEG;IACH,qBAAqB,CAAC,GAAW;QAC/B,MAAM,MAAM,GAAa,EAAE,CAAC;QAE5B,IAAI,CAAC;YACH,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,GAAG,EAAE,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC;YAE/C,yBAAyB;YACzB,MAAM,SAAS,GAAG,CAAC,CAAC,QAAQ,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC;YACzC,MAAM,eAAe,GAAG,CAAC,CAAC,cAAc,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC;YAErD,IAAI,CAAC,SAAS,IAAI,CAAC,eAAe,EAAE,CAAC;gBACnC,MAAM,CAAC,IAAI,CAAC,2DAA2D,CAAC,CAAC;YAC3E,CAAC;YAED,yCAAyC;YACzC,IAAI,SAAS,EAAE,CAAC;gBACd,MAAM,QAAQ,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC;gBACjC,IAAI,QAAQ,GAAG,KAAK,EAAE,CAAC;oBACrB,MAAM,CAAC,IAAI,CAAC,kBAAkB,QAAQ,gBAAgB,CAAC,CAAC;gBAC1D,CAAC;YACH,CAAC;YAED,uBAAuB;YACvB,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;gBAC/B,MAAM,QAAQ,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC;gBAC5B,MAAM,GAAG,GAAG,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;gBAEvD,IAAI,CAAC,GAAG,EAAE,CAAC;oBACT,MAAM,CAAC,IAAI,CAAC,aAAa,KAAK,GAAG,CAAC,wBAAwB,CAAC,CAAC;gBAC9D,CAAC;qBAAM,CAAC;oBACN,IAAI,CAAC;wBACH,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;oBACf,CAAC;oBAAC,MAAM,CAAC;wBACP,MAAM,CAAC,IAAI,CAAC,wBAAwB,KAAK,GAAG,CAAC,KAAK,GAAG,EAAE,CAAC,CAAC;oBAC3D,CAAC;gBACH,CAAC;gBAED,0BAA0B;gBAC1B,MAAM,OAAO,GAAG,QAAQ,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;gBAC/D,IAAI,OAAO,EAAE,CAAC;oBACZ,MAAM,IAAI,GAAG,IAAI,IAAI,CAAC,OAAO,CAAC,CAAC;oBAC/B,IAAI,KAAK,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC,EAAE,CAAC;wBAC1B,MAAM,CAAC,IAAI,CAAC,iCAAiC,KAAK,GAAG,CAAC,KAAK,OAAO,EAAE,CAAC,CAAC;oBACxE,CAAC;gBACH,CAAC;gBAED,oBAAoB;gBACpB,MAAM,QAAQ,GAAG,QAAQ,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;gBACjE,IAAI,QAAQ,EAAE,CAAC;oBACb,MAAM,WAAW,GAAG,UAAU,CAAC,QAAQ,CAAC,CAAC;oBACzC,IAAI,KAAK,CAAC,WAAW,CAAC,IAAI,WAAW,GAAG,CAAC,IAAI,WAAW,GAAG,CAAC,EAAE,CAAC;wBAC7D,MAAM,CAAC,IAAI,CAAC,6BAA6B,KAAK,GAAG,CAAC,KAAK,QAAQ,gBAAgB,CAAC,CAAC;oBACnF,CAAC;gBACH,CAAC;YACH,CAAC,CAAC,CAAC;QAEL,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,CAAC,IAAI,CAAC,sBAAsB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,EAAE,CAAC,CAAC;QAChG,CAAC;QAED,OAAO;YACL,KAAK,EAAE,MAAM,CAAC,MAAM,KAAK,CAAC;YAC1B,MAAM;SACP,CAAC;IACJ,CAAC;CACF;AA5cD,sCA4cC;AAED,0BAA0B;AACb,QAAA,mBAAmB,GAAG,IAAI,aAAa,EAAE,CAAC"}
|
package/package.json
CHANGED
|
@@ -1,69 +1,90 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tyroneross/blog-scraper",
|
|
3
|
-
"version": "0.
|
|
4
|
-
"description": "
|
|
3
|
+
"version": "0.3.0",
|
|
4
|
+
"description": "Intelligent web scraper SDK for extracting blog/news content. Supports RSS, sitemaps, and HTML with automatic detection.",
|
|
5
|
+
"main": "./dist/lib/index.js",
|
|
6
|
+
"types": "./dist/lib/index.d.ts",
|
|
7
|
+
"exports": {
|
|
8
|
+
".": {
|
|
9
|
+
"types": "./dist/lib/index.d.ts",
|
|
10
|
+
"import": "./dist/lib/index.js",
|
|
11
|
+
"require": "./dist/lib/index.js"
|
|
12
|
+
}
|
|
13
|
+
},
|
|
14
|
+
"files": [
|
|
15
|
+
"dist/lib",
|
|
16
|
+
"README.md",
|
|
17
|
+
"LICENSE"
|
|
18
|
+
],
|
|
19
|
+
"sideEffects": false,
|
|
5
20
|
"keywords": [
|
|
6
21
|
"scraper",
|
|
7
|
-
"web-
|
|
8
|
-
"content-extraction",
|
|
22
|
+
"web-scraper",
|
|
9
23
|
"blog",
|
|
10
|
-
"articles",
|
|
11
24
|
"rss",
|
|
12
25
|
"sitemap",
|
|
26
|
+
"content-extraction",
|
|
13
27
|
"readability",
|
|
14
|
-
"
|
|
28
|
+
"news",
|
|
29
|
+
"article"
|
|
15
30
|
],
|
|
16
31
|
"author": "Tyrone Ross",
|
|
17
32
|
"license": "MIT",
|
|
18
33
|
"repository": {
|
|
19
34
|
"type": "git",
|
|
20
|
-
"url": "https://github.com/tyroneross/blog-
|
|
21
|
-
"directory": "packages/sdk"
|
|
35
|
+
"url": "git+https://github.com/tyroneross/blog-scraper.git"
|
|
22
36
|
},
|
|
23
|
-
"homepage": "https://github.com/tyroneross/blog-
|
|
37
|
+
"homepage": "https://github.com/tyroneross/blog-scraper#readme",
|
|
24
38
|
"bugs": {
|
|
25
|
-
"url": "https://github.com/tyroneross/blog-
|
|
39
|
+
"url": "https://github.com/tyroneross/blog-scraper/issues"
|
|
26
40
|
},
|
|
27
|
-
"main": "./dist/index.js",
|
|
28
|
-
"module": "./dist/index.mjs",
|
|
29
|
-
"types": "./dist/index.d.ts",
|
|
30
|
-
"exports": {
|
|
31
|
-
".": {
|
|
32
|
-
"types": "./dist/index.d.ts",
|
|
33
|
-
"import": "./dist/index.mjs",
|
|
34
|
-
"require": "./dist/index.js"
|
|
35
|
-
}
|
|
36
|
-
},
|
|
37
|
-
"files": [
|
|
38
|
-
"dist",
|
|
39
|
-
"README.md",
|
|
40
|
-
"LICENSE"
|
|
41
|
-
],
|
|
42
41
|
"scripts": {
|
|
43
|
-
"
|
|
44
|
-
"
|
|
42
|
+
"dev": "next dev",
|
|
43
|
+
"build": "next build",
|
|
44
|
+
"build:sdk": "tsc -p tsconfig.build.json",
|
|
45
|
+
"start": "next start",
|
|
46
|
+
"lint": "next lint",
|
|
47
|
+
"pre-deploy": "./scripts/pre-deploy.sh",
|
|
45
48
|
"typecheck": "tsc --noEmit",
|
|
46
|
-
"
|
|
49
|
+
"test:f1": "tsx tests/run-f1-tests.ts",
|
|
50
|
+
"test:f1:dragnet": "tsx tests/dragnet-f1-test.ts",
|
|
51
|
+
"prepublishOnly": "npm run build:sdk"
|
|
47
52
|
},
|
|
48
53
|
"dependencies": {
|
|
49
|
-
"@mozilla/readability": "^0.
|
|
54
|
+
"@mozilla/readability": "^0.6.0",
|
|
50
55
|
"cheerio": "^1.1.2",
|
|
56
|
+
"global-agent": "^3.0.0",
|
|
51
57
|
"jsdom": "^24.0.0",
|
|
58
|
+
"p-limit": "^7.2.0",
|
|
59
|
+
"playwright": "^1.57.0",
|
|
60
|
+
"remark-gfm": "^4.0.1",
|
|
52
61
|
"rss-parser": "^3.13.0",
|
|
53
62
|
"turndown": "^7.2.1",
|
|
54
63
|
"zod": "^3.22.0"
|
|
55
64
|
},
|
|
56
65
|
"devDependencies": {
|
|
66
|
+
"@tailwindcss/postcss": "^4.1.0",
|
|
67
|
+
"@tailwindcss/typography": "^0.5.19",
|
|
68
|
+
"@tyroneross/claude-code-debugger": "^1.3.0",
|
|
69
|
+
"@types/global-agent": "^3.0.0",
|
|
57
70
|
"@types/jsdom": "^21.1.7",
|
|
58
71
|
"@types/node": "^20.0.0",
|
|
72
|
+
"@types/react": "^19.0.0",
|
|
73
|
+
"@types/react-dom": "^19.0.0",
|
|
59
74
|
"@types/turndown": "^5.0.5",
|
|
60
|
-
"
|
|
75
|
+
"eslint": "^8.0.0",
|
|
76
|
+
"eslint-config-next": "^15.0.0",
|
|
77
|
+
"lucide-react": "^0.400.0",
|
|
78
|
+
"next": "^16.1.1",
|
|
79
|
+
"postcss": "^8.4.0",
|
|
80
|
+
"react": "^19.0.0",
|
|
81
|
+
"react-dom": "^19.0.0",
|
|
82
|
+
"react-markdown": "^10.1.0",
|
|
83
|
+
"tailwindcss": "^4.1.0",
|
|
84
|
+
"tsx": "^4.7.0",
|
|
61
85
|
"typescript": "^5.0.0"
|
|
62
86
|
},
|
|
63
87
|
"engines": {
|
|
64
88
|
"node": ">=18.0.0"
|
|
65
|
-
},
|
|
66
|
-
"publishConfig": {
|
|
67
|
-
"access": "public"
|
|
68
89
|
}
|
|
69
90
|
}
|