recker 1.0.33-next.fbc1682 → 1.0.34
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/scrape/spider.d.ts +37 -0
- package/dist/scrape/spider.js +187 -1
- package/package.json +1 -1
package/dist/scrape/spider.d.ts
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import type { ExtractedLink } from './types.js';
|
|
2
|
+
import { type SitemapUrl } from '../seo/validators/sitemap.js';
|
|
2
3
|
export interface SpiderOptions {
|
|
3
4
|
maxDepth?: number;
|
|
4
5
|
maxPages?: number;
|
|
@@ -10,6 +11,8 @@ export interface SpiderOptions {
|
|
|
10
11
|
include?: RegExp[];
|
|
11
12
|
userAgent?: string;
|
|
12
13
|
respectRobotsTxt?: boolean;
|
|
14
|
+
useSitemap?: boolean;
|
|
15
|
+
sitemapUrl?: string;
|
|
13
16
|
onPage?: (result: SpiderPageResult) => void;
|
|
14
17
|
onProgress?: (progress: SpiderProgress) => void;
|
|
15
18
|
}
|
|
@@ -29,6 +32,29 @@ export interface SpiderProgress {
|
|
|
29
32
|
currentUrl: string;
|
|
30
33
|
depth: number;
|
|
31
34
|
}
|
|
35
|
+
export interface SitemapAnalysis {
|
|
36
|
+
found: boolean;
|
|
37
|
+
url?: string;
|
|
38
|
+
totalUrls: number;
|
|
39
|
+
crawledFromSitemap: number;
|
|
40
|
+
orphanUrls: string[];
|
|
41
|
+
missingFromSitemap: string[];
|
|
42
|
+
blockedBySitemapRobots: string[];
|
|
43
|
+
validationIssues: Array<{
|
|
44
|
+
type: string;
|
|
45
|
+
message: string;
|
|
46
|
+
}>;
|
|
47
|
+
sitemapUrls: SitemapUrl[];
|
|
48
|
+
}
|
|
49
|
+
export interface RobotsAnalysis {
|
|
50
|
+
found: boolean;
|
|
51
|
+
sitemaps: string[];
|
|
52
|
+
blocksAll: boolean;
|
|
53
|
+
issues: Array<{
|
|
54
|
+
type: string;
|
|
55
|
+
message: string;
|
|
56
|
+
}>;
|
|
57
|
+
}
|
|
32
58
|
export interface SpiderResult {
|
|
33
59
|
startUrl: string;
|
|
34
60
|
pages: SpiderPageResult[];
|
|
@@ -38,6 +64,8 @@ export interface SpiderResult {
|
|
|
38
64
|
url: string;
|
|
39
65
|
error: string;
|
|
40
66
|
}>;
|
|
67
|
+
sitemap?: SitemapAnalysis;
|
|
68
|
+
robots?: RobotsAnalysis;
|
|
41
69
|
}
|
|
42
70
|
export declare class Spider {
|
|
43
71
|
private options;
|
|
@@ -51,8 +79,17 @@ export declare class Spider {
|
|
|
51
79
|
private running;
|
|
52
80
|
private aborted;
|
|
53
81
|
private pendingCount;
|
|
82
|
+
private sitemapUrls;
|
|
83
|
+
private sitemapUrlSet;
|
|
84
|
+
private robotsData;
|
|
85
|
+
private sitemapValidation;
|
|
86
|
+
private robotsValidation;
|
|
54
87
|
constructor(options?: SpiderOptions);
|
|
55
88
|
crawl(startUrl: string): Promise<SpiderResult>;
|
|
89
|
+
private fetchRobotsTxt;
|
|
90
|
+
private fetchSitemaps;
|
|
91
|
+
private buildSitemapAnalysis;
|
|
92
|
+
private buildRobotsAnalysis;
|
|
56
93
|
private crawlPage;
|
|
57
94
|
abort(): void;
|
|
58
95
|
isRunning(): boolean;
|
package/dist/scrape/spider.js
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
import { createClient } from '../core/client.js';
|
|
2
2
|
import { ScrapeDocument } from './document.js';
|
|
3
3
|
import { RequestPool } from '../utils/request-pool.js';
|
|
4
|
+
import { discoverSitemaps, fetchAndValidateSitemap, } from '../seo/validators/sitemap.js';
|
|
5
|
+
import { fetchAndValidateRobotsTxt, isPathAllowed, } from '../seo/validators/robots.js';
|
|
4
6
|
const TRACKING_PARAMS = new Set([
|
|
5
7
|
'utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content',
|
|
6
8
|
'gclid', 'gclsrc', 'dclid',
|
|
@@ -84,6 +86,11 @@ export class Spider {
|
|
|
84
86
|
running = false;
|
|
85
87
|
aborted = false;
|
|
86
88
|
pendingCount = 0;
|
|
89
|
+
sitemapUrls = [];
|
|
90
|
+
sitemapUrlSet = new Set();
|
|
91
|
+
robotsData = null;
|
|
92
|
+
sitemapValidation = null;
|
|
93
|
+
robotsValidation = null;
|
|
87
94
|
constructor(options = {}) {
|
|
88
95
|
this.options = {
|
|
89
96
|
maxDepth: options.maxDepth ?? 5,
|
|
@@ -94,6 +101,8 @@ export class Spider {
|
|
|
94
101
|
delay: options.delay ?? 100,
|
|
95
102
|
userAgent: options.userAgent ?? 'Recker Spider/1.0',
|
|
96
103
|
respectRobotsTxt: options.respectRobotsTxt ?? true,
|
|
104
|
+
useSitemap: options.useSitemap ?? false,
|
|
105
|
+
sitemapUrl: options.sitemapUrl,
|
|
97
106
|
exclude: options.exclude,
|
|
98
107
|
include: options.include,
|
|
99
108
|
onPage: options.onPage,
|
|
@@ -117,6 +126,7 @@ export class Spider {
|
|
|
117
126
|
async crawl(startUrl) {
|
|
118
127
|
const startTime = performance.now();
|
|
119
128
|
const normalizedStart = normalizeUrl(startUrl);
|
|
129
|
+
const baseUrl = new URL(normalizedStart).origin;
|
|
120
130
|
this.baseHost = new URL(normalizedStart).hostname;
|
|
121
131
|
this.visited.clear();
|
|
122
132
|
this.queue = [];
|
|
@@ -125,8 +135,19 @@ export class Spider {
|
|
|
125
135
|
this.running = true;
|
|
126
136
|
this.aborted = false;
|
|
127
137
|
this.pendingCount = 0;
|
|
138
|
+
this.sitemapUrls = [];
|
|
139
|
+
this.sitemapUrlSet.clear();
|
|
140
|
+
this.robotsData = null;
|
|
141
|
+
this.sitemapValidation = null;
|
|
142
|
+
this.robotsValidation = null;
|
|
143
|
+
if (this.options.respectRobotsTxt || this.options.useSitemap) {
|
|
144
|
+
await this.fetchRobotsTxt(baseUrl);
|
|
145
|
+
}
|
|
146
|
+
if (this.options.useSitemap) {
|
|
147
|
+
await this.fetchSitemaps(baseUrl);
|
|
148
|
+
}
|
|
128
149
|
const pending = new Map();
|
|
129
|
-
const scheduleUrl = (item) => {
|
|
150
|
+
const scheduleUrl = (item, fromSitemap = false) => {
|
|
130
151
|
const normalized = normalizeUrl(item.url);
|
|
131
152
|
if (this.visited.has(normalized))
|
|
132
153
|
return;
|
|
@@ -136,6 +157,17 @@ export class Spider {
|
|
|
136
157
|
return;
|
|
137
158
|
if (this.results.length + pending.size >= this.options.maxPages)
|
|
138
159
|
return;
|
|
160
|
+
if (this.options.respectRobotsTxt && this.robotsData) {
|
|
161
|
+
try {
|
|
162
|
+
const urlPath = new URL(normalized).pathname;
|
|
163
|
+
if (!isPathAllowed(this.robotsData, urlPath, this.options.userAgent)) {
|
|
164
|
+
return;
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
catch {
|
|
168
|
+
return;
|
|
169
|
+
}
|
|
170
|
+
}
|
|
139
171
|
this.visited.add(normalized);
|
|
140
172
|
this.pendingCount++;
|
|
141
173
|
const promise = this.pool.run(() => this.crawlPage({ ...item, url: normalized }))
|
|
@@ -146,6 +178,18 @@ export class Spider {
|
|
|
146
178
|
pending.set(normalized, promise);
|
|
147
179
|
};
|
|
148
180
|
scheduleUrl({ url: normalizedStart, depth: 0 });
|
|
181
|
+
if (this.options.useSitemap && this.sitemapUrls.length > 0) {
|
|
182
|
+
for (const sitemapUrl of this.sitemapUrls) {
|
|
183
|
+
try {
|
|
184
|
+
const urlHost = new URL(sitemapUrl.loc).hostname;
|
|
185
|
+
if (urlHost === this.baseHost) {
|
|
186
|
+
scheduleUrl({ url: sitemapUrl.loc, depth: 1 }, true);
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
catch {
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
}
|
|
149
193
|
while ((pending.size > 0 || this.queue.length > 0) && !this.aborted) {
|
|
150
194
|
while (this.queue.length > 0 && !this.aborted) {
|
|
151
195
|
const item = this.queue.shift();
|
|
@@ -161,12 +205,154 @@ export class Spider {
|
|
|
161
205
|
await Promise.all(pending.values());
|
|
162
206
|
}
|
|
163
207
|
this.running = false;
|
|
208
|
+
const sitemapAnalysis = this.buildSitemapAnalysis();
|
|
209
|
+
const robotsAnalysis = this.buildRobotsAnalysis();
|
|
164
210
|
return {
|
|
165
211
|
startUrl: normalizedStart,
|
|
166
212
|
pages: this.results,
|
|
167
213
|
visited: this.visited,
|
|
168
214
|
duration: Math.round(performance.now() - startTime),
|
|
169
215
|
errors: this.errors,
|
|
216
|
+
sitemap: this.options.useSitemap ? sitemapAnalysis : undefined,
|
|
217
|
+
robots: robotsAnalysis,
|
|
218
|
+
};
|
|
219
|
+
}
|
|
220
|
+
async fetchRobotsTxt(baseUrl) {
|
|
221
|
+
try {
|
|
222
|
+
const fetcher = async (url) => {
|
|
223
|
+
const response = await this.client.get(url);
|
|
224
|
+
return {
|
|
225
|
+
status: response.status,
|
|
226
|
+
text: await response.text(),
|
|
227
|
+
};
|
|
228
|
+
};
|
|
229
|
+
const result = await fetchAndValidateRobotsTxt(baseUrl, fetcher);
|
|
230
|
+
if (result.exists) {
|
|
231
|
+
this.robotsData = result.parseResult;
|
|
232
|
+
this.robotsValidation = {
|
|
233
|
+
found: true,
|
|
234
|
+
issues: result.issues.map(i => ({ type: i.type, message: i.message })),
|
|
235
|
+
};
|
|
236
|
+
}
|
|
237
|
+
else {
|
|
238
|
+
this.robotsValidation = {
|
|
239
|
+
found: false,
|
|
240
|
+
issues: [{ type: 'info', message: 'robots.txt not found' }],
|
|
241
|
+
};
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
catch (error) {
|
|
245
|
+
this.robotsValidation = {
|
|
246
|
+
found: false,
|
|
247
|
+
issues: [{ type: 'error', message: `Failed to fetch robots.txt: ${error instanceof Error ? error.message : 'Unknown error'}` }],
|
|
248
|
+
};
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
async fetchSitemaps(baseUrl) {
|
|
252
|
+
const fetcher = async (url) => {
|
|
253
|
+
const response = await this.client.get(url);
|
|
254
|
+
return {
|
|
255
|
+
status: response.status,
|
|
256
|
+
text: await response.text(),
|
|
257
|
+
headers: Object.fromEntries([...response.headers.entries()]),
|
|
258
|
+
};
|
|
259
|
+
};
|
|
260
|
+
try {
|
|
261
|
+
let sitemapUrls = [];
|
|
262
|
+
if (this.options.sitemapUrl) {
|
|
263
|
+
sitemapUrls = [this.options.sitemapUrl];
|
|
264
|
+
}
|
|
265
|
+
else if (this.robotsData?.sitemaps.length) {
|
|
266
|
+
sitemapUrls = this.robotsData.sitemaps;
|
|
267
|
+
}
|
|
268
|
+
else {
|
|
269
|
+
sitemapUrls = await discoverSitemaps(baseUrl, undefined, fetcher);
|
|
270
|
+
}
|
|
271
|
+
for (const sitemapUrl of sitemapUrls) {
|
|
272
|
+
try {
|
|
273
|
+
const result = await fetchAndValidateSitemap(sitemapUrl, fetcher);
|
|
274
|
+
if (result.exists && result.parseResult.valid) {
|
|
275
|
+
this.sitemapValidation = result;
|
|
276
|
+
if (result.parseResult.type === 'sitemapindex') {
|
|
277
|
+
for (const childSitemap of result.parseResult.sitemaps) {
|
|
278
|
+
try {
|
|
279
|
+
const childResult = await fetchAndValidateSitemap(childSitemap.loc, fetcher);
|
|
280
|
+
if (childResult.exists && childResult.parseResult.urls.length > 0) {
|
|
281
|
+
this.sitemapUrls.push(...childResult.parseResult.urls);
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
catch {
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
else {
|
|
289
|
+
this.sitemapUrls.push(...result.parseResult.urls);
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
catch {
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
for (const url of this.sitemapUrls) {
|
|
297
|
+
this.sitemapUrlSet.add(normalizeUrl(url.loc));
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
catch (error) {
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
buildSitemapAnalysis() {
|
|
304
|
+
const crawledUrls = new Set(this.results.map(r => normalizeUrl(r.url)));
|
|
305
|
+
const crawledFromSitemap = this.sitemapUrls.filter(u => crawledUrls.has(normalizeUrl(u.loc))).length;
|
|
306
|
+
const linkedUrls = new Set();
|
|
307
|
+
for (const page of this.results) {
|
|
308
|
+
for (const link of page.links) {
|
|
309
|
+
if (link.href) {
|
|
310
|
+
linkedUrls.add(normalizeUrl(link.href));
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
const orphanUrls = this.sitemapUrls
|
|
315
|
+
.filter(u => {
|
|
316
|
+
const normalized = normalizeUrl(u.loc);
|
|
317
|
+
return !linkedUrls.has(normalized) && crawledUrls.has(normalized);
|
|
318
|
+
})
|
|
319
|
+
.map(u => u.loc);
|
|
320
|
+
const missingFromSitemap = Array.from(crawledUrls)
|
|
321
|
+
.filter(url => !this.sitemapUrlSet.has(url));
|
|
322
|
+
const blockedBySitemapRobots = [];
|
|
323
|
+
if (this.robotsData) {
|
|
324
|
+
for (const sitemapUrl of this.sitemapUrls) {
|
|
325
|
+
try {
|
|
326
|
+
const urlPath = new URL(sitemapUrl.loc).pathname;
|
|
327
|
+
if (!isPathAllowed(this.robotsData, urlPath, this.options.userAgent)) {
|
|
328
|
+
blockedBySitemapRobots.push(sitemapUrl.loc);
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
catch {
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
return {
|
|
336
|
+
found: this.sitemapUrls.length > 0,
|
|
337
|
+
url: this.sitemapValidation?.parseResult ? undefined : undefined,
|
|
338
|
+
totalUrls: this.sitemapUrls.length,
|
|
339
|
+
crawledFromSitemap,
|
|
340
|
+
orphanUrls,
|
|
341
|
+
missingFromSitemap,
|
|
342
|
+
blockedBySitemapRobots,
|
|
343
|
+
validationIssues: this.sitemapValidation?.issues.map(i => ({
|
|
344
|
+
type: i.type,
|
|
345
|
+
message: i.message,
|
|
346
|
+
})) || [],
|
|
347
|
+
sitemapUrls: this.sitemapUrls,
|
|
348
|
+
};
|
|
349
|
+
}
|
|
350
|
+
buildRobotsAnalysis() {
|
|
351
|
+
return {
|
|
352
|
+
found: this.robotsValidation?.found ?? false,
|
|
353
|
+
sitemaps: this.robotsData?.sitemaps ?? [],
|
|
354
|
+
blocksAll: this.robotsData?.blocksAllRobots ?? false,
|
|
355
|
+
issues: this.robotsValidation?.issues ?? [],
|
|
170
356
|
};
|
|
171
357
|
}
|
|
172
358
|
async crawlPage(item) {
|