recker 1.0.33-next.fbc1682 → 1.0.34

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,5 @@
1
1
  import type { ExtractedLink } from './types.js';
2
+ import { type SitemapUrl } from '../seo/validators/sitemap.js';
2
3
  export interface SpiderOptions {
3
4
  maxDepth?: number;
4
5
  maxPages?: number;
@@ -10,6 +11,8 @@ export interface SpiderOptions {
10
11
  include?: RegExp[];
11
12
  userAgent?: string;
12
13
  respectRobotsTxt?: boolean;
14
+ useSitemap?: boolean;
15
+ sitemapUrl?: string;
13
16
  onPage?: (result: SpiderPageResult) => void;
14
17
  onProgress?: (progress: SpiderProgress) => void;
15
18
  }
@@ -29,6 +32,29 @@ export interface SpiderProgress {
29
32
  currentUrl: string;
30
33
  depth: number;
31
34
  }
35
+ export interface SitemapAnalysis {
36
+ found: boolean;
37
+ url?: string;
38
+ totalUrls: number;
39
+ crawledFromSitemap: number;
40
+ orphanUrls: string[];
41
+ missingFromSitemap: string[];
42
+ blockedBySitemapRobots: string[];
43
+ validationIssues: Array<{
44
+ type: string;
45
+ message: string;
46
+ }>;
47
+ sitemapUrls: SitemapUrl[];
48
+ }
49
+ export interface RobotsAnalysis {
50
+ found: boolean;
51
+ sitemaps: string[];
52
+ blocksAll: boolean;
53
+ issues: Array<{
54
+ type: string;
55
+ message: string;
56
+ }>;
57
+ }
32
58
  export interface SpiderResult {
33
59
  startUrl: string;
34
60
  pages: SpiderPageResult[];
@@ -38,6 +64,8 @@ export interface SpiderResult {
38
64
  url: string;
39
65
  error: string;
40
66
  }>;
67
+ sitemap?: SitemapAnalysis;
68
+ robots?: RobotsAnalysis;
41
69
  }
42
70
  export declare class Spider {
43
71
  private options;
@@ -51,8 +79,17 @@ export declare class Spider {
51
79
  private running;
52
80
  private aborted;
53
81
  private pendingCount;
82
+ private sitemapUrls;
83
+ private sitemapUrlSet;
84
+ private robotsData;
85
+ private sitemapValidation;
86
+ private robotsValidation;
54
87
  constructor(options?: SpiderOptions);
55
88
  crawl(startUrl: string): Promise<SpiderResult>;
89
+ private fetchRobotsTxt;
90
+ private fetchSitemaps;
91
+ private buildSitemapAnalysis;
92
+ private buildRobotsAnalysis;
56
93
  private crawlPage;
57
94
  abort(): void;
58
95
  isRunning(): boolean;
@@ -1,6 +1,8 @@
1
1
  import { createClient } from '../core/client.js';
2
2
  import { ScrapeDocument } from './document.js';
3
3
  import { RequestPool } from '../utils/request-pool.js';
4
+ import { discoverSitemaps, fetchAndValidateSitemap, } from '../seo/validators/sitemap.js';
5
+ import { fetchAndValidateRobotsTxt, isPathAllowed, } from '../seo/validators/robots.js';
4
6
  const TRACKING_PARAMS = new Set([
5
7
  'utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content',
6
8
  'gclid', 'gclsrc', 'dclid',
@@ -84,6 +86,11 @@ export class Spider {
84
86
  running = false;
85
87
  aborted = false;
86
88
  pendingCount = 0;
89
+ sitemapUrls = [];
90
+ sitemapUrlSet = new Set();
91
+ robotsData = null;
92
+ sitemapValidation = null;
93
+ robotsValidation = null;
87
94
  constructor(options = {}) {
88
95
  this.options = {
89
96
  maxDepth: options.maxDepth ?? 5,
@@ -94,6 +101,8 @@ export class Spider {
94
101
  delay: options.delay ?? 100,
95
102
  userAgent: options.userAgent ?? 'Recker Spider/1.0',
96
103
  respectRobotsTxt: options.respectRobotsTxt ?? true,
104
+ useSitemap: options.useSitemap ?? false,
105
+ sitemapUrl: options.sitemapUrl,
97
106
  exclude: options.exclude,
98
107
  include: options.include,
99
108
  onPage: options.onPage,
@@ -117,6 +126,7 @@ export class Spider {
117
126
  async crawl(startUrl) {
118
127
  const startTime = performance.now();
119
128
  const normalizedStart = normalizeUrl(startUrl);
129
+ const baseUrl = new URL(normalizedStart).origin;
120
130
  this.baseHost = new URL(normalizedStart).hostname;
121
131
  this.visited.clear();
122
132
  this.queue = [];
@@ -125,8 +135,19 @@ export class Spider {
125
135
  this.running = true;
126
136
  this.aborted = false;
127
137
  this.pendingCount = 0;
138
+ this.sitemapUrls = [];
139
+ this.sitemapUrlSet.clear();
140
+ this.robotsData = null;
141
+ this.sitemapValidation = null;
142
+ this.robotsValidation = null;
143
+ if (this.options.respectRobotsTxt || this.options.useSitemap) {
144
+ await this.fetchRobotsTxt(baseUrl);
145
+ }
146
+ if (this.options.useSitemap) {
147
+ await this.fetchSitemaps(baseUrl);
148
+ }
128
149
  const pending = new Map();
129
- const scheduleUrl = (item) => {
150
+ const scheduleUrl = (item, fromSitemap = false) => {
130
151
  const normalized = normalizeUrl(item.url);
131
152
  if (this.visited.has(normalized))
132
153
  return;
@@ -136,6 +157,17 @@ export class Spider {
136
157
  return;
137
158
  if (this.results.length + pending.size >= this.options.maxPages)
138
159
  return;
160
+ if (this.options.respectRobotsTxt && this.robotsData) {
161
+ try {
162
+ const urlPath = new URL(normalized).pathname;
163
+ if (!isPathAllowed(this.robotsData, urlPath, this.options.userAgent)) {
164
+ return;
165
+ }
166
+ }
167
+ catch {
168
+ return;
169
+ }
170
+ }
139
171
  this.visited.add(normalized);
140
172
  this.pendingCount++;
141
173
  const promise = this.pool.run(() => this.crawlPage({ ...item, url: normalized }))
@@ -146,6 +178,18 @@ export class Spider {
146
178
  pending.set(normalized, promise);
147
179
  };
148
180
  scheduleUrl({ url: normalizedStart, depth: 0 });
181
+ if (this.options.useSitemap && this.sitemapUrls.length > 0) {
182
+ for (const sitemapUrl of this.sitemapUrls) {
183
+ try {
184
+ const urlHost = new URL(sitemapUrl.loc).hostname;
185
+ if (urlHost === this.baseHost) {
186
+ scheduleUrl({ url: sitemapUrl.loc, depth: 1 }, true);
187
+ }
188
+ }
189
+ catch {
190
+ }
191
+ }
192
+ }
149
193
  while ((pending.size > 0 || this.queue.length > 0) && !this.aborted) {
150
194
  while (this.queue.length > 0 && !this.aborted) {
151
195
  const item = this.queue.shift();
@@ -161,12 +205,154 @@ export class Spider {
161
205
  await Promise.all(pending.values());
162
206
  }
163
207
  this.running = false;
208
+ const sitemapAnalysis = this.buildSitemapAnalysis();
209
+ const robotsAnalysis = this.buildRobotsAnalysis();
164
210
  return {
165
211
  startUrl: normalizedStart,
166
212
  pages: this.results,
167
213
  visited: this.visited,
168
214
  duration: Math.round(performance.now() - startTime),
169
215
  errors: this.errors,
216
+ sitemap: this.options.useSitemap ? sitemapAnalysis : undefined,
217
+ robots: robotsAnalysis,
218
+ };
219
+ }
220
+ async fetchRobotsTxt(baseUrl) {
221
+ try {
222
+ const fetcher = async (url) => {
223
+ const response = await this.client.get(url);
224
+ return {
225
+ status: response.status,
226
+ text: await response.text(),
227
+ };
228
+ };
229
+ const result = await fetchAndValidateRobotsTxt(baseUrl, fetcher);
230
+ if (result.exists) {
231
+ this.robotsData = result.parseResult;
232
+ this.robotsValidation = {
233
+ found: true,
234
+ issues: result.issues.map(i => ({ type: i.type, message: i.message })),
235
+ };
236
+ }
237
+ else {
238
+ this.robotsValidation = {
239
+ found: false,
240
+ issues: [{ type: 'info', message: 'robots.txt not found' }],
241
+ };
242
+ }
243
+ }
244
+ catch (error) {
245
+ this.robotsValidation = {
246
+ found: false,
247
+ issues: [{ type: 'error', message: `Failed to fetch robots.txt: ${error instanceof Error ? error.message : 'Unknown error'}` }],
248
+ };
249
+ }
250
+ }
251
+ async fetchSitemaps(baseUrl) {
252
+ const fetcher = async (url) => {
253
+ const response = await this.client.get(url);
254
+ return {
255
+ status: response.status,
256
+ text: await response.text(),
257
+ headers: Object.fromEntries([...response.headers.entries()]),
258
+ };
259
+ };
260
+ try {
261
+ let sitemapUrls = [];
262
+ if (this.options.sitemapUrl) {
263
+ sitemapUrls = [this.options.sitemapUrl];
264
+ }
265
+ else if (this.robotsData?.sitemaps.length) {
266
+ sitemapUrls = this.robotsData.sitemaps;
267
+ }
268
+ else {
269
+ sitemapUrls = await discoverSitemaps(baseUrl, undefined, fetcher);
270
+ }
271
+ for (const sitemapUrl of sitemapUrls) {
272
+ try {
273
+ const result = await fetchAndValidateSitemap(sitemapUrl, fetcher);
274
+ if (result.exists && result.parseResult.valid) {
275
+ this.sitemapValidation = result;
276
+ if (result.parseResult.type === 'sitemapindex') {
277
+ for (const childSitemap of result.parseResult.sitemaps) {
278
+ try {
279
+ const childResult = await fetchAndValidateSitemap(childSitemap.loc, fetcher);
280
+ if (childResult.exists && childResult.parseResult.urls.length > 0) {
281
+ this.sitemapUrls.push(...childResult.parseResult.urls);
282
+ }
283
+ }
284
+ catch {
285
+ }
286
+ }
287
+ }
288
+ else {
289
+ this.sitemapUrls.push(...result.parseResult.urls);
290
+ }
291
+ }
292
+ }
293
+ catch {
294
+ }
295
+ }
296
+ for (const url of this.sitemapUrls) {
297
+ this.sitemapUrlSet.add(normalizeUrl(url.loc));
298
+ }
299
+ }
300
+ catch (error) {
301
+ }
302
+ }
303
+ buildSitemapAnalysis() {
304
+ const crawledUrls = new Set(this.results.map(r => normalizeUrl(r.url)));
305
+ const crawledFromSitemap = this.sitemapUrls.filter(u => crawledUrls.has(normalizeUrl(u.loc))).length;
306
+ const linkedUrls = new Set();
307
+ for (const page of this.results) {
308
+ for (const link of page.links) {
309
+ if (link.href) {
310
+ linkedUrls.add(normalizeUrl(link.href));
311
+ }
312
+ }
313
+ }
314
+ const orphanUrls = this.sitemapUrls
315
+ .filter(u => {
316
+ const normalized = normalizeUrl(u.loc);
317
+ return !linkedUrls.has(normalized) && crawledUrls.has(normalized);
318
+ })
319
+ .map(u => u.loc);
320
+ const missingFromSitemap = Array.from(crawledUrls)
321
+ .filter(url => !this.sitemapUrlSet.has(url));
322
+ const blockedBySitemapRobots = [];
323
+ if (this.robotsData) {
324
+ for (const sitemapUrl of this.sitemapUrls) {
325
+ try {
326
+ const urlPath = new URL(sitemapUrl.loc).pathname;
327
+ if (!isPathAllowed(this.robotsData, urlPath, this.options.userAgent)) {
328
+ blockedBySitemapRobots.push(sitemapUrl.loc);
329
+ }
330
+ }
331
+ catch {
332
+ }
333
+ }
334
+ }
335
+ return {
336
+ found: this.sitemapUrls.length > 0,
337
+ url: this.sitemapValidation?.parseResult ? undefined : undefined,
338
+ totalUrls: this.sitemapUrls.length,
339
+ crawledFromSitemap,
340
+ orphanUrls,
341
+ missingFromSitemap,
342
+ blockedBySitemapRobots,
343
+ validationIssues: this.sitemapValidation?.issues.map(i => ({
344
+ type: i.type,
345
+ message: i.message,
346
+ })) || [],
347
+ sitemapUrls: this.sitemapUrls,
348
+ };
349
+ }
350
+ buildRobotsAnalysis() {
351
+ return {
352
+ found: this.robotsValidation?.found ?? false,
353
+ sitemaps: this.robotsData?.sitemaps ?? [],
354
+ blocksAll: this.robotsData?.blocksAllRobots ?? false,
355
+ issues: this.robotsValidation?.issues ?? [],
170
356
  };
171
357
  }
172
358
  async crawlPage(item) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "recker",
3
- "version": "1.0.33-next.fbc1682",
3
+ "version": "1.0.34",
4
4
  "description": "AI & DevX focused HTTP client for Node.js 18+",
5
5
  "main": "./dist/index.js",
6
6
  "types": "./dist/index.d.ts",