recker 1.0.27 → 1.0.28-next.3bf98c7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. package/dist/browser/scrape/extractors.js +2 -1
  2. package/dist/browser/scrape/types.d.ts +2 -1
  3. package/dist/cli/index.js +142 -3
  4. package/dist/cli/tui/shell.d.ts +2 -0
  5. package/dist/cli/tui/shell.js +492 -1
  6. package/dist/index.d.ts +1 -0
  7. package/dist/index.js +1 -0
  8. package/dist/scrape/extractors.js +2 -1
  9. package/dist/scrape/index.d.ts +2 -0
  10. package/dist/scrape/index.js +1 -0
  11. package/dist/scrape/spider.d.ts +61 -0
  12. package/dist/scrape/spider.js +250 -0
  13. package/dist/scrape/types.d.ts +2 -1
  14. package/dist/seo/analyzer.d.ts +42 -0
  15. package/dist/seo/analyzer.js +742 -0
  16. package/dist/seo/index.d.ts +7 -0
  17. package/dist/seo/index.js +3 -0
  18. package/dist/seo/rules/accessibility.d.ts +2 -0
  19. package/dist/seo/rules/accessibility.js +694 -0
  20. package/dist/seo/rules/best-practices.d.ts +2 -0
  21. package/dist/seo/rules/best-practices.js +188 -0
  22. package/dist/seo/rules/content.d.ts +2 -0
  23. package/dist/seo/rules/content.js +236 -0
  24. package/dist/seo/rules/crawl.d.ts +2 -0
  25. package/dist/seo/rules/crawl.js +307 -0
  26. package/dist/seo/rules/cwv.d.ts +2 -0
  27. package/dist/seo/rules/cwv.js +337 -0
  28. package/dist/seo/rules/ecommerce.d.ts +2 -0
  29. package/dist/seo/rules/ecommerce.js +252 -0
  30. package/dist/seo/rules/i18n.d.ts +2 -0
  31. package/dist/seo/rules/i18n.js +222 -0
  32. package/dist/seo/rules/images.d.ts +2 -0
  33. package/dist/seo/rules/images.js +180 -0
  34. package/dist/seo/rules/index.d.ts +52 -0
  35. package/dist/seo/rules/index.js +143 -0
  36. package/dist/seo/rules/internal-linking.d.ts +2 -0
  37. package/dist/seo/rules/internal-linking.js +375 -0
  38. package/dist/seo/rules/links.d.ts +2 -0
  39. package/dist/seo/rules/links.js +150 -0
  40. package/dist/seo/rules/local.d.ts +2 -0
  41. package/dist/seo/rules/local.js +265 -0
  42. package/dist/seo/rules/meta.d.ts +2 -0
  43. package/dist/seo/rules/meta.js +523 -0
  44. package/dist/seo/rules/mobile.d.ts +2 -0
  45. package/dist/seo/rules/mobile.js +71 -0
  46. package/dist/seo/rules/performance.d.ts +2 -0
  47. package/dist/seo/rules/performance.js +246 -0
  48. package/dist/seo/rules/pwa.d.ts +2 -0
  49. package/dist/seo/rules/pwa.js +302 -0
  50. package/dist/seo/rules/readability.d.ts +2 -0
  51. package/dist/seo/rules/readability.js +255 -0
  52. package/dist/seo/rules/schema.d.ts +2 -0
  53. package/dist/seo/rules/schema.js +54 -0
  54. package/dist/seo/rules/security.d.ts +2 -0
  55. package/dist/seo/rules/security.js +525 -0
  56. package/dist/seo/rules/social.d.ts +2 -0
  57. package/dist/seo/rules/social.js +373 -0
  58. package/dist/seo/rules/structural.d.ts +2 -0
  59. package/dist/seo/rules/structural.js +155 -0
  60. package/dist/seo/rules/technical.d.ts +2 -0
  61. package/dist/seo/rules/technical.js +223 -0
  62. package/dist/seo/rules/thresholds.d.ts +196 -0
  63. package/dist/seo/rules/thresholds.js +118 -0
  64. package/dist/seo/rules/types.d.ts +346 -0
  65. package/dist/seo/rules/types.js +11 -0
  66. package/dist/seo/seo-spider.d.ts +47 -0
  67. package/dist/seo/seo-spider.js +362 -0
  68. package/dist/seo/types.d.ts +184 -0
  69. package/dist/seo/types.js +1 -0
  70. package/dist/utils/columns.d.ts +14 -0
  71. package/dist/utils/columns.js +69 -0
  72. package/package.json +1 -1
@@ -0,0 +1,250 @@
1
+ import { createClient } from '../core/client.js';
2
+ import { ScrapeDocument } from './document.js';
3
+ import { RequestPool } from '../utils/request-pool.js';
4
+ const TRACKING_PARAMS = new Set([
5
+ 'utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content',
6
+ 'gclid', 'gclsrc', 'dclid',
7
+ 'fbclid', 'fb_action_ids', 'fb_action_types', 'fb_source', 'fb_ref',
8
+ 'msclkid',
9
+ 'twclid',
10
+ 'ref', 'referer', 'referrer', 'source',
11
+ '_ga', '_gl', '_hsenc', '_hsmi',
12
+ 'mc_cid', 'mc_eid',
13
+ 'yclid', 'ymclid',
14
+ 'igshid',
15
+ '_t', 't', 'timestamp', 'ts', 'nocache', 'cache',
16
+ ]);
17
+ function normalizeUrl(urlStr) {
18
+ try {
19
+ const url = new URL(urlStr);
20
+ url.hash = '';
21
+ const paramsToDelete = [];
22
+ url.searchParams.forEach((_, key) => {
23
+ if (TRACKING_PARAMS.has(key.toLowerCase())) {
24
+ paramsToDelete.push(key);
25
+ }
26
+ });
27
+ paramsToDelete.forEach(key => url.searchParams.delete(key));
28
+ url.searchParams.sort();
29
+ if (url.pathname !== '/' && url.pathname.endsWith('/')) {
30
+ url.pathname = url.pathname.slice(0, -1);
31
+ }
32
+ return url.toString();
33
+ }
34
+ catch {
35
+ return urlStr;
36
+ }
37
+ }
38
+ function shouldCrawl(url, baseHost, options) {
39
+ try {
40
+ const parsed = new URL(url);
41
+ if (!['http:', 'https:'].includes(parsed.protocol)) {
42
+ return false;
43
+ }
44
+ if (options.sameDomain !== false && parsed.hostname !== baseHost) {
45
+ return false;
46
+ }
47
+ const skipExtensions = [
48
+ '.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.ico',
49
+ '.pdf', '.zip', '.tar', '.gz', '.rar',
50
+ '.mp3', '.mp4', '.avi', '.mov', '.webm',
51
+ '.css', '.js', '.json', '.xml', '.rss',
52
+ '.woff', '.woff2', '.ttf', '.eot',
53
+ ];
54
+ const pathname = parsed.pathname.toLowerCase();
55
+ if (skipExtensions.some(ext => pathname.endsWith(ext))) {
56
+ return false;
57
+ }
58
+ if (options.exclude?.some(pattern => pattern.test(url))) {
59
+ return false;
60
+ }
61
+ if (options.include?.length) {
62
+ if (!options.include.some(pattern => pattern.test(url))) {
63
+ return false;
64
+ }
65
+ }
66
+ return true;
67
+ }
68
+ catch {
69
+ return false;
70
+ }
71
+ }
72
+ function sleep(ms) {
73
+ return new Promise(resolve => setTimeout(resolve, ms));
74
+ }
75
+ export class Spider {
76
+ options;
77
+ client;
78
+ pool;
79
+ visited = new Set();
80
+ queue = [];
81
+ results = [];
82
+ errors = [];
83
+ baseHost = '';
84
+ running = false;
85
+ aborted = false;
86
+ pendingCount = 0;
87
+ constructor(options = {}) {
88
+ this.options = {
89
+ maxDepth: options.maxDepth ?? 4,
90
+ maxPages: options.maxPages ?? 100,
91
+ sameDomain: options.sameDomain ?? true,
92
+ concurrency: options.concurrency ?? 5,
93
+ timeout: options.timeout ?? 10000,
94
+ delay: options.delay ?? 100,
95
+ userAgent: options.userAgent ?? 'Recker Spider/1.0',
96
+ respectRobotsTxt: options.respectRobotsTxt ?? true,
97
+ exclude: options.exclude,
98
+ include: options.include,
99
+ onPage: options.onPage,
100
+ onProgress: options.onProgress,
101
+ };
102
+ this.client = createClient({
103
+ baseUrl: 'http://localhost',
104
+ timeout: this.options.timeout,
105
+ headers: {
106
+ 'User-Agent': this.options.userAgent,
107
+ },
108
+ });
109
+ this.pool = new RequestPool({
110
+ concurrency: this.options.concurrency,
111
+ ...(this.options.delay > 0 ? {
112
+ requestsPerInterval: 1,
113
+ interval: this.options.delay,
114
+ } : {}),
115
+ });
116
+ }
117
+ async crawl(startUrl) {
118
+ const startTime = performance.now();
119
+ const normalizedStart = normalizeUrl(startUrl);
120
+ this.baseHost = new URL(normalizedStart).hostname;
121
+ this.visited.clear();
122
+ this.queue = [];
123
+ this.results = [];
124
+ this.errors = [];
125
+ this.running = true;
126
+ this.aborted = false;
127
+ this.pendingCount = 0;
128
+ const pending = new Map();
129
+ const scheduleUrl = (item) => {
130
+ const normalized = normalizeUrl(item.url);
131
+ if (this.visited.has(normalized))
132
+ return;
133
+ if (pending.has(normalized))
134
+ return;
135
+ if (item.depth > this.options.maxDepth)
136
+ return;
137
+ if (this.results.length + pending.size >= this.options.maxPages)
138
+ return;
139
+ this.visited.add(normalized);
140
+ this.pendingCount++;
141
+ const promise = this.pool.run(() => this.crawlPage({ ...item, url: normalized }))
142
+ .finally(() => {
143
+ pending.delete(normalized);
144
+ this.pendingCount--;
145
+ });
146
+ pending.set(normalized, promise);
147
+ };
148
+ scheduleUrl({ url: normalizedStart, depth: 0 });
149
+ while ((pending.size > 0 || this.queue.length > 0) && !this.aborted) {
150
+ while (this.queue.length > 0 && !this.aborted) {
151
+ const item = this.queue.shift();
152
+ if (this.results.length + pending.size >= this.options.maxPages)
153
+ break;
154
+ scheduleUrl(item);
155
+ }
156
+ if (pending.size > 0) {
157
+ await Promise.race(pending.values());
158
+ }
159
+ }
160
+ if (pending.size > 0) {
161
+ await Promise.all(pending.values());
162
+ }
163
+ this.running = false;
164
+ return {
165
+ startUrl: normalizedStart,
166
+ pages: this.results,
167
+ visited: this.visited,
168
+ duration: Math.round(performance.now() - startTime),
169
+ errors: this.errors,
170
+ };
171
+ }
172
+ async crawlPage(item) {
173
+ const startTime = performance.now();
174
+ this.options.onProgress?.({
175
+ crawled: this.results.length,
176
+ queued: this.queue.length,
177
+ total: this.visited.size,
178
+ currentUrl: item.url,
179
+ depth: item.depth,
180
+ });
181
+ try {
182
+ const response = await this.client.get(item.url);
183
+ const status = response.status;
184
+ const contentType = response.headers.get('content-type') || '';
185
+ if (!contentType.includes('text/html')) {
186
+ return;
187
+ }
188
+ const html = await response.text();
189
+ const doc = await ScrapeDocument.create(html, { baseUrl: item.url });
190
+ const title = doc.selectFirst('title').text() || '';
191
+ const links = doc.links({ absolute: true });
192
+ const result = {
193
+ url: item.url,
194
+ status,
195
+ title,
196
+ depth: item.depth,
197
+ links,
198
+ duration: Math.round(performance.now() - startTime),
199
+ };
200
+ this.results.push(result);
201
+ this.options.onPage?.(result);
202
+ for (const link of links) {
203
+ if (!link.href)
204
+ continue;
205
+ const normalized = normalizeUrl(link.href);
206
+ if (this.visited.has(normalized))
207
+ continue;
208
+ if (!shouldCrawl(normalized, this.baseHost, this.options))
209
+ continue;
210
+ this.queue.push({
211
+ url: normalized,
212
+ depth: item.depth + 1,
213
+ });
214
+ }
215
+ }
216
+ catch (error) {
217
+ const errorResult = {
218
+ url: item.url,
219
+ status: 0,
220
+ title: '',
221
+ depth: item.depth,
222
+ links: [],
223
+ duration: Math.round(performance.now() - startTime),
224
+ error: error.message,
225
+ };
226
+ this.results.push(errorResult);
227
+ this.errors.push({ url: item.url, error: error.message });
228
+ this.options.onPage?.(errorResult);
229
+ }
230
+ }
231
+ abort() {
232
+ this.aborted = true;
233
+ }
234
+ isRunning() {
235
+ return this.running;
236
+ }
237
+ getProgress() {
238
+ return {
239
+ crawled: this.results.length,
240
+ queued: this.queue.length,
241
+ total: this.visited.size,
242
+ currentUrl: '',
243
+ depth: 0,
244
+ };
245
+ }
246
+ }
247
+ export async function spider(url, options) {
248
+ const s = new Spider(options);
249
+ return s.crawl(url);
250
+ }
@@ -14,13 +14,14 @@ export interface ExtractedImage {
14
14
  height?: number;
15
15
  srcset?: string;
16
16
  loading?: 'lazy' | 'eager';
17
+ decoding?: 'async' | 'auto' | 'sync';
17
18
  }
18
19
  export interface ExtractedMeta {
19
20
  title?: string;
20
21
  description?: string;
21
22
  keywords?: string[];
22
23
  author?: string;
23
- robots?: string;
24
+ robots?: string[];
24
25
  canonical?: string;
25
26
  viewport?: string;
26
27
  charset?: string;
@@ -0,0 +1,42 @@
1
+ import type { CheerioAPI } from 'cheerio';
2
+ import type { SeoReport, SeoAnalyzerOptions } from './types.js';
3
+ import { type RulesEngineOptions } from './rules/index.js';
4
+ export interface SeoAnalyzerFullOptions extends SeoAnalyzerOptions {
5
+ rules?: RulesEngineOptions;
6
+ }
7
+ export declare class SeoAnalyzer {
8
+ private $;
9
+ private options;
10
+ private rulesEngine;
11
+ constructor($: CheerioAPI, options?: SeoAnalyzerFullOptions);
12
+ static fromHtml(html: string, options?: SeoAnalyzerFullOptions): Promise<SeoAnalyzer>;
13
+ analyze(): SeoReport;
14
+ private buildRuleContext;
15
+ private analyzeUrlQuality;
16
+ private analyzeJsRendering;
17
+ private checkMixedContent;
18
+ private analyzeLinkSecurity;
19
+ private detectFavicon;
20
+ private analyzePerformanceHints;
21
+ private analyzeCWVHints;
22
+ private analyzeAccessibility;
23
+ private analyzeStructuralHtml;
24
+ private analyzeBreadcrumbs;
25
+ private analyzeMultimedia;
26
+ private analyzeTrustSignals;
27
+ private calculateTextHtmlRatio;
28
+ private convertToCheckResults;
29
+ private analyzeHeadings;
30
+ private analyzeContent;
31
+ private buildLinkAnalysis;
32
+ private buildImageAnalysis;
33
+ private buildSocialAnalysis;
34
+ private buildTechnicalAnalysis;
35
+ private calculateScore;
36
+ getRules(): import("./rules/types.js").SeoRule[];
37
+ getRulesByCategory(category: string): import("./rules/types.js").SeoRule[];
38
+ getCategories(): import("./rules/types.js").RuleCategory[];
39
+ }
40
+ export declare function analyzeSeo(html: string, options?: SeoAnalyzerFullOptions): Promise<SeoReport>;
41
+ export { SEO_THRESHOLDS, createRulesEngine, SeoRulesEngine } from './rules/index.js';
42
+ export type { RuleContext, RuleResult, RulesEngineOptions, RuleCategory, RuleSeverity, SeoRule } from './rules/index.js';