recker 1.0.28 → 1.0.29
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/tui/shell.d.ts +1 -0
- package/dist/cli/tui/shell.js +339 -5
- package/dist/scrape/index.d.ts +2 -0
- package/dist/scrape/index.js +1 -0
- package/dist/scrape/spider.d.ts +61 -0
- package/dist/scrape/spider.js +250 -0
- package/dist/seo/analyzer.js +27 -0
- package/dist/seo/index.d.ts +3 -1
- package/dist/seo/index.js +1 -0
- package/dist/seo/rules/accessibility.js +620 -54
- package/dist/seo/rules/best-practices.d.ts +2 -0
- package/dist/seo/rules/best-practices.js +188 -0
- package/dist/seo/rules/crawl.d.ts +2 -0
- package/dist/seo/rules/crawl.js +307 -0
- package/dist/seo/rules/cwv.d.ts +2 -0
- package/dist/seo/rules/cwv.js +337 -0
- package/dist/seo/rules/ecommerce.d.ts +2 -0
- package/dist/seo/rules/ecommerce.js +252 -0
- package/dist/seo/rules/i18n.d.ts +2 -0
- package/dist/seo/rules/i18n.js +222 -0
- package/dist/seo/rules/index.d.ts +32 -0
- package/dist/seo/rules/index.js +71 -0
- package/dist/seo/rules/internal-linking.d.ts +2 -0
- package/dist/seo/rules/internal-linking.js +375 -0
- package/dist/seo/rules/local.d.ts +2 -0
- package/dist/seo/rules/local.js +265 -0
- package/dist/seo/rules/pwa.d.ts +2 -0
- package/dist/seo/rules/pwa.js +302 -0
- package/dist/seo/rules/readability.d.ts +2 -0
- package/dist/seo/rules/readability.js +255 -0
- package/dist/seo/rules/security.js +406 -28
- package/dist/seo/rules/social.d.ts +2 -0
- package/dist/seo/rules/social.js +373 -0
- package/dist/seo/rules/types.d.ts +155 -0
- package/dist/seo/seo-spider.d.ts +47 -0
- package/dist/seo/seo-spider.js +362 -0
- package/dist/seo/types.d.ts +24 -0
- package/package.json +1 -1
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
import { createClient } from '../core/client.js';
|
|
2
|
+
import { ScrapeDocument } from './document.js';
|
|
3
|
+
import { RequestPool } from '../utils/request-pool.js';
|
|
4
|
+
const TRACKING_PARAMS = new Set([
|
|
5
|
+
'utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content',
|
|
6
|
+
'gclid', 'gclsrc', 'dclid',
|
|
7
|
+
'fbclid', 'fb_action_ids', 'fb_action_types', 'fb_source', 'fb_ref',
|
|
8
|
+
'msclkid',
|
|
9
|
+
'twclid',
|
|
10
|
+
'ref', 'referer', 'referrer', 'source',
|
|
11
|
+
'_ga', '_gl', '_hsenc', '_hsmi',
|
|
12
|
+
'mc_cid', 'mc_eid',
|
|
13
|
+
'yclid', 'ymclid',
|
|
14
|
+
'igshid',
|
|
15
|
+
'_t', 't', 'timestamp', 'ts', 'nocache', 'cache',
|
|
16
|
+
]);
|
|
17
|
+
function normalizeUrl(urlStr) {
|
|
18
|
+
try {
|
|
19
|
+
const url = new URL(urlStr);
|
|
20
|
+
url.hash = '';
|
|
21
|
+
const paramsToDelete = [];
|
|
22
|
+
url.searchParams.forEach((_, key) => {
|
|
23
|
+
if (TRACKING_PARAMS.has(key.toLowerCase())) {
|
|
24
|
+
paramsToDelete.push(key);
|
|
25
|
+
}
|
|
26
|
+
});
|
|
27
|
+
paramsToDelete.forEach(key => url.searchParams.delete(key));
|
|
28
|
+
url.searchParams.sort();
|
|
29
|
+
if (url.pathname !== '/' && url.pathname.endsWith('/')) {
|
|
30
|
+
url.pathname = url.pathname.slice(0, -1);
|
|
31
|
+
}
|
|
32
|
+
return url.toString();
|
|
33
|
+
}
|
|
34
|
+
catch {
|
|
35
|
+
return urlStr;
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
function shouldCrawl(url, baseHost, options) {
|
|
39
|
+
try {
|
|
40
|
+
const parsed = new URL(url);
|
|
41
|
+
if (!['http:', 'https:'].includes(parsed.protocol)) {
|
|
42
|
+
return false;
|
|
43
|
+
}
|
|
44
|
+
if (options.sameDomain !== false && parsed.hostname !== baseHost) {
|
|
45
|
+
return false;
|
|
46
|
+
}
|
|
47
|
+
const skipExtensions = [
|
|
48
|
+
'.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.ico',
|
|
49
|
+
'.pdf', '.zip', '.tar', '.gz', '.rar',
|
|
50
|
+
'.mp3', '.mp4', '.avi', '.mov', '.webm',
|
|
51
|
+
'.css', '.js', '.json', '.xml', '.rss',
|
|
52
|
+
'.woff', '.woff2', '.ttf', '.eot',
|
|
53
|
+
];
|
|
54
|
+
const pathname = parsed.pathname.toLowerCase();
|
|
55
|
+
if (skipExtensions.some(ext => pathname.endsWith(ext))) {
|
|
56
|
+
return false;
|
|
57
|
+
}
|
|
58
|
+
if (options.exclude?.some(pattern => pattern.test(url))) {
|
|
59
|
+
return false;
|
|
60
|
+
}
|
|
61
|
+
if (options.include?.length) {
|
|
62
|
+
if (!options.include.some(pattern => pattern.test(url))) {
|
|
63
|
+
return false;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
return true;
|
|
67
|
+
}
|
|
68
|
+
catch {
|
|
69
|
+
return false;
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
function sleep(ms) {
|
|
73
|
+
return new Promise(resolve => setTimeout(resolve, ms));
|
|
74
|
+
}
|
|
75
|
+
export class Spider {
|
|
76
|
+
options;
|
|
77
|
+
client;
|
|
78
|
+
pool;
|
|
79
|
+
visited = new Set();
|
|
80
|
+
queue = [];
|
|
81
|
+
results = [];
|
|
82
|
+
errors = [];
|
|
83
|
+
baseHost = '';
|
|
84
|
+
running = false;
|
|
85
|
+
aborted = false;
|
|
86
|
+
pendingCount = 0;
|
|
87
|
+
constructor(options = {}) {
|
|
88
|
+
this.options = {
|
|
89
|
+
maxDepth: options.maxDepth ?? 4,
|
|
90
|
+
maxPages: options.maxPages ?? 100,
|
|
91
|
+
sameDomain: options.sameDomain ?? true,
|
|
92
|
+
concurrency: options.concurrency ?? 5,
|
|
93
|
+
timeout: options.timeout ?? 10000,
|
|
94
|
+
delay: options.delay ?? 100,
|
|
95
|
+
userAgent: options.userAgent ?? 'Recker Spider/1.0',
|
|
96
|
+
respectRobotsTxt: options.respectRobotsTxt ?? true,
|
|
97
|
+
exclude: options.exclude,
|
|
98
|
+
include: options.include,
|
|
99
|
+
onPage: options.onPage,
|
|
100
|
+
onProgress: options.onProgress,
|
|
101
|
+
};
|
|
102
|
+
this.client = createClient({
|
|
103
|
+
baseUrl: 'http://localhost',
|
|
104
|
+
timeout: this.options.timeout,
|
|
105
|
+
headers: {
|
|
106
|
+
'User-Agent': this.options.userAgent,
|
|
107
|
+
},
|
|
108
|
+
});
|
|
109
|
+
this.pool = new RequestPool({
|
|
110
|
+
concurrency: this.options.concurrency,
|
|
111
|
+
...(this.options.delay > 0 ? {
|
|
112
|
+
requestsPerInterval: 1,
|
|
113
|
+
interval: this.options.delay,
|
|
114
|
+
} : {}),
|
|
115
|
+
});
|
|
116
|
+
}
|
|
117
|
+
async crawl(startUrl) {
|
|
118
|
+
const startTime = performance.now();
|
|
119
|
+
const normalizedStart = normalizeUrl(startUrl);
|
|
120
|
+
this.baseHost = new URL(normalizedStart).hostname;
|
|
121
|
+
this.visited.clear();
|
|
122
|
+
this.queue = [];
|
|
123
|
+
this.results = [];
|
|
124
|
+
this.errors = [];
|
|
125
|
+
this.running = true;
|
|
126
|
+
this.aborted = false;
|
|
127
|
+
this.pendingCount = 0;
|
|
128
|
+
const pending = new Map();
|
|
129
|
+
const scheduleUrl = (item) => {
|
|
130
|
+
const normalized = normalizeUrl(item.url);
|
|
131
|
+
if (this.visited.has(normalized))
|
|
132
|
+
return;
|
|
133
|
+
if (pending.has(normalized))
|
|
134
|
+
return;
|
|
135
|
+
if (item.depth > this.options.maxDepth)
|
|
136
|
+
return;
|
|
137
|
+
if (this.results.length + pending.size >= this.options.maxPages)
|
|
138
|
+
return;
|
|
139
|
+
this.visited.add(normalized);
|
|
140
|
+
this.pendingCount++;
|
|
141
|
+
const promise = this.pool.run(() => this.crawlPage({ ...item, url: normalized }))
|
|
142
|
+
.finally(() => {
|
|
143
|
+
pending.delete(normalized);
|
|
144
|
+
this.pendingCount--;
|
|
145
|
+
});
|
|
146
|
+
pending.set(normalized, promise);
|
|
147
|
+
};
|
|
148
|
+
scheduleUrl({ url: normalizedStart, depth: 0 });
|
|
149
|
+
while ((pending.size > 0 || this.queue.length > 0) && !this.aborted) {
|
|
150
|
+
while (this.queue.length > 0 && !this.aborted) {
|
|
151
|
+
const item = this.queue.shift();
|
|
152
|
+
if (this.results.length + pending.size >= this.options.maxPages)
|
|
153
|
+
break;
|
|
154
|
+
scheduleUrl(item);
|
|
155
|
+
}
|
|
156
|
+
if (pending.size > 0) {
|
|
157
|
+
await Promise.race(pending.values());
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
if (pending.size > 0) {
|
|
161
|
+
await Promise.all(pending.values());
|
|
162
|
+
}
|
|
163
|
+
this.running = false;
|
|
164
|
+
return {
|
|
165
|
+
startUrl: normalizedStart,
|
|
166
|
+
pages: this.results,
|
|
167
|
+
visited: this.visited,
|
|
168
|
+
duration: Math.round(performance.now() - startTime),
|
|
169
|
+
errors: this.errors,
|
|
170
|
+
};
|
|
171
|
+
}
|
|
172
|
+
async crawlPage(item) {
|
|
173
|
+
const startTime = performance.now();
|
|
174
|
+
this.options.onProgress?.({
|
|
175
|
+
crawled: this.results.length,
|
|
176
|
+
queued: this.queue.length,
|
|
177
|
+
total: this.visited.size,
|
|
178
|
+
currentUrl: item.url,
|
|
179
|
+
depth: item.depth,
|
|
180
|
+
});
|
|
181
|
+
try {
|
|
182
|
+
const response = await this.client.get(item.url);
|
|
183
|
+
const status = response.status;
|
|
184
|
+
const contentType = response.headers.get('content-type') || '';
|
|
185
|
+
if (!contentType.includes('text/html')) {
|
|
186
|
+
return;
|
|
187
|
+
}
|
|
188
|
+
const html = await response.text();
|
|
189
|
+
const doc = await ScrapeDocument.create(html, { baseUrl: item.url });
|
|
190
|
+
const title = doc.selectFirst('title').text() || '';
|
|
191
|
+
const links = doc.links({ absolute: true });
|
|
192
|
+
const result = {
|
|
193
|
+
url: item.url,
|
|
194
|
+
status,
|
|
195
|
+
title,
|
|
196
|
+
depth: item.depth,
|
|
197
|
+
links,
|
|
198
|
+
duration: Math.round(performance.now() - startTime),
|
|
199
|
+
};
|
|
200
|
+
this.results.push(result);
|
|
201
|
+
this.options.onPage?.(result);
|
|
202
|
+
for (const link of links) {
|
|
203
|
+
if (!link.href)
|
|
204
|
+
continue;
|
|
205
|
+
const normalized = normalizeUrl(link.href);
|
|
206
|
+
if (this.visited.has(normalized))
|
|
207
|
+
continue;
|
|
208
|
+
if (!shouldCrawl(normalized, this.baseHost, this.options))
|
|
209
|
+
continue;
|
|
210
|
+
this.queue.push({
|
|
211
|
+
url: normalized,
|
|
212
|
+
depth: item.depth + 1,
|
|
213
|
+
});
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
catch (error) {
|
|
217
|
+
const errorResult = {
|
|
218
|
+
url: item.url,
|
|
219
|
+
status: 0,
|
|
220
|
+
title: '',
|
|
221
|
+
depth: item.depth,
|
|
222
|
+
links: [],
|
|
223
|
+
duration: Math.round(performance.now() - startTime),
|
|
224
|
+
error: error.message,
|
|
225
|
+
};
|
|
226
|
+
this.results.push(errorResult);
|
|
227
|
+
this.errors.push({ url: item.url, error: error.message });
|
|
228
|
+
this.options.onPage?.(errorResult);
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
abort() {
|
|
232
|
+
this.aborted = true;
|
|
233
|
+
}
|
|
234
|
+
isRunning() {
|
|
235
|
+
return this.running;
|
|
236
|
+
}
|
|
237
|
+
getProgress() {
|
|
238
|
+
return {
|
|
239
|
+
crawled: this.results.length,
|
|
240
|
+
queued: this.queue.length,
|
|
241
|
+
total: this.visited.size,
|
|
242
|
+
currentUrl: '',
|
|
243
|
+
depth: 0,
|
|
244
|
+
};
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
export async function spider(url, options) {
|
|
248
|
+
const s = new Spider(options);
|
|
249
|
+
return s.crawl(url);
|
|
250
|
+
}
|
package/dist/seo/analyzer.js
CHANGED
|
@@ -57,6 +57,21 @@ export class SeoAnalyzer {
|
|
|
57
57
|
checks,
|
|
58
58
|
title: meta.title ? { text: meta.title, length: meta.title.length } : undefined,
|
|
59
59
|
metaDescription: meta.description ? { text: meta.description, length: meta.description.length } : undefined,
|
|
60
|
+
openGraph: Object.keys(og).length > 0 ? {
|
|
61
|
+
title: og.title,
|
|
62
|
+
description: og.description,
|
|
63
|
+
image: Array.isArray(og.image) ? og.image[0] : og.image,
|
|
64
|
+
url: og.url,
|
|
65
|
+
type: og.type,
|
|
66
|
+
siteName: og.siteName,
|
|
67
|
+
} : undefined,
|
|
68
|
+
twitterCard: Object.keys(twitter).length > 0 ? {
|
|
69
|
+
card: twitter.card,
|
|
70
|
+
title: twitter.title,
|
|
71
|
+
description: twitter.description,
|
|
72
|
+
image: Array.isArray(twitter.image) ? twitter.image[0] : twitter.image,
|
|
73
|
+
site: twitter.site,
|
|
74
|
+
} : undefined,
|
|
60
75
|
headings: headings,
|
|
61
76
|
content,
|
|
62
77
|
links: linkAnalysis,
|
|
@@ -72,6 +87,16 @@ export class SeoAnalyzer {
|
|
|
72
87
|
buildRuleContext(data) {
|
|
73
88
|
const { meta, og, twitter, jsonLd, headings, content, linkAnalysis, imageAnalysis, links } = data;
|
|
74
89
|
const htmlLang = this.$('html').attr('lang');
|
|
90
|
+
const hreflangTags = [];
|
|
91
|
+
this.$('link[rel="alternate"][hreflang]').each((_, el) => {
|
|
92
|
+
const $el = this.$(el);
|
|
93
|
+
const lang = $el.attr('hreflang');
|
|
94
|
+
const href = $el.attr('href');
|
|
95
|
+
if (lang && href) {
|
|
96
|
+
hreflangTags.push({ lang, href });
|
|
97
|
+
}
|
|
98
|
+
});
|
|
99
|
+
const ogLocale = this.$('meta[property="og:locale"]').attr('content');
|
|
75
100
|
const genericTexts = SEO_THRESHOLDS.links.genericTexts;
|
|
76
101
|
const genericTextLinks = links.filter((l) => {
|
|
77
102
|
const text = l.text?.toLowerCase().trim();
|
|
@@ -196,6 +221,8 @@ export class SeoAnalyzer {
|
|
|
196
221
|
titleMatchesH1: meta.title && h1Text ? meta.title.toLowerCase().trim() === h1Text.toLowerCase().trim() : undefined,
|
|
197
222
|
...this.analyzeUrlQuality(),
|
|
198
223
|
...this.analyzeJsRendering(content),
|
|
224
|
+
hreflangTags: hreflangTags.length > 0 ? hreflangTags : undefined,
|
|
225
|
+
ogLocale,
|
|
199
226
|
};
|
|
200
227
|
}
|
|
201
228
|
analyzeUrlQuality() {
|
package/dist/seo/index.d.ts
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
export { SeoAnalyzer, analyzeSeo } from './analyzer.js';
|
|
2
|
+
export { SeoSpider, seoSpider } from './seo-spider.js';
|
|
3
|
+
export type { SeoSpiderOptions, SeoPageResult, SiteWideIssue, SeoSpiderResult, } from './seo-spider.js';
|
|
2
4
|
export { SeoRulesEngine, createRulesEngine, SEO_THRESHOLDS, ALL_SEO_RULES, } from './rules/index.js';
|
|
3
|
-
export type { SeoReport, SeoCheckResult, SeoStatus, HeadingAnalysis, HeadingInfo, ContentMetrics, LinkAnalysis, ImageAnalysis, SocialMetaAnalysis, TechnicalSeo, SeoAnalyzerOptions, } from './types.js';
|
|
5
|
+
export type { SeoReport, SeoCheckResult, SeoStatus, SeoTiming, HeadingAnalysis, HeadingInfo, ContentMetrics, LinkAnalysis, ImageAnalysis, SocialMetaAnalysis, TechnicalSeo, SeoAnalyzerOptions, } from './types.js';
|
|
4
6
|
export type { SeoRule, RuleContext, RuleResult, RuleEvidence, RuleCategory, RuleSeverity, RulesEngineOptions, } from './rules/index.js';
|
|
5
7
|
export type { SeoAnalyzerFullOptions } from './analyzer.js';
|
package/dist/seo/index.js
CHANGED