recker 1.0.28-next.32fe8ef → 1.0.28-next.4354f8c
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/tui/shell.d.ts +1 -0
- package/dist/cli/tui/shell.js +112 -1
- package/dist/scrape/index.d.ts +2 -0
- package/dist/scrape/index.js +1 -0
- package/dist/scrape/spider.d.ts +59 -0
- package/dist/scrape/spider.js +209 -0
- package/dist/seo/analyzer.js +12 -0
- package/dist/seo/rules/accessibility.js +620 -54
- package/dist/seo/rules/best-practices.d.ts +2 -0
- package/dist/seo/rules/best-practices.js +188 -0
- package/dist/seo/rules/crawl.d.ts +2 -0
- package/dist/seo/rules/crawl.js +307 -0
- package/dist/seo/rules/cwv.d.ts +2 -0
- package/dist/seo/rules/cwv.js +337 -0
- package/dist/seo/rules/ecommerce.d.ts +2 -0
- package/dist/seo/rules/ecommerce.js +252 -0
- package/dist/seo/rules/i18n.d.ts +2 -0
- package/dist/seo/rules/i18n.js +222 -0
- package/dist/seo/rules/index.d.ts +32 -0
- package/dist/seo/rules/index.js +71 -0
- package/dist/seo/rules/internal-linking.d.ts +2 -0
- package/dist/seo/rules/internal-linking.js +375 -0
- package/dist/seo/rules/local.d.ts +2 -0
- package/dist/seo/rules/local.js +265 -0
- package/dist/seo/rules/pwa.d.ts +2 -0
- package/dist/seo/rules/pwa.js +302 -0
- package/dist/seo/rules/readability.d.ts +2 -0
- package/dist/seo/rules/readability.js +255 -0
- package/dist/seo/rules/security.js +406 -28
- package/dist/seo/rules/social.d.ts +2 -0
- package/dist/seo/rules/social.js +373 -0
- package/dist/seo/rules/types.d.ts +155 -0
- package/package.json +1 -1
package/dist/cli/tui/shell.d.ts
CHANGED
package/dist/cli/tui/shell.js
CHANGED
|
@@ -10,6 +10,7 @@ import { inspectTLS } from '../../utils/tls-inspector.js';
|
|
|
10
10
|
import { getSecurityRecords } from '../../utils/dns-toolkit.js';
|
|
11
11
|
import { rdap } from '../../utils/rdap.js';
|
|
12
12
|
import { ScrapeDocument } from '../../scrape/document.js';
|
|
13
|
+
import { Spider } from '../../scrape/spider.js';
|
|
13
14
|
import colors from '../../utils/colors.js';
|
|
14
15
|
import { getShellSearch } from './shell-search.js';
|
|
15
16
|
import { openSearchPanel } from './search-panel.js';
|
|
@@ -94,7 +95,7 @@ export class RekShell {
|
|
|
94
95
|
'get', 'post', 'put', 'delete', 'patch', 'head', 'options',
|
|
95
96
|
'ws', 'udp', 'load', 'chat', 'ai',
|
|
96
97
|
'whois', 'tls', 'ssl', 'security', 'ip', 'dns', 'dns:propagate', 'dns:email', 'rdap', 'ping',
|
|
97
|
-
'scrap', '$', '$text', '$attr', '$html', '$links', '$images', '$scripts', '$css', '$sourcemaps', '$unmap', '$unmap:view', '$unmap:save', '$beautify', '$beautify:save', '$table',
|
|
98
|
+
'scrap', 'spider', '$', '$text', '$attr', '$html', '$links', '$images', '$scripts', '$css', '$sourcemaps', '$unmap', '$unmap:view', '$unmap:save', '$beautify', '$beautify:save', '$table',
|
|
98
99
|
'?', 'search', 'suggest', 'example',
|
|
99
100
|
'help', 'clear', 'exit', 'set', 'url', 'vars', 'env'
|
|
100
101
|
];
|
|
@@ -368,6 +369,9 @@ export class RekShell {
|
|
|
368
369
|
case 'scrap':
|
|
369
370
|
await this.runScrap(parts[1]);
|
|
370
371
|
return;
|
|
372
|
+
case 'spider':
|
|
373
|
+
await this.runSpider(parts.slice(1));
|
|
374
|
+
return;
|
|
371
375
|
case '$':
|
|
372
376
|
await this.runSelect(parts.slice(1).join(' '));
|
|
373
377
|
return;
|
|
@@ -1434,6 +1438,105 @@ ${colors.bold('Network:')}
|
|
|
1434
1438
|
}
|
|
1435
1439
|
console.log('');
|
|
1436
1440
|
}
|
|
1441
|
+
async runSpider(args) {
|
|
1442
|
+
let url = '';
|
|
1443
|
+
let maxDepth = 3;
|
|
1444
|
+
let maxPages = 100;
|
|
1445
|
+
let concurrency = 5;
|
|
1446
|
+
for (let i = 0; i < args.length; i++) {
|
|
1447
|
+
const arg = args[i];
|
|
1448
|
+
if (arg.startsWith('--depth=') || arg.startsWith('-d=')) {
|
|
1449
|
+
maxDepth = parseInt(arg.split('=')[1]) || 3;
|
|
1450
|
+
}
|
|
1451
|
+
else if (arg.startsWith('--limit=') || arg.startsWith('-l=')) {
|
|
1452
|
+
maxPages = parseInt(arg.split('=')[1]) || 100;
|
|
1453
|
+
}
|
|
1454
|
+
else if (arg.startsWith('--concurrency=') || arg.startsWith('-c=')) {
|
|
1455
|
+
concurrency = parseInt(arg.split('=')[1]) || 5;
|
|
1456
|
+
}
|
|
1457
|
+
else if (!arg.startsWith('-')) {
|
|
1458
|
+
url = arg;
|
|
1459
|
+
}
|
|
1460
|
+
}
|
|
1461
|
+
if (!url) {
|
|
1462
|
+
if (!this.baseUrl) {
|
|
1463
|
+
console.log(colors.yellow('Usage: spider <url> [options]'));
|
|
1464
|
+
console.log(colors.gray(' Options:'));
|
|
1465
|
+
console.log(colors.gray(' --depth=3 Max crawl depth'));
|
|
1466
|
+
console.log(colors.gray(' --limit=100 Max pages to crawl'));
|
|
1467
|
+
console.log(colors.gray(' --concurrency=5 Concurrent requests'));
|
|
1468
|
+
console.log(colors.gray(' Examples:'));
|
|
1469
|
+
console.log(colors.gray(' spider https://example.com'));
|
|
1470
|
+
console.log(colors.gray(' spider https://example.com --depth=2 --limit=50'));
|
|
1471
|
+
return;
|
|
1472
|
+
}
|
|
1473
|
+
url = this.baseUrl;
|
|
1474
|
+
}
|
|
1475
|
+
else if (!url.startsWith('http')) {
|
|
1476
|
+
url = `https://${url}`;
|
|
1477
|
+
}
|
|
1478
|
+
console.log(colors.cyan(`\nSpider starting: ${url}`));
|
|
1479
|
+
console.log(colors.gray(` Depth: ${maxDepth} | Limit: ${maxPages} | Concurrency: ${concurrency}`));
|
|
1480
|
+
console.log('');
|
|
1481
|
+
const spider = new Spider({
|
|
1482
|
+
maxDepth,
|
|
1483
|
+
maxPages,
|
|
1484
|
+
concurrency,
|
|
1485
|
+
sameDomain: true,
|
|
1486
|
+
delay: 100,
|
|
1487
|
+
onProgress: (progress) => {
|
|
1488
|
+
process.stdout.write(`\r${colors.gray(' Crawling:')} ${colors.cyan(progress.crawled.toString())} pages | ${colors.gray('Queue:')} ${progress.queued} | ${colors.gray('Depth:')} ${progress.depth} `);
|
|
1489
|
+
},
|
|
1490
|
+
});
|
|
1491
|
+
try {
|
|
1492
|
+
const result = await spider.crawl(url);
|
|
1493
|
+
process.stdout.write('\r' + ' '.repeat(80) + '\r');
|
|
1494
|
+
console.log(colors.green(`\n✔ Spider complete`) + colors.gray(` (${(result.duration / 1000).toFixed(1)}s)`));
|
|
1495
|
+
console.log(` ${colors.cyan('Pages crawled')}: ${result.pages.length}`);
|
|
1496
|
+
console.log(` ${colors.cyan('Unique URLs')}: ${result.visited.size}`);
|
|
1497
|
+
console.log(` ${colors.cyan('Errors')}: ${result.errors.length}`);
|
|
1498
|
+
const byDepth = new Map();
|
|
1499
|
+
for (const page of result.pages) {
|
|
1500
|
+
byDepth.set(page.depth, (byDepth.get(page.depth) || 0) + 1);
|
|
1501
|
+
}
|
|
1502
|
+
console.log(colors.bold('\n Pages by depth:'));
|
|
1503
|
+
for (const [depth, count] of Array.from(byDepth.entries()).sort((a, b) => a[0] - b[0])) {
|
|
1504
|
+
const bar = '█'.repeat(Math.min(count, 40));
|
|
1505
|
+
console.log(` ${colors.gray(`d${depth}:`)} ${bar} ${count}`);
|
|
1506
|
+
}
|
|
1507
|
+
const topPages = [...result.pages]
|
|
1508
|
+
.filter(p => !p.error)
|
|
1509
|
+
.sort((a, b) => b.links.length - a.links.length)
|
|
1510
|
+
.slice(0, 10);
|
|
1511
|
+
if (topPages.length > 0) {
|
|
1512
|
+
console.log(colors.bold('\n Top pages by outgoing links:'));
|
|
1513
|
+
for (const page of topPages) {
|
|
1514
|
+
const title = page.title.slice(0, 40) || new URL(page.url).pathname;
|
|
1515
|
+
console.log(` ${colors.cyan(page.links.length.toString().padStart(3))} ${title}`);
|
|
1516
|
+
}
|
|
1517
|
+
}
|
|
1518
|
+
if (result.errors.length > 0 && result.errors.length <= 10) {
|
|
1519
|
+
console.log(colors.bold('\n Errors:'));
|
|
1520
|
+
for (const err of result.errors) {
|
|
1521
|
+
const path = new URL(err.url).pathname;
|
|
1522
|
+
console.log(` ${colors.red('✗')} ${path.slice(0, 40)} ${colors.gray('→')} ${err.error.slice(0, 30)}`);
|
|
1523
|
+
}
|
|
1524
|
+
}
|
|
1525
|
+
else if (result.errors.length > 10) {
|
|
1526
|
+
console.log(colors.yellow(`\n ${result.errors.length} errors (showing first 10):`));
|
|
1527
|
+
for (const err of result.errors.slice(0, 10)) {
|
|
1528
|
+
const path = new URL(err.url).pathname;
|
|
1529
|
+
console.log(` ${colors.red('✗')} ${path.slice(0, 40)} ${colors.gray('→')} ${err.error.slice(0, 30)}`);
|
|
1530
|
+
}
|
|
1531
|
+
}
|
|
1532
|
+
this.lastResponse = result;
|
|
1533
|
+
console.log(colors.gray('\n Result stored in lastResponse. Use $links to explore.'));
|
|
1534
|
+
}
|
|
1535
|
+
catch (error) {
|
|
1536
|
+
console.error(colors.red(`Spider failed: ${error.message}`));
|
|
1537
|
+
}
|
|
1538
|
+
console.log('');
|
|
1539
|
+
}
|
|
1437
1540
|
async runSelect(selector) {
|
|
1438
1541
|
if (!this.currentDoc) {
|
|
1439
1542
|
console.log(colors.yellow('No document loaded. Use "scrap <url>" first.'));
|
|
@@ -2358,6 +2461,13 @@ ${colors.bold('Network:')}
|
|
|
2358
2461
|
${colors.green('$beautify:save [f]')} Save beautified code to file.
|
|
2359
2462
|
${colors.green('$table <selector>')} Extract table as data.
|
|
2360
2463
|
|
|
2464
|
+
${colors.bold('Web Crawler:')}
|
|
2465
|
+
${colors.green('spider <url>')} Crawl website following internal links.
|
|
2466
|
+
${colors.gray('Options:')}
|
|
2467
|
+
${colors.white('--depth=3')} ${colors.gray('Maximum depth to crawl')}
|
|
2468
|
+
${colors.white('--limit=100')} ${colors.gray('Maximum pages to crawl')}
|
|
2469
|
+
${colors.white('--concurrency=5')} ${colors.gray('Parallel requests')}
|
|
2470
|
+
|
|
2361
2471
|
${colors.bold('Documentation:')}
|
|
2362
2472
|
${colors.green('? <query>')} Search Recker documentation.
|
|
2363
2473
|
${colors.green('search <query>')} Alias for ? (hybrid fuzzy+semantic search).
|
|
@@ -2375,6 +2485,7 @@ ${colors.bold('Network:')}
|
|
|
2375
2485
|
› post /post name="Neo" active:=true role:Admin
|
|
2376
2486
|
› load /heavy-endpoint users=100 mode=stress
|
|
2377
2487
|
› chat openai gpt-5.1
|
|
2488
|
+
› spider https://example.com --depth=2 --limit=50
|
|
2378
2489
|
`);
|
|
2379
2490
|
}
|
|
2380
2491
|
}
|
package/dist/scrape/index.d.ts
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
export { ScrapeDocument } from './document.js';
|
|
2
2
|
export { ScrapeElement } from './element.js';
|
|
3
|
+
export { Spider, spider } from './spider.js';
|
|
4
|
+
export type { SpiderOptions, SpiderPageResult, SpiderProgress, SpiderResult, } from './spider.js';
|
|
3
5
|
export { extractLinks, extractImages, extractMeta, extractOpenGraph, extractTwitterCard, extractJsonLd, extractForms, extractTables, extractScripts, extractStyles, } from './extractors.js';
|
|
4
6
|
export type { ExtractedLink, ExtractedImage, ExtractedMeta, OpenGraphData, TwitterCardData, JsonLdData, ExtractedForm, ExtractedFormField, ExtractedTable, ExtractedScript, ExtractedStyle, ExtractionSchema, ExtractionSchemaField, ScrapeOptions, LinkExtractionOptions, ImageExtractionOptions, } from './types.js';
|
package/dist/scrape/index.js
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
1
|
export { ScrapeDocument } from './document.js';
|
|
2
2
|
export { ScrapeElement } from './element.js';
|
|
3
|
+
export { Spider, spider } from './spider.js';
|
|
3
4
|
export { extractLinks, extractImages, extractMeta, extractOpenGraph, extractTwitterCard, extractJsonLd, extractForms, extractTables, extractScripts, extractStyles, } from './extractors.js';
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import type { ExtractedLink } from './types.js';
|
|
2
|
+
export interface SpiderOptions {
|
|
3
|
+
maxDepth?: number;
|
|
4
|
+
maxPages?: number;
|
|
5
|
+
sameDomain?: boolean;
|
|
6
|
+
concurrency?: number;
|
|
7
|
+
timeout?: number;
|
|
8
|
+
delay?: number;
|
|
9
|
+
exclude?: RegExp[];
|
|
10
|
+
include?: RegExp[];
|
|
11
|
+
userAgent?: string;
|
|
12
|
+
respectRobotsTxt?: boolean;
|
|
13
|
+
onPage?: (result: SpiderPageResult) => void;
|
|
14
|
+
onProgress?: (progress: SpiderProgress) => void;
|
|
15
|
+
}
|
|
16
|
+
export interface SpiderPageResult {
|
|
17
|
+
url: string;
|
|
18
|
+
status: number;
|
|
19
|
+
title: string;
|
|
20
|
+
depth: number;
|
|
21
|
+
links: ExtractedLink[];
|
|
22
|
+
duration: number;
|
|
23
|
+
error?: string;
|
|
24
|
+
}
|
|
25
|
+
export interface SpiderProgress {
|
|
26
|
+
crawled: number;
|
|
27
|
+
queued: number;
|
|
28
|
+
total: number;
|
|
29
|
+
currentUrl: string;
|
|
30
|
+
depth: number;
|
|
31
|
+
}
|
|
32
|
+
export interface SpiderResult {
|
|
33
|
+
startUrl: string;
|
|
34
|
+
pages: SpiderPageResult[];
|
|
35
|
+
visited: Set<string>;
|
|
36
|
+
duration: number;
|
|
37
|
+
errors: Array<{
|
|
38
|
+
url: string;
|
|
39
|
+
error: string;
|
|
40
|
+
}>;
|
|
41
|
+
}
|
|
42
|
+
export declare class Spider {
|
|
43
|
+
private options;
|
|
44
|
+
private client;
|
|
45
|
+
private visited;
|
|
46
|
+
private queue;
|
|
47
|
+
private results;
|
|
48
|
+
private errors;
|
|
49
|
+
private baseHost;
|
|
50
|
+
private running;
|
|
51
|
+
private aborted;
|
|
52
|
+
constructor(options?: SpiderOptions);
|
|
53
|
+
crawl(startUrl: string): Promise<SpiderResult>;
|
|
54
|
+
private crawlPage;
|
|
55
|
+
abort(): void;
|
|
56
|
+
isRunning(): boolean;
|
|
57
|
+
getProgress(): SpiderProgress;
|
|
58
|
+
}
|
|
59
|
+
export declare function spider(url: string, options?: SpiderOptions): Promise<SpiderResult>;
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
import { createClient } from '../core/client.js';
|
|
2
|
+
import { ScrapeDocument } from './document.js';
|
|
3
|
+
function normalizeUrl(urlStr) {
|
|
4
|
+
try {
|
|
5
|
+
const url = new URL(urlStr);
|
|
6
|
+
url.hash = '';
|
|
7
|
+
url.searchParams.sort();
|
|
8
|
+
if (url.pathname !== '/' && url.pathname.endsWith('/')) {
|
|
9
|
+
url.pathname = url.pathname.slice(0, -1);
|
|
10
|
+
}
|
|
11
|
+
return url.toString();
|
|
12
|
+
}
|
|
13
|
+
catch {
|
|
14
|
+
return urlStr;
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
function shouldCrawl(url, baseHost, options) {
|
|
18
|
+
try {
|
|
19
|
+
const parsed = new URL(url);
|
|
20
|
+
if (!['http:', 'https:'].includes(parsed.protocol)) {
|
|
21
|
+
return false;
|
|
22
|
+
}
|
|
23
|
+
if (options.sameDomain !== false && parsed.hostname !== baseHost) {
|
|
24
|
+
return false;
|
|
25
|
+
}
|
|
26
|
+
const skipExtensions = [
|
|
27
|
+
'.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.ico',
|
|
28
|
+
'.pdf', '.zip', '.tar', '.gz', '.rar',
|
|
29
|
+
'.mp3', '.mp4', '.avi', '.mov', '.webm',
|
|
30
|
+
'.css', '.js', '.json', '.xml', '.rss',
|
|
31
|
+
'.woff', '.woff2', '.ttf', '.eot',
|
|
32
|
+
];
|
|
33
|
+
const pathname = parsed.pathname.toLowerCase();
|
|
34
|
+
if (skipExtensions.some(ext => pathname.endsWith(ext))) {
|
|
35
|
+
return false;
|
|
36
|
+
}
|
|
37
|
+
if (options.exclude?.some(pattern => pattern.test(url))) {
|
|
38
|
+
return false;
|
|
39
|
+
}
|
|
40
|
+
if (options.include?.length) {
|
|
41
|
+
if (!options.include.some(pattern => pattern.test(url))) {
|
|
42
|
+
return false;
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
return true;
|
|
46
|
+
}
|
|
47
|
+
catch {
|
|
48
|
+
return false;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
function sleep(ms) {
|
|
52
|
+
return new Promise(resolve => setTimeout(resolve, ms));
|
|
53
|
+
}
|
|
54
|
+
export class Spider {
|
|
55
|
+
options;
|
|
56
|
+
client;
|
|
57
|
+
visited = new Set();
|
|
58
|
+
queue = [];
|
|
59
|
+
results = [];
|
|
60
|
+
errors = [];
|
|
61
|
+
baseHost = '';
|
|
62
|
+
running = false;
|
|
63
|
+
aborted = false;
|
|
64
|
+
constructor(options = {}) {
|
|
65
|
+
this.options = {
|
|
66
|
+
maxDepth: options.maxDepth ?? 3,
|
|
67
|
+
maxPages: options.maxPages ?? 100,
|
|
68
|
+
sameDomain: options.sameDomain ?? true,
|
|
69
|
+
concurrency: options.concurrency ?? 5,
|
|
70
|
+
timeout: options.timeout ?? 10000,
|
|
71
|
+
delay: options.delay ?? 100,
|
|
72
|
+
userAgent: options.userAgent ?? 'Recker Spider/1.0',
|
|
73
|
+
respectRobotsTxt: options.respectRobotsTxt ?? true,
|
|
74
|
+
exclude: options.exclude,
|
|
75
|
+
include: options.include,
|
|
76
|
+
onPage: options.onPage,
|
|
77
|
+
onProgress: options.onProgress,
|
|
78
|
+
};
|
|
79
|
+
this.client = createClient({
|
|
80
|
+
baseUrl: 'http://localhost',
|
|
81
|
+
timeout: this.options.timeout,
|
|
82
|
+
headers: {
|
|
83
|
+
'User-Agent': this.options.userAgent,
|
|
84
|
+
},
|
|
85
|
+
});
|
|
86
|
+
}
|
|
87
|
+
async crawl(startUrl) {
|
|
88
|
+
const startTime = performance.now();
|
|
89
|
+
const normalizedStart = normalizeUrl(startUrl);
|
|
90
|
+
this.baseHost = new URL(normalizedStart).hostname;
|
|
91
|
+
this.visited.clear();
|
|
92
|
+
this.queue = [{ url: normalizedStart, depth: 0 }];
|
|
93
|
+
this.results = [];
|
|
94
|
+
this.errors = [];
|
|
95
|
+
this.running = true;
|
|
96
|
+
this.aborted = false;
|
|
97
|
+
while (this.queue.length > 0 && !this.aborted) {
|
|
98
|
+
if (this.results.length >= this.options.maxPages) {
|
|
99
|
+
break;
|
|
100
|
+
}
|
|
101
|
+
const batch = [];
|
|
102
|
+
while (batch.length < this.options.concurrency && this.queue.length > 0) {
|
|
103
|
+
const item = this.queue.shift();
|
|
104
|
+
const normalized = normalizeUrl(item.url);
|
|
105
|
+
if (this.visited.has(normalized)) {
|
|
106
|
+
continue;
|
|
107
|
+
}
|
|
108
|
+
if (item.depth > this.options.maxDepth) {
|
|
109
|
+
continue;
|
|
110
|
+
}
|
|
111
|
+
this.visited.add(normalized);
|
|
112
|
+
batch.push({ ...item, url: normalized });
|
|
113
|
+
}
|
|
114
|
+
if (batch.length === 0) {
|
|
115
|
+
continue;
|
|
116
|
+
}
|
|
117
|
+
await Promise.all(batch.map(item => this.crawlPage(item)));
|
|
118
|
+
if (this.options.delay > 0 && this.queue.length > 0) {
|
|
119
|
+
await sleep(this.options.delay);
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
this.running = false;
|
|
123
|
+
return {
|
|
124
|
+
startUrl: normalizedStart,
|
|
125
|
+
pages: this.results,
|
|
126
|
+
visited: this.visited,
|
|
127
|
+
duration: Math.round(performance.now() - startTime),
|
|
128
|
+
errors: this.errors,
|
|
129
|
+
};
|
|
130
|
+
}
|
|
131
|
+
async crawlPage(item) {
|
|
132
|
+
const startTime = performance.now();
|
|
133
|
+
this.options.onProgress?.({
|
|
134
|
+
crawled: this.results.length,
|
|
135
|
+
queued: this.queue.length,
|
|
136
|
+
total: this.visited.size,
|
|
137
|
+
currentUrl: item.url,
|
|
138
|
+
depth: item.depth,
|
|
139
|
+
});
|
|
140
|
+
try {
|
|
141
|
+
const response = await this.client.get(item.url);
|
|
142
|
+
const status = response.status;
|
|
143
|
+
const contentType = response.headers.get('content-type') || '';
|
|
144
|
+
if (!contentType.includes('text/html')) {
|
|
145
|
+
return;
|
|
146
|
+
}
|
|
147
|
+
const html = await response.text();
|
|
148
|
+
const doc = await ScrapeDocument.create(html, { baseUrl: item.url });
|
|
149
|
+
const title = doc.selectFirst('title').text() || '';
|
|
150
|
+
const links = doc.links({ absolute: true });
|
|
151
|
+
const result = {
|
|
152
|
+
url: item.url,
|
|
153
|
+
status,
|
|
154
|
+
title,
|
|
155
|
+
depth: item.depth,
|
|
156
|
+
links,
|
|
157
|
+
duration: Math.round(performance.now() - startTime),
|
|
158
|
+
};
|
|
159
|
+
this.results.push(result);
|
|
160
|
+
this.options.onPage?.(result);
|
|
161
|
+
for (const link of links) {
|
|
162
|
+
if (!link.href)
|
|
163
|
+
continue;
|
|
164
|
+
const normalized = normalizeUrl(link.href);
|
|
165
|
+
if (this.visited.has(normalized))
|
|
166
|
+
continue;
|
|
167
|
+
if (!shouldCrawl(normalized, this.baseHost, this.options))
|
|
168
|
+
continue;
|
|
169
|
+
this.queue.push({
|
|
170
|
+
url: normalized,
|
|
171
|
+
depth: item.depth + 1,
|
|
172
|
+
});
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
catch (error) {
|
|
176
|
+
const errorResult = {
|
|
177
|
+
url: item.url,
|
|
178
|
+
status: 0,
|
|
179
|
+
title: '',
|
|
180
|
+
depth: item.depth,
|
|
181
|
+
links: [],
|
|
182
|
+
duration: Math.round(performance.now() - startTime),
|
|
183
|
+
error: error.message,
|
|
184
|
+
};
|
|
185
|
+
this.results.push(errorResult);
|
|
186
|
+
this.errors.push({ url: item.url, error: error.message });
|
|
187
|
+
this.options.onPage?.(errorResult);
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
abort() {
|
|
191
|
+
this.aborted = true;
|
|
192
|
+
}
|
|
193
|
+
isRunning() {
|
|
194
|
+
return this.running;
|
|
195
|
+
}
|
|
196
|
+
getProgress() {
|
|
197
|
+
return {
|
|
198
|
+
crawled: this.results.length,
|
|
199
|
+
queued: this.queue.length,
|
|
200
|
+
total: this.visited.size,
|
|
201
|
+
currentUrl: '',
|
|
202
|
+
depth: 0,
|
|
203
|
+
};
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
export async function spider(url, options) {
|
|
207
|
+
const s = new Spider(options);
|
|
208
|
+
return s.crawl(url);
|
|
209
|
+
}
|
package/dist/seo/analyzer.js
CHANGED
|
@@ -72,6 +72,16 @@ export class SeoAnalyzer {
|
|
|
72
72
|
buildRuleContext(data) {
|
|
73
73
|
const { meta, og, twitter, jsonLd, headings, content, linkAnalysis, imageAnalysis, links } = data;
|
|
74
74
|
const htmlLang = this.$('html').attr('lang');
|
|
75
|
+
const hreflangTags = [];
|
|
76
|
+
this.$('link[rel="alternate"][hreflang]').each((_, el) => {
|
|
77
|
+
const $el = this.$(el);
|
|
78
|
+
const lang = $el.attr('hreflang');
|
|
79
|
+
const href = $el.attr('href');
|
|
80
|
+
if (lang && href) {
|
|
81
|
+
hreflangTags.push({ lang, href });
|
|
82
|
+
}
|
|
83
|
+
});
|
|
84
|
+
const ogLocale = this.$('meta[property="og:locale"]').attr('content');
|
|
75
85
|
const genericTexts = SEO_THRESHOLDS.links.genericTexts;
|
|
76
86
|
const genericTextLinks = links.filter((l) => {
|
|
77
87
|
const text = l.text?.toLowerCase().trim();
|
|
@@ -196,6 +206,8 @@ export class SeoAnalyzer {
|
|
|
196
206
|
titleMatchesH1: meta.title && h1Text ? meta.title.toLowerCase().trim() === h1Text.toLowerCase().trim() : undefined,
|
|
197
207
|
...this.analyzeUrlQuality(),
|
|
198
208
|
...this.analyzeJsRendering(content),
|
|
209
|
+
hreflangTags: hreflangTags.length > 0 ? hreflangTags : undefined,
|
|
210
|
+
ogLocale,
|
|
199
211
|
};
|
|
200
212
|
}
|
|
201
213
|
analyzeUrlQuality() {
|