recker 1.0.28-next.dea5ff5 → 1.0.29
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/tui/shell.d.ts +1 -0
- package/dist/cli/tui/shell.js +339 -5
- package/dist/scrape/index.d.ts +2 -0
- package/dist/scrape/index.js +1 -0
- package/dist/scrape/spider.d.ts +61 -0
- package/dist/scrape/spider.js +250 -0
- package/dist/seo/analyzer.js +15 -0
- package/dist/seo/index.d.ts +3 -1
- package/dist/seo/index.js +1 -0
- package/dist/seo/seo-spider.d.ts +47 -0
- package/dist/seo/seo-spider.js +362 -0
- package/dist/seo/types.d.ts +24 -0
- package/package.json +1 -1
package/dist/cli/tui/shell.d.ts
CHANGED
package/dist/cli/tui/shell.js
CHANGED
|
@@ -10,11 +10,12 @@ import { inspectTLS } from '../../utils/tls-inspector.js';
|
|
|
10
10
|
import { getSecurityRecords } from '../../utils/dns-toolkit.js';
|
|
11
11
|
import { rdap } from '../../utils/rdap.js';
|
|
12
12
|
import { ScrapeDocument } from '../../scrape/document.js';
|
|
13
|
+
import { Spider } from '../../scrape/spider.js';
|
|
13
14
|
import colors from '../../utils/colors.js';
|
|
14
15
|
import { getShellSearch } from './shell-search.js';
|
|
15
16
|
import { openSearchPanel } from './search-panel.js';
|
|
16
17
|
import { ScrollBuffer, parseScrollKey, parseMouseScroll, disableMouseReporting } from './scroll-buffer.js';
|
|
17
|
-
import { analyzeSeo } from '../../seo/index.js';
|
|
18
|
+
import { analyzeSeo, SeoSpider } from '../../seo/index.js';
|
|
18
19
|
let highlight;
|
|
19
20
|
async function initDependencies() {
|
|
20
21
|
if (!highlight) {
|
|
@@ -94,7 +95,7 @@ export class RekShell {
|
|
|
94
95
|
'get', 'post', 'put', 'delete', 'patch', 'head', 'options',
|
|
95
96
|
'ws', 'udp', 'load', 'chat', 'ai',
|
|
96
97
|
'whois', 'tls', 'ssl', 'security', 'ip', 'dns', 'dns:propagate', 'dns:email', 'rdap', 'ping',
|
|
97
|
-
'scrap', '$', '$text', '$attr', '$html', '$links', '$images', '$scripts', '$css', '$sourcemaps', '$unmap', '$unmap:view', '$unmap:save', '$beautify', '$beautify:save', '$table',
|
|
98
|
+
'scrap', 'spider', '$', '$text', '$attr', '$html', '$links', '$images', '$scripts', '$css', '$sourcemaps', '$unmap', '$unmap:view', '$unmap:save', '$beautify', '$beautify:save', '$table',
|
|
98
99
|
'?', 'search', 'suggest', 'example',
|
|
99
100
|
'help', 'clear', 'exit', 'set', 'url', 'vars', 'env'
|
|
100
101
|
];
|
|
@@ -368,6 +369,9 @@ export class RekShell {
|
|
|
368
369
|
case 'scrap':
|
|
369
370
|
await this.runScrap(parts[1]);
|
|
370
371
|
return;
|
|
372
|
+
case 'spider':
|
|
373
|
+
await this.runSpider(parts.slice(1));
|
|
374
|
+
return;
|
|
371
375
|
case '$':
|
|
372
376
|
await this.runSelect(parts.slice(1).join(' '));
|
|
373
377
|
return;
|
|
@@ -972,11 +976,20 @@ ${colors.bold('Details:')}`);
|
|
|
972
976
|
const html = await res.text();
|
|
973
977
|
const duration = Math.round(performance.now() - startTime);
|
|
974
978
|
const report = await analyzeSeo(html, { baseUrl: url });
|
|
979
|
+
const t = res.timings;
|
|
980
|
+
report.timing = {
|
|
981
|
+
ttfb: t?.firstByte ? Math.round(t.firstByte) : undefined,
|
|
982
|
+
total: t?.total ? Math.round(t.total) : duration,
|
|
983
|
+
dns: t?.dns ? Math.round(t.dns) : undefined,
|
|
984
|
+
tcp: t?.tcp ? Math.round(t.tcp) : undefined,
|
|
985
|
+
tls: t?.tls ? Math.round(t.tls) : undefined,
|
|
986
|
+
download: t?.content ? Math.round(t.content) : undefined,
|
|
987
|
+
};
|
|
975
988
|
if (jsonOutput) {
|
|
976
989
|
const jsonResult = {
|
|
977
990
|
url,
|
|
978
991
|
analyzedAt: new Date().toISOString(),
|
|
979
|
-
|
|
992
|
+
timing: report.timing,
|
|
980
993
|
score: report.score,
|
|
981
994
|
grade: report.grade,
|
|
982
995
|
title: report.title,
|
|
@@ -985,8 +998,9 @@ ${colors.bold('Details:')}`);
|
|
|
985
998
|
headings: report.headings,
|
|
986
999
|
links: report.links,
|
|
987
1000
|
images: report.images,
|
|
988
|
-
openGraph: report.
|
|
989
|
-
twitterCard: report.
|
|
1001
|
+
openGraph: report.openGraph,
|
|
1002
|
+
twitterCard: report.twitterCard,
|
|
1003
|
+
social: report.social,
|
|
990
1004
|
jsonLd: report.jsonLd,
|
|
991
1005
|
technical: report.technical,
|
|
992
1006
|
checks: report.checks,
|
|
@@ -1024,6 +1038,50 @@ Grade: ${gradeColor(colors.bold(report.grade))} (${report.score}/100)
|
|
|
1024
1038
|
: report.metaDescription.text;
|
|
1025
1039
|
console.log(colors.bold('Description:') + ` ${desc} ` + colors.gray(`(${report.metaDescription.length} chars)`));
|
|
1026
1040
|
}
|
|
1041
|
+
if (report.openGraph && Object.values(report.openGraph).some(v => v)) {
|
|
1042
|
+
console.log('');
|
|
1043
|
+
console.log(colors.bold(colors.cyan('OpenGraph:')));
|
|
1044
|
+
if (report.openGraph.title) {
|
|
1045
|
+
const ogTitle = report.openGraph.title.length > 60
|
|
1046
|
+
? report.openGraph.title.slice(0, 57) + '...'
|
|
1047
|
+
: report.openGraph.title;
|
|
1048
|
+
console.log(` ${colors.gray('og:title:')} ${ogTitle}`);
|
|
1049
|
+
}
|
|
1050
|
+
if (report.openGraph.description) {
|
|
1051
|
+
const ogDesc = report.openGraph.description.length > 60
|
|
1052
|
+
? report.openGraph.description.slice(0, 57) + '...'
|
|
1053
|
+
: report.openGraph.description;
|
|
1054
|
+
console.log(` ${colors.gray('og:description:')} ${ogDesc}`);
|
|
1055
|
+
}
|
|
1056
|
+
if (report.openGraph.image) {
|
|
1057
|
+
const ogImg = report.openGraph.image.length > 50
|
|
1058
|
+
? '...' + report.openGraph.image.slice(-47)
|
|
1059
|
+
: report.openGraph.image;
|
|
1060
|
+
console.log(` ${colors.gray('og:image:')} ${colors.blue(ogImg)}`);
|
|
1061
|
+
}
|
|
1062
|
+
if (report.openGraph.type) {
|
|
1063
|
+
console.log(` ${colors.gray('og:type:')} ${report.openGraph.type}`);
|
|
1064
|
+
}
|
|
1065
|
+
}
|
|
1066
|
+
if (report.timing) {
|
|
1067
|
+
const t = report.timing;
|
|
1068
|
+
console.log('');
|
|
1069
|
+
console.log(colors.bold('Timing:'));
|
|
1070
|
+
const timings = [];
|
|
1071
|
+
if (t.dns !== undefined)
|
|
1072
|
+
timings.push(`DNS ${t.dns}ms`);
|
|
1073
|
+
if (t.tcp !== undefined)
|
|
1074
|
+
timings.push(`TCP ${t.tcp}ms`);
|
|
1075
|
+
if (t.tls !== undefined)
|
|
1076
|
+
timings.push(`TLS ${t.tls}ms`);
|
|
1077
|
+
if (t.ttfb !== undefined)
|
|
1078
|
+
timings.push(`TTFB ${t.ttfb}ms`);
|
|
1079
|
+
if (t.download !== undefined)
|
|
1080
|
+
timings.push(`Download ${t.download}ms`);
|
|
1081
|
+
if (t.total !== undefined)
|
|
1082
|
+
timings.push(`Total ${t.total}ms`);
|
|
1083
|
+
console.log(` ${timings.join(' → ')}`);
|
|
1084
|
+
}
|
|
1027
1085
|
if (report.content) {
|
|
1028
1086
|
console.log(colors.bold('Content:') + ` ${report.content.wordCount} words, ${report.content.paragraphCount} paragraphs, ~${report.content.readingTimeMinutes} min read`);
|
|
1029
1087
|
}
|
|
@@ -1434,6 +1492,274 @@ ${colors.bold('Network:')}
|
|
|
1434
1492
|
}
|
|
1435
1493
|
console.log('');
|
|
1436
1494
|
}
|
|
1495
|
+
async runSpider(args) {
|
|
1496
|
+
let url = '';
|
|
1497
|
+
let maxDepth = 3;
|
|
1498
|
+
let maxPages = 100;
|
|
1499
|
+
let concurrency = 5;
|
|
1500
|
+
let seoEnabled = false;
|
|
1501
|
+
let outputFile = '';
|
|
1502
|
+
for (let i = 0; i < args.length; i++) {
|
|
1503
|
+
const arg = args[i];
|
|
1504
|
+
if (arg.startsWith('depth=')) {
|
|
1505
|
+
maxDepth = parseInt(arg.split('=')[1]) || 4;
|
|
1506
|
+
}
|
|
1507
|
+
else if (arg.startsWith('limit=')) {
|
|
1508
|
+
maxPages = parseInt(arg.split('=')[1]) || 100;
|
|
1509
|
+
}
|
|
1510
|
+
else if (arg.startsWith('concurrency=')) {
|
|
1511
|
+
concurrency = parseInt(arg.split('=')[1]) || 5;
|
|
1512
|
+
}
|
|
1513
|
+
else if (arg === 'seo') {
|
|
1514
|
+
seoEnabled = true;
|
|
1515
|
+
}
|
|
1516
|
+
else if (arg.startsWith('output=')) {
|
|
1517
|
+
outputFile = arg.split('=')[1] || '';
|
|
1518
|
+
}
|
|
1519
|
+
else if (!arg.includes('=')) {
|
|
1520
|
+
url = arg;
|
|
1521
|
+
}
|
|
1522
|
+
}
|
|
1523
|
+
if (!url) {
|
|
1524
|
+
if (!this.baseUrl) {
|
|
1525
|
+
console.log(colors.yellow('Usage: spider <url> [options]'));
|
|
1526
|
+
console.log(colors.gray(' Options:'));
|
|
1527
|
+
console.log(colors.gray(' depth=4 Max crawl depth'));
|
|
1528
|
+
console.log(colors.gray(' limit=100 Max pages to crawl'));
|
|
1529
|
+
console.log(colors.gray(' concurrency=5 Concurrent requests'));
|
|
1530
|
+
console.log(colors.gray(' seo Enable SEO analysis'));
|
|
1531
|
+
console.log(colors.gray(' output=file.json Save JSON report'));
|
|
1532
|
+
console.log(colors.gray(' Examples:'));
|
|
1533
|
+
console.log(colors.gray(' spider example.com'));
|
|
1534
|
+
console.log(colors.gray(' spider example.com depth=2 limit=50'));
|
|
1535
|
+
console.log(colors.gray(' spider example.com seo output=seo-report.json'));
|
|
1536
|
+
return;
|
|
1537
|
+
}
|
|
1538
|
+
url = this.baseUrl;
|
|
1539
|
+
}
|
|
1540
|
+
else if (!url.startsWith('http')) {
|
|
1541
|
+
url = `https://${url}`;
|
|
1542
|
+
}
|
|
1543
|
+
console.log(colors.cyan(`\nSpider starting: ${url}`));
|
|
1544
|
+
const modeLabel = seoEnabled ? colors.magenta(' + SEO') : '';
|
|
1545
|
+
console.log(colors.gray(` Depth: ${maxDepth} | Limit: ${maxPages} | Concurrency: ${concurrency}${modeLabel}`));
|
|
1546
|
+
if (outputFile) {
|
|
1547
|
+
console.log(colors.gray(` Output: ${outputFile}`));
|
|
1548
|
+
}
|
|
1549
|
+
console.log('');
|
|
1550
|
+
if (seoEnabled) {
|
|
1551
|
+
const seoSpider = new SeoSpider({
|
|
1552
|
+
maxDepth,
|
|
1553
|
+
maxPages,
|
|
1554
|
+
concurrency,
|
|
1555
|
+
sameDomain: true,
|
|
1556
|
+
delay: 100,
|
|
1557
|
+
seo: true,
|
|
1558
|
+
output: outputFile || undefined,
|
|
1559
|
+
onProgress: (progress) => {
|
|
1560
|
+
process.stdout.write(`\r${colors.gray(' Crawling:')} ${colors.cyan(progress.crawled.toString())} pages | ${colors.gray('Queue:')} ${progress.queued} | ${colors.gray('Depth:')} ${progress.depth} `);
|
|
1561
|
+
},
|
|
1562
|
+
});
|
|
1563
|
+
try {
|
|
1564
|
+
const result = await seoSpider.crawl(url);
|
|
1565
|
+
process.stdout.write('\r' + ' '.repeat(80) + '\r');
|
|
1566
|
+
console.log(colors.green(`\n✔ SEO Spider complete`) + colors.gray(` (${(result.duration / 1000).toFixed(1)}s)`));
|
|
1567
|
+
console.log(` ${colors.cyan('Pages crawled')}: ${result.pages.length}`);
|
|
1568
|
+
console.log(` ${colors.cyan('Unique URLs')}: ${result.visited.size}`);
|
|
1569
|
+
console.log(` ${colors.cyan('Avg SEO Score')}: ${result.summary.avgScore}/100`);
|
|
1570
|
+
const responseTimes = result.pages.filter(p => p.duration > 0).map(p => p.duration);
|
|
1571
|
+
const avgResponseTime = responseTimes.length > 0
|
|
1572
|
+
? Math.round(responseTimes.reduce((a, b) => a + b, 0) / responseTimes.length)
|
|
1573
|
+
: 0;
|
|
1574
|
+
const minResponseTime = responseTimes.length > 0 ? Math.min(...responseTimes) : 0;
|
|
1575
|
+
const maxResponseTime = responseTimes.length > 0 ? Math.max(...responseTimes) : 0;
|
|
1576
|
+
const reqPerSec = result.duration > 0 ? (result.pages.length / (result.duration / 1000)).toFixed(1) : '0';
|
|
1577
|
+
const statusCounts = new Map();
|
|
1578
|
+
for (const page of result.pages) {
|
|
1579
|
+
const status = page.status || 0;
|
|
1580
|
+
statusCounts.set(status, (statusCounts.get(status) || 0) + 1);
|
|
1581
|
+
}
|
|
1582
|
+
let totalInternalLinks = 0;
|
|
1583
|
+
let totalExternalLinks = 0;
|
|
1584
|
+
let totalImages = 0;
|
|
1585
|
+
let imagesWithoutAlt = 0;
|
|
1586
|
+
let pagesWithoutTitle = 0;
|
|
1587
|
+
let pagesWithoutDescription = 0;
|
|
1588
|
+
for (const page of result.pages) {
|
|
1589
|
+
if (page.seoReport) {
|
|
1590
|
+
totalInternalLinks += page.seoReport.links?.internal || 0;
|
|
1591
|
+
totalExternalLinks += page.seoReport.links?.external || 0;
|
|
1592
|
+
totalImages += page.seoReport.images?.total || 0;
|
|
1593
|
+
imagesWithoutAlt += page.seoReport.images?.withoutAlt || 0;
|
|
1594
|
+
if (!page.seoReport.title?.text)
|
|
1595
|
+
pagesWithoutTitle++;
|
|
1596
|
+
if (!page.seoReport.metaDescription?.text)
|
|
1597
|
+
pagesWithoutDescription++;
|
|
1598
|
+
}
|
|
1599
|
+
}
|
|
1600
|
+
console.log(colors.bold('\n Performance:'));
|
|
1601
|
+
console.log(` ${colors.gray('Avg Response:')} ${avgResponseTime}ms`);
|
|
1602
|
+
console.log(` ${colors.gray('Min/Max:')} ${minResponseTime}ms / ${maxResponseTime}ms`);
|
|
1603
|
+
console.log(` ${colors.gray('Throughput:')} ${reqPerSec} req/s`);
|
|
1604
|
+
console.log(colors.bold('\n HTTP Status:'));
|
|
1605
|
+
const sortedStatuses = Array.from(statusCounts.entries()).sort((a, b) => b[1] - a[1]);
|
|
1606
|
+
for (const [status, count] of sortedStatuses.slice(0, 5)) {
|
|
1607
|
+
const statusLabel = status === 0 ? 'Error' : status.toString();
|
|
1608
|
+
const statusColor = status >= 400 || status === 0 ? colors.red :
|
|
1609
|
+
status >= 300 ? colors.yellow : colors.green;
|
|
1610
|
+
const pct = ((count / result.pages.length) * 100).toFixed(0);
|
|
1611
|
+
console.log(` ${statusColor(statusLabel.padEnd(5))} ${count.toString().padStart(3)} (${pct}%)`);
|
|
1612
|
+
}
|
|
1613
|
+
console.log(colors.bold('\n Content:'));
|
|
1614
|
+
console.log(` ${colors.gray('Internal links:')} ${totalInternalLinks.toLocaleString()}`);
|
|
1615
|
+
console.log(` ${colors.gray('External links:')} ${totalExternalLinks.toLocaleString()}`);
|
|
1616
|
+
console.log(` ${colors.gray('Images:')} ${totalImages.toLocaleString()} (${imagesWithoutAlt} missing alt)`);
|
|
1617
|
+
console.log(` ${colors.gray('Missing title:')} ${pagesWithoutTitle}`);
|
|
1618
|
+
console.log(` ${colors.gray('Missing desc:')} ${pagesWithoutDescription}`);
|
|
1619
|
+
console.log(colors.bold('\n SEO Summary:'));
|
|
1620
|
+
const { summary } = result;
|
|
1621
|
+
console.log(` ${colors.red('✗')} Pages with errors: ${summary.pagesWithErrors}`);
|
|
1622
|
+
console.log(` ${colors.yellow('⚠')} Pages with warnings: ${summary.pagesWithWarnings}`);
|
|
1623
|
+
console.log(` ${colors.magenta('⚐')} Duplicate titles: ${summary.duplicateTitles}`);
|
|
1624
|
+
console.log(` ${colors.magenta('⚐')} Duplicate descriptions:${summary.duplicateDescriptions}`);
|
|
1625
|
+
console.log(` ${colors.magenta('⚐')} Duplicate H1s: ${summary.duplicateH1s}`);
|
|
1626
|
+
console.log(` ${colors.gray('○')} Orphan pages: ${summary.orphanPages}`);
|
|
1627
|
+
if (result.siteWideIssues.length > 0) {
|
|
1628
|
+
console.log(colors.bold('\n Site-Wide Issues:'));
|
|
1629
|
+
for (const issue of result.siteWideIssues.slice(0, 10)) {
|
|
1630
|
+
const icon = issue.severity === 'error' ? colors.red('✗') :
|
|
1631
|
+
issue.severity === 'warning' ? colors.yellow('⚠') : colors.gray('○');
|
|
1632
|
+
console.log(` ${icon} ${issue.message}`);
|
|
1633
|
+
if (issue.value) {
|
|
1634
|
+
const truncatedValue = issue.value.length > 50 ? issue.value.slice(0, 47) + '...' : issue.value;
|
|
1635
|
+
console.log(` ${colors.gray(`"${truncatedValue}"`)}`);
|
|
1636
|
+
}
|
|
1637
|
+
const uniquePaths = [...new Set(issue.affectedUrls.map(u => new URL(u).pathname))];
|
|
1638
|
+
if (uniquePaths.length <= 3) {
|
|
1639
|
+
for (const path of uniquePaths) {
|
|
1640
|
+
console.log(` ${colors.gray('→')} ${path}`);
|
|
1641
|
+
}
|
|
1642
|
+
}
|
|
1643
|
+
else {
|
|
1644
|
+
console.log(` ${colors.gray(`→ ${uniquePaths.length} pages affected`)}`);
|
|
1645
|
+
}
|
|
1646
|
+
}
|
|
1647
|
+
if (result.siteWideIssues.length > 10) {
|
|
1648
|
+
console.log(colors.gray(` ... and ${result.siteWideIssues.length - 10} more issues`));
|
|
1649
|
+
}
|
|
1650
|
+
}
|
|
1651
|
+
const pagesWithScores = result.pages
|
|
1652
|
+
.filter(p => p.seoReport)
|
|
1653
|
+
.sort((a, b) => (a.seoReport?.score || 0) - (b.seoReport?.score || 0));
|
|
1654
|
+
const seenPaths = new Set();
|
|
1655
|
+
const uniquePages = pagesWithScores.filter(page => {
|
|
1656
|
+
const path = new URL(page.url).pathname;
|
|
1657
|
+
if (seenPaths.has(path))
|
|
1658
|
+
return false;
|
|
1659
|
+
seenPaths.add(path);
|
|
1660
|
+
return true;
|
|
1661
|
+
});
|
|
1662
|
+
if (uniquePages.length > 0) {
|
|
1663
|
+
console.log(colors.bold('\n Pages by SEO Score:'));
|
|
1664
|
+
const worstPages = uniquePages.slice(0, 5);
|
|
1665
|
+
for (const page of worstPages) {
|
|
1666
|
+
const score = page.seoReport?.score || 0;
|
|
1667
|
+
const grade = page.seoReport?.grade || '?';
|
|
1668
|
+
const path = new URL(page.url).pathname;
|
|
1669
|
+
const scoreColor = score >= 80 ? colors.green : score >= 60 ? colors.yellow : colors.red;
|
|
1670
|
+
console.log(` ${scoreColor(`${score.toString().padStart(3)}`)} ${colors.gray(`[${grade}]`)} ${path.slice(0, 50)}`);
|
|
1671
|
+
}
|
|
1672
|
+
if (uniquePages.length > 5) {
|
|
1673
|
+
console.log(colors.gray(` ... and ${uniquePages.length - 5} more pages`));
|
|
1674
|
+
}
|
|
1675
|
+
}
|
|
1676
|
+
if (outputFile) {
|
|
1677
|
+
console.log(colors.green(`\n Report saved to: ${outputFile}`));
|
|
1678
|
+
}
|
|
1679
|
+
this.lastResponse = result;
|
|
1680
|
+
console.log(colors.gray('\n Result stored in lastResponse.'));
|
|
1681
|
+
}
|
|
1682
|
+
catch (error) {
|
|
1683
|
+
console.error(colors.red(`SEO Spider failed: ${error.message}`));
|
|
1684
|
+
}
|
|
1685
|
+
}
|
|
1686
|
+
else {
|
|
1687
|
+
const spider = new Spider({
|
|
1688
|
+
maxDepth,
|
|
1689
|
+
maxPages,
|
|
1690
|
+
concurrency,
|
|
1691
|
+
sameDomain: true,
|
|
1692
|
+
delay: 100,
|
|
1693
|
+
onProgress: (progress) => {
|
|
1694
|
+
process.stdout.write(`\r${colors.gray(' Crawling:')} ${colors.cyan(progress.crawled.toString())} pages | ${colors.gray('Queue:')} ${progress.queued} | ${colors.gray('Depth:')} ${progress.depth} `);
|
|
1695
|
+
},
|
|
1696
|
+
});
|
|
1697
|
+
try {
|
|
1698
|
+
const result = await spider.crawl(url);
|
|
1699
|
+
process.stdout.write('\r' + ' '.repeat(80) + '\r');
|
|
1700
|
+
console.log(colors.green(`\n✔ Spider complete`) + colors.gray(` (${(result.duration / 1000).toFixed(1)}s)`));
|
|
1701
|
+
console.log(` ${colors.cyan('Pages crawled')}: ${result.pages.length}`);
|
|
1702
|
+
console.log(` ${colors.cyan('Unique URLs')}: ${result.visited.size}`);
|
|
1703
|
+
console.log(` ${colors.cyan('Errors')}: ${result.errors.length}`);
|
|
1704
|
+
const byDepth = new Map();
|
|
1705
|
+
for (const page of result.pages) {
|
|
1706
|
+
byDepth.set(page.depth, (byDepth.get(page.depth) || 0) + 1);
|
|
1707
|
+
}
|
|
1708
|
+
console.log(colors.bold('\n Pages by depth:'));
|
|
1709
|
+
for (const [depth, count] of Array.from(byDepth.entries()).sort((a, b) => a[0] - b[0])) {
|
|
1710
|
+
const bar = '█'.repeat(Math.min(count, 40));
|
|
1711
|
+
console.log(` ${colors.gray(`d${depth}:`)} ${bar} ${count}`);
|
|
1712
|
+
}
|
|
1713
|
+
const topPages = [...result.pages]
|
|
1714
|
+
.filter(p => !p.error)
|
|
1715
|
+
.sort((a, b) => b.links.length - a.links.length)
|
|
1716
|
+
.slice(0, 10);
|
|
1717
|
+
if (topPages.length > 0) {
|
|
1718
|
+
console.log(colors.bold('\n Top pages by outgoing links:'));
|
|
1719
|
+
for (const page of topPages) {
|
|
1720
|
+
const title = page.title.slice(0, 40) || new URL(page.url).pathname;
|
|
1721
|
+
console.log(` ${colors.cyan(page.links.length.toString().padStart(3))} ${title}`);
|
|
1722
|
+
}
|
|
1723
|
+
}
|
|
1724
|
+
const formatError = (error) => {
|
|
1725
|
+
const statusMatch = error.match(/status code (\d{3})/i);
|
|
1726
|
+
if (statusMatch) {
|
|
1727
|
+
return `HTTP ${statusMatch[1]}`;
|
|
1728
|
+
}
|
|
1729
|
+
return error.length > 50 ? error.slice(0, 47) + '...' : error;
|
|
1730
|
+
};
|
|
1731
|
+
if (result.errors.length > 0 && result.errors.length <= 10) {
|
|
1732
|
+
console.log(colors.bold('\n Errors:'));
|
|
1733
|
+
for (const err of result.errors) {
|
|
1734
|
+
const path = new URL(err.url).pathname;
|
|
1735
|
+
console.log(` ${colors.red('✗')} ${path.padEnd(25)} ${colors.gray('→')} ${formatError(err.error)}`);
|
|
1736
|
+
}
|
|
1737
|
+
}
|
|
1738
|
+
else if (result.errors.length > 10) {
|
|
1739
|
+
console.log(colors.yellow(`\n ${result.errors.length} errors (showing first 10):`));
|
|
1740
|
+
for (const err of result.errors.slice(0, 10)) {
|
|
1741
|
+
const path = new URL(err.url).pathname;
|
|
1742
|
+
console.log(` ${colors.red('✗')} ${path.padEnd(25)} ${colors.gray('→')} ${formatError(err.error)}`);
|
|
1743
|
+
}
|
|
1744
|
+
}
|
|
1745
|
+
if (outputFile) {
|
|
1746
|
+
const reportData = {
|
|
1747
|
+
...result,
|
|
1748
|
+
visited: Array.from(result.visited),
|
|
1749
|
+
generatedAt: new Date().toISOString(),
|
|
1750
|
+
};
|
|
1751
|
+
await fs.writeFile(outputFile, JSON.stringify(reportData, null, 2), 'utf-8');
|
|
1752
|
+
console.log(colors.green(`\n Report saved to: ${outputFile}`));
|
|
1753
|
+
}
|
|
1754
|
+
this.lastResponse = result;
|
|
1755
|
+
console.log(colors.gray('\n Result stored in lastResponse. Use $links to explore.'));
|
|
1756
|
+
}
|
|
1757
|
+
catch (error) {
|
|
1758
|
+
console.error(colors.red(`Spider failed: ${error.message}`));
|
|
1759
|
+
}
|
|
1760
|
+
}
|
|
1761
|
+
console.log('');
|
|
1762
|
+
}
|
|
1437
1763
|
async runSelect(selector) {
|
|
1438
1764
|
if (!this.currentDoc) {
|
|
1439
1765
|
console.log(colors.yellow('No document loaded. Use "scrap <url>" first.'));
|
|
@@ -2358,6 +2684,13 @@ ${colors.bold('Network:')}
|
|
|
2358
2684
|
${colors.green('$beautify:save [f]')} Save beautified code to file.
|
|
2359
2685
|
${colors.green('$table <selector>')} Extract table as data.
|
|
2360
2686
|
|
|
2687
|
+
${colors.bold('Web Crawler:')}
|
|
2688
|
+
${colors.green('spider <url>')} Crawl website following internal links.
|
|
2689
|
+
${colors.gray('Options:')}
|
|
2690
|
+
${colors.white('--depth=4')} ${colors.gray('Maximum depth to crawl')}
|
|
2691
|
+
${colors.white('--limit=100')} ${colors.gray('Maximum pages to crawl')}
|
|
2692
|
+
${colors.white('--concurrency=5')} ${colors.gray('Parallel requests')}
|
|
2693
|
+
|
|
2361
2694
|
${colors.bold('Documentation:')}
|
|
2362
2695
|
${colors.green('? <query>')} Search Recker documentation.
|
|
2363
2696
|
${colors.green('search <query>')} Alias for ? (hybrid fuzzy+semantic search).
|
|
@@ -2375,6 +2708,7 @@ ${colors.bold('Network:')}
|
|
|
2375
2708
|
› post /post name="Neo" active:=true role:Admin
|
|
2376
2709
|
› load /heavy-endpoint users=100 mode=stress
|
|
2377
2710
|
› chat openai gpt-5.1
|
|
2711
|
+
› spider example.com depth=2 limit=50
|
|
2378
2712
|
`);
|
|
2379
2713
|
}
|
|
2380
2714
|
}
|
package/dist/scrape/index.d.ts
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
export { ScrapeDocument } from './document.js';
|
|
2
2
|
export { ScrapeElement } from './element.js';
|
|
3
|
+
export { Spider, spider } from './spider.js';
|
|
4
|
+
export type { SpiderOptions, SpiderPageResult, SpiderProgress, SpiderResult, } from './spider.js';
|
|
3
5
|
export { extractLinks, extractImages, extractMeta, extractOpenGraph, extractTwitterCard, extractJsonLd, extractForms, extractTables, extractScripts, extractStyles, } from './extractors.js';
|
|
4
6
|
export type { ExtractedLink, ExtractedImage, ExtractedMeta, OpenGraphData, TwitterCardData, JsonLdData, ExtractedForm, ExtractedFormField, ExtractedTable, ExtractedScript, ExtractedStyle, ExtractionSchema, ExtractionSchemaField, ScrapeOptions, LinkExtractionOptions, ImageExtractionOptions, } from './types.js';
|
package/dist/scrape/index.js
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
1
|
export { ScrapeDocument } from './document.js';
|
|
2
2
|
export { ScrapeElement } from './element.js';
|
|
3
|
+
export { Spider, spider } from './spider.js';
|
|
3
4
|
export { extractLinks, extractImages, extractMeta, extractOpenGraph, extractTwitterCard, extractJsonLd, extractForms, extractTables, extractScripts, extractStyles, } from './extractors.js';
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import type { ExtractedLink } from './types.js';
|
|
2
|
+
export interface SpiderOptions {
|
|
3
|
+
maxDepth?: number;
|
|
4
|
+
maxPages?: number;
|
|
5
|
+
sameDomain?: boolean;
|
|
6
|
+
concurrency?: number;
|
|
7
|
+
timeout?: number;
|
|
8
|
+
delay?: number;
|
|
9
|
+
exclude?: RegExp[];
|
|
10
|
+
include?: RegExp[];
|
|
11
|
+
userAgent?: string;
|
|
12
|
+
respectRobotsTxt?: boolean;
|
|
13
|
+
onPage?: (result: SpiderPageResult) => void;
|
|
14
|
+
onProgress?: (progress: SpiderProgress) => void;
|
|
15
|
+
}
|
|
16
|
+
export interface SpiderPageResult {
|
|
17
|
+
url: string;
|
|
18
|
+
status: number;
|
|
19
|
+
title: string;
|
|
20
|
+
depth: number;
|
|
21
|
+
links: ExtractedLink[];
|
|
22
|
+
duration: number;
|
|
23
|
+
error?: string;
|
|
24
|
+
}
|
|
25
|
+
export interface SpiderProgress {
|
|
26
|
+
crawled: number;
|
|
27
|
+
queued: number;
|
|
28
|
+
total: number;
|
|
29
|
+
currentUrl: string;
|
|
30
|
+
depth: number;
|
|
31
|
+
}
|
|
32
|
+
export interface SpiderResult {
|
|
33
|
+
startUrl: string;
|
|
34
|
+
pages: SpiderPageResult[];
|
|
35
|
+
visited: Set<string>;
|
|
36
|
+
duration: number;
|
|
37
|
+
errors: Array<{
|
|
38
|
+
url: string;
|
|
39
|
+
error: string;
|
|
40
|
+
}>;
|
|
41
|
+
}
|
|
42
|
+
export declare class Spider {
|
|
43
|
+
private options;
|
|
44
|
+
private client;
|
|
45
|
+
private pool;
|
|
46
|
+
private visited;
|
|
47
|
+
private queue;
|
|
48
|
+
private results;
|
|
49
|
+
private errors;
|
|
50
|
+
private baseHost;
|
|
51
|
+
private running;
|
|
52
|
+
private aborted;
|
|
53
|
+
private pendingCount;
|
|
54
|
+
constructor(options?: SpiderOptions);
|
|
55
|
+
crawl(startUrl: string): Promise<SpiderResult>;
|
|
56
|
+
private crawlPage;
|
|
57
|
+
abort(): void;
|
|
58
|
+
isRunning(): boolean;
|
|
59
|
+
getProgress(): SpiderProgress;
|
|
60
|
+
}
|
|
61
|
+
export declare function spider(url: string, options?: SpiderOptions): Promise<SpiderResult>;
|
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
import { createClient } from '../core/client.js';
|
|
2
|
+
import { ScrapeDocument } from './document.js';
|
|
3
|
+
import { RequestPool } from '../utils/request-pool.js';
|
|
4
|
+
const TRACKING_PARAMS = new Set([
|
|
5
|
+
'utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content',
|
|
6
|
+
'gclid', 'gclsrc', 'dclid',
|
|
7
|
+
'fbclid', 'fb_action_ids', 'fb_action_types', 'fb_source', 'fb_ref',
|
|
8
|
+
'msclkid',
|
|
9
|
+
'twclid',
|
|
10
|
+
'ref', 'referer', 'referrer', 'source',
|
|
11
|
+
'_ga', '_gl', '_hsenc', '_hsmi',
|
|
12
|
+
'mc_cid', 'mc_eid',
|
|
13
|
+
'yclid', 'ymclid',
|
|
14
|
+
'igshid',
|
|
15
|
+
'_t', 't', 'timestamp', 'ts', 'nocache', 'cache',
|
|
16
|
+
]);
|
|
17
|
+
function normalizeUrl(urlStr) {
|
|
18
|
+
try {
|
|
19
|
+
const url = new URL(urlStr);
|
|
20
|
+
url.hash = '';
|
|
21
|
+
const paramsToDelete = [];
|
|
22
|
+
url.searchParams.forEach((_, key) => {
|
|
23
|
+
if (TRACKING_PARAMS.has(key.toLowerCase())) {
|
|
24
|
+
paramsToDelete.push(key);
|
|
25
|
+
}
|
|
26
|
+
});
|
|
27
|
+
paramsToDelete.forEach(key => url.searchParams.delete(key));
|
|
28
|
+
url.searchParams.sort();
|
|
29
|
+
if (url.pathname !== '/' && url.pathname.endsWith('/')) {
|
|
30
|
+
url.pathname = url.pathname.slice(0, -1);
|
|
31
|
+
}
|
|
32
|
+
return url.toString();
|
|
33
|
+
}
|
|
34
|
+
catch {
|
|
35
|
+
return urlStr;
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
function shouldCrawl(url, baseHost, options) {
|
|
39
|
+
try {
|
|
40
|
+
const parsed = new URL(url);
|
|
41
|
+
if (!['http:', 'https:'].includes(parsed.protocol)) {
|
|
42
|
+
return false;
|
|
43
|
+
}
|
|
44
|
+
if (options.sameDomain !== false && parsed.hostname !== baseHost) {
|
|
45
|
+
return false;
|
|
46
|
+
}
|
|
47
|
+
const skipExtensions = [
|
|
48
|
+
'.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.ico',
|
|
49
|
+
'.pdf', '.zip', '.tar', '.gz', '.rar',
|
|
50
|
+
'.mp3', '.mp4', '.avi', '.mov', '.webm',
|
|
51
|
+
'.css', '.js', '.json', '.xml', '.rss',
|
|
52
|
+
'.woff', '.woff2', '.ttf', '.eot',
|
|
53
|
+
];
|
|
54
|
+
const pathname = parsed.pathname.toLowerCase();
|
|
55
|
+
if (skipExtensions.some(ext => pathname.endsWith(ext))) {
|
|
56
|
+
return false;
|
|
57
|
+
}
|
|
58
|
+
if (options.exclude?.some(pattern => pattern.test(url))) {
|
|
59
|
+
return false;
|
|
60
|
+
}
|
|
61
|
+
if (options.include?.length) {
|
|
62
|
+
if (!options.include.some(pattern => pattern.test(url))) {
|
|
63
|
+
return false;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
return true;
|
|
67
|
+
}
|
|
68
|
+
catch {
|
|
69
|
+
return false;
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
function sleep(ms) {
|
|
73
|
+
return new Promise(resolve => setTimeout(resolve, ms));
|
|
74
|
+
}
|
|
75
|
+
export class Spider {
|
|
76
|
+
options;
|
|
77
|
+
client;
|
|
78
|
+
pool;
|
|
79
|
+
visited = new Set();
|
|
80
|
+
queue = [];
|
|
81
|
+
results = [];
|
|
82
|
+
errors = [];
|
|
83
|
+
baseHost = '';
|
|
84
|
+
running = false;
|
|
85
|
+
aborted = false;
|
|
86
|
+
pendingCount = 0;
|
|
87
|
+
constructor(options = {}) {
|
|
88
|
+
this.options = {
|
|
89
|
+
maxDepth: options.maxDepth ?? 4,
|
|
90
|
+
maxPages: options.maxPages ?? 100,
|
|
91
|
+
sameDomain: options.sameDomain ?? true,
|
|
92
|
+
concurrency: options.concurrency ?? 5,
|
|
93
|
+
timeout: options.timeout ?? 10000,
|
|
94
|
+
delay: options.delay ?? 100,
|
|
95
|
+
userAgent: options.userAgent ?? 'Recker Spider/1.0',
|
|
96
|
+
respectRobotsTxt: options.respectRobotsTxt ?? true,
|
|
97
|
+
exclude: options.exclude,
|
|
98
|
+
include: options.include,
|
|
99
|
+
onPage: options.onPage,
|
|
100
|
+
onProgress: options.onProgress,
|
|
101
|
+
};
|
|
102
|
+
this.client = createClient({
|
|
103
|
+
baseUrl: 'http://localhost',
|
|
104
|
+
timeout: this.options.timeout,
|
|
105
|
+
headers: {
|
|
106
|
+
'User-Agent': this.options.userAgent,
|
|
107
|
+
},
|
|
108
|
+
});
|
|
109
|
+
this.pool = new RequestPool({
|
|
110
|
+
concurrency: this.options.concurrency,
|
|
111
|
+
...(this.options.delay > 0 ? {
|
|
112
|
+
requestsPerInterval: 1,
|
|
113
|
+
interval: this.options.delay,
|
|
114
|
+
} : {}),
|
|
115
|
+
});
|
|
116
|
+
}
|
|
117
|
+
async crawl(startUrl) {
|
|
118
|
+
const startTime = performance.now();
|
|
119
|
+
const normalizedStart = normalizeUrl(startUrl);
|
|
120
|
+
this.baseHost = new URL(normalizedStart).hostname;
|
|
121
|
+
this.visited.clear();
|
|
122
|
+
this.queue = [];
|
|
123
|
+
this.results = [];
|
|
124
|
+
this.errors = [];
|
|
125
|
+
this.running = true;
|
|
126
|
+
this.aborted = false;
|
|
127
|
+
this.pendingCount = 0;
|
|
128
|
+
const pending = new Map();
|
|
129
|
+
const scheduleUrl = (item) => {
|
|
130
|
+
const normalized = normalizeUrl(item.url);
|
|
131
|
+
if (this.visited.has(normalized))
|
|
132
|
+
return;
|
|
133
|
+
if (pending.has(normalized))
|
|
134
|
+
return;
|
|
135
|
+
if (item.depth > this.options.maxDepth)
|
|
136
|
+
return;
|
|
137
|
+
if (this.results.length + pending.size >= this.options.maxPages)
|
|
138
|
+
return;
|
|
139
|
+
this.visited.add(normalized);
|
|
140
|
+
this.pendingCount++;
|
|
141
|
+
const promise = this.pool.run(() => this.crawlPage({ ...item, url: normalized }))
|
|
142
|
+
.finally(() => {
|
|
143
|
+
pending.delete(normalized);
|
|
144
|
+
this.pendingCount--;
|
|
145
|
+
});
|
|
146
|
+
pending.set(normalized, promise);
|
|
147
|
+
};
|
|
148
|
+
scheduleUrl({ url: normalizedStart, depth: 0 });
|
|
149
|
+
while ((pending.size > 0 || this.queue.length > 0) && !this.aborted) {
|
|
150
|
+
while (this.queue.length > 0 && !this.aborted) {
|
|
151
|
+
const item = this.queue.shift();
|
|
152
|
+
if (this.results.length + pending.size >= this.options.maxPages)
|
|
153
|
+
break;
|
|
154
|
+
scheduleUrl(item);
|
|
155
|
+
}
|
|
156
|
+
if (pending.size > 0) {
|
|
157
|
+
await Promise.race(pending.values());
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
if (pending.size > 0) {
|
|
161
|
+
await Promise.all(pending.values());
|
|
162
|
+
}
|
|
163
|
+
this.running = false;
|
|
164
|
+
return {
|
|
165
|
+
startUrl: normalizedStart,
|
|
166
|
+
pages: this.results,
|
|
167
|
+
visited: this.visited,
|
|
168
|
+
duration: Math.round(performance.now() - startTime),
|
|
169
|
+
errors: this.errors,
|
|
170
|
+
};
|
|
171
|
+
}
|
|
172
|
+
async crawlPage(item) {
|
|
173
|
+
const startTime = performance.now();
|
|
174
|
+
this.options.onProgress?.({
|
|
175
|
+
crawled: this.results.length,
|
|
176
|
+
queued: this.queue.length,
|
|
177
|
+
total: this.visited.size,
|
|
178
|
+
currentUrl: item.url,
|
|
179
|
+
depth: item.depth,
|
|
180
|
+
});
|
|
181
|
+
try {
|
|
182
|
+
const response = await this.client.get(item.url);
|
|
183
|
+
const status = response.status;
|
|
184
|
+
const contentType = response.headers.get('content-type') || '';
|
|
185
|
+
if (!contentType.includes('text/html')) {
|
|
186
|
+
return;
|
|
187
|
+
}
|
|
188
|
+
const html = await response.text();
|
|
189
|
+
const doc = await ScrapeDocument.create(html, { baseUrl: item.url });
|
|
190
|
+
const title = doc.selectFirst('title').text() || '';
|
|
191
|
+
const links = doc.links({ absolute: true });
|
|
192
|
+
const result = {
|
|
193
|
+
url: item.url,
|
|
194
|
+
status,
|
|
195
|
+
title,
|
|
196
|
+
depth: item.depth,
|
|
197
|
+
links,
|
|
198
|
+
duration: Math.round(performance.now() - startTime),
|
|
199
|
+
};
|
|
200
|
+
this.results.push(result);
|
|
201
|
+
this.options.onPage?.(result);
|
|
202
|
+
for (const link of links) {
|
|
203
|
+
if (!link.href)
|
|
204
|
+
continue;
|
|
205
|
+
const normalized = normalizeUrl(link.href);
|
|
206
|
+
if (this.visited.has(normalized))
|
|
207
|
+
continue;
|
|
208
|
+
if (!shouldCrawl(normalized, this.baseHost, this.options))
|
|
209
|
+
continue;
|
|
210
|
+
this.queue.push({
|
|
211
|
+
url: normalized,
|
|
212
|
+
depth: item.depth + 1,
|
|
213
|
+
});
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
catch (error) {
|
|
217
|
+
const errorResult = {
|
|
218
|
+
url: item.url,
|
|
219
|
+
status: 0,
|
|
220
|
+
title: '',
|
|
221
|
+
depth: item.depth,
|
|
222
|
+
links: [],
|
|
223
|
+
duration: Math.round(performance.now() - startTime),
|
|
224
|
+
error: error.message,
|
|
225
|
+
};
|
|
226
|
+
this.results.push(errorResult);
|
|
227
|
+
this.errors.push({ url: item.url, error: error.message });
|
|
228
|
+
this.options.onPage?.(errorResult);
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
abort() {
|
|
232
|
+
this.aborted = true;
|
|
233
|
+
}
|
|
234
|
+
isRunning() {
|
|
235
|
+
return this.running;
|
|
236
|
+
}
|
|
237
|
+
getProgress() {
|
|
238
|
+
return {
|
|
239
|
+
crawled: this.results.length,
|
|
240
|
+
queued: this.queue.length,
|
|
241
|
+
total: this.visited.size,
|
|
242
|
+
currentUrl: '',
|
|
243
|
+
depth: 0,
|
|
244
|
+
};
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
export async function spider(url, options) {
|
|
248
|
+
const s = new Spider(options);
|
|
249
|
+
return s.crawl(url);
|
|
250
|
+
}
|
package/dist/seo/analyzer.js
CHANGED
|
@@ -57,6 +57,21 @@ export class SeoAnalyzer {
|
|
|
57
57
|
checks,
|
|
58
58
|
title: meta.title ? { text: meta.title, length: meta.title.length } : undefined,
|
|
59
59
|
metaDescription: meta.description ? { text: meta.description, length: meta.description.length } : undefined,
|
|
60
|
+
openGraph: Object.keys(og).length > 0 ? {
|
|
61
|
+
title: og.title,
|
|
62
|
+
description: og.description,
|
|
63
|
+
image: Array.isArray(og.image) ? og.image[0] : og.image,
|
|
64
|
+
url: og.url,
|
|
65
|
+
type: og.type,
|
|
66
|
+
siteName: og.siteName,
|
|
67
|
+
} : undefined,
|
|
68
|
+
twitterCard: Object.keys(twitter).length > 0 ? {
|
|
69
|
+
card: twitter.card,
|
|
70
|
+
title: twitter.title,
|
|
71
|
+
description: twitter.description,
|
|
72
|
+
image: Array.isArray(twitter.image) ? twitter.image[0] : twitter.image,
|
|
73
|
+
site: twitter.site,
|
|
74
|
+
} : undefined,
|
|
60
75
|
headings: headings,
|
|
61
76
|
content,
|
|
62
77
|
links: linkAnalysis,
|
package/dist/seo/index.d.ts
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
export { SeoAnalyzer, analyzeSeo } from './analyzer.js';
|
|
2
|
+
export { SeoSpider, seoSpider } from './seo-spider.js';
|
|
3
|
+
export type { SeoSpiderOptions, SeoPageResult, SiteWideIssue, SeoSpiderResult, } from './seo-spider.js';
|
|
2
4
|
export { SeoRulesEngine, createRulesEngine, SEO_THRESHOLDS, ALL_SEO_RULES, } from './rules/index.js';
|
|
3
|
-
export type { SeoReport, SeoCheckResult, SeoStatus, HeadingAnalysis, HeadingInfo, ContentMetrics, LinkAnalysis, ImageAnalysis, SocialMetaAnalysis, TechnicalSeo, SeoAnalyzerOptions, } from './types.js';
|
|
5
|
+
export type { SeoReport, SeoCheckResult, SeoStatus, SeoTiming, HeadingAnalysis, HeadingInfo, ContentMetrics, LinkAnalysis, ImageAnalysis, SocialMetaAnalysis, TechnicalSeo, SeoAnalyzerOptions, } from './types.js';
|
|
4
6
|
export type { SeoRule, RuleContext, RuleResult, RuleEvidence, RuleCategory, RuleSeverity, RulesEngineOptions, } from './rules/index.js';
|
|
5
7
|
export type { SeoAnalyzerFullOptions } from './analyzer.js';
|
package/dist/seo/index.js
CHANGED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import { SpiderOptions, SpiderResult, SpiderPageResult } from '../scrape/spider.js';
|
|
2
|
+
import type { SeoReport } from './types.js';
|
|
3
|
+
export interface SeoSpiderOptions extends SpiderOptions {
|
|
4
|
+
seo?: boolean;
|
|
5
|
+
output?: string;
|
|
6
|
+
onSeoAnalysis?: (result: SeoPageResult) => void;
|
|
7
|
+
}
|
|
8
|
+
export interface SeoPageResult extends SpiderPageResult {
|
|
9
|
+
seoReport?: SeoReport;
|
|
10
|
+
}
|
|
11
|
+
export interface SiteWideIssue {
|
|
12
|
+
type: 'duplicate-title' | 'duplicate-description' | 'duplicate-h1' | 'missing-canonical' | 'orphan-page';
|
|
13
|
+
severity: 'error' | 'warning' | 'info';
|
|
14
|
+
message: string;
|
|
15
|
+
affectedUrls: string[];
|
|
16
|
+
value?: string;
|
|
17
|
+
}
|
|
18
|
+
export interface SeoSpiderResult extends Omit<SpiderResult, 'pages'> {
|
|
19
|
+
pages: SeoPageResult[];
|
|
20
|
+
siteWideIssues: SiteWideIssue[];
|
|
21
|
+
summary: {
|
|
22
|
+
totalPages: number;
|
|
23
|
+
pagesWithErrors: number;
|
|
24
|
+
pagesWithWarnings: number;
|
|
25
|
+
avgScore: number;
|
|
26
|
+
duplicateTitles: number;
|
|
27
|
+
duplicateDescriptions: number;
|
|
28
|
+
duplicateH1s: number;
|
|
29
|
+
orphanPages: number;
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
export declare class SeoSpider {
|
|
33
|
+
private spider;
|
|
34
|
+
private options;
|
|
35
|
+
private seoResults;
|
|
36
|
+
constructor(options?: SeoSpiderOptions);
|
|
37
|
+
crawl(startUrl: string): Promise<SeoSpiderResult>;
|
|
38
|
+
private analyzePages;
|
|
39
|
+
private createReportFromPageData;
|
|
40
|
+
private detectSiteWideIssues;
|
|
41
|
+
private calculateSummary;
|
|
42
|
+
private scoreToGrade;
|
|
43
|
+
private saveReport;
|
|
44
|
+
abort(): void;
|
|
45
|
+
isRunning(): boolean;
|
|
46
|
+
}
|
|
47
|
+
export declare function seoSpider(url: string, options?: SeoSpiderOptions): Promise<SeoSpiderResult>;
|
|
@@ -0,0 +1,362 @@
|
|
|
1
|
+
import { Spider } from '../scrape/spider.js';
|
|
2
|
+
import { analyzeSeo } from './analyzer.js';
|
|
3
|
+
import { createClient } from '../core/client.js';
|
|
4
|
+
import * as fs from 'fs/promises';
|
|
5
|
+
export class SeoSpider {
|
|
6
|
+
spider;
|
|
7
|
+
options;
|
|
8
|
+
seoResults = new Map();
|
|
9
|
+
constructor(options = {}) {
|
|
10
|
+
this.options = options;
|
|
11
|
+
this.spider = new Spider(options);
|
|
12
|
+
}
|
|
13
|
+
async crawl(startUrl) {
|
|
14
|
+
const result = await this.spider.crawl(startUrl);
|
|
15
|
+
if (!this.options.seo) {
|
|
16
|
+
return {
|
|
17
|
+
...result,
|
|
18
|
+
pages: result.pages,
|
|
19
|
+
siteWideIssues: [],
|
|
20
|
+
summary: {
|
|
21
|
+
totalPages: result.pages.length,
|
|
22
|
+
pagesWithErrors: 0,
|
|
23
|
+
pagesWithWarnings: 0,
|
|
24
|
+
avgScore: 0,
|
|
25
|
+
duplicateTitles: 0,
|
|
26
|
+
duplicateDescriptions: 0,
|
|
27
|
+
duplicateH1s: 0,
|
|
28
|
+
orphanPages: 0,
|
|
29
|
+
},
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
const seoPages = await this.analyzePages(result.pages);
|
|
33
|
+
const siteWideIssues = this.detectSiteWideIssues(seoPages);
|
|
34
|
+
const summary = this.calculateSummary(seoPages, siteWideIssues);
|
|
35
|
+
const seoResult = {
|
|
36
|
+
...result,
|
|
37
|
+
pages: seoPages,
|
|
38
|
+
siteWideIssues,
|
|
39
|
+
summary,
|
|
40
|
+
};
|
|
41
|
+
if (this.options.output) {
|
|
42
|
+
await this.saveReport(seoResult);
|
|
43
|
+
}
|
|
44
|
+
return seoResult;
|
|
45
|
+
}
|
|
46
|
+
async analyzePages(pages) {
|
|
47
|
+
const results = [];
|
|
48
|
+
const client = createClient({
|
|
49
|
+
timeout: this.options.timeout || 10000,
|
|
50
|
+
headers: {
|
|
51
|
+
'User-Agent': this.options.userAgent || 'Recker Spider/1.0',
|
|
52
|
+
},
|
|
53
|
+
});
|
|
54
|
+
for (const page of pages) {
|
|
55
|
+
if (page.error || page.status >= 400) {
|
|
56
|
+
results.push({
|
|
57
|
+
...page,
|
|
58
|
+
seoReport: undefined,
|
|
59
|
+
});
|
|
60
|
+
continue;
|
|
61
|
+
}
|
|
62
|
+
try {
|
|
63
|
+
const response = await client.get(page.url);
|
|
64
|
+
const html = await response.text();
|
|
65
|
+
const seoReport = await analyzeSeo(html, { baseUrl: page.url });
|
|
66
|
+
const seoPage = {
|
|
67
|
+
...page,
|
|
68
|
+
seoReport,
|
|
69
|
+
};
|
|
70
|
+
results.push(seoPage);
|
|
71
|
+
this.seoResults.set(page.url, seoReport);
|
|
72
|
+
this.options.onSeoAnalysis?.(seoPage);
|
|
73
|
+
}
|
|
74
|
+
catch {
|
|
75
|
+
results.push({
|
|
76
|
+
...page,
|
|
77
|
+
seoReport: undefined,
|
|
78
|
+
});
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
return results;
|
|
82
|
+
}
|
|
83
|
+
createReportFromPageData(page) {
|
|
84
|
+
const checks = [];
|
|
85
|
+
if (page.title) {
|
|
86
|
+
const titleLength = page.title.length;
|
|
87
|
+
if (titleLength < 30) {
|
|
88
|
+
checks.push({
|
|
89
|
+
name: 'Title Length',
|
|
90
|
+
status: 'warn',
|
|
91
|
+
message: `Title is too short (${titleLength} chars)`,
|
|
92
|
+
value: titleLength,
|
|
93
|
+
recommendation: 'Title should be 50-60 characters',
|
|
94
|
+
});
|
|
95
|
+
}
|
|
96
|
+
else if (titleLength > 60) {
|
|
97
|
+
checks.push({
|
|
98
|
+
name: 'Title Length',
|
|
99
|
+
status: 'warn',
|
|
100
|
+
message: `Title is too long (${titleLength} chars)`,
|
|
101
|
+
value: titleLength,
|
|
102
|
+
recommendation: 'Title should be 50-60 characters',
|
|
103
|
+
});
|
|
104
|
+
}
|
|
105
|
+
else {
|
|
106
|
+
checks.push({
|
|
107
|
+
name: 'Title Length',
|
|
108
|
+
status: 'pass',
|
|
109
|
+
message: `Good title length (${titleLength} chars)`,
|
|
110
|
+
value: titleLength,
|
|
111
|
+
});
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
else {
|
|
115
|
+
checks.push({
|
|
116
|
+
name: 'Title',
|
|
117
|
+
status: 'fail',
|
|
118
|
+
message: 'Page has no title',
|
|
119
|
+
recommendation: 'Add a descriptive <title> tag',
|
|
120
|
+
});
|
|
121
|
+
}
|
|
122
|
+
const internalLinks = page.links.filter(l => l.type === 'internal').length;
|
|
123
|
+
const externalLinks = page.links.filter(l => l.type === 'external').length;
|
|
124
|
+
if (internalLinks === 0) {
|
|
125
|
+
checks.push({
|
|
126
|
+
name: 'Internal Links',
|
|
127
|
+
status: 'warn',
|
|
128
|
+
message: 'No internal links found',
|
|
129
|
+
recommendation: 'Add internal links to improve site structure',
|
|
130
|
+
});
|
|
131
|
+
}
|
|
132
|
+
else {
|
|
133
|
+
checks.push({
|
|
134
|
+
name: 'Internal Links',
|
|
135
|
+
status: 'pass',
|
|
136
|
+
message: `${internalLinks} internal links found`,
|
|
137
|
+
value: internalLinks,
|
|
138
|
+
});
|
|
139
|
+
}
|
|
140
|
+
const scoreSum = checks.reduce((sum, c) => {
|
|
141
|
+
if (c.status === 'pass')
|
|
142
|
+
return sum + 100;
|
|
143
|
+
if (c.status === 'warn')
|
|
144
|
+
return sum + 50;
|
|
145
|
+
return sum;
|
|
146
|
+
}, 0);
|
|
147
|
+
const score = checks.length > 0 ? Math.round(scoreSum / checks.length) : 0;
|
|
148
|
+
return {
|
|
149
|
+
url: page.url,
|
|
150
|
+
timestamp: new Date(),
|
|
151
|
+
grade: this.scoreToGrade(score),
|
|
152
|
+
score,
|
|
153
|
+
checks,
|
|
154
|
+
title: page.title ? { text: page.title, length: page.title.length } : undefined,
|
|
155
|
+
headings: {
|
|
156
|
+
structure: [],
|
|
157
|
+
h1Count: 0,
|
|
158
|
+
hasProperHierarchy: false,
|
|
159
|
+
issues: [],
|
|
160
|
+
},
|
|
161
|
+
content: {
|
|
162
|
+
wordCount: 0,
|
|
163
|
+
characterCount: 0,
|
|
164
|
+
sentenceCount: 0,
|
|
165
|
+
paragraphCount: 0,
|
|
166
|
+
readingTimeMinutes: 0,
|
|
167
|
+
avgWordsPerSentence: 0,
|
|
168
|
+
avgParagraphLength: 0,
|
|
169
|
+
listCount: 0,
|
|
170
|
+
strongTagCount: 0,
|
|
171
|
+
emTagCount: 0,
|
|
172
|
+
},
|
|
173
|
+
links: {
|
|
174
|
+
total: page.links.length,
|
|
175
|
+
internal: internalLinks,
|
|
176
|
+
external: externalLinks,
|
|
177
|
+
nofollow: 0,
|
|
178
|
+
broken: 0,
|
|
179
|
+
withoutText: page.links.filter(l => !l.text?.trim()).length,
|
|
180
|
+
sponsoredLinks: 0,
|
|
181
|
+
ugcLinks: 0,
|
|
182
|
+
},
|
|
183
|
+
images: {
|
|
184
|
+
total: 0,
|
|
185
|
+
withAlt: 0,
|
|
186
|
+
withoutAlt: 0,
|
|
187
|
+
lazy: 0,
|
|
188
|
+
missingDimensions: 0,
|
|
189
|
+
modernFormats: 0,
|
|
190
|
+
altTextLengths: [],
|
|
191
|
+
imageFilenames: [],
|
|
192
|
+
imagesWithAsyncDecoding: 0,
|
|
193
|
+
},
|
|
194
|
+
social: {
|
|
195
|
+
openGraph: {
|
|
196
|
+
present: false,
|
|
197
|
+
hasTitle: false,
|
|
198
|
+
hasDescription: false,
|
|
199
|
+
hasImage: false,
|
|
200
|
+
hasUrl: false,
|
|
201
|
+
issues: [],
|
|
202
|
+
},
|
|
203
|
+
twitterCard: {
|
|
204
|
+
present: false,
|
|
205
|
+
hasCard: false,
|
|
206
|
+
hasTitle: false,
|
|
207
|
+
hasDescription: false,
|
|
208
|
+
hasImage: false,
|
|
209
|
+
issues: [],
|
|
210
|
+
},
|
|
211
|
+
},
|
|
212
|
+
technical: {
|
|
213
|
+
hasCanonical: false,
|
|
214
|
+
hasRobotsMeta: false,
|
|
215
|
+
hasViewport: false,
|
|
216
|
+
hasCharset: false,
|
|
217
|
+
hasLang: false,
|
|
218
|
+
},
|
|
219
|
+
jsonLd: {
|
|
220
|
+
count: 0,
|
|
221
|
+
types: [],
|
|
222
|
+
},
|
|
223
|
+
};
|
|
224
|
+
}
|
|
225
|
+
detectSiteWideIssues(pages) {
|
|
226
|
+
const issues = [];
|
|
227
|
+
const titleGroups = new Map();
|
|
228
|
+
const descriptionGroups = new Map();
|
|
229
|
+
const h1Groups = new Map();
|
|
230
|
+
for (const page of pages) {
|
|
231
|
+
if (!page.seoReport)
|
|
232
|
+
continue;
|
|
233
|
+
const title = page.seoReport.title?.text?.trim();
|
|
234
|
+
if (title) {
|
|
235
|
+
const urls = titleGroups.get(title) || [];
|
|
236
|
+
urls.push(page.url);
|
|
237
|
+
titleGroups.set(title, urls);
|
|
238
|
+
}
|
|
239
|
+
const desc = page.seoReport.metaDescription?.text?.trim();
|
|
240
|
+
if (desc) {
|
|
241
|
+
const urls = descriptionGroups.get(desc) || [];
|
|
242
|
+
urls.push(page.url);
|
|
243
|
+
descriptionGroups.set(desc, urls);
|
|
244
|
+
}
|
|
245
|
+
const h1 = page.seoReport.headings?.structure?.find(h => h.level === 1)?.text?.trim();
|
|
246
|
+
if (h1) {
|
|
247
|
+
const urls = h1Groups.get(h1) || [];
|
|
248
|
+
urls.push(page.url);
|
|
249
|
+
h1Groups.set(h1, urls);
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
for (const [title, urls] of titleGroups) {
|
|
253
|
+
if (urls.length > 1) {
|
|
254
|
+
issues.push({
|
|
255
|
+
type: 'duplicate-title',
|
|
256
|
+
severity: 'error',
|
|
257
|
+
message: `${urls.length} pages share the same title`,
|
|
258
|
+
affectedUrls: urls,
|
|
259
|
+
value: title,
|
|
260
|
+
});
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
for (const [desc, urls] of descriptionGroups) {
|
|
264
|
+
if (urls.length > 1) {
|
|
265
|
+
issues.push({
|
|
266
|
+
type: 'duplicate-description',
|
|
267
|
+
severity: 'warning',
|
|
268
|
+
message: `${urls.length} pages share the same meta description`,
|
|
269
|
+
affectedUrls: urls,
|
|
270
|
+
value: desc,
|
|
271
|
+
});
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
for (const [h1, urls] of h1Groups) {
|
|
275
|
+
if (urls.length > 1) {
|
|
276
|
+
issues.push({
|
|
277
|
+
type: 'duplicate-h1',
|
|
278
|
+
severity: 'warning',
|
|
279
|
+
message: `${urls.length} pages share the same H1 heading`,
|
|
280
|
+
affectedUrls: urls,
|
|
281
|
+
value: h1,
|
|
282
|
+
});
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
const linkedUrls = new Set();
|
|
286
|
+
for (const page of pages) {
|
|
287
|
+
for (const link of page.links) {
|
|
288
|
+
if (link.type === 'internal' && link.href) {
|
|
289
|
+
linkedUrls.add(link.href);
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
const orphanPages = pages
|
|
294
|
+
.filter(p => !linkedUrls.has(p.url) && p.depth > 0)
|
|
295
|
+
.map(p => p.url);
|
|
296
|
+
if (orphanPages.length > 0) {
|
|
297
|
+
issues.push({
|
|
298
|
+
type: 'orphan-page',
|
|
299
|
+
severity: 'warning',
|
|
300
|
+
message: `${orphanPages.length} page(s) have no internal links pointing to them`,
|
|
301
|
+
affectedUrls: orphanPages,
|
|
302
|
+
});
|
|
303
|
+
}
|
|
304
|
+
return issues;
|
|
305
|
+
}
|
|
306
|
+
calculateSummary(pages, issues) {
|
|
307
|
+
const pagesWithSeo = pages.filter(p => p.seoReport);
|
|
308
|
+
const scores = pagesWithSeo.map(p => p.seoReport.score);
|
|
309
|
+
const avgScore = scores.length > 0
|
|
310
|
+
? Math.round(scores.reduce((a, b) => a + b, 0) / scores.length)
|
|
311
|
+
: 0;
|
|
312
|
+
const pagesWithErrors = pagesWithSeo.filter(p => p.seoReport.checks.some(c => c.status === 'fail')).length;
|
|
313
|
+
const pagesWithWarnings = pagesWithSeo.filter(p => p.seoReport.checks.some(c => c.status === 'warn')).length;
|
|
314
|
+
const duplicateTitles = issues.filter(i => i.type === 'duplicate-title').length;
|
|
315
|
+
const duplicateDescriptions = issues.filter(i => i.type === 'duplicate-description').length;
|
|
316
|
+
const duplicateH1s = issues.filter(i => i.type === 'duplicate-h1').length;
|
|
317
|
+
const orphanPages = issues
|
|
318
|
+
.filter(i => i.type === 'orphan-page')
|
|
319
|
+
.reduce((sum, i) => sum + i.affectedUrls.length, 0);
|
|
320
|
+
return {
|
|
321
|
+
totalPages: pages.length,
|
|
322
|
+
pagesWithErrors,
|
|
323
|
+
pagesWithWarnings,
|
|
324
|
+
avgScore,
|
|
325
|
+
duplicateTitles,
|
|
326
|
+
duplicateDescriptions,
|
|
327
|
+
duplicateH1s,
|
|
328
|
+
orphanPages,
|
|
329
|
+
};
|
|
330
|
+
}
|
|
331
|
+
scoreToGrade(score) {
|
|
332
|
+
if (score >= 90)
|
|
333
|
+
return 'A';
|
|
334
|
+
if (score >= 80)
|
|
335
|
+
return 'B';
|
|
336
|
+
if (score >= 70)
|
|
337
|
+
return 'C';
|
|
338
|
+
if (score >= 60)
|
|
339
|
+
return 'D';
|
|
340
|
+
return 'F';
|
|
341
|
+
}
|
|
342
|
+
async saveReport(result) {
|
|
343
|
+
if (!this.options.output)
|
|
344
|
+
return;
|
|
345
|
+
const reportData = {
|
|
346
|
+
...result,
|
|
347
|
+
visited: Array.from(result.visited),
|
|
348
|
+
generatedAt: new Date().toISOString(),
|
|
349
|
+
};
|
|
350
|
+
await fs.writeFile(this.options.output, JSON.stringify(reportData, null, 2), 'utf-8');
|
|
351
|
+
}
|
|
352
|
+
abort() {
|
|
353
|
+
this.spider.abort();
|
|
354
|
+
}
|
|
355
|
+
isRunning() {
|
|
356
|
+
return this.spider.isRunning();
|
|
357
|
+
}
|
|
358
|
+
}
|
|
359
|
+
export async function seoSpider(url, options) {
|
|
360
|
+
const spider = new SeoSpider(options);
|
|
361
|
+
return spider.crawl(url);
|
|
362
|
+
}
|
package/dist/seo/types.d.ts
CHANGED
|
@@ -86,11 +86,20 @@ export interface TechnicalSeo {
|
|
|
86
86
|
hasLang: boolean;
|
|
87
87
|
langValue?: string;
|
|
88
88
|
}
|
|
89
|
+
export interface SeoTiming {
|
|
90
|
+
ttfb?: number;
|
|
91
|
+
total?: number;
|
|
92
|
+
dns?: number;
|
|
93
|
+
tcp?: number;
|
|
94
|
+
tls?: number;
|
|
95
|
+
download?: number;
|
|
96
|
+
}
|
|
89
97
|
export interface SeoReport {
|
|
90
98
|
url: string;
|
|
91
99
|
timestamp: Date;
|
|
92
100
|
grade: string;
|
|
93
101
|
score: number;
|
|
102
|
+
timing?: SeoTiming;
|
|
94
103
|
checks: SeoCheckResult[];
|
|
95
104
|
title?: {
|
|
96
105
|
text: string;
|
|
@@ -100,6 +109,21 @@ export interface SeoReport {
|
|
|
100
109
|
text: string;
|
|
101
110
|
length: number;
|
|
102
111
|
};
|
|
112
|
+
openGraph?: {
|
|
113
|
+
title?: string;
|
|
114
|
+
description?: string;
|
|
115
|
+
image?: string;
|
|
116
|
+
url?: string;
|
|
117
|
+
type?: string;
|
|
118
|
+
siteName?: string;
|
|
119
|
+
};
|
|
120
|
+
twitterCard?: {
|
|
121
|
+
card?: string;
|
|
122
|
+
title?: string;
|
|
123
|
+
description?: string;
|
|
124
|
+
image?: string;
|
|
125
|
+
site?: string;
|
|
126
|
+
};
|
|
103
127
|
headings: HeadingAnalysis;
|
|
104
128
|
content: ContentMetrics;
|
|
105
129
|
links: LinkAnalysis;
|