recker 1.0.28-next.32fe8ef → 1.0.28-next.857660a
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/tui/shell.d.ts +1 -0
- package/dist/cli/tui/shell.js +322 -5
- package/dist/scrape/index.d.ts +2 -0
- package/dist/scrape/index.js +1 -0
- package/dist/scrape/spider.d.ts +61 -0
- package/dist/scrape/spider.js +250 -0
- package/dist/seo/analyzer.js +27 -0
- package/dist/seo/index.d.ts +3 -1
- package/dist/seo/index.js +1 -0
- package/dist/seo/rules/accessibility.js +620 -54
- package/dist/seo/rules/best-practices.d.ts +2 -0
- package/dist/seo/rules/best-practices.js +188 -0
- package/dist/seo/rules/crawl.d.ts +2 -0
- package/dist/seo/rules/crawl.js +307 -0
- package/dist/seo/rules/cwv.d.ts +2 -0
- package/dist/seo/rules/cwv.js +337 -0
- package/dist/seo/rules/ecommerce.d.ts +2 -0
- package/dist/seo/rules/ecommerce.js +252 -0
- package/dist/seo/rules/i18n.d.ts +2 -0
- package/dist/seo/rules/i18n.js +222 -0
- package/dist/seo/rules/index.d.ts +32 -0
- package/dist/seo/rules/index.js +71 -0
- package/dist/seo/rules/internal-linking.d.ts +2 -0
- package/dist/seo/rules/internal-linking.js +375 -0
- package/dist/seo/rules/local.d.ts +2 -0
- package/dist/seo/rules/local.js +265 -0
- package/dist/seo/rules/pwa.d.ts +2 -0
- package/dist/seo/rules/pwa.js +302 -0
- package/dist/seo/rules/readability.d.ts +2 -0
- package/dist/seo/rules/readability.js +255 -0
- package/dist/seo/rules/security.js +406 -28
- package/dist/seo/rules/social.d.ts +2 -0
- package/dist/seo/rules/social.js +373 -0
- package/dist/seo/rules/types.d.ts +155 -0
- package/dist/seo/seo-spider.d.ts +47 -0
- package/dist/seo/seo-spider.js +362 -0
- package/dist/seo/types.d.ts +24 -0
- package/package.json +1 -1
package/dist/cli/tui/shell.d.ts
CHANGED
package/dist/cli/tui/shell.js
CHANGED
|
@@ -10,11 +10,12 @@ import { inspectTLS } from '../../utils/tls-inspector.js';
|
|
|
10
10
|
import { getSecurityRecords } from '../../utils/dns-toolkit.js';
|
|
11
11
|
import { rdap } from '../../utils/rdap.js';
|
|
12
12
|
import { ScrapeDocument } from '../../scrape/document.js';
|
|
13
|
+
import { Spider } from '../../scrape/spider.js';
|
|
13
14
|
import colors from '../../utils/colors.js';
|
|
14
15
|
import { getShellSearch } from './shell-search.js';
|
|
15
16
|
import { openSearchPanel } from './search-panel.js';
|
|
16
17
|
import { ScrollBuffer, parseScrollKey, parseMouseScroll, disableMouseReporting } from './scroll-buffer.js';
|
|
17
|
-
import { analyzeSeo } from '../../seo/index.js';
|
|
18
|
+
import { analyzeSeo, SeoSpider } from '../../seo/index.js';
|
|
18
19
|
let highlight;
|
|
19
20
|
async function initDependencies() {
|
|
20
21
|
if (!highlight) {
|
|
@@ -94,7 +95,7 @@ export class RekShell {
|
|
|
94
95
|
'get', 'post', 'put', 'delete', 'patch', 'head', 'options',
|
|
95
96
|
'ws', 'udp', 'load', 'chat', 'ai',
|
|
96
97
|
'whois', 'tls', 'ssl', 'security', 'ip', 'dns', 'dns:propagate', 'dns:email', 'rdap', 'ping',
|
|
97
|
-
'scrap', '$', '$text', '$attr', '$html', '$links', '$images', '$scripts', '$css', '$sourcemaps', '$unmap', '$unmap:view', '$unmap:save', '$beautify', '$beautify:save', '$table',
|
|
98
|
+
'scrap', 'spider', '$', '$text', '$attr', '$html', '$links', '$images', '$scripts', '$css', '$sourcemaps', '$unmap', '$unmap:view', '$unmap:save', '$beautify', '$beautify:save', '$table',
|
|
98
99
|
'?', 'search', 'suggest', 'example',
|
|
99
100
|
'help', 'clear', 'exit', 'set', 'url', 'vars', 'env'
|
|
100
101
|
];
|
|
@@ -368,6 +369,9 @@ export class RekShell {
|
|
|
368
369
|
case 'scrap':
|
|
369
370
|
await this.runScrap(parts[1]);
|
|
370
371
|
return;
|
|
372
|
+
case 'spider':
|
|
373
|
+
await this.runSpider(parts.slice(1));
|
|
374
|
+
return;
|
|
371
375
|
case '$':
|
|
372
376
|
await this.runSelect(parts.slice(1).join(' '));
|
|
373
377
|
return;
|
|
@@ -968,15 +972,21 @@ ${colors.bold('Details:')}`);
|
|
|
968
972
|
}
|
|
969
973
|
const startTime = performance.now();
|
|
970
974
|
try {
|
|
975
|
+
const ttfbStart = performance.now();
|
|
971
976
|
const res = await this.client.get(url);
|
|
977
|
+
const ttfb = Math.round(performance.now() - ttfbStart);
|
|
972
978
|
const html = await res.text();
|
|
973
979
|
const duration = Math.round(performance.now() - startTime);
|
|
974
980
|
const report = await analyzeSeo(html, { baseUrl: url });
|
|
981
|
+
report.timing = {
|
|
982
|
+
ttfb,
|
|
983
|
+
total: duration,
|
|
984
|
+
};
|
|
975
985
|
if (jsonOutput) {
|
|
976
986
|
const jsonResult = {
|
|
977
987
|
url,
|
|
978
988
|
analyzedAt: new Date().toISOString(),
|
|
979
|
-
|
|
989
|
+
timing: report.timing,
|
|
980
990
|
score: report.score,
|
|
981
991
|
grade: report.grade,
|
|
982
992
|
title: report.title,
|
|
@@ -985,8 +995,9 @@ ${colors.bold('Details:')}`);
|
|
|
985
995
|
headings: report.headings,
|
|
986
996
|
links: report.links,
|
|
987
997
|
images: report.images,
|
|
988
|
-
openGraph: report.
|
|
989
|
-
twitterCard: report.
|
|
998
|
+
openGraph: report.openGraph,
|
|
999
|
+
twitterCard: report.twitterCard,
|
|
1000
|
+
social: report.social,
|
|
990
1001
|
jsonLd: report.jsonLd,
|
|
991
1002
|
technical: report.technical,
|
|
992
1003
|
checks: report.checks,
|
|
@@ -1024,6 +1035,36 @@ Grade: ${gradeColor(colors.bold(report.grade))} (${report.score}/100)
|
|
|
1024
1035
|
: report.metaDescription.text;
|
|
1025
1036
|
console.log(colors.bold('Description:') + ` ${desc} ` + colors.gray(`(${report.metaDescription.length} chars)`));
|
|
1026
1037
|
}
|
|
1038
|
+
if (report.openGraph && Object.values(report.openGraph).some(v => v)) {
|
|
1039
|
+
console.log('');
|
|
1040
|
+
console.log(colors.bold(colors.cyan('OpenGraph:')));
|
|
1041
|
+
if (report.openGraph.title) {
|
|
1042
|
+
const ogTitle = report.openGraph.title.length > 60
|
|
1043
|
+
? report.openGraph.title.slice(0, 57) + '...'
|
|
1044
|
+
: report.openGraph.title;
|
|
1045
|
+
console.log(` ${colors.gray('og:title:')} ${ogTitle}`);
|
|
1046
|
+
}
|
|
1047
|
+
if (report.openGraph.description) {
|
|
1048
|
+
const ogDesc = report.openGraph.description.length > 60
|
|
1049
|
+
? report.openGraph.description.slice(0, 57) + '...'
|
|
1050
|
+
: report.openGraph.description;
|
|
1051
|
+
console.log(` ${colors.gray('og:description:')} ${ogDesc}`);
|
|
1052
|
+
}
|
|
1053
|
+
if (report.openGraph.image) {
|
|
1054
|
+
const ogImg = report.openGraph.image.length > 50
|
|
1055
|
+
? '...' + report.openGraph.image.slice(-47)
|
|
1056
|
+
: report.openGraph.image;
|
|
1057
|
+
console.log(` ${colors.gray('og:image:')} ${colors.blue(ogImg)}`);
|
|
1058
|
+
}
|
|
1059
|
+
if (report.openGraph.type) {
|
|
1060
|
+
console.log(` ${colors.gray('og:type:')} ${report.openGraph.type}`);
|
|
1061
|
+
}
|
|
1062
|
+
}
|
|
1063
|
+
if (report.timing?.ttfb !== undefined) {
|
|
1064
|
+
console.log('');
|
|
1065
|
+
console.log(colors.bold('Timing:') + ` TTFB ${report.timing.ttfb}ms` +
|
|
1066
|
+
(report.timing.total ? `, Total ${report.timing.total}ms` : ''));
|
|
1067
|
+
}
|
|
1027
1068
|
if (report.content) {
|
|
1028
1069
|
console.log(colors.bold('Content:') + ` ${report.content.wordCount} words, ${report.content.paragraphCount} paragraphs, ~${report.content.readingTimeMinutes} min read`);
|
|
1029
1070
|
}
|
|
@@ -1434,6 +1475,274 @@ ${colors.bold('Network:')}
|
|
|
1434
1475
|
}
|
|
1435
1476
|
console.log('');
|
|
1436
1477
|
}
|
|
1478
|
+
async runSpider(args) {
|
|
1479
|
+
let url = '';
|
|
1480
|
+
let maxDepth = 3;
|
|
1481
|
+
let maxPages = 100;
|
|
1482
|
+
let concurrency = 5;
|
|
1483
|
+
let seoEnabled = false;
|
|
1484
|
+
let outputFile = '';
|
|
1485
|
+
for (let i = 0; i < args.length; i++) {
|
|
1486
|
+
const arg = args[i];
|
|
1487
|
+
if (arg.startsWith('depth=')) {
|
|
1488
|
+
maxDepth = parseInt(arg.split('=')[1]) || 4;
|
|
1489
|
+
}
|
|
1490
|
+
else if (arg.startsWith('limit=')) {
|
|
1491
|
+
maxPages = parseInt(arg.split('=')[1]) || 100;
|
|
1492
|
+
}
|
|
1493
|
+
else if (arg.startsWith('concurrency=')) {
|
|
1494
|
+
concurrency = parseInt(arg.split('=')[1]) || 5;
|
|
1495
|
+
}
|
|
1496
|
+
else if (arg === 'seo') {
|
|
1497
|
+
seoEnabled = true;
|
|
1498
|
+
}
|
|
1499
|
+
else if (arg.startsWith('output=')) {
|
|
1500
|
+
outputFile = arg.split('=')[1] || '';
|
|
1501
|
+
}
|
|
1502
|
+
else if (!arg.includes('=')) {
|
|
1503
|
+
url = arg;
|
|
1504
|
+
}
|
|
1505
|
+
}
|
|
1506
|
+
if (!url) {
|
|
1507
|
+
if (!this.baseUrl) {
|
|
1508
|
+
console.log(colors.yellow('Usage: spider <url> [options]'));
|
|
1509
|
+
console.log(colors.gray(' Options:'));
|
|
1510
|
+
console.log(colors.gray(' depth=4 Max crawl depth'));
|
|
1511
|
+
console.log(colors.gray(' limit=100 Max pages to crawl'));
|
|
1512
|
+
console.log(colors.gray(' concurrency=5 Concurrent requests'));
|
|
1513
|
+
console.log(colors.gray(' seo Enable SEO analysis'));
|
|
1514
|
+
console.log(colors.gray(' output=file.json Save JSON report'));
|
|
1515
|
+
console.log(colors.gray(' Examples:'));
|
|
1516
|
+
console.log(colors.gray(' spider example.com'));
|
|
1517
|
+
console.log(colors.gray(' spider example.com depth=2 limit=50'));
|
|
1518
|
+
console.log(colors.gray(' spider example.com seo output=seo-report.json'));
|
|
1519
|
+
return;
|
|
1520
|
+
}
|
|
1521
|
+
url = this.baseUrl;
|
|
1522
|
+
}
|
|
1523
|
+
else if (!url.startsWith('http')) {
|
|
1524
|
+
url = `https://${url}`;
|
|
1525
|
+
}
|
|
1526
|
+
console.log(colors.cyan(`\nSpider starting: ${url}`));
|
|
1527
|
+
const modeLabel = seoEnabled ? colors.magenta(' + SEO') : '';
|
|
1528
|
+
console.log(colors.gray(` Depth: ${maxDepth} | Limit: ${maxPages} | Concurrency: ${concurrency}${modeLabel}`));
|
|
1529
|
+
if (outputFile) {
|
|
1530
|
+
console.log(colors.gray(` Output: ${outputFile}`));
|
|
1531
|
+
}
|
|
1532
|
+
console.log('');
|
|
1533
|
+
if (seoEnabled) {
|
|
1534
|
+
const seoSpider = new SeoSpider({
|
|
1535
|
+
maxDepth,
|
|
1536
|
+
maxPages,
|
|
1537
|
+
concurrency,
|
|
1538
|
+
sameDomain: true,
|
|
1539
|
+
delay: 100,
|
|
1540
|
+
seo: true,
|
|
1541
|
+
output: outputFile || undefined,
|
|
1542
|
+
onProgress: (progress) => {
|
|
1543
|
+
process.stdout.write(`\r${colors.gray(' Crawling:')} ${colors.cyan(progress.crawled.toString())} pages | ${colors.gray('Queue:')} ${progress.queued} | ${colors.gray('Depth:')} ${progress.depth} `);
|
|
1544
|
+
},
|
|
1545
|
+
});
|
|
1546
|
+
try {
|
|
1547
|
+
const result = await seoSpider.crawl(url);
|
|
1548
|
+
process.stdout.write('\r' + ' '.repeat(80) + '\r');
|
|
1549
|
+
console.log(colors.green(`\n✔ SEO Spider complete`) + colors.gray(` (${(result.duration / 1000).toFixed(1)}s)`));
|
|
1550
|
+
console.log(` ${colors.cyan('Pages crawled')}: ${result.pages.length}`);
|
|
1551
|
+
console.log(` ${colors.cyan('Unique URLs')}: ${result.visited.size}`);
|
|
1552
|
+
console.log(` ${colors.cyan('Avg SEO Score')}: ${result.summary.avgScore}/100`);
|
|
1553
|
+
const responseTimes = result.pages.filter(p => p.duration > 0).map(p => p.duration);
|
|
1554
|
+
const avgResponseTime = responseTimes.length > 0
|
|
1555
|
+
? Math.round(responseTimes.reduce((a, b) => a + b, 0) / responseTimes.length)
|
|
1556
|
+
: 0;
|
|
1557
|
+
const minResponseTime = responseTimes.length > 0 ? Math.min(...responseTimes) : 0;
|
|
1558
|
+
const maxResponseTime = responseTimes.length > 0 ? Math.max(...responseTimes) : 0;
|
|
1559
|
+
const reqPerSec = result.duration > 0 ? (result.pages.length / (result.duration / 1000)).toFixed(1) : '0';
|
|
1560
|
+
const statusCounts = new Map();
|
|
1561
|
+
for (const page of result.pages) {
|
|
1562
|
+
const status = page.status || 0;
|
|
1563
|
+
statusCounts.set(status, (statusCounts.get(status) || 0) + 1);
|
|
1564
|
+
}
|
|
1565
|
+
let totalInternalLinks = 0;
|
|
1566
|
+
let totalExternalLinks = 0;
|
|
1567
|
+
let totalImages = 0;
|
|
1568
|
+
let imagesWithoutAlt = 0;
|
|
1569
|
+
let pagesWithoutTitle = 0;
|
|
1570
|
+
let pagesWithoutDescription = 0;
|
|
1571
|
+
for (const page of result.pages) {
|
|
1572
|
+
if (page.seoReport) {
|
|
1573
|
+
totalInternalLinks += page.seoReport.links?.internal || 0;
|
|
1574
|
+
totalExternalLinks += page.seoReport.links?.external || 0;
|
|
1575
|
+
totalImages += page.seoReport.images?.total || 0;
|
|
1576
|
+
imagesWithoutAlt += page.seoReport.images?.withoutAlt || 0;
|
|
1577
|
+
if (!page.seoReport.title?.text)
|
|
1578
|
+
pagesWithoutTitle++;
|
|
1579
|
+
if (!page.seoReport.metaDescription?.text)
|
|
1580
|
+
pagesWithoutDescription++;
|
|
1581
|
+
}
|
|
1582
|
+
}
|
|
1583
|
+
console.log(colors.bold('\n Performance:'));
|
|
1584
|
+
console.log(` ${colors.gray('Avg Response:')} ${avgResponseTime}ms`);
|
|
1585
|
+
console.log(` ${colors.gray('Min/Max:')} ${minResponseTime}ms / ${maxResponseTime}ms`);
|
|
1586
|
+
console.log(` ${colors.gray('Throughput:')} ${reqPerSec} req/s`);
|
|
1587
|
+
console.log(colors.bold('\n HTTP Status:'));
|
|
1588
|
+
const sortedStatuses = Array.from(statusCounts.entries()).sort((a, b) => b[1] - a[1]);
|
|
1589
|
+
for (const [status, count] of sortedStatuses.slice(0, 5)) {
|
|
1590
|
+
const statusLabel = status === 0 ? 'Error' : status.toString();
|
|
1591
|
+
const statusColor = status >= 400 || status === 0 ? colors.red :
|
|
1592
|
+
status >= 300 ? colors.yellow : colors.green;
|
|
1593
|
+
const pct = ((count / result.pages.length) * 100).toFixed(0);
|
|
1594
|
+
console.log(` ${statusColor(statusLabel.padEnd(5))} ${count.toString().padStart(3)} (${pct}%)`);
|
|
1595
|
+
}
|
|
1596
|
+
console.log(colors.bold('\n Content:'));
|
|
1597
|
+
console.log(` ${colors.gray('Internal links:')} ${totalInternalLinks.toLocaleString()}`);
|
|
1598
|
+
console.log(` ${colors.gray('External links:')} ${totalExternalLinks.toLocaleString()}`);
|
|
1599
|
+
console.log(` ${colors.gray('Images:')} ${totalImages.toLocaleString()} (${imagesWithoutAlt} missing alt)`);
|
|
1600
|
+
console.log(` ${colors.gray('Missing title:')} ${pagesWithoutTitle}`);
|
|
1601
|
+
console.log(` ${colors.gray('Missing desc:')} ${pagesWithoutDescription}`);
|
|
1602
|
+
console.log(colors.bold('\n SEO Summary:'));
|
|
1603
|
+
const { summary } = result;
|
|
1604
|
+
console.log(` ${colors.red('✗')} Pages with errors: ${summary.pagesWithErrors}`);
|
|
1605
|
+
console.log(` ${colors.yellow('⚠')} Pages with warnings: ${summary.pagesWithWarnings}`);
|
|
1606
|
+
console.log(` ${colors.magenta('⚐')} Duplicate titles: ${summary.duplicateTitles}`);
|
|
1607
|
+
console.log(` ${colors.magenta('⚐')} Duplicate descriptions:${summary.duplicateDescriptions}`);
|
|
1608
|
+
console.log(` ${colors.magenta('⚐')} Duplicate H1s: ${summary.duplicateH1s}`);
|
|
1609
|
+
console.log(` ${colors.gray('○')} Orphan pages: ${summary.orphanPages}`);
|
|
1610
|
+
if (result.siteWideIssues.length > 0) {
|
|
1611
|
+
console.log(colors.bold('\n Site-Wide Issues:'));
|
|
1612
|
+
for (const issue of result.siteWideIssues.slice(0, 10)) {
|
|
1613
|
+
const icon = issue.severity === 'error' ? colors.red('✗') :
|
|
1614
|
+
issue.severity === 'warning' ? colors.yellow('⚠') : colors.gray('○');
|
|
1615
|
+
console.log(` ${icon} ${issue.message}`);
|
|
1616
|
+
if (issue.value) {
|
|
1617
|
+
const truncatedValue = issue.value.length > 50 ? issue.value.slice(0, 47) + '...' : issue.value;
|
|
1618
|
+
console.log(` ${colors.gray(`"${truncatedValue}"`)}`);
|
|
1619
|
+
}
|
|
1620
|
+
const uniquePaths = [...new Set(issue.affectedUrls.map(u => new URL(u).pathname))];
|
|
1621
|
+
if (uniquePaths.length <= 3) {
|
|
1622
|
+
for (const path of uniquePaths) {
|
|
1623
|
+
console.log(` ${colors.gray('→')} ${path}`);
|
|
1624
|
+
}
|
|
1625
|
+
}
|
|
1626
|
+
else {
|
|
1627
|
+
console.log(` ${colors.gray(`→ ${uniquePaths.length} pages affected`)}`);
|
|
1628
|
+
}
|
|
1629
|
+
}
|
|
1630
|
+
if (result.siteWideIssues.length > 10) {
|
|
1631
|
+
console.log(colors.gray(` ... and ${result.siteWideIssues.length - 10} more issues`));
|
|
1632
|
+
}
|
|
1633
|
+
}
|
|
1634
|
+
const pagesWithScores = result.pages
|
|
1635
|
+
.filter(p => p.seoReport)
|
|
1636
|
+
.sort((a, b) => (a.seoReport?.score || 0) - (b.seoReport?.score || 0));
|
|
1637
|
+
const seenPaths = new Set();
|
|
1638
|
+
const uniquePages = pagesWithScores.filter(page => {
|
|
1639
|
+
const path = new URL(page.url).pathname;
|
|
1640
|
+
if (seenPaths.has(path))
|
|
1641
|
+
return false;
|
|
1642
|
+
seenPaths.add(path);
|
|
1643
|
+
return true;
|
|
1644
|
+
});
|
|
1645
|
+
if (uniquePages.length > 0) {
|
|
1646
|
+
console.log(colors.bold('\n Pages by SEO Score:'));
|
|
1647
|
+
const worstPages = uniquePages.slice(0, 5);
|
|
1648
|
+
for (const page of worstPages) {
|
|
1649
|
+
const score = page.seoReport?.score || 0;
|
|
1650
|
+
const grade = page.seoReport?.grade || '?';
|
|
1651
|
+
const path = new URL(page.url).pathname;
|
|
1652
|
+
const scoreColor = score >= 80 ? colors.green : score >= 60 ? colors.yellow : colors.red;
|
|
1653
|
+
console.log(` ${scoreColor(`${score.toString().padStart(3)}`)} ${colors.gray(`[${grade}]`)} ${path.slice(0, 50)}`);
|
|
1654
|
+
}
|
|
1655
|
+
if (uniquePages.length > 5) {
|
|
1656
|
+
console.log(colors.gray(` ... and ${uniquePages.length - 5} more pages`));
|
|
1657
|
+
}
|
|
1658
|
+
}
|
|
1659
|
+
if (outputFile) {
|
|
1660
|
+
console.log(colors.green(`\n Report saved to: ${outputFile}`));
|
|
1661
|
+
}
|
|
1662
|
+
this.lastResponse = result;
|
|
1663
|
+
console.log(colors.gray('\n Result stored in lastResponse.'));
|
|
1664
|
+
}
|
|
1665
|
+
catch (error) {
|
|
1666
|
+
console.error(colors.red(`SEO Spider failed: ${error.message}`));
|
|
1667
|
+
}
|
|
1668
|
+
}
|
|
1669
|
+
else {
|
|
1670
|
+
const spider = new Spider({
|
|
1671
|
+
maxDepth,
|
|
1672
|
+
maxPages,
|
|
1673
|
+
concurrency,
|
|
1674
|
+
sameDomain: true,
|
|
1675
|
+
delay: 100,
|
|
1676
|
+
onProgress: (progress) => {
|
|
1677
|
+
process.stdout.write(`\r${colors.gray(' Crawling:')} ${colors.cyan(progress.crawled.toString())} pages | ${colors.gray('Queue:')} ${progress.queued} | ${colors.gray('Depth:')} ${progress.depth} `);
|
|
1678
|
+
},
|
|
1679
|
+
});
|
|
1680
|
+
try {
|
|
1681
|
+
const result = await spider.crawl(url);
|
|
1682
|
+
process.stdout.write('\r' + ' '.repeat(80) + '\r');
|
|
1683
|
+
console.log(colors.green(`\n✔ Spider complete`) + colors.gray(` (${(result.duration / 1000).toFixed(1)}s)`));
|
|
1684
|
+
console.log(` ${colors.cyan('Pages crawled')}: ${result.pages.length}`);
|
|
1685
|
+
console.log(` ${colors.cyan('Unique URLs')}: ${result.visited.size}`);
|
|
1686
|
+
console.log(` ${colors.cyan('Errors')}: ${result.errors.length}`);
|
|
1687
|
+
const byDepth = new Map();
|
|
1688
|
+
for (const page of result.pages) {
|
|
1689
|
+
byDepth.set(page.depth, (byDepth.get(page.depth) || 0) + 1);
|
|
1690
|
+
}
|
|
1691
|
+
console.log(colors.bold('\n Pages by depth:'));
|
|
1692
|
+
for (const [depth, count] of Array.from(byDepth.entries()).sort((a, b) => a[0] - b[0])) {
|
|
1693
|
+
const bar = '█'.repeat(Math.min(count, 40));
|
|
1694
|
+
console.log(` ${colors.gray(`d${depth}:`)} ${bar} ${count}`);
|
|
1695
|
+
}
|
|
1696
|
+
const topPages = [...result.pages]
|
|
1697
|
+
.filter(p => !p.error)
|
|
1698
|
+
.sort((a, b) => b.links.length - a.links.length)
|
|
1699
|
+
.slice(0, 10);
|
|
1700
|
+
if (topPages.length > 0) {
|
|
1701
|
+
console.log(colors.bold('\n Top pages by outgoing links:'));
|
|
1702
|
+
for (const page of topPages) {
|
|
1703
|
+
const title = page.title.slice(0, 40) || new URL(page.url).pathname;
|
|
1704
|
+
console.log(` ${colors.cyan(page.links.length.toString().padStart(3))} ${title}`);
|
|
1705
|
+
}
|
|
1706
|
+
}
|
|
1707
|
+
const formatError = (error) => {
|
|
1708
|
+
const statusMatch = error.match(/status code (\d{3})/i);
|
|
1709
|
+
if (statusMatch) {
|
|
1710
|
+
return `HTTP ${statusMatch[1]}`;
|
|
1711
|
+
}
|
|
1712
|
+
return error.length > 50 ? error.slice(0, 47) + '...' : error;
|
|
1713
|
+
};
|
|
1714
|
+
if (result.errors.length > 0 && result.errors.length <= 10) {
|
|
1715
|
+
console.log(colors.bold('\n Errors:'));
|
|
1716
|
+
for (const err of result.errors) {
|
|
1717
|
+
const path = new URL(err.url).pathname;
|
|
1718
|
+
console.log(` ${colors.red('✗')} ${path.padEnd(25)} ${colors.gray('→')} ${formatError(err.error)}`);
|
|
1719
|
+
}
|
|
1720
|
+
}
|
|
1721
|
+
else if (result.errors.length > 10) {
|
|
1722
|
+
console.log(colors.yellow(`\n ${result.errors.length} errors (showing first 10):`));
|
|
1723
|
+
for (const err of result.errors.slice(0, 10)) {
|
|
1724
|
+
const path = new URL(err.url).pathname;
|
|
1725
|
+
console.log(` ${colors.red('✗')} ${path.padEnd(25)} ${colors.gray('→')} ${formatError(err.error)}`);
|
|
1726
|
+
}
|
|
1727
|
+
}
|
|
1728
|
+
if (outputFile) {
|
|
1729
|
+
const reportData = {
|
|
1730
|
+
...result,
|
|
1731
|
+
visited: Array.from(result.visited),
|
|
1732
|
+
generatedAt: new Date().toISOString(),
|
|
1733
|
+
};
|
|
1734
|
+
await fs.writeFile(outputFile, JSON.stringify(reportData, null, 2), 'utf-8');
|
|
1735
|
+
console.log(colors.green(`\n Report saved to: ${outputFile}`));
|
|
1736
|
+
}
|
|
1737
|
+
this.lastResponse = result;
|
|
1738
|
+
console.log(colors.gray('\n Result stored in lastResponse. Use $links to explore.'));
|
|
1739
|
+
}
|
|
1740
|
+
catch (error) {
|
|
1741
|
+
console.error(colors.red(`Spider failed: ${error.message}`));
|
|
1742
|
+
}
|
|
1743
|
+
}
|
|
1744
|
+
console.log('');
|
|
1745
|
+
}
|
|
1437
1746
|
async runSelect(selector) {
|
|
1438
1747
|
if (!this.currentDoc) {
|
|
1439
1748
|
console.log(colors.yellow('No document loaded. Use "scrap <url>" first.'));
|
|
@@ -2358,6 +2667,13 @@ ${colors.bold('Network:')}
|
|
|
2358
2667
|
${colors.green('$beautify:save [f]')} Save beautified code to file.
|
|
2359
2668
|
${colors.green('$table <selector>')} Extract table as data.
|
|
2360
2669
|
|
|
2670
|
+
${colors.bold('Web Crawler:')}
|
|
2671
|
+
${colors.green('spider <url>')} Crawl website following internal links.
|
|
2672
|
+
${colors.gray('Options:')}
|
|
2673
|
+
${colors.white('--depth=4')} ${colors.gray('Maximum depth to crawl')}
|
|
2674
|
+
${colors.white('--limit=100')} ${colors.gray('Maximum pages to crawl')}
|
|
2675
|
+
${colors.white('--concurrency=5')} ${colors.gray('Parallel requests')}
|
|
2676
|
+
|
|
2361
2677
|
${colors.bold('Documentation:')}
|
|
2362
2678
|
${colors.green('? <query>')} Search Recker documentation.
|
|
2363
2679
|
${colors.green('search <query>')} Alias for ? (hybrid fuzzy+semantic search).
|
|
@@ -2375,6 +2691,7 @@ ${colors.bold('Network:')}
|
|
|
2375
2691
|
› post /post name="Neo" active:=true role:Admin
|
|
2376
2692
|
› load /heavy-endpoint users=100 mode=stress
|
|
2377
2693
|
› chat openai gpt-5.1
|
|
2694
|
+
› spider example.com depth=2 limit=50
|
|
2378
2695
|
`);
|
|
2379
2696
|
}
|
|
2380
2697
|
}
|
package/dist/scrape/index.d.ts
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
export { ScrapeDocument } from './document.js';
|
|
2
2
|
export { ScrapeElement } from './element.js';
|
|
3
|
+
export { Spider, spider } from './spider.js';
|
|
4
|
+
export type { SpiderOptions, SpiderPageResult, SpiderProgress, SpiderResult, } from './spider.js';
|
|
3
5
|
export { extractLinks, extractImages, extractMeta, extractOpenGraph, extractTwitterCard, extractJsonLd, extractForms, extractTables, extractScripts, extractStyles, } from './extractors.js';
|
|
4
6
|
export type { ExtractedLink, ExtractedImage, ExtractedMeta, OpenGraphData, TwitterCardData, JsonLdData, ExtractedForm, ExtractedFormField, ExtractedTable, ExtractedScript, ExtractedStyle, ExtractionSchema, ExtractionSchemaField, ScrapeOptions, LinkExtractionOptions, ImageExtractionOptions, } from './types.js';
|
package/dist/scrape/index.js
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
1
|
export { ScrapeDocument } from './document.js';
|
|
2
2
|
export { ScrapeElement } from './element.js';
|
|
3
|
+
export { Spider, spider } from './spider.js';
|
|
3
4
|
export { extractLinks, extractImages, extractMeta, extractOpenGraph, extractTwitterCard, extractJsonLd, extractForms, extractTables, extractScripts, extractStyles, } from './extractors.js';
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import type { ExtractedLink } from './types.js';
|
|
2
|
+
export interface SpiderOptions {
|
|
3
|
+
maxDepth?: number;
|
|
4
|
+
maxPages?: number;
|
|
5
|
+
sameDomain?: boolean;
|
|
6
|
+
concurrency?: number;
|
|
7
|
+
timeout?: number;
|
|
8
|
+
delay?: number;
|
|
9
|
+
exclude?: RegExp[];
|
|
10
|
+
include?: RegExp[];
|
|
11
|
+
userAgent?: string;
|
|
12
|
+
respectRobotsTxt?: boolean;
|
|
13
|
+
onPage?: (result: SpiderPageResult) => void;
|
|
14
|
+
onProgress?: (progress: SpiderProgress) => void;
|
|
15
|
+
}
|
|
16
|
+
export interface SpiderPageResult {
|
|
17
|
+
url: string;
|
|
18
|
+
status: number;
|
|
19
|
+
title: string;
|
|
20
|
+
depth: number;
|
|
21
|
+
links: ExtractedLink[];
|
|
22
|
+
duration: number;
|
|
23
|
+
error?: string;
|
|
24
|
+
}
|
|
25
|
+
export interface SpiderProgress {
|
|
26
|
+
crawled: number;
|
|
27
|
+
queued: number;
|
|
28
|
+
total: number;
|
|
29
|
+
currentUrl: string;
|
|
30
|
+
depth: number;
|
|
31
|
+
}
|
|
32
|
+
export interface SpiderResult {
|
|
33
|
+
startUrl: string;
|
|
34
|
+
pages: SpiderPageResult[];
|
|
35
|
+
visited: Set<string>;
|
|
36
|
+
duration: number;
|
|
37
|
+
errors: Array<{
|
|
38
|
+
url: string;
|
|
39
|
+
error: string;
|
|
40
|
+
}>;
|
|
41
|
+
}
|
|
42
|
+
export declare class Spider {
|
|
43
|
+
private options;
|
|
44
|
+
private client;
|
|
45
|
+
private pool;
|
|
46
|
+
private visited;
|
|
47
|
+
private queue;
|
|
48
|
+
private results;
|
|
49
|
+
private errors;
|
|
50
|
+
private baseHost;
|
|
51
|
+
private running;
|
|
52
|
+
private aborted;
|
|
53
|
+
private pendingCount;
|
|
54
|
+
constructor(options?: SpiderOptions);
|
|
55
|
+
crawl(startUrl: string): Promise<SpiderResult>;
|
|
56
|
+
private crawlPage;
|
|
57
|
+
abort(): void;
|
|
58
|
+
isRunning(): boolean;
|
|
59
|
+
getProgress(): SpiderProgress;
|
|
60
|
+
}
|
|
61
|
+
export declare function spider(url: string, options?: SpiderOptions): Promise<SpiderResult>;
|