recker 1.0.28 → 1.0.29-next.7cc1d8b

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/README.md +28 -1
  2. package/dist/cli/tui/shell.d.ts +1 -0
  3. package/dist/cli/tui/shell.js +339 -5
  4. package/dist/scrape/index.d.ts +2 -0
  5. package/dist/scrape/index.js +1 -0
  6. package/dist/scrape/spider.d.ts +61 -0
  7. package/dist/scrape/spider.js +250 -0
  8. package/dist/seo/analyzer.js +27 -0
  9. package/dist/seo/index.d.ts +3 -1
  10. package/dist/seo/index.js +1 -0
  11. package/dist/seo/rules/accessibility.js +620 -54
  12. package/dist/seo/rules/best-practices.d.ts +2 -0
  13. package/dist/seo/rules/best-practices.js +188 -0
  14. package/dist/seo/rules/crawl.d.ts +2 -0
  15. package/dist/seo/rules/crawl.js +307 -0
  16. package/dist/seo/rules/cwv.d.ts +2 -0
  17. package/dist/seo/rules/cwv.js +337 -0
  18. package/dist/seo/rules/ecommerce.d.ts +2 -0
  19. package/dist/seo/rules/ecommerce.js +252 -0
  20. package/dist/seo/rules/i18n.d.ts +2 -0
  21. package/dist/seo/rules/i18n.js +222 -0
  22. package/dist/seo/rules/index.d.ts +32 -0
  23. package/dist/seo/rules/index.js +71 -0
  24. package/dist/seo/rules/internal-linking.d.ts +2 -0
  25. package/dist/seo/rules/internal-linking.js +375 -0
  26. package/dist/seo/rules/local.d.ts +2 -0
  27. package/dist/seo/rules/local.js +265 -0
  28. package/dist/seo/rules/pwa.d.ts +2 -0
  29. package/dist/seo/rules/pwa.js +302 -0
  30. package/dist/seo/rules/readability.d.ts +2 -0
  31. package/dist/seo/rules/readability.js +255 -0
  32. package/dist/seo/rules/security.js +406 -28
  33. package/dist/seo/rules/social.d.ts +2 -0
  34. package/dist/seo/rules/social.js +373 -0
  35. package/dist/seo/rules/types.d.ts +155 -0
  36. package/dist/seo/seo-spider.d.ts +47 -0
  37. package/dist/seo/seo-spider.js +362 -0
  38. package/dist/seo/types.d.ts +24 -0
  39. package/package.json +1 -1
package/README.md CHANGED
@@ -109,6 +109,8 @@ See [Mini Client documentation](./docs/http/18-mini-client.md) for more.
109
109
  | **Type-Safe** | Full TypeScript with Zod schema validation. |
110
110
  | **Observable** | DNS/TCP/TLS/TTFB timing breakdown per request. |
111
111
  | **Resilient** | Retry, circuit breaker, rate limiting, deduplication. |
112
+ | **SEO Analysis** | 250+ rules across 21 categories. Site-wide crawling with duplicate detection. |
113
+ | **Spider Crawler** | Web crawler with URL deduplication, depth control, and concurrency. |
112
114
  | **GeoIP (Offline)** | MaxMind GeoLite2 database with bogon detection. |
113
115
  | **RDAP Support** | Modern WHOIS with IANA Bootstrap and TLD detection. |
114
116
 
@@ -133,11 +135,31 @@ console.log(response.timings);
133
135
  // { dns: 12, tcp: 8, tls: 45, firstByte: 23, total: 156 }
134
136
  ```
135
137
 
136
- ### Scraping
138
+ ### Scraping & Spider
137
139
 
138
140
  ```typescript
141
+ // Scrape single page
139
142
  const doc = await client.scrape('https://example.com');
140
143
  const titles = doc.selectAll('h1').map(el => el.text());
144
+
145
+ // Crawl entire site
146
+ import { spider } from 'recker/scrape';
147
+ const result = await spider('https://example.com', { maxPages: 50 });
148
+ console.log(`Crawled ${result.pages.length} pages`);
149
+ ```
150
+
151
+ ### SEO Analysis
152
+
153
+ ```typescript
154
+ import { analyzeSeo, seoSpider } from 'recker/seo';
155
+
156
+ // Single page analysis - 250+ checks across 21 categories
157
+ const report = await analyzeSeo(html, { baseUrl: 'https://example.com' });
158
+ console.log(`Score: ${report.score}/100 (${report.grade})`);
159
+
160
+ // Site-wide analysis - detect duplicates and orphan pages
161
+ const siteReport = await seoSpider('https://example.com', { seo: true });
162
+ console.log(`Duplicate titles: ${siteReport.summary.duplicateTitles}`);
141
163
  ```
142
164
 
143
165
  ### Circuit Breaker
@@ -174,6 +196,9 @@ rek -o data.json api.com/export
174
196
  # Interactive shell
175
197
  rek shell
176
198
 
199
+ # SEO analysis
200
+ rek seo https://example.com
201
+
177
202
  # Mock servers for testing
178
203
  rek serve http # HTTP on :3000
179
204
  rek serve ws # WebSocket on :8080
@@ -187,6 +212,8 @@ See [CLI Documentation](./docs/cli/01-overview.md) for more.
187
212
  - **[Quick Start](./docs/http/01-quickstart.md)** - Get running in 2 minutes
188
213
  - **[Mini Client](./docs/http/18-mini-client.md)** - Maximum performance mode
189
214
  - **[CLI Guide](./docs/cli/01-overview.md)** - Terminal client documentation
215
+ - **[SEO Analysis](./docs/http/19-seo.md)** - 250+ rules, site-wide crawling
216
+ - **[Web Scraping](./docs/http/14-scraping.md)** - HTML parsing and Spider crawler
190
217
  - **[API Reference](./docs/reference/01-api.md)** - Complete API documentation
191
218
  - **[Configuration](./docs/http/05-configuration.md)** - Client options
192
219
  - **[Plugins](./docs/http/10-plugins.md)** - Extend functionality
@@ -51,6 +51,7 @@ export declare class RekShell {
51
51
  private runRDAP;
52
52
  private runPing;
53
53
  private runScrap;
54
+ private runSpider;
54
55
  private runSelect;
55
56
  private runSelectText;
56
57
  private runSelectAttr;
@@ -10,11 +10,12 @@ import { inspectTLS } from '../../utils/tls-inspector.js';
10
10
  import { getSecurityRecords } from '../../utils/dns-toolkit.js';
11
11
  import { rdap } from '../../utils/rdap.js';
12
12
  import { ScrapeDocument } from '../../scrape/document.js';
13
+ import { Spider } from '../../scrape/spider.js';
13
14
  import colors from '../../utils/colors.js';
14
15
  import { getShellSearch } from './shell-search.js';
15
16
  import { openSearchPanel } from './search-panel.js';
16
17
  import { ScrollBuffer, parseScrollKey, parseMouseScroll, disableMouseReporting } from './scroll-buffer.js';
17
- import { analyzeSeo } from '../../seo/index.js';
18
+ import { analyzeSeo, SeoSpider } from '../../seo/index.js';
18
19
  let highlight;
19
20
  async function initDependencies() {
20
21
  if (!highlight) {
@@ -94,7 +95,7 @@ export class RekShell {
94
95
  'get', 'post', 'put', 'delete', 'patch', 'head', 'options',
95
96
  'ws', 'udp', 'load', 'chat', 'ai',
96
97
  'whois', 'tls', 'ssl', 'security', 'ip', 'dns', 'dns:propagate', 'dns:email', 'rdap', 'ping',
97
- 'scrap', '$', '$text', '$attr', '$html', '$links', '$images', '$scripts', '$css', '$sourcemaps', '$unmap', '$unmap:view', '$unmap:save', '$beautify', '$beautify:save', '$table',
98
+ 'scrap', 'spider', '$', '$text', '$attr', '$html', '$links', '$images', '$scripts', '$css', '$sourcemaps', '$unmap', '$unmap:view', '$unmap:save', '$beautify', '$beautify:save', '$table',
98
99
  '?', 'search', 'suggest', 'example',
99
100
  'help', 'clear', 'exit', 'set', 'url', 'vars', 'env'
100
101
  ];
@@ -368,6 +369,9 @@ export class RekShell {
368
369
  case 'scrap':
369
370
  await this.runScrap(parts[1]);
370
371
  return;
372
+ case 'spider':
373
+ await this.runSpider(parts.slice(1));
374
+ return;
371
375
  case '$':
372
376
  await this.runSelect(parts.slice(1).join(' '));
373
377
  return;
@@ -972,11 +976,20 @@ ${colors.bold('Details:')}`);
972
976
  const html = await res.text();
973
977
  const duration = Math.round(performance.now() - startTime);
974
978
  const report = await analyzeSeo(html, { baseUrl: url });
979
+ const t = res.timings;
980
+ report.timing = {
981
+ ttfb: t?.firstByte ? Math.round(t.firstByte) : undefined,
982
+ total: t?.total ? Math.round(t.total) : duration,
983
+ dns: t?.dns ? Math.round(t.dns) : undefined,
984
+ tcp: t?.tcp ? Math.round(t.tcp) : undefined,
985
+ tls: t?.tls ? Math.round(t.tls) : undefined,
986
+ download: t?.content ? Math.round(t.content) : undefined,
987
+ };
975
988
  if (jsonOutput) {
976
989
  const jsonResult = {
977
990
  url,
978
991
  analyzedAt: new Date().toISOString(),
979
- durationMs: duration,
992
+ timing: report.timing,
980
993
  score: report.score,
981
994
  grade: report.grade,
982
995
  title: report.title,
@@ -985,8 +998,9 @@ ${colors.bold('Details:')}`);
985
998
  headings: report.headings,
986
999
  links: report.links,
987
1000
  images: report.images,
988
- openGraph: report.social.openGraph,
989
- twitterCard: report.social.twitterCard,
1001
+ openGraph: report.openGraph,
1002
+ twitterCard: report.twitterCard,
1003
+ social: report.social,
990
1004
  jsonLd: report.jsonLd,
991
1005
  technical: report.technical,
992
1006
  checks: report.checks,
@@ -1024,6 +1038,50 @@ Grade: ${gradeColor(colors.bold(report.grade))} (${report.score}/100)
1024
1038
  : report.metaDescription.text;
1025
1039
  console.log(colors.bold('Description:') + ` ${desc} ` + colors.gray(`(${report.metaDescription.length} chars)`));
1026
1040
  }
1041
+ if (report.openGraph && Object.values(report.openGraph).some(v => v)) {
1042
+ console.log('');
1043
+ console.log(colors.bold(colors.cyan('OpenGraph:')));
1044
+ if (report.openGraph.title) {
1045
+ const ogTitle = report.openGraph.title.length > 60
1046
+ ? report.openGraph.title.slice(0, 57) + '...'
1047
+ : report.openGraph.title;
1048
+ console.log(` ${colors.gray('og:title:')} ${ogTitle}`);
1049
+ }
1050
+ if (report.openGraph.description) {
1051
+ const ogDesc = report.openGraph.description.length > 60
1052
+ ? report.openGraph.description.slice(0, 57) + '...'
1053
+ : report.openGraph.description;
1054
+ console.log(` ${colors.gray('og:description:')} ${ogDesc}`);
1055
+ }
1056
+ if (report.openGraph.image) {
1057
+ const ogImg = report.openGraph.image.length > 50
1058
+ ? '...' + report.openGraph.image.slice(-47)
1059
+ : report.openGraph.image;
1060
+ console.log(` ${colors.gray('og:image:')} ${colors.blue(ogImg)}`);
1061
+ }
1062
+ if (report.openGraph.type) {
1063
+ console.log(` ${colors.gray('og:type:')} ${report.openGraph.type}`);
1064
+ }
1065
+ }
1066
+ if (report.timing) {
1067
+ const t = report.timing;
1068
+ console.log('');
1069
+ console.log(colors.bold('Timing:'));
1070
+ const timings = [];
1071
+ if (t.dns !== undefined)
1072
+ timings.push(`DNS ${t.dns}ms`);
1073
+ if (t.tcp !== undefined)
1074
+ timings.push(`TCP ${t.tcp}ms`);
1075
+ if (t.tls !== undefined)
1076
+ timings.push(`TLS ${t.tls}ms`);
1077
+ if (t.ttfb !== undefined)
1078
+ timings.push(`TTFB ${t.ttfb}ms`);
1079
+ if (t.download !== undefined)
1080
+ timings.push(`Download ${t.download}ms`);
1081
+ if (t.total !== undefined)
1082
+ timings.push(`Total ${t.total}ms`);
1083
+ console.log(` ${timings.join(' → ')}`);
1084
+ }
1027
1085
  if (report.content) {
1028
1086
  console.log(colors.bold('Content:') + ` ${report.content.wordCount} words, ${report.content.paragraphCount} paragraphs, ~${report.content.readingTimeMinutes} min read`);
1029
1087
  }
@@ -1434,6 +1492,274 @@ ${colors.bold('Network:')}
1434
1492
  }
1435
1493
  console.log('');
1436
1494
  }
1495
+ async runSpider(args) {
1496
+ let url = '';
1497
+ let maxDepth = 3;
1498
+ let maxPages = 100;
1499
+ let concurrency = 5;
1500
+ let seoEnabled = false;
1501
+ let outputFile = '';
1502
+ for (let i = 0; i < args.length; i++) {
1503
+ const arg = args[i];
1504
+ if (arg.startsWith('depth=')) {
1505
+ maxDepth = parseInt(arg.split('=')[1]) || 4;
1506
+ }
1507
+ else if (arg.startsWith('limit=')) {
1508
+ maxPages = parseInt(arg.split('=')[1]) || 100;
1509
+ }
1510
+ else if (arg.startsWith('concurrency=')) {
1511
+ concurrency = parseInt(arg.split('=')[1]) || 5;
1512
+ }
1513
+ else if (arg === 'seo') {
1514
+ seoEnabled = true;
1515
+ }
1516
+ else if (arg.startsWith('output=')) {
1517
+ outputFile = arg.split('=')[1] || '';
1518
+ }
1519
+ else if (!arg.includes('=')) {
1520
+ url = arg;
1521
+ }
1522
+ }
1523
+ if (!url) {
1524
+ if (!this.baseUrl) {
1525
+ console.log(colors.yellow('Usage: spider <url> [options]'));
1526
+ console.log(colors.gray(' Options:'));
1527
+ console.log(colors.gray(' depth=4 Max crawl depth'));
1528
+ console.log(colors.gray(' limit=100 Max pages to crawl'));
1529
+ console.log(colors.gray(' concurrency=5 Concurrent requests'));
1530
+ console.log(colors.gray(' seo Enable SEO analysis'));
1531
+ console.log(colors.gray(' output=file.json Save JSON report'));
1532
+ console.log(colors.gray(' Examples:'));
1533
+ console.log(colors.gray(' spider example.com'));
1534
+ console.log(colors.gray(' spider example.com depth=2 limit=50'));
1535
+ console.log(colors.gray(' spider example.com seo output=seo-report.json'));
1536
+ return;
1537
+ }
1538
+ url = this.baseUrl;
1539
+ }
1540
+ else if (!url.startsWith('http')) {
1541
+ url = `https://${url}`;
1542
+ }
1543
+ console.log(colors.cyan(`\nSpider starting: ${url}`));
1544
+ const modeLabel = seoEnabled ? colors.magenta(' + SEO') : '';
1545
+ console.log(colors.gray(` Depth: ${maxDepth} | Limit: ${maxPages} | Concurrency: ${concurrency}${modeLabel}`));
1546
+ if (outputFile) {
1547
+ console.log(colors.gray(` Output: ${outputFile}`));
1548
+ }
1549
+ console.log('');
1550
+ if (seoEnabled) {
1551
+ const seoSpider = new SeoSpider({
1552
+ maxDepth,
1553
+ maxPages,
1554
+ concurrency,
1555
+ sameDomain: true,
1556
+ delay: 100,
1557
+ seo: true,
1558
+ output: outputFile || undefined,
1559
+ onProgress: (progress) => {
1560
+ process.stdout.write(`\r${colors.gray(' Crawling:')} ${colors.cyan(progress.crawled.toString())} pages | ${colors.gray('Queue:')} ${progress.queued} | ${colors.gray('Depth:')} ${progress.depth} `);
1561
+ },
1562
+ });
1563
+ try {
1564
+ const result = await seoSpider.crawl(url);
1565
+ process.stdout.write('\r' + ' '.repeat(80) + '\r');
1566
+ console.log(colors.green(`\n✔ SEO Spider complete`) + colors.gray(` (${(result.duration / 1000).toFixed(1)}s)`));
1567
+ console.log(` ${colors.cyan('Pages crawled')}: ${result.pages.length}`);
1568
+ console.log(` ${colors.cyan('Unique URLs')}: ${result.visited.size}`);
1569
+ console.log(` ${colors.cyan('Avg SEO Score')}: ${result.summary.avgScore}/100`);
1570
+ const responseTimes = result.pages.filter(p => p.duration > 0).map(p => p.duration);
1571
+ const avgResponseTime = responseTimes.length > 0
1572
+ ? Math.round(responseTimes.reduce((a, b) => a + b, 0) / responseTimes.length)
1573
+ : 0;
1574
+ const minResponseTime = responseTimes.length > 0 ? Math.min(...responseTimes) : 0;
1575
+ const maxResponseTime = responseTimes.length > 0 ? Math.max(...responseTimes) : 0;
1576
+ const reqPerSec = result.duration > 0 ? (result.pages.length / (result.duration / 1000)).toFixed(1) : '0';
1577
+ const statusCounts = new Map();
1578
+ for (const page of result.pages) {
1579
+ const status = page.status || 0;
1580
+ statusCounts.set(status, (statusCounts.get(status) || 0) + 1);
1581
+ }
1582
+ let totalInternalLinks = 0;
1583
+ let totalExternalLinks = 0;
1584
+ let totalImages = 0;
1585
+ let imagesWithoutAlt = 0;
1586
+ let pagesWithoutTitle = 0;
1587
+ let pagesWithoutDescription = 0;
1588
+ for (const page of result.pages) {
1589
+ if (page.seoReport) {
1590
+ totalInternalLinks += page.seoReport.links?.internal || 0;
1591
+ totalExternalLinks += page.seoReport.links?.external || 0;
1592
+ totalImages += page.seoReport.images?.total || 0;
1593
+ imagesWithoutAlt += page.seoReport.images?.withoutAlt || 0;
1594
+ if (!page.seoReport.title?.text)
1595
+ pagesWithoutTitle++;
1596
+ if (!page.seoReport.metaDescription?.text)
1597
+ pagesWithoutDescription++;
1598
+ }
1599
+ }
1600
+ console.log(colors.bold('\n Performance:'));
1601
+ console.log(` ${colors.gray('Avg Response:')} ${avgResponseTime}ms`);
1602
+ console.log(` ${colors.gray('Min/Max:')} ${minResponseTime}ms / ${maxResponseTime}ms`);
1603
+ console.log(` ${colors.gray('Throughput:')} ${reqPerSec} req/s`);
1604
+ console.log(colors.bold('\n HTTP Status:'));
1605
+ const sortedStatuses = Array.from(statusCounts.entries()).sort((a, b) => b[1] - a[1]);
1606
+ for (const [status, count] of sortedStatuses.slice(0, 5)) {
1607
+ const statusLabel = status === 0 ? 'Error' : status.toString();
1608
+ const statusColor = status >= 400 || status === 0 ? colors.red :
1609
+ status >= 300 ? colors.yellow : colors.green;
1610
+ const pct = ((count / result.pages.length) * 100).toFixed(0);
1611
+ console.log(` ${statusColor(statusLabel.padEnd(5))} ${count.toString().padStart(3)} (${pct}%)`);
1612
+ }
1613
+ console.log(colors.bold('\n Content:'));
1614
+ console.log(` ${colors.gray('Internal links:')} ${totalInternalLinks.toLocaleString()}`);
1615
+ console.log(` ${colors.gray('External links:')} ${totalExternalLinks.toLocaleString()}`);
1616
+ console.log(` ${colors.gray('Images:')} ${totalImages.toLocaleString()} (${imagesWithoutAlt} missing alt)`);
1617
+ console.log(` ${colors.gray('Missing title:')} ${pagesWithoutTitle}`);
1618
+ console.log(` ${colors.gray('Missing desc:')} ${pagesWithoutDescription}`);
1619
+ console.log(colors.bold('\n SEO Summary:'));
1620
+ const { summary } = result;
1621
+ console.log(` ${colors.red('✗')} Pages with errors: ${summary.pagesWithErrors}`);
1622
+ console.log(` ${colors.yellow('⚠')} Pages with warnings: ${summary.pagesWithWarnings}`);
1623
+ console.log(` ${colors.magenta('⚐')} Duplicate titles: ${summary.duplicateTitles}`);
1624
+ console.log(` ${colors.magenta('⚐')} Duplicate descriptions:${summary.duplicateDescriptions}`);
1625
+ console.log(` ${colors.magenta('⚐')} Duplicate H1s: ${summary.duplicateH1s}`);
1626
+ console.log(` ${colors.gray('○')} Orphan pages: ${summary.orphanPages}`);
1627
+ if (result.siteWideIssues.length > 0) {
1628
+ console.log(colors.bold('\n Site-Wide Issues:'));
1629
+ for (const issue of result.siteWideIssues.slice(0, 10)) {
1630
+ const icon = issue.severity === 'error' ? colors.red('✗') :
1631
+ issue.severity === 'warning' ? colors.yellow('⚠') : colors.gray('○');
1632
+ console.log(` ${icon} ${issue.message}`);
1633
+ if (issue.value) {
1634
+ const truncatedValue = issue.value.length > 50 ? issue.value.slice(0, 47) + '...' : issue.value;
1635
+ console.log(` ${colors.gray(`"${truncatedValue}"`)}`);
1636
+ }
1637
+ const uniquePaths = [...new Set(issue.affectedUrls.map(u => new URL(u).pathname))];
1638
+ if (uniquePaths.length <= 3) {
1639
+ for (const path of uniquePaths) {
1640
+ console.log(` ${colors.gray('→')} ${path}`);
1641
+ }
1642
+ }
1643
+ else {
1644
+ console.log(` ${colors.gray(`→ ${uniquePaths.length} pages affected`)}`);
1645
+ }
1646
+ }
1647
+ if (result.siteWideIssues.length > 10) {
1648
+ console.log(colors.gray(` ... and ${result.siteWideIssues.length - 10} more issues`));
1649
+ }
1650
+ }
1651
+ const pagesWithScores = result.pages
1652
+ .filter(p => p.seoReport)
1653
+ .sort((a, b) => (a.seoReport?.score || 0) - (b.seoReport?.score || 0));
1654
+ const seenPaths = new Set();
1655
+ const uniquePages = pagesWithScores.filter(page => {
1656
+ const path = new URL(page.url).pathname;
1657
+ if (seenPaths.has(path))
1658
+ return false;
1659
+ seenPaths.add(path);
1660
+ return true;
1661
+ });
1662
+ if (uniquePages.length > 0) {
1663
+ console.log(colors.bold('\n Pages by SEO Score:'));
1664
+ const worstPages = uniquePages.slice(0, 5);
1665
+ for (const page of worstPages) {
1666
+ const score = page.seoReport?.score || 0;
1667
+ const grade = page.seoReport?.grade || '?';
1668
+ const path = new URL(page.url).pathname;
1669
+ const scoreColor = score >= 80 ? colors.green : score >= 60 ? colors.yellow : colors.red;
1670
+ console.log(` ${scoreColor(`${score.toString().padStart(3)}`)} ${colors.gray(`[${grade}]`)} ${path.slice(0, 50)}`);
1671
+ }
1672
+ if (uniquePages.length > 5) {
1673
+ console.log(colors.gray(` ... and ${uniquePages.length - 5} more pages`));
1674
+ }
1675
+ }
1676
+ if (outputFile) {
1677
+ console.log(colors.green(`\n Report saved to: ${outputFile}`));
1678
+ }
1679
+ this.lastResponse = result;
1680
+ console.log(colors.gray('\n Result stored in lastResponse.'));
1681
+ }
1682
+ catch (error) {
1683
+ console.error(colors.red(`SEO Spider failed: ${error.message}`));
1684
+ }
1685
+ }
1686
+ else {
1687
+ const spider = new Spider({
1688
+ maxDepth,
1689
+ maxPages,
1690
+ concurrency,
1691
+ sameDomain: true,
1692
+ delay: 100,
1693
+ onProgress: (progress) => {
1694
+ process.stdout.write(`\r${colors.gray(' Crawling:')} ${colors.cyan(progress.crawled.toString())} pages | ${colors.gray('Queue:')} ${progress.queued} | ${colors.gray('Depth:')} ${progress.depth} `);
1695
+ },
1696
+ });
1697
+ try {
1698
+ const result = await spider.crawl(url);
1699
+ process.stdout.write('\r' + ' '.repeat(80) + '\r');
1700
+ console.log(colors.green(`\n✔ Spider complete`) + colors.gray(` (${(result.duration / 1000).toFixed(1)}s)`));
1701
+ console.log(` ${colors.cyan('Pages crawled')}: ${result.pages.length}`);
1702
+ console.log(` ${colors.cyan('Unique URLs')}: ${result.visited.size}`);
1703
+ console.log(` ${colors.cyan('Errors')}: ${result.errors.length}`);
1704
+ const byDepth = new Map();
1705
+ for (const page of result.pages) {
1706
+ byDepth.set(page.depth, (byDepth.get(page.depth) || 0) + 1);
1707
+ }
1708
+ console.log(colors.bold('\n Pages by depth:'));
1709
+ for (const [depth, count] of Array.from(byDepth.entries()).sort((a, b) => a[0] - b[0])) {
1710
+ const bar = '█'.repeat(Math.min(count, 40));
1711
+ console.log(` ${colors.gray(`d${depth}:`)} ${bar} ${count}`);
1712
+ }
1713
+ const topPages = [...result.pages]
1714
+ .filter(p => !p.error)
1715
+ .sort((a, b) => b.links.length - a.links.length)
1716
+ .slice(0, 10);
1717
+ if (topPages.length > 0) {
1718
+ console.log(colors.bold('\n Top pages by outgoing links:'));
1719
+ for (const page of topPages) {
1720
+ const title = page.title.slice(0, 40) || new URL(page.url).pathname;
1721
+ console.log(` ${colors.cyan(page.links.length.toString().padStart(3))} ${title}`);
1722
+ }
1723
+ }
1724
+ const formatError = (error) => {
1725
+ const statusMatch = error.match(/status code (\d{3})/i);
1726
+ if (statusMatch) {
1727
+ return `HTTP ${statusMatch[1]}`;
1728
+ }
1729
+ return error.length > 50 ? error.slice(0, 47) + '...' : error;
1730
+ };
1731
+ if (result.errors.length > 0 && result.errors.length <= 10) {
1732
+ console.log(colors.bold('\n Errors:'));
1733
+ for (const err of result.errors) {
1734
+ const path = new URL(err.url).pathname;
1735
+ console.log(` ${colors.red('✗')} ${path.padEnd(25)} ${colors.gray('→')} ${formatError(err.error)}`);
1736
+ }
1737
+ }
1738
+ else if (result.errors.length > 10) {
1739
+ console.log(colors.yellow(`\n ${result.errors.length} errors (showing first 10):`));
1740
+ for (const err of result.errors.slice(0, 10)) {
1741
+ const path = new URL(err.url).pathname;
1742
+ console.log(` ${colors.red('✗')} ${path.padEnd(25)} ${colors.gray('→')} ${formatError(err.error)}`);
1743
+ }
1744
+ }
1745
+ if (outputFile) {
1746
+ const reportData = {
1747
+ ...result,
1748
+ visited: Array.from(result.visited),
1749
+ generatedAt: new Date().toISOString(),
1750
+ };
1751
+ await fs.writeFile(outputFile, JSON.stringify(reportData, null, 2), 'utf-8');
1752
+ console.log(colors.green(`\n Report saved to: ${outputFile}`));
1753
+ }
1754
+ this.lastResponse = result;
1755
+ console.log(colors.gray('\n Result stored in lastResponse. Use $links to explore.'));
1756
+ }
1757
+ catch (error) {
1758
+ console.error(colors.red(`Spider failed: ${error.message}`));
1759
+ }
1760
+ }
1761
+ console.log('');
1762
+ }
1437
1763
  async runSelect(selector) {
1438
1764
  if (!this.currentDoc) {
1439
1765
  console.log(colors.yellow('No document loaded. Use "scrap <url>" first.'));
@@ -2358,6 +2684,13 @@ ${colors.bold('Network:')}
2358
2684
  ${colors.green('$beautify:save [f]')} Save beautified code to file.
2359
2685
  ${colors.green('$table <selector>')} Extract table as data.
2360
2686
 
2687
+ ${colors.bold('Web Crawler:')}
2688
+ ${colors.green('spider <url>')} Crawl website following internal links.
2689
+ ${colors.gray('Options:')}
2690
+ ${colors.white('--depth=4')} ${colors.gray('Maximum depth to crawl')}
2691
+ ${colors.white('--limit=100')} ${colors.gray('Maximum pages to crawl')}
2692
+ ${colors.white('--concurrency=5')} ${colors.gray('Parallel requests')}
2693
+
2361
2694
  ${colors.bold('Documentation:')}
2362
2695
  ${colors.green('? <query>')} Search Recker documentation.
2363
2696
  ${colors.green('search <query>')} Alias for ? (hybrid fuzzy+semantic search).
@@ -2375,6 +2708,7 @@ ${colors.bold('Network:')}
2375
2708
  › post /post name="Neo" active:=true role:Admin
2376
2709
  › load /heavy-endpoint users=100 mode=stress
2377
2710
  › chat openai gpt-5.1
2711
+ › spider example.com depth=2 limit=50
2378
2712
  `);
2379
2713
  }
2380
2714
  }
@@ -1,4 +1,6 @@
1
1
  export { ScrapeDocument } from './document.js';
2
2
  export { ScrapeElement } from './element.js';
3
+ export { Spider, spider } from './spider.js';
4
+ export type { SpiderOptions, SpiderPageResult, SpiderProgress, SpiderResult, } from './spider.js';
3
5
  export { extractLinks, extractImages, extractMeta, extractOpenGraph, extractTwitterCard, extractJsonLd, extractForms, extractTables, extractScripts, extractStyles, } from './extractors.js';
4
6
  export type { ExtractedLink, ExtractedImage, ExtractedMeta, OpenGraphData, TwitterCardData, JsonLdData, ExtractedForm, ExtractedFormField, ExtractedTable, ExtractedScript, ExtractedStyle, ExtractionSchema, ExtractionSchemaField, ScrapeOptions, LinkExtractionOptions, ImageExtractionOptions, } from './types.js';
@@ -1,3 +1,4 @@
1
1
  export { ScrapeDocument } from './document.js';
2
2
  export { ScrapeElement } from './element.js';
3
+ export { Spider, spider } from './spider.js';
3
4
  export { extractLinks, extractImages, extractMeta, extractOpenGraph, extractTwitterCard, extractJsonLd, extractForms, extractTables, extractScripts, extractStyles, } from './extractors.js';
@@ -0,0 +1,61 @@
1
+ import type { ExtractedLink } from './types.js';
2
+ export interface SpiderOptions {
3
+ maxDepth?: number;
4
+ maxPages?: number;
5
+ sameDomain?: boolean;
6
+ concurrency?: number;
7
+ timeout?: number;
8
+ delay?: number;
9
+ exclude?: RegExp[];
10
+ include?: RegExp[];
11
+ userAgent?: string;
12
+ respectRobotsTxt?: boolean;
13
+ onPage?: (result: SpiderPageResult) => void;
14
+ onProgress?: (progress: SpiderProgress) => void;
15
+ }
16
+ export interface SpiderPageResult {
17
+ url: string;
18
+ status: number;
19
+ title: string;
20
+ depth: number;
21
+ links: ExtractedLink[];
22
+ duration: number;
23
+ error?: string;
24
+ }
25
+ export interface SpiderProgress {
26
+ crawled: number;
27
+ queued: number;
28
+ total: number;
29
+ currentUrl: string;
30
+ depth: number;
31
+ }
32
+ export interface SpiderResult {
33
+ startUrl: string;
34
+ pages: SpiderPageResult[];
35
+ visited: Set<string>;
36
+ duration: number;
37
+ errors: Array<{
38
+ url: string;
39
+ error: string;
40
+ }>;
41
+ }
42
+ export declare class Spider {
43
+ private options;
44
+ private client;
45
+ private pool;
46
+ private visited;
47
+ private queue;
48
+ private results;
49
+ private errors;
50
+ private baseHost;
51
+ private running;
52
+ private aborted;
53
+ private pendingCount;
54
+ constructor(options?: SpiderOptions);
55
+ crawl(startUrl: string): Promise<SpiderResult>;
56
+ private crawlPage;
57
+ abort(): void;
58
+ isRunning(): boolean;
59
+ getProgress(): SpiderProgress;
60
+ }
61
+ export declare function spider(url: string, options?: SpiderOptions): Promise<SpiderResult>;