@crawlith/core 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/CHANGELOG.md +6 -0
  2. package/dist/analysis/analysis_list.html +35 -0
  3. package/dist/analysis/analysis_page.html +123 -0
  4. package/dist/analysis/analyze.d.ts +17 -3
  5. package/dist/analysis/analyze.js +192 -248
  6. package/dist/analysis/scoring.js +7 -1
  7. package/dist/analysis/templates.d.ts +2 -0
  8. package/dist/analysis/templates.js +7 -0
  9. package/dist/core/security/ipGuard.d.ts +11 -0
  10. package/dist/core/security/ipGuard.js +71 -3
  11. package/dist/crawler/crawl.d.ts +4 -22
  12. package/dist/crawler/crawl.js +4 -335
  13. package/dist/crawler/crawler.d.ts +75 -0
  14. package/dist/crawler/crawler.js +518 -0
  15. package/dist/crawler/extract.d.ts +4 -1
  16. package/dist/crawler/extract.js +7 -2
  17. package/dist/crawler/fetcher.d.ts +1 -0
  18. package/dist/crawler/fetcher.js +20 -5
  19. package/dist/crawler/metricsRunner.d.ts +3 -1
  20. package/dist/crawler/metricsRunner.js +55 -46
  21. package/dist/crawler/sitemap.d.ts +3 -0
  22. package/dist/crawler/sitemap.js +5 -1
  23. package/dist/db/graphLoader.js +32 -3
  24. package/dist/db/index.d.ts +3 -0
  25. package/dist/db/index.js +4 -0
  26. package/dist/db/repositories/EdgeRepository.d.ts +8 -0
  27. package/dist/db/repositories/EdgeRepository.js +13 -0
  28. package/dist/db/repositories/MetricsRepository.d.ts +3 -0
  29. package/dist/db/repositories/MetricsRepository.js +14 -1
  30. package/dist/db/repositories/PageRepository.d.ts +11 -0
  31. package/dist/db/repositories/PageRepository.js +112 -19
  32. package/dist/db/repositories/SiteRepository.d.ts +3 -0
  33. package/dist/db/repositories/SiteRepository.js +9 -0
  34. package/dist/db/repositories/SnapshotRepository.d.ts +2 -0
  35. package/dist/db/repositories/SnapshotRepository.js +23 -2
  36. package/dist/events.d.ts +48 -0
  37. package/dist/events.js +1 -0
  38. package/dist/graph/cluster.js +62 -14
  39. package/dist/graph/duplicate.js +242 -191
  40. package/dist/graph/graph.d.ts +16 -0
  41. package/dist/graph/graph.js +17 -4
  42. package/dist/graph/metrics.js +12 -0
  43. package/dist/graph/pagerank.js +2 -0
  44. package/dist/graph/simhash.d.ts +6 -0
  45. package/dist/graph/simhash.js +14 -0
  46. package/dist/index.d.ts +5 -2
  47. package/dist/index.js +5 -2
  48. package/dist/lock/hashKey.js +1 -1
  49. package/dist/lock/lockManager.d.ts +4 -1
  50. package/dist/lock/lockManager.js +23 -13
  51. package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
  52. package/dist/report/crawlExport.d.ts +3 -0
  53. package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
  54. package/dist/report/crawl_template.d.ts +1 -0
  55. package/dist/report/crawl_template.js +7 -0
  56. package/dist/report/html.js +15 -216
  57. package/dist/scoring/health.d.ts +50 -0
  58. package/dist/scoring/health.js +170 -0
  59. package/dist/scoring/hits.d.ts +1 -0
  60. package/dist/scoring/hits.js +64 -44
  61. package/dist/scoring/orphanSeverity.d.ts +5 -5
  62. package/package.json +3 -3
  63. package/scripts/copy-assets.js +37 -0
  64. package/src/analysis/analysis_list.html +35 -0
  65. package/src/analysis/analysis_page.html +123 -0
  66. package/src/analysis/analyze.ts +218 -261
  67. package/src/analysis/scoring.ts +8 -1
  68. package/src/analysis/templates.ts +9 -0
  69. package/src/core/security/ipGuard.ts +82 -3
  70. package/src/crawler/crawl.ts +6 -379
  71. package/src/crawler/crawler.ts +601 -0
  72. package/src/crawler/extract.ts +7 -2
  73. package/src/crawler/fetcher.ts +24 -6
  74. package/src/crawler/metricsRunner.ts +60 -47
  75. package/src/crawler/sitemap.ts +4 -1
  76. package/src/db/graphLoader.ts +33 -3
  77. package/src/db/index.ts +5 -0
  78. package/src/db/repositories/EdgeRepository.ts +14 -0
  79. package/src/db/repositories/MetricsRepository.ts +15 -1
  80. package/src/db/repositories/PageRepository.ts +119 -19
  81. package/src/db/repositories/SiteRepository.ts +11 -0
  82. package/src/db/repositories/SnapshotRepository.ts +28 -3
  83. package/src/events.ts +16 -0
  84. package/src/graph/cluster.ts +69 -15
  85. package/src/graph/duplicate.ts +249 -185
  86. package/src/graph/graph.ts +24 -4
  87. package/src/graph/metrics.ts +15 -0
  88. package/src/graph/pagerank.ts +1 -0
  89. package/src/graph/simhash.ts +15 -0
  90. package/src/index.ts +5 -2
  91. package/src/lock/hashKey.ts +1 -1
  92. package/src/lock/lockManager.ts +21 -13
  93. package/{dist/report/sitegraph_template.js → src/report/crawl.html} +330 -81
  94. package/src/report/{sitegraphExport.ts → crawlExport.ts} +3 -3
  95. package/src/report/crawl_template.ts +9 -0
  96. package/src/report/html.ts +17 -217
  97. package/src/scoring/health.ts +241 -0
  98. package/src/scoring/hits.ts +67 -45
  99. package/src/scoring/orphanSeverity.ts +8 -8
  100. package/tests/analysis.unit.test.ts +44 -0
  101. package/tests/analyze.integration.test.ts +88 -53
  102. package/tests/analyze_markdown.test.ts +98 -0
  103. package/tests/audit/audit.test.ts +101 -0
  104. package/tests/audit/scoring.test.ts +25 -25
  105. package/tests/audit/transport.test.ts +0 -1
  106. package/tests/clustering_risk.test.ts +118 -0
  107. package/tests/crawler.test.ts +19 -13
  108. package/tests/db/index.test.ts +134 -0
  109. package/tests/db/repositories.test.ts +115 -0
  110. package/tests/db_repos.test.ts +72 -0
  111. package/tests/duplicate.test.ts +2 -2
  112. package/tests/extract.test.ts +86 -0
  113. package/tests/fetcher.test.ts +5 -1
  114. package/tests/fetcher_safety.test.ts +9 -3
  115. package/tests/graph/graph.test.ts +100 -0
  116. package/tests/graphLoader.test.ts +124 -0
  117. package/tests/html_report.test.ts +52 -51
  118. package/tests/ipGuard.test.ts +73 -0
  119. package/tests/lock/lockManager.test.ts +77 -17
  120. package/tests/normalize.test.ts +6 -19
  121. package/tests/orphanSeverity.test.ts +9 -9
  122. package/tests/redirect_safety.test.ts +5 -1
  123. package/tests/renderAnalysisCsv.test.ts +183 -0
  124. package/tests/safety.test.ts +12 -0
  125. package/tests/scope.test.ts +18 -0
  126. package/tests/scoring.test.ts +25 -24
  127. package/tests/sitemap.test.ts +13 -1
  128. package/tests/ssrf_fix.test.ts +69 -0
  129. package/tests/visualization_data.test.ts +10 -10
  130. package/dist/report/sitegraphExport.d.ts +0 -3
  131. package/dist/report/sitegraph_template.d.ts +0 -1
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@crawlith/core",
3
- "version": "0.1.0",
3
+ "version": "0.1.1",
4
4
  "type": "module",
5
5
  "main": "dist/index.js",
6
6
  "types": "dist/index.d.ts",
@@ -15,7 +15,7 @@
15
15
  "better-sqlite3": "^12.6.2",
16
16
  "chalk": "^5.3.0",
17
17
  "cheerio": "^1.0.0-rc.12",
18
- "p-limit": "^5.0.0",
18
+ "p-limit": "^7.3.0",
19
19
  "robots-parser": "^3.0.1",
20
20
  "undici": "^6.13.0",
21
21
  "vite": "7.3.1"
@@ -27,7 +27,7 @@
27
27
  "vitest": "^4.0.18"
28
28
  },
29
29
  "scripts": {
30
- "build": "tsc",
30
+ "build": "tsc && node scripts/copy-assets.js",
31
31
  "test": "vitest run"
32
32
  }
33
33
  }
@@ -0,0 +1,37 @@
1
+ import fs from 'node:fs';
2
+ import path from 'node:path';
3
+ import { fileURLToPath } from 'node:url';
4
+
5
+ const __filename = fileURLToPath(import.meta.url);
6
+ const __dirname = path.dirname(__filename);
7
+
8
+ // Ensure dist directories exist
9
+ const reportDestDir = path.join(__dirname, '../dist/report');
10
+ if (!fs.existsSync(reportDestDir)) {
11
+ fs.mkdirSync(reportDestDir, { recursive: true });
12
+ }
13
+
14
+ const analysisDestDir = path.join(__dirname, '../dist/analysis');
15
+ if (!fs.existsSync(analysisDestDir)) {
16
+ fs.mkdirSync(analysisDestDir, { recursive: true });
17
+ }
18
+
19
+ // Copy Report Assets
20
+ const crawlSrc = path.join(__dirname, '../src/report/crawl.html');
21
+ const crawlDest = path.join(reportDestDir, 'crawl.html');
22
+ if (fs.existsSync(crawlSrc)) {
23
+ fs.copyFileSync(crawlSrc, crawlDest);
24
+ }
25
+
26
+ // Copy Analysis Assets
27
+ const analysisListSrc = path.join(__dirname, '../src/analysis/analysis_list.html');
28
+ const analysisListDest = path.join(analysisDestDir, 'analysis_list.html');
29
+ if (fs.existsSync(analysisListSrc)) {
30
+ fs.copyFileSync(analysisListSrc, analysisListDest);
31
+ }
32
+
33
+ const analysisPageSrc = path.join(__dirname, '../src/analysis/analysis_page.html');
34
+ const analysisPageDest = path.join(analysisDestDir, 'analysis_page.html');
35
+ if (fs.existsSync(analysisPageSrc)) {
36
+ fs.copyFileSync(analysisPageSrc, analysisPageDest);
37
+ }
@@ -0,0 +1,35 @@
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="utf-8" />
5
+ <title>Crawlith Analysis Report</title>
6
+ <style>
7
+ body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; max-width: 1000px; margin: 0 auto; padding: 20px; color: #333; }
8
+ h1 { border-bottom: 2px solid #eee; padding-bottom: 10px; }
9
+ table { width: 100%; border-collapse: collapse; margin-top: 20px; }
10
+ th, td { padding: 8px 12px; border: 1px solid #ddd; text-align: left; }
11
+ th { background-color: #f4f4f4; }
12
+ tr:nth-child(even) { background-color: #f9f9f9; }
13
+ tr:hover { background-color: #f1f1f1; }
14
+ </style>
15
+ </head>
16
+ <body>
17
+ <h1>Analysis</h1>
18
+ <p>Pages: {{PAGES_ANALYZED}}</p>
19
+ <p>Average SEO: {{AVG_SEO_SCORE}}</p>
20
+ <table border="1" cellspacing="0" cellpadding="6">
21
+ <thead>
22
+ <tr>
23
+ <th>URL</th>
24
+ <th>SEO Score</th>
25
+ <th>Thin Score</th>
26
+ <th>Title</th>
27
+ <th>Meta</th>
28
+ </tr>
29
+ </thead>
30
+ <tbody>
31
+ {{ROWS}}
32
+ </tbody>
33
+ </table>
34
+ </body>
35
+ </html>
@@ -0,0 +1,123 @@
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Analysis for {{URL}}</title>
7
+ <style>
8
+ body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; line-height: 1.6; color: #333; }
9
+ h1 { border-bottom: 2px solid #eee; padding-bottom: 10px; }
10
+ h2 { margin-top: 30px; border-bottom: 1px solid #eee; padding-bottom: 5px; }
11
+ .score-card { display: flex; gap: 20px; margin-bottom: 30px; }
12
+ .score-box { background: #f8f9fa; padding: 15px; border-radius: 8px; text-align: center; flex: 1; border: 1px solid #e1e4e8; }
13
+ .score-val { font-size: 24px; font-weight: bold; color: #0366d6; }
14
+ .status-ok { color: green; font-weight: bold; }
15
+ .status-warning { color: orange; font-weight: bold; }
16
+ .status-critical { color: red; font-weight: bold; }
17
+ .status-missing { color: red; font-weight: bold; }
18
+ .data-table { width: 100%; border-collapse: collapse; margin-top: 10px; }
19
+ .data-table th, .data-table td { text-align: left; padding: 8px; border-bottom: 1px solid #eee; }
20
+ .data-table th { width: 150px; color: #666; }
21
+ code { background: #f6f8fa; padding: 2px 4px; border-radius: 3px; font-size: 0.9em; }
22
+ </style>
23
+ </head>
24
+ <body>
25
+ <h1>Page Analysis</h1>
26
+ <p><strong>URL:</strong> <a href="{{URL}}" target="_blank">{{URL}}</a></p>
27
+
28
+ <div class="score-card">
29
+ <div class="score-box">
30
+ <div class="score-val">{{SEO_SCORE}}</div>
31
+ <div>SEO Score</div>
32
+ </div>
33
+ <div class="score-box">
34
+ <div class="score-val">{{THIN_SCORE}}</div>
35
+ <div>Thin Content Score</div>
36
+ </div>
37
+ <div class="score-box">
38
+ <div class="score-val">{{HTTP_STATUS}}</div>
39
+ <div>HTTP Status</div>
40
+ </div>
41
+ </div>
42
+
43
+ <h2>Meta Tags</h2>
44
+ <table class="data-table">
45
+ <tr>
46
+ <th>Title</th>
47
+ <td>
48
+ <div>{{TITLE_VALUE}}</div>
49
+ <small>Length: {{TITLE_LENGTH}} | Status: <span class="status-{{TITLE_STATUS}}">{{TITLE_STATUS}}</span></small>
50
+ </td>
51
+ </tr>
52
+ <tr>
53
+ <th>Description</th>
54
+ <td>
55
+ <div>{{META_DESCRIPTION_VALUE}}</div>
56
+ <small>Length: {{META_DESCRIPTION_LENGTH}} | Status: <span class="status-{{META_DESCRIPTION_STATUS}}">{{META_DESCRIPTION_STATUS}}</span></small>
57
+ </td>
58
+ </tr>
59
+ <tr>
60
+ <th>Canonical</th>
61
+ <td>{{CANONICAL}}</td>
62
+ </tr>
63
+ <tr>
64
+ <th>Robots</th>
65
+ <td>
66
+ Index: {{ROBOTS_INDEX}},
67
+ Follow: {{ROBOTS_FOLLOW}}
68
+ </td>
69
+ </tr>
70
+ </table>
71
+
72
+ <h2>Content & Heading</h2>
73
+ <table class="data-table">
74
+ <tr>
75
+ <th>H1 Tag</th>
76
+ <td>
77
+ Status: <span class="status-{{H1_STATUS}}">{{H1_STATUS}}</span>
78
+ ({{H1_COUNT}} detected)
79
+ {{H1_MATCHES_TITLE}}
80
+ </td>
81
+ </tr>
82
+ <tr>
83
+ <th>Word Count</th>
84
+ <td>{{WORD_COUNT}} words</td>
85
+ </tr>
86
+ <tr>
87
+ <th>Unique Sentences</th>
88
+ <td>{{UNIQUE_SENTENCES}}</td>
89
+ </tr>
90
+ <tr>
91
+ <th>Text / HTML Ratio</th>
92
+ <td>{{TEXT_HTML_RATIO}}%</td>
93
+ </tr>
94
+ </table>
95
+
96
+ <h2>Links & Images</h2>
97
+ <table class="data-table">
98
+ <tr>
99
+ <th>Internal Links</th>
100
+ <td>{{INTERNAL_LINKS}}</td>
101
+ </tr>
102
+ <tr>
103
+ <th>External Links</th>
104
+ <td>{{EXTERNAL_LINKS}} ({{EXTERNAL_RATIO}}%)</td>
105
+ </tr>
106
+ <tr>
107
+ <th>Images</th>
108
+ <td>{{TOTAL_IMAGES}} total ({{MISSING_ALT}} missing alt text)</td>
109
+ </tr>
110
+ </table>
111
+
112
+ <h2>Structured Data</h2>
113
+ <table class="data-table">
114
+ <tr>
115
+ <th>Status</th>
116
+ <td>
117
+ {{STRUCTURED_DATA_STATUS}}
118
+ </td>
119
+ </tr>
120
+ {{STRUCTURED_DATA_TYPES_ROW}}
121
+ </table>
122
+ </body>
123
+ </html>