@crawlith/core 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/dist/analysis/analysis_list.html +35 -0
- package/dist/analysis/analysis_page.html +123 -0
- package/dist/analysis/analyze.d.ts +17 -3
- package/dist/analysis/analyze.js +192 -248
- package/dist/analysis/scoring.js +7 -1
- package/dist/analysis/templates.d.ts +2 -0
- package/dist/analysis/templates.js +7 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +71 -3
- package/dist/crawler/crawl.d.ts +4 -22
- package/dist/crawler/crawl.js +4 -335
- package/dist/crawler/crawler.d.ts +75 -0
- package/dist/crawler/crawler.js +518 -0
- package/dist/crawler/extract.d.ts +4 -1
- package/dist/crawler/extract.js +7 -2
- package/dist/crawler/fetcher.d.ts +1 -0
- package/dist/crawler/fetcher.js +20 -5
- package/dist/crawler/metricsRunner.d.ts +3 -1
- package/dist/crawler/metricsRunner.js +55 -46
- package/dist/crawler/sitemap.d.ts +3 -0
- package/dist/crawler/sitemap.js +5 -1
- package/dist/db/graphLoader.js +32 -3
- package/dist/db/index.d.ts +3 -0
- package/dist/db/index.js +4 -0
- package/dist/db/repositories/EdgeRepository.d.ts +8 -0
- package/dist/db/repositories/EdgeRepository.js +13 -0
- package/dist/db/repositories/MetricsRepository.d.ts +3 -0
- package/dist/db/repositories/MetricsRepository.js +14 -1
- package/dist/db/repositories/PageRepository.d.ts +11 -0
- package/dist/db/repositories/PageRepository.js +112 -19
- package/dist/db/repositories/SiteRepository.d.ts +3 -0
- package/dist/db/repositories/SiteRepository.js +9 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +2 -0
- package/dist/db/repositories/SnapshotRepository.js +23 -2
- package/dist/events.d.ts +48 -0
- package/dist/events.js +1 -0
- package/dist/graph/cluster.js +62 -14
- package/dist/graph/duplicate.js +242 -191
- package/dist/graph/graph.d.ts +16 -0
- package/dist/graph/graph.js +17 -4
- package/dist/graph/metrics.js +12 -0
- package/dist/graph/pagerank.js +2 -0
- package/dist/graph/simhash.d.ts +6 -0
- package/dist/graph/simhash.js +14 -0
- package/dist/index.d.ts +5 -2
- package/dist/index.js +5 -2
- package/dist/lock/hashKey.js +1 -1
- package/dist/lock/lockManager.d.ts +4 -1
- package/dist/lock/lockManager.js +23 -13
- package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
- package/dist/report/crawlExport.d.ts +3 -0
- package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
- package/dist/report/crawl_template.d.ts +1 -0
- package/dist/report/crawl_template.js +7 -0
- package/dist/report/html.js +15 -216
- package/dist/scoring/health.d.ts +50 -0
- package/dist/scoring/health.js +170 -0
- package/dist/scoring/hits.d.ts +1 -0
- package/dist/scoring/hits.js +64 -44
- package/dist/scoring/orphanSeverity.d.ts +5 -5
- package/package.json +3 -3
- package/scripts/copy-assets.js +37 -0
- package/src/analysis/analysis_list.html +35 -0
- package/src/analysis/analysis_page.html +123 -0
- package/src/analysis/analyze.ts +218 -261
- package/src/analysis/scoring.ts +8 -1
- package/src/analysis/templates.ts +9 -0
- package/src/core/security/ipGuard.ts +82 -3
- package/src/crawler/crawl.ts +6 -379
- package/src/crawler/crawler.ts +601 -0
- package/src/crawler/extract.ts +7 -2
- package/src/crawler/fetcher.ts +24 -6
- package/src/crawler/metricsRunner.ts +60 -47
- package/src/crawler/sitemap.ts +4 -1
- package/src/db/graphLoader.ts +33 -3
- package/src/db/index.ts +5 -0
- package/src/db/repositories/EdgeRepository.ts +14 -0
- package/src/db/repositories/MetricsRepository.ts +15 -1
- package/src/db/repositories/PageRepository.ts +119 -19
- package/src/db/repositories/SiteRepository.ts +11 -0
- package/src/db/repositories/SnapshotRepository.ts +28 -3
- package/src/events.ts +16 -0
- package/src/graph/cluster.ts +69 -15
- package/src/graph/duplicate.ts +249 -185
- package/src/graph/graph.ts +24 -4
- package/src/graph/metrics.ts +15 -0
- package/src/graph/pagerank.ts +1 -0
- package/src/graph/simhash.ts +15 -0
- package/src/index.ts +5 -2
- package/src/lock/hashKey.ts +1 -1
- package/src/lock/lockManager.ts +21 -13
- package/{dist/report/sitegraph_template.js → src/report/crawl.html} +330 -81
- package/src/report/{sitegraphExport.ts → crawlExport.ts} +3 -3
- package/src/report/crawl_template.ts +9 -0
- package/src/report/html.ts +17 -217
- package/src/scoring/health.ts +241 -0
- package/src/scoring/hits.ts +67 -45
- package/src/scoring/orphanSeverity.ts +8 -8
- package/tests/analysis.unit.test.ts +44 -0
- package/tests/analyze.integration.test.ts +88 -53
- package/tests/analyze_markdown.test.ts +98 -0
- package/tests/audit/audit.test.ts +101 -0
- package/tests/audit/scoring.test.ts +25 -25
- package/tests/audit/transport.test.ts +0 -1
- package/tests/clustering_risk.test.ts +118 -0
- package/tests/crawler.test.ts +19 -13
- package/tests/db/index.test.ts +134 -0
- package/tests/db/repositories.test.ts +115 -0
- package/tests/db_repos.test.ts +72 -0
- package/tests/duplicate.test.ts +2 -2
- package/tests/extract.test.ts +86 -0
- package/tests/fetcher.test.ts +5 -1
- package/tests/fetcher_safety.test.ts +9 -3
- package/tests/graph/graph.test.ts +100 -0
- package/tests/graphLoader.test.ts +124 -0
- package/tests/html_report.test.ts +52 -51
- package/tests/ipGuard.test.ts +73 -0
- package/tests/lock/lockManager.test.ts +77 -17
- package/tests/normalize.test.ts +6 -19
- package/tests/orphanSeverity.test.ts +9 -9
- package/tests/redirect_safety.test.ts +5 -1
- package/tests/renderAnalysisCsv.test.ts +183 -0
- package/tests/safety.test.ts +12 -0
- package/tests/scope.test.ts +18 -0
- package/tests/scoring.test.ts +25 -24
- package/tests/sitemap.test.ts +13 -1
- package/tests/ssrf_fix.test.ts +69 -0
- package/tests/visualization_data.test.ts +10 -10
- package/dist/report/sitegraphExport.d.ts +0 -3
- package/dist/report/sitegraph_template.d.ts +0 -1
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@crawlith/core",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.1",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"types": "dist/index.d.ts",
|
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
"better-sqlite3": "^12.6.2",
|
|
16
16
|
"chalk": "^5.3.0",
|
|
17
17
|
"cheerio": "^1.0.0-rc.12",
|
|
18
|
-
"p-limit": "^
|
|
18
|
+
"p-limit": "^7.3.0",
|
|
19
19
|
"robots-parser": "^3.0.1",
|
|
20
20
|
"undici": "^6.13.0",
|
|
21
21
|
"vite": "7.3.1"
|
|
@@ -27,7 +27,7 @@
|
|
|
27
27
|
"vitest": "^4.0.18"
|
|
28
28
|
},
|
|
29
29
|
"scripts": {
|
|
30
|
-
"build": "tsc",
|
|
30
|
+
"build": "tsc && node scripts/copy-assets.js",
|
|
31
31
|
"test": "vitest run"
|
|
32
32
|
}
|
|
33
33
|
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import fs from 'node:fs';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
import { fileURLToPath } from 'node:url';
|
|
4
|
+
|
|
5
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
6
|
+
const __dirname = path.dirname(__filename);
|
|
7
|
+
|
|
8
|
+
// Ensure dist directories exist
|
|
9
|
+
const reportDestDir = path.join(__dirname, '../dist/report');
|
|
10
|
+
if (!fs.existsSync(reportDestDir)) {
|
|
11
|
+
fs.mkdirSync(reportDestDir, { recursive: true });
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
const analysisDestDir = path.join(__dirname, '../dist/analysis');
|
|
15
|
+
if (!fs.existsSync(analysisDestDir)) {
|
|
16
|
+
fs.mkdirSync(analysisDestDir, { recursive: true });
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
// Copy Report Assets
|
|
20
|
+
const crawlSrc = path.join(__dirname, '../src/report/crawl.html');
|
|
21
|
+
const crawlDest = path.join(reportDestDir, 'crawl.html');
|
|
22
|
+
if (fs.existsSync(crawlSrc)) {
|
|
23
|
+
fs.copyFileSync(crawlSrc, crawlDest);
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
// Copy Analysis Assets
|
|
27
|
+
const analysisListSrc = path.join(__dirname, '../src/analysis/analysis_list.html');
|
|
28
|
+
const analysisListDest = path.join(analysisDestDir, 'analysis_list.html');
|
|
29
|
+
if (fs.existsSync(analysisListSrc)) {
|
|
30
|
+
fs.copyFileSync(analysisListSrc, analysisListDest);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
const analysisPageSrc = path.join(__dirname, '../src/analysis/analysis_page.html');
|
|
34
|
+
const analysisPageDest = path.join(analysisDestDir, 'analysis_page.html');
|
|
35
|
+
if (fs.existsSync(analysisPageSrc)) {
|
|
36
|
+
fs.copyFileSync(analysisPageSrc, analysisPageDest);
|
|
37
|
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
<!DOCTYPE html>
|
|
2
|
+
<html lang="en">
|
|
3
|
+
<head>
|
|
4
|
+
<meta charset="utf-8" />
|
|
5
|
+
<title>Crawlith Analysis Report</title>
|
|
6
|
+
<style>
|
|
7
|
+
body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; max-width: 1000px; margin: 0 auto; padding: 20px; color: #333; }
|
|
8
|
+
h1 { border-bottom: 2px solid #eee; padding-bottom: 10px; }
|
|
9
|
+
table { width: 100%; border-collapse: collapse; margin-top: 20px; }
|
|
10
|
+
th, td { padding: 8px 12px; border: 1px solid #ddd; text-align: left; }
|
|
11
|
+
th { background-color: #f4f4f4; }
|
|
12
|
+
tr:nth-child(even) { background-color: #f9f9f9; }
|
|
13
|
+
tr:hover { background-color: #f1f1f1; }
|
|
14
|
+
</style>
|
|
15
|
+
</head>
|
|
16
|
+
<body>
|
|
17
|
+
<h1>Analysis</h1>
|
|
18
|
+
<p>Pages: {{PAGES_ANALYZED}}</p>
|
|
19
|
+
<p>Average SEO: {{AVG_SEO_SCORE}}</p>
|
|
20
|
+
<table border="1" cellspacing="0" cellpadding="6">
|
|
21
|
+
<thead>
|
|
22
|
+
<tr>
|
|
23
|
+
<th>URL</th>
|
|
24
|
+
<th>SEO Score</th>
|
|
25
|
+
<th>Thin Score</th>
|
|
26
|
+
<th>Title</th>
|
|
27
|
+
<th>Meta</th>
|
|
28
|
+
</tr>
|
|
29
|
+
</thead>
|
|
30
|
+
<tbody>
|
|
31
|
+
{{ROWS}}
|
|
32
|
+
</tbody>
|
|
33
|
+
</table>
|
|
34
|
+
</body>
|
|
35
|
+
</html>
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
<!DOCTYPE html>
|
|
2
|
+
<html lang="en">
|
|
3
|
+
<head>
|
|
4
|
+
<meta charset="UTF-8">
|
|
5
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
6
|
+
<title>Analysis for {{URL}}</title>
|
|
7
|
+
<style>
|
|
8
|
+
body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; line-height: 1.6; color: #333; }
|
|
9
|
+
h1 { border-bottom: 2px solid #eee; padding-bottom: 10px; }
|
|
10
|
+
h2 { margin-top: 30px; border-bottom: 1px solid #eee; padding-bottom: 5px; }
|
|
11
|
+
.score-card { display: flex; gap: 20px; margin-bottom: 30px; }
|
|
12
|
+
.score-box { background: #f8f9fa; padding: 15px; border-radius: 8px; text-align: center; flex: 1; border: 1px solid #e1e4e8; }
|
|
13
|
+
.score-val { font-size: 24px; font-weight: bold; color: #0366d6; }
|
|
14
|
+
.status-ok { color: green; font-weight: bold; }
|
|
15
|
+
.status-warning { color: orange; font-weight: bold; }
|
|
16
|
+
.status-critical { color: red; font-weight: bold; }
|
|
17
|
+
.status-missing { color: red; font-weight: bold; }
|
|
18
|
+
.data-table { width: 100%; border-collapse: collapse; margin-top: 10px; }
|
|
19
|
+
.data-table th, .data-table td { text-align: left; padding: 8px; border-bottom: 1px solid #eee; }
|
|
20
|
+
.data-table th { width: 150px; color: #666; }
|
|
21
|
+
code { background: #f6f8fa; padding: 2px 4px; border-radius: 3px; font-size: 0.9em; }
|
|
22
|
+
</style>
|
|
23
|
+
</head>
|
|
24
|
+
<body>
|
|
25
|
+
<h1>Page Analysis</h1>
|
|
26
|
+
<p><strong>URL:</strong> <a href="{{URL}}" target="_blank">{{URL}}</a></p>
|
|
27
|
+
|
|
28
|
+
<div class="score-card">
|
|
29
|
+
<div class="score-box">
|
|
30
|
+
<div class="score-val">{{SEO_SCORE}}</div>
|
|
31
|
+
<div>SEO Score</div>
|
|
32
|
+
</div>
|
|
33
|
+
<div class="score-box">
|
|
34
|
+
<div class="score-val">{{THIN_SCORE}}</div>
|
|
35
|
+
<div>Thin Content Score</div>
|
|
36
|
+
</div>
|
|
37
|
+
<div class="score-box">
|
|
38
|
+
<div class="score-val">{{HTTP_STATUS}}</div>
|
|
39
|
+
<div>HTTP Status</div>
|
|
40
|
+
</div>
|
|
41
|
+
</div>
|
|
42
|
+
|
|
43
|
+
<h2>Meta Tags</h2>
|
|
44
|
+
<table class="data-table">
|
|
45
|
+
<tr>
|
|
46
|
+
<th>Title</th>
|
|
47
|
+
<td>
|
|
48
|
+
<div>{{TITLE_VALUE}}</div>
|
|
49
|
+
<small>Length: {{TITLE_LENGTH}} | Status: <span class="status-{{TITLE_STATUS}}">{{TITLE_STATUS}}</span></small>
|
|
50
|
+
</td>
|
|
51
|
+
</tr>
|
|
52
|
+
<tr>
|
|
53
|
+
<th>Description</th>
|
|
54
|
+
<td>
|
|
55
|
+
<div>{{META_DESCRIPTION_VALUE}}</div>
|
|
56
|
+
<small>Length: {{META_DESCRIPTION_LENGTH}} | Status: <span class="status-{{META_DESCRIPTION_STATUS}}">{{META_DESCRIPTION_STATUS}}</span></small>
|
|
57
|
+
</td>
|
|
58
|
+
</tr>
|
|
59
|
+
<tr>
|
|
60
|
+
<th>Canonical</th>
|
|
61
|
+
<td>{{CANONICAL}}</td>
|
|
62
|
+
</tr>
|
|
63
|
+
<tr>
|
|
64
|
+
<th>Robots</th>
|
|
65
|
+
<td>
|
|
66
|
+
Index: {{ROBOTS_INDEX}},
|
|
67
|
+
Follow: {{ROBOTS_FOLLOW}}
|
|
68
|
+
</td>
|
|
69
|
+
</tr>
|
|
70
|
+
</table>
|
|
71
|
+
|
|
72
|
+
<h2>Content & Heading</h2>
|
|
73
|
+
<table class="data-table">
|
|
74
|
+
<tr>
|
|
75
|
+
<th>H1 Tag</th>
|
|
76
|
+
<td>
|
|
77
|
+
Status: <span class="status-{{H1_STATUS}}">{{H1_STATUS}}</span>
|
|
78
|
+
({{H1_COUNT}} detected)
|
|
79
|
+
{{H1_MATCHES_TITLE}}
|
|
80
|
+
</td>
|
|
81
|
+
</tr>
|
|
82
|
+
<tr>
|
|
83
|
+
<th>Word Count</th>
|
|
84
|
+
<td>{{WORD_COUNT}} words</td>
|
|
85
|
+
</tr>
|
|
86
|
+
<tr>
|
|
87
|
+
<th>Unique Sentences</th>
|
|
88
|
+
<td>{{UNIQUE_SENTENCES}}</td>
|
|
89
|
+
</tr>
|
|
90
|
+
<tr>
|
|
91
|
+
<th>Text / HTML Ratio</th>
|
|
92
|
+
<td>{{TEXT_HTML_RATIO}}%</td>
|
|
93
|
+
</tr>
|
|
94
|
+
</table>
|
|
95
|
+
|
|
96
|
+
<h2>Links & Images</h2>
|
|
97
|
+
<table class="data-table">
|
|
98
|
+
<tr>
|
|
99
|
+
<th>Internal Links</th>
|
|
100
|
+
<td>{{INTERNAL_LINKS}}</td>
|
|
101
|
+
</tr>
|
|
102
|
+
<tr>
|
|
103
|
+
<th>External Links</th>
|
|
104
|
+
<td>{{EXTERNAL_LINKS}} ({{EXTERNAL_RATIO}}%)</td>
|
|
105
|
+
</tr>
|
|
106
|
+
<tr>
|
|
107
|
+
<th>Images</th>
|
|
108
|
+
<td>{{TOTAL_IMAGES}} total ({{MISSING_ALT}} missing alt text)</td>
|
|
109
|
+
</tr>
|
|
110
|
+
</table>
|
|
111
|
+
|
|
112
|
+
<h2>Structured Data</h2>
|
|
113
|
+
<table class="data-table">
|
|
114
|
+
<tr>
|
|
115
|
+
<th>Status</th>
|
|
116
|
+
<td>
|
|
117
|
+
{{STRUCTURED_DATA_STATUS}}
|
|
118
|
+
</td>
|
|
119
|
+
</tr>
|
|
120
|
+
{{STRUCTURED_DATA_TYPES_ROW}}
|
|
121
|
+
</table>
|
|
122
|
+
</body>
|
|
123
|
+
</html>
|