@crawlith/core 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/dist/analysis/analysis_list.html +35 -0
- package/dist/analysis/analysis_page.html +123 -0
- package/dist/analysis/analyze.d.ts +17 -3
- package/dist/analysis/analyze.js +192 -248
- package/dist/analysis/scoring.js +7 -1
- package/dist/analysis/templates.d.ts +2 -0
- package/dist/analysis/templates.js +7 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +71 -3
- package/dist/crawler/crawl.d.ts +4 -22
- package/dist/crawler/crawl.js +4 -335
- package/dist/crawler/crawler.d.ts +75 -0
- package/dist/crawler/crawler.js +518 -0
- package/dist/crawler/extract.d.ts +4 -1
- package/dist/crawler/extract.js +7 -2
- package/dist/crawler/fetcher.d.ts +1 -0
- package/dist/crawler/fetcher.js +20 -5
- package/dist/crawler/metricsRunner.d.ts +3 -1
- package/dist/crawler/metricsRunner.js +55 -46
- package/dist/crawler/sitemap.d.ts +3 -0
- package/dist/crawler/sitemap.js +5 -1
- package/dist/db/graphLoader.js +32 -3
- package/dist/db/index.d.ts +3 -0
- package/dist/db/index.js +4 -0
- package/dist/db/repositories/EdgeRepository.d.ts +8 -0
- package/dist/db/repositories/EdgeRepository.js +13 -0
- package/dist/db/repositories/MetricsRepository.d.ts +3 -0
- package/dist/db/repositories/MetricsRepository.js +14 -1
- package/dist/db/repositories/PageRepository.d.ts +11 -0
- package/dist/db/repositories/PageRepository.js +112 -19
- package/dist/db/repositories/SiteRepository.d.ts +3 -0
- package/dist/db/repositories/SiteRepository.js +9 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +2 -0
- package/dist/db/repositories/SnapshotRepository.js +23 -2
- package/dist/events.d.ts +48 -0
- package/dist/events.js +1 -0
- package/dist/graph/cluster.js +62 -14
- package/dist/graph/duplicate.js +242 -191
- package/dist/graph/graph.d.ts +16 -0
- package/dist/graph/graph.js +17 -4
- package/dist/graph/metrics.js +12 -0
- package/dist/graph/pagerank.js +2 -0
- package/dist/graph/simhash.d.ts +6 -0
- package/dist/graph/simhash.js +14 -0
- package/dist/index.d.ts +5 -2
- package/dist/index.js +5 -2
- package/dist/lock/hashKey.js +1 -1
- package/dist/lock/lockManager.d.ts +4 -1
- package/dist/lock/lockManager.js +23 -13
- package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
- package/dist/report/crawlExport.d.ts +3 -0
- package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
- package/dist/report/crawl_template.d.ts +1 -0
- package/dist/report/crawl_template.js +7 -0
- package/dist/report/html.js +15 -216
- package/dist/scoring/health.d.ts +50 -0
- package/dist/scoring/health.js +170 -0
- package/dist/scoring/hits.d.ts +1 -0
- package/dist/scoring/hits.js +64 -44
- package/dist/scoring/orphanSeverity.d.ts +5 -5
- package/package.json +3 -3
- package/scripts/copy-assets.js +37 -0
- package/src/analysis/analysis_list.html +35 -0
- package/src/analysis/analysis_page.html +123 -0
- package/src/analysis/analyze.ts +218 -261
- package/src/analysis/scoring.ts +8 -1
- package/src/analysis/templates.ts +9 -0
- package/src/core/security/ipGuard.ts +82 -3
- package/src/crawler/crawl.ts +6 -379
- package/src/crawler/crawler.ts +601 -0
- package/src/crawler/extract.ts +7 -2
- package/src/crawler/fetcher.ts +24 -6
- package/src/crawler/metricsRunner.ts +60 -47
- package/src/crawler/sitemap.ts +4 -1
- package/src/db/graphLoader.ts +33 -3
- package/src/db/index.ts +5 -0
- package/src/db/repositories/EdgeRepository.ts +14 -0
- package/src/db/repositories/MetricsRepository.ts +15 -1
- package/src/db/repositories/PageRepository.ts +119 -19
- package/src/db/repositories/SiteRepository.ts +11 -0
- package/src/db/repositories/SnapshotRepository.ts +28 -3
- package/src/events.ts +16 -0
- package/src/graph/cluster.ts +69 -15
- package/src/graph/duplicate.ts +249 -185
- package/src/graph/graph.ts +24 -4
- package/src/graph/metrics.ts +15 -0
- package/src/graph/pagerank.ts +1 -0
- package/src/graph/simhash.ts +15 -0
- package/src/index.ts +5 -2
- package/src/lock/hashKey.ts +1 -1
- package/src/lock/lockManager.ts +21 -13
- package/{dist/report/sitegraph_template.js → src/report/crawl.html} +330 -81
- package/src/report/{sitegraphExport.ts → crawlExport.ts} +3 -3
- package/src/report/crawl_template.ts +9 -0
- package/src/report/html.ts +17 -217
- package/src/scoring/health.ts +241 -0
- package/src/scoring/hits.ts +67 -45
- package/src/scoring/orphanSeverity.ts +8 -8
- package/tests/analysis.unit.test.ts +44 -0
- package/tests/analyze.integration.test.ts +88 -53
- package/tests/analyze_markdown.test.ts +98 -0
- package/tests/audit/audit.test.ts +101 -0
- package/tests/audit/scoring.test.ts +25 -25
- package/tests/audit/transport.test.ts +0 -1
- package/tests/clustering_risk.test.ts +118 -0
- package/tests/crawler.test.ts +19 -13
- package/tests/db/index.test.ts +134 -0
- package/tests/db/repositories.test.ts +115 -0
- package/tests/db_repos.test.ts +72 -0
- package/tests/duplicate.test.ts +2 -2
- package/tests/extract.test.ts +86 -0
- package/tests/fetcher.test.ts +5 -1
- package/tests/fetcher_safety.test.ts +9 -3
- package/tests/graph/graph.test.ts +100 -0
- package/tests/graphLoader.test.ts +124 -0
- package/tests/html_report.test.ts +52 -51
- package/tests/ipGuard.test.ts +73 -0
- package/tests/lock/lockManager.test.ts +77 -17
- package/tests/normalize.test.ts +6 -19
- package/tests/orphanSeverity.test.ts +9 -9
- package/tests/redirect_safety.test.ts +5 -1
- package/tests/renderAnalysisCsv.test.ts +183 -0
- package/tests/safety.test.ts +12 -0
- package/tests/scope.test.ts +18 -0
- package/tests/scoring.test.ts +25 -24
- package/tests/sitemap.test.ts +13 -1
- package/tests/ssrf_fix.test.ts +69 -0
- package/tests/visualization_data.test.ts +10 -10
- package/dist/report/sitegraphExport.d.ts +0 -3
- package/dist/report/sitegraph_template.d.ts +0 -1
package/CHANGELOG.md
CHANGED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
<!DOCTYPE html>
|
|
2
|
+
<html lang="en">
|
|
3
|
+
<head>
|
|
4
|
+
<meta charset="utf-8" />
|
|
5
|
+
<title>Crawlith Analysis Report</title>
|
|
6
|
+
<style>
|
|
7
|
+
body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; max-width: 1000px; margin: 0 auto; padding: 20px; color: #333; }
|
|
8
|
+
h1 { border-bottom: 2px solid #eee; padding-bottom: 10px; }
|
|
9
|
+
table { width: 100%; border-collapse: collapse; margin-top: 20px; }
|
|
10
|
+
th, td { padding: 8px 12px; border: 1px solid #ddd; text-align: left; }
|
|
11
|
+
th { background-color: #f4f4f4; }
|
|
12
|
+
tr:nth-child(even) { background-color: #f9f9f9; }
|
|
13
|
+
tr:hover { background-color: #f1f1f1; }
|
|
14
|
+
</style>
|
|
15
|
+
</head>
|
|
16
|
+
<body>
|
|
17
|
+
<h1>Analysis</h1>
|
|
18
|
+
<p>Pages: {{PAGES_ANALYZED}}</p>
|
|
19
|
+
<p>Average SEO: {{AVG_SEO_SCORE}}</p>
|
|
20
|
+
<table border="1" cellspacing="0" cellpadding="6">
|
|
21
|
+
<thead>
|
|
22
|
+
<tr>
|
|
23
|
+
<th>URL</th>
|
|
24
|
+
<th>SEO Score</th>
|
|
25
|
+
<th>Thin Score</th>
|
|
26
|
+
<th>Title</th>
|
|
27
|
+
<th>Meta</th>
|
|
28
|
+
</tr>
|
|
29
|
+
</thead>
|
|
30
|
+
<tbody>
|
|
31
|
+
{{ROWS}}
|
|
32
|
+
</tbody>
|
|
33
|
+
</table>
|
|
34
|
+
</body>
|
|
35
|
+
</html>
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
<!DOCTYPE html>
|
|
2
|
+
<html lang="en">
|
|
3
|
+
<head>
|
|
4
|
+
<meta charset="UTF-8">
|
|
5
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
6
|
+
<title>Analysis for {{URL}}</title>
|
|
7
|
+
<style>
|
|
8
|
+
body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; line-height: 1.6; color: #333; }
|
|
9
|
+
h1 { border-bottom: 2px solid #eee; padding-bottom: 10px; }
|
|
10
|
+
h2 { margin-top: 30px; border-bottom: 1px solid #eee; padding-bottom: 5px; }
|
|
11
|
+
.score-card { display: flex; gap: 20px; margin-bottom: 30px; }
|
|
12
|
+
.score-box { background: #f8f9fa; padding: 15px; border-radius: 8px; text-align: center; flex: 1; border: 1px solid #e1e4e8; }
|
|
13
|
+
.score-val { font-size: 24px; font-weight: bold; color: #0366d6; }
|
|
14
|
+
.status-ok { color: green; font-weight: bold; }
|
|
15
|
+
.status-warning { color: orange; font-weight: bold; }
|
|
16
|
+
.status-critical { color: red; font-weight: bold; }
|
|
17
|
+
.status-missing { color: red; font-weight: bold; }
|
|
18
|
+
.data-table { width: 100%; border-collapse: collapse; margin-top: 10px; }
|
|
19
|
+
.data-table th, .data-table td { text-align: left; padding: 8px; border-bottom: 1px solid #eee; }
|
|
20
|
+
.data-table th { width: 150px; color: #666; }
|
|
21
|
+
code { background: #f6f8fa; padding: 2px 4px; border-radius: 3px; font-size: 0.9em; }
|
|
22
|
+
</style>
|
|
23
|
+
</head>
|
|
24
|
+
<body>
|
|
25
|
+
<h1>Page Analysis</h1>
|
|
26
|
+
<p><strong>URL:</strong> <a href="{{URL}}" target="_blank">{{URL}}</a></p>
|
|
27
|
+
|
|
28
|
+
<div class="score-card">
|
|
29
|
+
<div class="score-box">
|
|
30
|
+
<div class="score-val">{{SEO_SCORE}}</div>
|
|
31
|
+
<div>SEO Score</div>
|
|
32
|
+
</div>
|
|
33
|
+
<div class="score-box">
|
|
34
|
+
<div class="score-val">{{THIN_SCORE}}</div>
|
|
35
|
+
<div>Thin Content Score</div>
|
|
36
|
+
</div>
|
|
37
|
+
<div class="score-box">
|
|
38
|
+
<div class="score-val">{{HTTP_STATUS}}</div>
|
|
39
|
+
<div>HTTP Status</div>
|
|
40
|
+
</div>
|
|
41
|
+
</div>
|
|
42
|
+
|
|
43
|
+
<h2>Meta Tags</h2>
|
|
44
|
+
<table class="data-table">
|
|
45
|
+
<tr>
|
|
46
|
+
<th>Title</th>
|
|
47
|
+
<td>
|
|
48
|
+
<div>{{TITLE_VALUE}}</div>
|
|
49
|
+
<small>Length: {{TITLE_LENGTH}} | Status: <span class="status-{{TITLE_STATUS}}">{{TITLE_STATUS}}</span></small>
|
|
50
|
+
</td>
|
|
51
|
+
</tr>
|
|
52
|
+
<tr>
|
|
53
|
+
<th>Description</th>
|
|
54
|
+
<td>
|
|
55
|
+
<div>{{META_DESCRIPTION_VALUE}}</div>
|
|
56
|
+
<small>Length: {{META_DESCRIPTION_LENGTH}} | Status: <span class="status-{{META_DESCRIPTION_STATUS}}">{{META_DESCRIPTION_STATUS}}</span></small>
|
|
57
|
+
</td>
|
|
58
|
+
</tr>
|
|
59
|
+
<tr>
|
|
60
|
+
<th>Canonical</th>
|
|
61
|
+
<td>{{CANONICAL}}</td>
|
|
62
|
+
</tr>
|
|
63
|
+
<tr>
|
|
64
|
+
<th>Robots</th>
|
|
65
|
+
<td>
|
|
66
|
+
Index: {{ROBOTS_INDEX}},
|
|
67
|
+
Follow: {{ROBOTS_FOLLOW}}
|
|
68
|
+
</td>
|
|
69
|
+
</tr>
|
|
70
|
+
</table>
|
|
71
|
+
|
|
72
|
+
<h2>Content & Heading</h2>
|
|
73
|
+
<table class="data-table">
|
|
74
|
+
<tr>
|
|
75
|
+
<th>H1 Tag</th>
|
|
76
|
+
<td>
|
|
77
|
+
Status: <span class="status-{{H1_STATUS}}">{{H1_STATUS}}</span>
|
|
78
|
+
({{H1_COUNT}} detected)
|
|
79
|
+
{{H1_MATCHES_TITLE}}
|
|
80
|
+
</td>
|
|
81
|
+
</tr>
|
|
82
|
+
<tr>
|
|
83
|
+
<th>Word Count</th>
|
|
84
|
+
<td>{{WORD_COUNT}} words</td>
|
|
85
|
+
</tr>
|
|
86
|
+
<tr>
|
|
87
|
+
<th>Unique Sentences</th>
|
|
88
|
+
<td>{{UNIQUE_SENTENCES}}</td>
|
|
89
|
+
</tr>
|
|
90
|
+
<tr>
|
|
91
|
+
<th>Text / HTML Ratio</th>
|
|
92
|
+
<td>{{TEXT_HTML_RATIO}}%</td>
|
|
93
|
+
</tr>
|
|
94
|
+
</table>
|
|
95
|
+
|
|
96
|
+
<h2>Links & Images</h2>
|
|
97
|
+
<table class="data-table">
|
|
98
|
+
<tr>
|
|
99
|
+
<th>Internal Links</th>
|
|
100
|
+
<td>{{INTERNAL_LINKS}}</td>
|
|
101
|
+
</tr>
|
|
102
|
+
<tr>
|
|
103
|
+
<th>External Links</th>
|
|
104
|
+
<td>{{EXTERNAL_LINKS}} ({{EXTERNAL_RATIO}}%)</td>
|
|
105
|
+
</tr>
|
|
106
|
+
<tr>
|
|
107
|
+
<th>Images</th>
|
|
108
|
+
<td>{{TOTAL_IMAGES}} total ({{MISSING_ALT}} missing alt text)</td>
|
|
109
|
+
</tr>
|
|
110
|
+
</table>
|
|
111
|
+
|
|
112
|
+
<h2>Structured Data</h2>
|
|
113
|
+
<table class="data-table">
|
|
114
|
+
<tr>
|
|
115
|
+
<th>Status</th>
|
|
116
|
+
<td>
|
|
117
|
+
{{STRUCTURED_DATA_STATUS}}
|
|
118
|
+
</td>
|
|
119
|
+
</tr>
|
|
120
|
+
{{STRUCTURED_DATA_TYPES_ROW}}
|
|
121
|
+
</table>
|
|
122
|
+
</body>
|
|
123
|
+
</html>
|
|
@@ -5,6 +5,7 @@ import { ImageAltAnalysis } from './images.js';
|
|
|
5
5
|
import { LinkRatioAnalysis } from './links.js';
|
|
6
6
|
import { StructuredDataResult } from './structuredData.js';
|
|
7
7
|
import { aggregateSiteScore } from './scoring.js';
|
|
8
|
+
import { EngineContext } from '../events.js';
|
|
8
9
|
export interface CrawlPage {
|
|
9
10
|
url: string;
|
|
10
11
|
status?: number;
|
|
@@ -13,11 +14,10 @@ export interface CrawlPage {
|
|
|
13
14
|
canonical?: string;
|
|
14
15
|
noindex?: boolean;
|
|
15
16
|
nofollow?: boolean;
|
|
17
|
+
crawlStatus?: string;
|
|
16
18
|
}
|
|
17
19
|
export interface AnalyzeOptions {
|
|
18
|
-
fromCrawl?: string;
|
|
19
20
|
live?: boolean;
|
|
20
|
-
html?: boolean;
|
|
21
21
|
seo?: boolean;
|
|
22
22
|
content?: boolean;
|
|
23
23
|
accessibility?: boolean;
|
|
@@ -28,6 +28,7 @@ export interface AnalyzeOptions {
|
|
|
28
28
|
debug?: boolean;
|
|
29
29
|
clusterThreshold?: number;
|
|
30
30
|
minClusterSize?: number;
|
|
31
|
+
allPages?: boolean;
|
|
31
32
|
}
|
|
32
33
|
export interface PageAnalysis {
|
|
33
34
|
url: string;
|
|
@@ -45,6 +46,7 @@ export interface PageAnalysis {
|
|
|
45
46
|
canonical?: string;
|
|
46
47
|
noindex?: boolean;
|
|
47
48
|
nofollow?: boolean;
|
|
49
|
+
crawlStatus?: string;
|
|
48
50
|
};
|
|
49
51
|
}
|
|
50
52
|
export interface AnalysisResult {
|
|
@@ -63,8 +65,20 @@ export interface AnalysisResult {
|
|
|
63
65
|
accessibility: boolean;
|
|
64
66
|
};
|
|
65
67
|
clusters?: ClusterInfo[];
|
|
68
|
+
snapshotId?: number;
|
|
69
|
+
crawledAt?: string;
|
|
66
70
|
}
|
|
67
|
-
|
|
71
|
+
/**
|
|
72
|
+
* Analyzes a site for SEO, content, and accessibility.
|
|
73
|
+
* Supports live crawling or loading from a database snapshot.
|
|
74
|
+
* Note: File-based data loading is not supported.
|
|
75
|
+
*
|
|
76
|
+
* @param url The root URL to analyze
|
|
77
|
+
* @param options Analysis options
|
|
78
|
+
* @param context Engine context for event emission
|
|
79
|
+
*/
|
|
80
|
+
export declare function analyzeSite(url: string, options: AnalyzeOptions, context?: EngineContext): Promise<AnalysisResult>;
|
|
68
81
|
export declare function renderAnalysisHtml(result: AnalysisResult): string;
|
|
69
82
|
export declare function renderAnalysisMarkdown(result: AnalysisResult): string;
|
|
70
83
|
export declare function renderAnalysisCsv(result: AnalysisResult): string;
|
|
84
|
+
export declare function analyzePages(rootUrl: string, pages: Iterable<CrawlPage> | CrawlPage[], robots?: any): PageAnalysis[];
|