@crawlith/core 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (201) hide show
  1. package/CHANGELOG.md +7 -0
  2. package/dist/analysis/analyze.d.ts +70 -0
  3. package/dist/analysis/analyze.js +436 -0
  4. package/dist/analysis/content.d.ts +12 -0
  5. package/dist/analysis/content.js +33 -0
  6. package/dist/analysis/images.d.ts +6 -0
  7. package/dist/analysis/images.js +18 -0
  8. package/dist/analysis/links.d.ts +7 -0
  9. package/dist/analysis/links.js +30 -0
  10. package/dist/analysis/scoring.d.ts +9 -0
  11. package/dist/analysis/scoring.js +42 -0
  12. package/dist/analysis/seo.d.ts +15 -0
  13. package/dist/analysis/seo.js +64 -0
  14. package/dist/analysis/structuredData.d.ts +6 -0
  15. package/dist/analysis/structuredData.js +51 -0
  16. package/dist/audit/dns.d.ts +2 -0
  17. package/dist/audit/dns.js +42 -0
  18. package/dist/audit/headers.d.ts +2 -0
  19. package/dist/audit/headers.js +95 -0
  20. package/dist/audit/index.d.ts +2 -0
  21. package/dist/audit/index.js +50 -0
  22. package/dist/audit/scoring.d.ts +14 -0
  23. package/dist/audit/scoring.js +214 -0
  24. package/dist/audit/transport.d.ts +6 -0
  25. package/dist/audit/transport.js +207 -0
  26. package/dist/audit/types.d.ts +88 -0
  27. package/dist/audit/types.js +1 -0
  28. package/dist/core/network/proxyAdapter.d.ts +6 -0
  29. package/dist/core/network/proxyAdapter.js +19 -0
  30. package/dist/core/network/rateLimiter.d.ts +6 -0
  31. package/dist/core/network/rateLimiter.js +31 -0
  32. package/dist/core/network/redirectController.d.ts +13 -0
  33. package/dist/core/network/redirectController.js +41 -0
  34. package/dist/core/network/responseLimiter.d.ts +4 -0
  35. package/dist/core/network/responseLimiter.js +26 -0
  36. package/dist/core/network/retryPolicy.d.ts +10 -0
  37. package/dist/core/network/retryPolicy.js +41 -0
  38. package/dist/core/scope/domainFilter.d.ts +11 -0
  39. package/dist/core/scope/domainFilter.js +40 -0
  40. package/dist/core/scope/scopeManager.d.ts +14 -0
  41. package/dist/core/scope/scopeManager.js +39 -0
  42. package/dist/core/scope/subdomainPolicy.d.ts +6 -0
  43. package/dist/core/scope/subdomainPolicy.js +35 -0
  44. package/dist/core/security/ipGuard.d.ts +11 -0
  45. package/dist/core/security/ipGuard.js +84 -0
  46. package/dist/crawler/crawl.d.ts +22 -0
  47. package/dist/crawler/crawl.js +336 -0
  48. package/dist/crawler/extract.d.ts +5 -0
  49. package/dist/crawler/extract.js +33 -0
  50. package/dist/crawler/fetcher.d.ts +40 -0
  51. package/dist/crawler/fetcher.js +161 -0
  52. package/dist/crawler/metricsRunner.d.ts +1 -0
  53. package/dist/crawler/metricsRunner.js +108 -0
  54. package/dist/crawler/normalize.d.ts +7 -0
  55. package/dist/crawler/normalize.js +88 -0
  56. package/dist/crawler/parser.d.ts +22 -0
  57. package/dist/crawler/parser.js +158 -0
  58. package/dist/crawler/sitemap.d.ts +8 -0
  59. package/dist/crawler/sitemap.js +70 -0
  60. package/dist/crawler/trap.d.ts +24 -0
  61. package/dist/crawler/trap.js +78 -0
  62. package/dist/db/graphLoader.d.ts +2 -0
  63. package/dist/db/graphLoader.js +96 -0
  64. package/dist/db/index.d.ts +4 -0
  65. package/dist/db/index.js +61 -0
  66. package/dist/db/repositories/EdgeRepository.d.ts +16 -0
  67. package/dist/db/repositories/EdgeRepository.js +17 -0
  68. package/dist/db/repositories/MetricsRepository.d.ts +26 -0
  69. package/dist/db/repositories/MetricsRepository.js +27 -0
  70. package/dist/db/repositories/PageRepository.d.ts +47 -0
  71. package/dist/db/repositories/PageRepository.js +93 -0
  72. package/dist/db/repositories/SiteRepository.d.ts +15 -0
  73. package/dist/db/repositories/SiteRepository.js +22 -0
  74. package/dist/db/repositories/SnapshotRepository.d.ts +22 -0
  75. package/dist/db/repositories/SnapshotRepository.js +55 -0
  76. package/dist/db/schema.d.ts +2 -0
  77. package/dist/db/schema.js +169 -0
  78. package/dist/diff/compare.d.ts +26 -0
  79. package/dist/diff/compare.js +64 -0
  80. package/dist/graph/cluster.d.ts +6 -0
  81. package/dist/graph/cluster.js +173 -0
  82. package/dist/graph/duplicate.d.ts +10 -0
  83. package/dist/graph/duplicate.js +251 -0
  84. package/dist/graph/graph.d.ts +103 -0
  85. package/dist/graph/graph.js +106 -0
  86. package/dist/graph/metrics.d.ts +29 -0
  87. package/dist/graph/metrics.js +74 -0
  88. package/dist/graph/pagerank.d.ts +12 -0
  89. package/dist/graph/pagerank.js +102 -0
  90. package/dist/graph/simhash.d.ts +17 -0
  91. package/dist/graph/simhash.js +56 -0
  92. package/dist/index.d.ts +30 -0
  93. package/dist/index.js +30 -0
  94. package/dist/lock/hashKey.d.ts +1 -0
  95. package/dist/lock/hashKey.js +44 -0
  96. package/dist/lock/lockManager.d.ts +7 -0
  97. package/dist/lock/lockManager.js +112 -0
  98. package/dist/lock/pidCheck.d.ts +1 -0
  99. package/dist/lock/pidCheck.js +14 -0
  100. package/dist/report/html.d.ts +2 -0
  101. package/dist/report/html.js +223 -0
  102. package/dist/report/sitegraphExport.d.ts +3 -0
  103. package/dist/report/sitegraphExport.js +52 -0
  104. package/dist/report/sitegraph_template.d.ts +1 -0
  105. package/dist/report/sitegraph_template.js +630 -0
  106. package/dist/scoring/hits.d.ts +9 -0
  107. package/dist/scoring/hits.js +111 -0
  108. package/dist/scoring/orphanSeverity.d.ts +39 -0
  109. package/dist/scoring/orphanSeverity.js +125 -0
  110. package/dist/utils/version.d.ts +2 -0
  111. package/dist/utils/version.js +15 -0
  112. package/package.json +33 -0
  113. package/src/analysis/analyze.ts +548 -0
  114. package/src/analysis/content.ts +62 -0
  115. package/src/analysis/images.ts +28 -0
  116. package/src/analysis/links.ts +41 -0
  117. package/src/analysis/scoring.ts +59 -0
  118. package/src/analysis/seo.ts +82 -0
  119. package/src/analysis/structuredData.ts +62 -0
  120. package/src/audit/dns.ts +49 -0
  121. package/src/audit/headers.ts +98 -0
  122. package/src/audit/index.ts +66 -0
  123. package/src/audit/scoring.ts +232 -0
  124. package/src/audit/transport.ts +258 -0
  125. package/src/audit/types.ts +102 -0
  126. package/src/core/network/proxyAdapter.ts +21 -0
  127. package/src/core/network/rateLimiter.ts +39 -0
  128. package/src/core/network/redirectController.ts +47 -0
  129. package/src/core/network/responseLimiter.ts +34 -0
  130. package/src/core/network/retryPolicy.ts +57 -0
  131. package/src/core/scope/domainFilter.ts +45 -0
  132. package/src/core/scope/scopeManager.ts +52 -0
  133. package/src/core/scope/subdomainPolicy.ts +39 -0
  134. package/src/core/security/ipGuard.ts +92 -0
  135. package/src/crawler/crawl.ts +382 -0
  136. package/src/crawler/extract.ts +34 -0
  137. package/src/crawler/fetcher.ts +233 -0
  138. package/src/crawler/metricsRunner.ts +124 -0
  139. package/src/crawler/normalize.ts +108 -0
  140. package/src/crawler/parser.ts +190 -0
  141. package/src/crawler/sitemap.ts +73 -0
  142. package/src/crawler/trap.ts +96 -0
  143. package/src/db/graphLoader.ts +105 -0
  144. package/src/db/index.ts +70 -0
  145. package/src/db/repositories/EdgeRepository.ts +29 -0
  146. package/src/db/repositories/MetricsRepository.ts +49 -0
  147. package/src/db/repositories/PageRepository.ts +128 -0
  148. package/src/db/repositories/SiteRepository.ts +32 -0
  149. package/src/db/repositories/SnapshotRepository.ts +74 -0
  150. package/src/db/schema.ts +177 -0
  151. package/src/diff/compare.ts +84 -0
  152. package/src/graph/cluster.ts +192 -0
  153. package/src/graph/duplicate.ts +286 -0
  154. package/src/graph/graph.ts +172 -0
  155. package/src/graph/metrics.ts +110 -0
  156. package/src/graph/pagerank.ts +125 -0
  157. package/src/graph/simhash.ts +61 -0
  158. package/src/index.ts +30 -0
  159. package/src/lock/hashKey.ts +51 -0
  160. package/src/lock/lockManager.ts +124 -0
  161. package/src/lock/pidCheck.ts +13 -0
  162. package/src/report/html.ts +227 -0
  163. package/src/report/sitegraphExport.ts +58 -0
  164. package/src/report/sitegraph_template.ts +630 -0
  165. package/src/scoring/hits.ts +131 -0
  166. package/src/scoring/orphanSeverity.ts +176 -0
  167. package/src/utils/version.ts +18 -0
  168. package/tests/__snapshots__/orphanSeverity.test.ts.snap +49 -0
  169. package/tests/analysis.unit.test.ts +98 -0
  170. package/tests/analyze.integration.test.ts +98 -0
  171. package/tests/audit/dns.test.ts +31 -0
  172. package/tests/audit/headers.test.ts +45 -0
  173. package/tests/audit/scoring.test.ts +133 -0
  174. package/tests/audit/security.test.ts +12 -0
  175. package/tests/audit/transport.test.ts +112 -0
  176. package/tests/clustering.test.ts +118 -0
  177. package/tests/crawler.test.ts +358 -0
  178. package/tests/db.test.ts +159 -0
  179. package/tests/diff.test.ts +67 -0
  180. package/tests/duplicate.test.ts +110 -0
  181. package/tests/fetcher.test.ts +106 -0
  182. package/tests/fetcher_safety.test.ts +85 -0
  183. package/tests/fixtures/analyze-crawl.json +26 -0
  184. package/tests/hits.test.ts +134 -0
  185. package/tests/html_report.test.ts +58 -0
  186. package/tests/lock/lockManager.test.ts +138 -0
  187. package/tests/metrics.test.ts +196 -0
  188. package/tests/normalize.test.ts +101 -0
  189. package/tests/orphanSeverity.test.ts +160 -0
  190. package/tests/pagerank.test.ts +98 -0
  191. package/tests/parser.test.ts +117 -0
  192. package/tests/proxy_safety.test.ts +57 -0
  193. package/tests/redirect_safety.test.ts +73 -0
  194. package/tests/safety.test.ts +114 -0
  195. package/tests/scope.test.ts +66 -0
  196. package/tests/scoring.test.ts +59 -0
  197. package/tests/sitemap.test.ts +88 -0
  198. package/tests/soft404.test.ts +41 -0
  199. package/tests/trap.test.ts +39 -0
  200. package/tests/visualization_data.test.ts +46 -0
  201. package/tsconfig.json +11 -0
@@ -0,0 +1,131 @@
1
+ import { Graph, GraphNode } from '../graph/graph.js';
2
+
3
+ export interface HITSOptions {
4
+ iterations?: number;
5
+ }
6
+
7
+ /**
8
+ * Computes Hub and Authority scores using the HITS algorithm.
9
+ * Operates purely on the internal link graph.
10
+ */
11
+ export function computeHITS(graph: Graph, options: HITSOptions = {}): void {
12
+ const iterations = options.iterations || 20;
13
+ const nodes = graph.getNodes();
14
+
15
+ // 1. Filter eligible nodes
16
+ // Eligibility: status 200, non-redirect (redirectChain empty), not noindex, non-external
17
+ const eligibleNodes = nodes.filter(n =>
18
+ n.status === 200 &&
19
+ (!n.redirectChain || n.redirectChain.length === 0) &&
20
+ !n.noindex
21
+ );
22
+
23
+ if (eligibleNodes.length === 0) return;
24
+
25
+ const urlToNode = new Map<string, GraphNode>();
26
+ for (const node of eligibleNodes) {
27
+ urlToNode.set(node.url, node);
28
+ // 2. Initialization
29
+ node.authorityScore = 1.0;
30
+ node.hubScore = 1.0;
31
+ }
32
+
33
+ const allEdges = graph.getEdges();
34
+ // Filter edges: internal links only (both source and target must be in eligibleNodes), no self-links
35
+ const eligibleEdges = allEdges.filter(e =>
36
+ e.source !== e.target &&
37
+ urlToNode.has(e.source) &&
38
+ urlToNode.has(e.target)
39
+ );
40
+
41
+ // Group edges for efficient iteration
42
+ const incoming = new Map<string, { source: string, weight: number }[]>();
43
+ const outgoing = new Map<string, { target: string, weight: number }[]>();
44
+
45
+ for (const edge of eligibleEdges) {
46
+ if (!incoming.has(edge.target)) incoming.set(edge.target, []);
47
+ incoming.get(edge.target)!.push({ source: edge.source, weight: edge.weight });
48
+
49
+ if (!outgoing.has(edge.source)) outgoing.set(edge.source, []);
50
+ outgoing.get(edge.source)!.push({ target: edge.target, weight: edge.weight });
51
+ }
52
+
53
+ // 3. Iteration
54
+ for (let i = 0; i < iterations; i++) {
55
+ // Update Authorities
56
+ let normAuth = 0;
57
+ for (const node of eligibleNodes) {
58
+ const inLinks = incoming.get(node.url) || [];
59
+ let newAuth = 0;
60
+ for (const link of inLinks) {
61
+ const sourceNode = urlToNode.get(link.source)!;
62
+ newAuth += (sourceNode.hubScore || 0) * link.weight;
63
+ }
64
+ node.authorityScore = newAuth;
65
+ normAuth += newAuth * newAuth;
66
+ }
67
+
68
+ // Normalize Authorities (L2 norm)
69
+ normAuth = Math.sqrt(normAuth);
70
+ if (normAuth > 0) {
71
+ for (const node of eligibleNodes) {
72
+ node.authorityScore = (node.authorityScore || 0) / normAuth;
73
+ }
74
+ }
75
+
76
+ // Update Hubs
77
+ let normHub = 0;
78
+ for (const node of eligibleNodes) {
79
+ const outLinks = outgoing.get(node.url) || [];
80
+ let newHub = 0;
81
+ for (const link of outLinks) {
82
+ const targetNode = urlToNode.get(link.target)!;
83
+ newHub += (targetNode.authorityScore || 0) * link.weight;
84
+ }
85
+ node.hubScore = newHub;
86
+ normHub += newHub * newHub;
87
+ }
88
+
89
+ // Normalize Hubs (L2 norm)
90
+ normHub = Math.sqrt(normHub);
91
+ if (normHub > 0) {
92
+ for (const node of eligibleNodes) {
93
+ node.hubScore = (node.hubScore || 0) / normHub;
94
+ }
95
+ }
96
+ }
97
+
98
+ // 4. Classification Logic
99
+ classifyLinkRoles(eligibleNodes);
100
+ }
101
+
102
+ function classifyLinkRoles(nodes: GraphNode[]): void {
103
+ if (nodes.length === 0) return;
104
+
105
+ const authScores = nodes.map(n => n.authorityScore || 0).sort((a, b) => a - b);
106
+ const hubScores = nodes.map(n => n.hubScore || 0).sort((a, b) => a - b);
107
+
108
+ // Use 75th percentile as "high" threshold
109
+ const medianAuth = authScores[Math.floor(authScores.length / 2)];
110
+ const medianHub = hubScores[Math.floor(hubScores.length / 2)];
111
+
112
+ for (const node of nodes) {
113
+ const auth = node.authorityScore || 0;
114
+ const hub = node.hubScore || 0;
115
+
116
+ const isHighAuth = auth > medianAuth && auth > 0.0001;
117
+ const isHighHub = hub > medianHub && hub > 0.0001;
118
+
119
+ if (isHighAuth && isHighHub) {
120
+ node.linkRole = 'power';
121
+ } else if (isHighAuth) {
122
+ node.linkRole = 'authority';
123
+ } else if (isHighHub) {
124
+ node.linkRole = 'hub';
125
+ } else if (auth > 0.0001 && hub > 0.0001) {
126
+ node.linkRole = 'balanced';
127
+ } else {
128
+ node.linkRole = 'peripheral';
129
+ }
130
+ }
131
+ }
@@ -0,0 +1,176 @@
1
+ export type OrphanType = 'hard' | 'near' | 'soft' | 'crawl-only';
2
+ export type ImpactLevel = 'low' | 'medium' | 'high' | 'critical';
3
+
4
+ export interface SitegraphNode {
5
+ url: string;
6
+ depth: number;
7
+ inLinks: number;
8
+ outLinks: number;
9
+ status: number;
10
+ discoveredViaSitemap?: boolean;
11
+ robotsExcluded?: boolean;
12
+ canonicalUrl?: string;
13
+ isHomepage?: boolean;
14
+ wordCount?: number;
15
+ hasStructuredData?: boolean;
16
+ pageType?: string;
17
+ noindex?: boolean;
18
+ duplicateContent?: boolean;
19
+ isProductOrCommercial?: boolean;
20
+ }
21
+
22
+ export interface SitegraphEdge {
23
+ source: string;
24
+ target: string;
25
+ }
26
+
27
+ export interface OrphanScoringOptions {
28
+ enabled: boolean;
29
+ severityEnabled: boolean;
30
+ includeSoftOrphans: boolean;
31
+ minInbound: number;
32
+ rootUrl?: string;
33
+ }
34
+
35
+ export type AnnotatedNode = SitegraphNode & {
36
+ orphan: boolean;
37
+ orphanType?: OrphanType;
38
+ orphanSeverity?: number;
39
+ impactLevel?: ImpactLevel;
40
+ };
41
+
42
+ const LOW_VALUE_PATTERNS = [
43
+ /[?&](page|p)=\d+/i,
44
+ /\/(page|tag|tags|category|categories)\//i,
45
+ /[?&](q|query|search|filter|sort)=/i,
46
+ /\/search(\/|\?|$)/i
47
+ ];
48
+
49
+ function isLowValuePage(node: SitegraphNode): boolean {
50
+ const type = (node.pageType || '').toLowerCase();
51
+ if (['pagination', 'tag', 'category', 'filter', 'search', 'archive'].includes(type)) {
52
+ return true;
53
+ }
54
+ if (node.noindex) {
55
+ return true;
56
+ }
57
+ return LOW_VALUE_PATTERNS.some((pattern) => pattern.test(node.url));
58
+ }
59
+
60
+ function clampScore(score: number): number {
61
+ return Math.max(0, Math.min(100, Math.round(score)));
62
+ }
63
+
64
+ export function mapImpactLevel(score: number): ImpactLevel {
65
+ if (score <= 39) return 'low';
66
+ if (score <= 69) return 'medium';
67
+ if (score <= 89) return 'high';
68
+ return 'critical';
69
+ }
70
+
71
+ export function calculateOrphanSeverity(orphanType: OrphanType, node: SitegraphNode): number {
72
+ let score = 0;
73
+
74
+ switch (orphanType) {
75
+ case 'hard':
76
+ score = 90;
77
+ break;
78
+ case 'crawl-only':
79
+ score = 80;
80
+ break;
81
+ case 'near':
82
+ score = node.inLinks <= 1 ? 70 : 60;
83
+ break;
84
+ case 'soft':
85
+ score = 50;
86
+ break;
87
+ }
88
+
89
+ let positiveModifier = 0;
90
+ if ((node.wordCount || 0) > 800) positiveModifier += 10;
91
+ if (node.hasStructuredData) positiveModifier += 10;
92
+ if (node.depth <= 2) positiveModifier += 10;
93
+ if (node.isProductOrCommercial) positiveModifier += 10;
94
+ positiveModifier = Math.min(20, positiveModifier);
95
+
96
+ let negativeModifier = 0;
97
+ if ((node.wordCount || 0) > 0 && (node.wordCount || 0) < 300) negativeModifier += 20;
98
+ if (node.noindex) negativeModifier += 20;
99
+ if (node.duplicateContent) negativeModifier += 20;
100
+ if ((node.pageType || '').toLowerCase() === 'archive' || (node.pageType || '').toLowerCase() === 'pagination') negativeModifier += 20;
101
+ negativeModifier = Math.min(20, negativeModifier);
102
+
103
+ score += positiveModifier;
104
+ score -= negativeModifier;
105
+
106
+ return clampScore(score);
107
+ }
108
+
109
+ function consolidateInboundByCanonical(nodes: SitegraphNode[]): Map<string, number> {
110
+ const canonicalInbound = new Map<string, number>();
111
+ for (const node of nodes) {
112
+ const canonical = node.canonicalUrl || node.url;
113
+ canonicalInbound.set(canonical, (canonicalInbound.get(canonical) || 0) + node.inLinks);
114
+ }
115
+ return canonicalInbound;
116
+ }
117
+
118
+ export function annotateOrphans(nodes: SitegraphNode[], edges: SitegraphEdge[], options: OrphanScoringOptions): AnnotatedNode[] {
119
+ if (!options.enabled) {
120
+ return nodes.map((node) => ({ ...node, orphan: false }));
121
+ }
122
+
123
+ const canonicalInbound = consolidateInboundByCanonical(nodes);
124
+ const nodeByUrl = new Map(nodes.map((node) => [node.url, node]));
125
+
126
+ return nodes.map((node) => {
127
+ const isHomepage = node.isHomepage || (options.rootUrl ? node.url === options.rootUrl : node.depth === 0);
128
+ if (isHomepage || node.robotsExcluded) {
129
+ return { ...node, orphan: false };
130
+ }
131
+
132
+ const canonical = node.canonicalUrl || node.url;
133
+ const inbound = canonicalInbound.get(canonical) || 0;
134
+
135
+ let orphanType: OrphanType | undefined;
136
+
137
+ if (inbound === 0) {
138
+ orphanType = node.discoveredViaSitemap ? 'crawl-only' : 'hard';
139
+ } else if (inbound <= options.minInbound) {
140
+ orphanType = 'near';
141
+ }
142
+
143
+ if (!orphanType && options.includeSoftOrphans && inbound > 0) {
144
+ const inboundSources = edges
145
+ .filter((edge) => edge.target === node.url)
146
+ .map((edge) => nodeByUrl.get(edge.source))
147
+ .filter((source): source is SitegraphNode => Boolean(source));
148
+
149
+ if (inboundSources.length > 0 && inboundSources.every((source) => isLowValuePage(source))) {
150
+ orphanType = 'soft';
151
+ }
152
+ }
153
+
154
+ if (!orphanType) {
155
+ return { ...node, orphan: false };
156
+ }
157
+
158
+ if (!options.severityEnabled) {
159
+ return {
160
+ ...node,
161
+ orphan: true,
162
+ orphanType
163
+ };
164
+ }
165
+
166
+ const orphanSeverity = calculateOrphanSeverity(orphanType, { ...node, inLinks: inbound });
167
+
168
+ return {
169
+ ...node,
170
+ orphan: true,
171
+ orphanType,
172
+ orphanSeverity,
173
+ impactLevel: mapImpactLevel(orphanSeverity)
174
+ };
175
+ });
176
+ }
@@ -0,0 +1,18 @@
1
+ import { readFileSync } from 'node:fs';
2
+ import { fileURLToPath } from 'node:url';
3
+ import { dirname, join } from 'node:path';
4
+
5
+ const __filename = fileURLToPath(import.meta.url);
6
+ const __dirname = dirname(__filename);
7
+
8
+ let version = '0.0.1';
9
+
10
+ try {
11
+ const pkgPath = join(__dirname, '../../package.json');
12
+ const pkg = JSON.parse(readFileSync(pkgPath, 'utf-8'));
13
+ version = pkg.version;
14
+ } catch {
15
+ // Fallback to internal default
16
+ }
17
+
18
+ export { version };
@@ -0,0 +1,49 @@
1
+ // Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html
2
+
3
+ exports[`orphan detection and severity scoring > canonical consolidation, robots exclusion, and deterministic JSON output snapshot 1`] = `
4
+ "[
5
+ {
6
+ "url": "https://example.com/canonical",
7
+ "depth": 1,
8
+ "inLinks": 0,
9
+ "outLinks": 0,
10
+ "status": 200,
11
+ "orphan": true,
12
+ "orphanType": "near",
13
+ "orphanSeverity": 80,
14
+ "impactLevel": "high"
15
+ },
16
+ {
17
+ "url": "https://example.com/variant?a=1",
18
+ "depth": 1,
19
+ "inLinks": 1,
20
+ "outLinks": 0,
21
+ "status": 200,
22
+ "canonicalUrl": "https://example.com/canonical",
23
+ "orphan": true,
24
+ "orphanType": "near",
25
+ "orphanSeverity": 80,
26
+ "impactLevel": "high"
27
+ },
28
+ {
29
+ "url": "https://example.com/blocked",
30
+ "depth": 1,
31
+ "inLinks": 0,
32
+ "outLinks": 0,
33
+ "status": 200,
34
+ "robotsExcluded": true,
35
+ "orphan": false
36
+ },
37
+ {
38
+ "url": "https://example.com/redirect-target",
39
+ "depth": 1,
40
+ "inLinks": 1,
41
+ "outLinks": 0,
42
+ "status": 200,
43
+ "orphan": true,
44
+ "orphanType": "near",
45
+ "orphanSeverity": 80,
46
+ "impactLevel": "high"
47
+ }
48
+ ]"
49
+ `;
@@ -0,0 +1,98 @@
1
+ import { describe, expect, test } from 'vitest';
2
+ import { analyzeTitle, analyzeMetaDescription, applyDuplicateStatuses, analyzeH1 } from '../src/analysis/seo.js';
3
+ import { analyzeContent, calculateThinContentScore } from '../src/analysis/content.js';
4
+ import { analyzeStructuredData } from '../src/analysis/structuredData.js';
5
+ import { analyzeLinks } from '../src/analysis/links.js';
6
+ import { analyzeImageAlts } from '../src/analysis/images.js';
7
+
8
+ describe('SEO module', () => {
9
+ test('analyze title edge cases', () => {
10
+ expect(analyzeTitle('<html></html>').status).toBe('missing');
11
+ expect(analyzeTitle('<title>short</title>').status).toBe('too_short');
12
+ expect(analyzeTitle(`<title>${'a'.repeat(61)}</title>`).status).toBe('too_long');
13
+ expect(analyzeTitle(`<title>${'a'.repeat(55)}</title>`).status).toBe('ok');
14
+ });
15
+
16
+ test('duplicate detection', () => {
17
+ const values = applyDuplicateStatuses([
18
+ { value: 'Same', length: 4, status: 'ok' as const },
19
+ { value: 'same', length: 4, status: 'ok' as const },
20
+ { value: null, length: 0, status: 'missing' as const }
21
+ ]);
22
+ expect(values[0].status).toBe('duplicate');
23
+ expect(values[1].status).toBe('duplicate');
24
+ expect(values[2].status).toBe('missing');
25
+ });
26
+
27
+ test('meta description boundaries', () => {
28
+ expect(analyzeMetaDescription('<meta name="description" content="">').status).toBe('missing');
29
+ expect(analyzeMetaDescription('<html></html>').status).toBe('missing');
30
+ expect(analyzeMetaDescription('<meta name="description" content="short">').status).toBe('too_short');
31
+ expect(analyzeMetaDescription(`<meta name="description" content="${'x'.repeat(150)}">`).status).toBe('ok');
32
+ expect(analyzeMetaDescription(`<meta name="description" content="${'x'.repeat(170)}">`).status).toBe('too_long');
33
+ });
34
+
35
+ test('h1 variations', () => {
36
+ expect(analyzeH1('<h1>One</h1>', 'Title').status).toBe('ok');
37
+ expect(analyzeH1('<h1>One</h1><h1>Two</h1>', 'Title').status).toBe('warning');
38
+ const noH1 = analyzeH1('<p>none</p>', 'Title');
39
+ expect(noH1.status).toBe('critical');
40
+ const same = analyzeH1('<h1>same</h1>', 'Same');
41
+ expect(same.matchesTitle).toBe(true);
42
+ });
43
+ });
44
+
45
+ describe('content module', () => {
46
+ test('word count strips nav/footer/script/style', () => {
47
+ const html = '<body><nav>skip me</nav><p>keep words here</p><footer>skip</footer><script>var x</script><style>.x{}</style></body>';
48
+ const result = analyzeContent(html);
49
+ expect(result.wordCount).toBe(3);
50
+ expect(result.uniqueSentenceCount).toBe(1);
51
+ expect(result.textHtmlRatio).toBeGreaterThan(0);
52
+ });
53
+
54
+ test('thin score boundaries', () => {
55
+ expect(calculateThinContentScore({ wordCount: 600, textHtmlRatio: 0.5, uniqueSentenceCount: 4 }, 0)).toBe(0);
56
+ expect(calculateThinContentScore({ wordCount: 0, textHtmlRatio: 0, uniqueSentenceCount: 1 }, 100)).toBe(100);
57
+ });
58
+
59
+ test('content handles malformed/empty html', () => {
60
+ expect(analyzeContent('').wordCount).toBe(0);
61
+ expect(analyzeContent('<div><span>broken').wordCount).toBeGreaterThanOrEqual(1);
62
+ });
63
+ });
64
+
65
+ describe('structured data', () => {
66
+ test('valid and invalid JSON-LD parsing', () => {
67
+ const valid = analyzeStructuredData('<script type="application/ld+json">{"@type":"Article"}</script>');
68
+ expect(valid.present).toBe(true);
69
+ expect(valid.valid).toBe(true);
70
+ expect(valid.types).toContain('Article');
71
+
72
+ const invalid = analyzeStructuredData('<script type="application/ld+json">{invalid}</script>');
73
+ expect(invalid.present).toBe(true);
74
+ expect(invalid.valid).toBe(false);
75
+
76
+ const missing = analyzeStructuredData('<p>none</p>');
77
+ expect(missing.present).toBe(false);
78
+ });
79
+ });
80
+
81
+ describe('links and images', () => {
82
+ test('link ratio calculation', () => {
83
+ const html = '<a href="/a">A</a><a href="https://other.com">B</a><a href="https://other.com" rel="nofollow">C</a>';
84
+ const links = analyzeLinks(html, 'https://example.com/page', 'https://example.com');
85
+ expect(links.internalLinks).toBe(1);
86
+ expect(links.externalLinks).toBe(2);
87
+ expect(links.nofollowCount).toBe(1);
88
+ expect(links.externalRatio).toBeCloseTo(2 / 3);
89
+ });
90
+
91
+ test('image alt detection', () => {
92
+ const html = '<img src="a"><img src="b" alt=""><img src="c" alt="ok">';
93
+ const imgs = analyzeImageAlts(html);
94
+ expect(imgs.totalImages).toBe(3);
95
+ expect(imgs.missingAlt).toBe(1);
96
+ expect(imgs.emptyAlt).toBe(1);
97
+ });
98
+ });
@@ -0,0 +1,98 @@
1
+ import { describe, expect, test } from 'vitest';
2
+ import path from 'node:path';
3
+ import fs from 'node:fs/promises';
4
+ import { analyzeSite, renderAnalysisHtml } from '../src/analysis/analyze.js';
5
+
6
+ describe('analyze integration', () => {
7
+ const fixturePath = path.resolve(import.meta.dirname, 'fixtures/analyze-crawl.json');
8
+
9
+ test('analyzes full crawl fixture and schema', async () => {
10
+ const result = await analyzeSite('https://example.com', { fromCrawl: fixturePath });
11
+
12
+ expect(result.site_summary.pages_analyzed).toBe(3);
13
+ expect(result.site_summary.duplicate_titles).toBe(2);
14
+ expect(result.site_summary.avg_seo_score).toBeGreaterThanOrEqual(0);
15
+ expect(result.pages[0]).toHaveProperty('title');
16
+ expect(result.pages[0]).toHaveProperty('content');
17
+ expect(result.pages[0]).toHaveProperty('links');
18
+ expect(result.site_scores.overallScore).toBeGreaterThanOrEqual(0);
19
+ expect(result.site_scores.overallScore).toBeLessThanOrEqual(100);
20
+ });
21
+
22
+ test('module filter flags behavior', async () => {
23
+ const seoOnly = await analyzeSite('https://example.com', { fromCrawl: fixturePath, seo: true });
24
+ expect(seoOnly.pages[0].content.wordCount).toBe(0);
25
+ expect(seoOnly.pages[0].images.totalImages).toBe(0);
26
+
27
+ const contentOnly = await analyzeSite('https://example.com', { fromCrawl: fixturePath, content: true });
28
+ expect(contentOnly.pages[0].title.status).toBe('missing');
29
+ expect(contentOnly.pages[0].thinScore).toBeGreaterThanOrEqual(0);
30
+
31
+ const accessibilityOnly = await analyzeSite('https://example.com', { fromCrawl: fixturePath, accessibility: true });
32
+ expect(accessibilityOnly.pages[0].images.totalImages).toBeGreaterThan(0);
33
+ expect(accessibilityOnly.pages[0].title.status).toBe('missing');
34
+ });
35
+
36
+ test('html report generation', async () => {
37
+ const result = await analyzeSite('https://example.com', { fromCrawl: fixturePath });
38
+ const html = renderAnalysisHtml(result);
39
+ expect(html).toContain('<table');
40
+ expect(html).toContain('Analysis');
41
+ });
42
+
43
+ test('default database loading', async () => {
44
+ // Force in-memory DB for this test
45
+ process.env.CRAWLITH_DB_PATH = ':memory:';
46
+
47
+ // Close existing DB connection if any to ensure fresh start
48
+ const { getDb, closeDb } = await import('../src/db/index.js');
49
+ closeDb();
50
+
51
+ // Setup repositories
52
+ const { SiteRepository } = await import('../src/db/repositories/SiteRepository.js');
53
+ const { SnapshotRepository } = await import('../src/db/repositories/SnapshotRepository.js');
54
+ const { PageRepository } = await import('../src/db/repositories/PageRepository.js');
55
+
56
+ const db = getDb();
57
+ const siteRepo = new SiteRepository(db);
58
+ const snapshotRepo = new SnapshotRepository(db);
59
+ const pageRepo = new PageRepository(db);
60
+
61
+ // Create site and snapshot
62
+ const siteId = siteRepo.createSite('example.com');
63
+ const snapshotId = snapshotRepo.createSnapshot(siteId, 'full', 'running');
64
+
65
+ // Parse fixture and load pages into db
66
+ const rawYaml = await fs.readFile(fixturePath, 'utf-8');
67
+ const rawData = JSON.parse(rawYaml);
68
+ (rawData.pages || rawData.nodes).forEach((p: any) => {
69
+ pageRepo.upsertPage({
70
+ site_id: siteId,
71
+ normalized_url: p.url,
72
+ last_seen_snapshot_id: snapshotId,
73
+ http_status: p.status || 200,
74
+ html: p.html || '',
75
+ depth: p.depth || 0,
76
+ });
77
+ });
78
+
79
+ snapshotRepo.updateSnapshotStatus(snapshotId, 'completed', { node_count: 3, edge_count: 0 });
80
+
81
+ try {
82
+ const result = await analyzeSite('https://example.com', {});
83
+ expect(result.site_summary.pages_analyzed).toBe(3);
84
+ } finally {
85
+ closeDb();
86
+ delete process.env.CRAWLITH_DB_PATH;
87
+ }
88
+ });
89
+
90
+ test('handles large html and js-only content', async () => {
91
+ const hugeText = '<html><body><script>document.write("x")</script>' + '<p>word </p>'.repeat(1000) + '</body></html>';
92
+ const tmpFile = path.resolve(import.meta.dirname, 'fixtures/large-analyze.json');
93
+ await fs.writeFile(tmpFile, JSON.stringify({ pages: [{ url: 'https://example.com/', status: 200, depth: 0, html: hugeText }] }));
94
+ const result = await analyzeSite('https://example.com', { fromCrawl: tmpFile });
95
+ expect(result.pages[0].content.wordCount).toBe(1000);
96
+ await fs.unlink(tmpFile);
97
+ });
98
+ });
@@ -0,0 +1,31 @@
1
+ import { describe, it, expect, vi } from 'vitest';
2
+ import { resolveDns } from '../../src/audit/dns.js';
3
+ import dns from 'node:dns/promises';
4
+
5
+ vi.mock('node:dns/promises');
6
+
7
+ describe('DNS Diagnostics', () => {
8
+ it('should resolve all records', async () => {
9
+ vi.spyOn(dns, 'resolve4').mockResolvedValue(['1.1.1.1']);
10
+ vi.spyOn(dns, 'resolve6').mockResolvedValue(['2606::1']);
11
+ vi.spyOn(dns, 'resolveCname').mockRejectedValue(new Error('ENODATA'));
12
+ vi.spyOn(dns, 'reverse').mockResolvedValue(['one.one.one.one']);
13
+
14
+ const result = await resolveDns('example.com');
15
+ expect(result.a).toEqual(['1.1.1.1']);
16
+ expect(result.aaaa).toEqual(['2606::1']);
17
+ expect(result.ipv6Support).toBe(true);
18
+ expect(result.reverse).toEqual(['one.one.one.one']);
19
+ expect(result.resolutionTime).toBeGreaterThanOrEqual(0);
20
+ });
21
+
22
+ it('should handle failures gracefully', async () => {
23
+ vi.spyOn(dns, 'resolve4').mockRejectedValue(new Error('ENOTFOUND'));
24
+ vi.spyOn(dns, 'resolve6').mockRejectedValue(new Error('ENOTFOUND'));
25
+ vi.spyOn(dns, 'resolveCname').mockRejectedValue(new Error('ENOTFOUND'));
26
+
27
+ const result = await resolveDns('invalid.com');
28
+ expect(result.a).toEqual([]);
29
+ expect(result.ipCount).toBe(0);
30
+ });
31
+ });
@@ -0,0 +1,45 @@
1
+ import { describe, it, expect } from 'vitest';
2
+ import { analyzeHeaders } from '../../src/audit/headers.js';
3
+
4
+ describe('Headers Analysis', () => {
5
+ it('should detect all secure headers', () => {
6
+ const headers = {
7
+ 'strict-transport-security': 'max-age=31536000; includeSubDomains',
8
+ 'content-security-policy': "default-src 'self'",
9
+ 'x-frame-options': 'DENY',
10
+ 'x-content-type-options': 'nosniff',
11
+ 'referrer-policy': 'strict-origin-when-cross-origin',
12
+ 'permissions-policy': 'geolocation=()'
13
+ };
14
+ const result = analyzeHeaders(headers);
15
+ expect(result.score).toBe(100);
16
+ expect(result.strictTransportSecurity.valid).toBe(true);
17
+ });
18
+
19
+ it('should handle missing headers', () => {
20
+ const headers = {};
21
+ const result = analyzeHeaders(headers);
22
+ expect(result.score).toBe(0);
23
+ expect(result.strictTransportSecurity.present).toBe(false);
24
+ });
25
+
26
+ it('should validate HSTS properly', () => {
27
+ const headers = {
28
+ 'strict-transport-security': 'max-age=0'
29
+ };
30
+ // valid requires max-age
31
+ const result = analyzeHeaders(headers);
32
+ expect(result.strictTransportSecurity.valid).toBe(true);
33
+ // Wait, checkHSTS: includes('max-age=') is true. includes('includeSubDomains') is false.
34
+ // Issues will contain 'Missing includeSubDomains'.
35
+ expect(result.strictTransportSecurity.issues).toContain('Missing includeSubDomains');
36
+ });
37
+
38
+ it('should validate invalid HSTS', () => {
39
+ const headers = {
40
+ 'strict-transport-security': 'invalid'
41
+ };
42
+ const result = analyzeHeaders(headers);
43
+ expect(result.strictTransportSecurity.valid).toBe(false);
44
+ });
45
+ });