@crawlith/core 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analysis_list.html +35 -0
  4. package/dist/analysis/analysis_page.html +123 -0
  5. package/dist/analysis/analyze.d.ts +40 -5
  6. package/dist/analysis/analyze.js +395 -347
  7. package/dist/analysis/clustering.d.ts +23 -0
  8. package/dist/analysis/clustering.js +206 -0
  9. package/dist/analysis/content.d.ts +1 -1
  10. package/dist/analysis/content.js +11 -5
  11. package/dist/analysis/duplicate.d.ts +34 -0
  12. package/dist/analysis/duplicate.js +305 -0
  13. package/dist/analysis/heading.d.ts +116 -0
  14. package/dist/analysis/heading.js +356 -0
  15. package/dist/analysis/images.d.ts +1 -1
  16. package/dist/analysis/images.js +6 -5
  17. package/dist/analysis/links.d.ts +1 -1
  18. package/dist/analysis/links.js +8 -8
  19. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  20. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  21. package/dist/analysis/scoring.js +11 -2
  22. package/dist/analysis/seo.d.ts +8 -4
  23. package/dist/analysis/seo.js +41 -30
  24. package/dist/analysis/soft404.d.ts +17 -0
  25. package/dist/analysis/soft404.js +62 -0
  26. package/dist/analysis/structuredData.d.ts +1 -1
  27. package/dist/analysis/structuredData.js +5 -4
  28. package/dist/analysis/templates.d.ts +2 -0
  29. package/dist/analysis/templates.js +7 -0
  30. package/dist/application/index.d.ts +2 -0
  31. package/dist/application/index.js +2 -0
  32. package/dist/application/usecase.d.ts +3 -0
  33. package/dist/application/usecase.js +1 -0
  34. package/dist/application/usecases.d.ts +114 -0
  35. package/dist/application/usecases.js +201 -0
  36. package/dist/audit/index.js +1 -1
  37. package/dist/audit/transport.d.ts +1 -1
  38. package/dist/audit/transport.js +5 -4
  39. package/dist/audit/types.d.ts +1 -0
  40. package/dist/constants.d.ts +17 -0
  41. package/dist/constants.js +23 -0
  42. package/dist/core/scope/scopeManager.js +3 -0
  43. package/dist/core/security/ipGuard.d.ts +11 -0
  44. package/dist/core/security/ipGuard.js +71 -3
  45. package/dist/crawler/crawl.d.ts +4 -22
  46. package/dist/crawler/crawl.js +4 -335
  47. package/dist/crawler/crawler.d.ts +87 -0
  48. package/dist/crawler/crawler.js +683 -0
  49. package/dist/crawler/extract.d.ts +4 -1
  50. package/dist/crawler/extract.js +7 -2
  51. package/dist/crawler/fetcher.d.ts +2 -1
  52. package/dist/crawler/fetcher.js +26 -11
  53. package/dist/crawler/metricsRunner.d.ts +23 -1
  54. package/dist/crawler/metricsRunner.js +202 -72
  55. package/dist/crawler/normalize.d.ts +41 -0
  56. package/dist/crawler/normalize.js +119 -3
  57. package/dist/crawler/parser.d.ts +1 -3
  58. package/dist/crawler/parser.js +2 -49
  59. package/dist/crawler/resolver.d.ts +11 -0
  60. package/dist/crawler/resolver.js +67 -0
  61. package/dist/crawler/sitemap.d.ts +6 -0
  62. package/dist/crawler/sitemap.js +27 -17
  63. package/dist/crawler/trap.d.ts +5 -1
  64. package/dist/crawler/trap.js +23 -2
  65. package/dist/db/CrawlithDB.d.ts +110 -0
  66. package/dist/db/CrawlithDB.js +500 -0
  67. package/dist/db/graphLoader.js +42 -30
  68. package/dist/db/index.d.ts +11 -0
  69. package/dist/db/index.js +41 -29
  70. package/dist/db/migrations.d.ts +2 -0
  71. package/dist/db/{schema.js → migrations.js} +90 -43
  72. package/dist/db/pluginRegistry.d.ts +9 -0
  73. package/dist/db/pluginRegistry.js +19 -0
  74. package/dist/db/repositories/EdgeRepository.d.ts +13 -0
  75. package/dist/db/repositories/EdgeRepository.js +20 -0
  76. package/dist/db/repositories/MetricsRepository.d.ts +16 -8
  77. package/dist/db/repositories/MetricsRepository.js +28 -7
  78. package/dist/db/repositories/PageRepository.d.ts +15 -2
  79. package/dist/db/repositories/PageRepository.js +169 -25
  80. package/dist/db/repositories/SiteRepository.d.ts +9 -0
  81. package/dist/db/repositories/SiteRepository.js +13 -0
  82. package/dist/db/repositories/SnapshotRepository.d.ts +14 -5
  83. package/dist/db/repositories/SnapshotRepository.js +64 -5
  84. package/dist/db/reset.d.ts +9 -0
  85. package/dist/db/reset.js +32 -0
  86. package/dist/db/statements.d.ts +12 -0
  87. package/dist/db/statements.js +40 -0
  88. package/dist/diff/compare.d.ts +0 -5
  89. package/dist/diff/compare.js +0 -12
  90. package/dist/diff/service.d.ts +16 -0
  91. package/dist/diff/service.js +41 -0
  92. package/dist/domain/index.d.ts +4 -0
  93. package/dist/domain/index.js +4 -0
  94. package/dist/events.d.ts +56 -0
  95. package/dist/events.js +1 -0
  96. package/dist/graph/graph.d.ts +36 -42
  97. package/dist/graph/graph.js +26 -17
  98. package/dist/graph/hits.d.ts +23 -0
  99. package/dist/graph/hits.js +111 -0
  100. package/dist/graph/metrics.d.ts +0 -4
  101. package/dist/graph/metrics.js +25 -9
  102. package/dist/graph/pagerank.d.ts +17 -4
  103. package/dist/graph/pagerank.js +126 -91
  104. package/dist/graph/simhash.d.ts +6 -0
  105. package/dist/graph/simhash.js +14 -0
  106. package/dist/index.d.ts +29 -8
  107. package/dist/index.js +29 -8
  108. package/dist/lock/hashKey.js +1 -1
  109. package/dist/lock/lockManager.d.ts +5 -1
  110. package/dist/lock/lockManager.js +38 -13
  111. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  112. package/dist/plugin-system/plugin-cli.js +31 -0
  113. package/dist/plugin-system/plugin-config.d.ts +16 -0
  114. package/dist/plugin-system/plugin-config.js +36 -0
  115. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  116. package/dist/plugin-system/plugin-loader.js +122 -0
  117. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  118. package/dist/plugin-system/plugin-registry.js +167 -0
  119. package/dist/plugin-system/plugin-types.d.ts +205 -0
  120. package/dist/plugin-system/plugin-types.js +1 -0
  121. package/dist/ports/index.d.ts +9 -0
  122. package/dist/ports/index.js +1 -0
  123. package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
  124. package/dist/report/crawlExport.d.ts +3 -0
  125. package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
  126. package/dist/report/crawl_template.d.ts +1 -0
  127. package/dist/report/crawl_template.js +7 -0
  128. package/dist/report/export.d.ts +3 -0
  129. package/dist/report/export.js +81 -0
  130. package/dist/report/html.js +15 -216
  131. package/dist/report/insight.d.ts +27 -0
  132. package/dist/report/insight.js +103 -0
  133. package/dist/scoring/health.d.ts +56 -0
  134. package/dist/scoring/health.js +213 -0
  135. package/dist/utils/chalk.d.ts +6 -0
  136. package/dist/utils/chalk.js +41 -0
  137. package/dist/utils/secureConfig.d.ts +23 -0
  138. package/dist/utils/secureConfig.js +128 -0
  139. package/package.json +12 -6
  140. package/CHANGELOG.md +0 -7
  141. package/dist/db/schema.d.ts +0 -2
  142. package/dist/graph/cluster.d.ts +0 -6
  143. package/dist/graph/cluster.js +0 -173
  144. package/dist/graph/duplicate.d.ts +0 -10
  145. package/dist/graph/duplicate.js +0 -251
  146. package/dist/report/sitegraphExport.d.ts +0 -3
  147. package/dist/report/sitegraph_template.d.ts +0 -1
  148. package/dist/report/sitegraph_template.js +0 -630
  149. package/dist/scoring/hits.d.ts +0 -9
  150. package/dist/scoring/hits.js +0 -111
  151. package/src/analysis/analyze.ts +0 -548
  152. package/src/analysis/content.ts +0 -62
  153. package/src/analysis/images.ts +0 -28
  154. package/src/analysis/links.ts +0 -41
  155. package/src/analysis/scoring.ts +0 -59
  156. package/src/analysis/seo.ts +0 -82
  157. package/src/analysis/structuredData.ts +0 -62
  158. package/src/audit/dns.ts +0 -49
  159. package/src/audit/headers.ts +0 -98
  160. package/src/audit/index.ts +0 -66
  161. package/src/audit/scoring.ts +0 -232
  162. package/src/audit/transport.ts +0 -258
  163. package/src/audit/types.ts +0 -102
  164. package/src/core/network/proxyAdapter.ts +0 -21
  165. package/src/core/network/rateLimiter.ts +0 -39
  166. package/src/core/network/redirectController.ts +0 -47
  167. package/src/core/network/responseLimiter.ts +0 -34
  168. package/src/core/network/retryPolicy.ts +0 -57
  169. package/src/core/scope/domainFilter.ts +0 -45
  170. package/src/core/scope/scopeManager.ts +0 -52
  171. package/src/core/scope/subdomainPolicy.ts +0 -39
  172. package/src/core/security/ipGuard.ts +0 -92
  173. package/src/crawler/crawl.ts +0 -382
  174. package/src/crawler/extract.ts +0 -34
  175. package/src/crawler/fetcher.ts +0 -233
  176. package/src/crawler/metricsRunner.ts +0 -124
  177. package/src/crawler/normalize.ts +0 -108
  178. package/src/crawler/parser.ts +0 -190
  179. package/src/crawler/sitemap.ts +0 -73
  180. package/src/crawler/trap.ts +0 -96
  181. package/src/db/graphLoader.ts +0 -105
  182. package/src/db/index.ts +0 -70
  183. package/src/db/repositories/EdgeRepository.ts +0 -29
  184. package/src/db/repositories/MetricsRepository.ts +0 -49
  185. package/src/db/repositories/PageRepository.ts +0 -128
  186. package/src/db/repositories/SiteRepository.ts +0 -32
  187. package/src/db/repositories/SnapshotRepository.ts +0 -74
  188. package/src/db/schema.ts +0 -177
  189. package/src/diff/compare.ts +0 -84
  190. package/src/graph/cluster.ts +0 -192
  191. package/src/graph/duplicate.ts +0 -286
  192. package/src/graph/graph.ts +0 -172
  193. package/src/graph/metrics.ts +0 -110
  194. package/src/graph/pagerank.ts +0 -125
  195. package/src/graph/simhash.ts +0 -61
  196. package/src/index.ts +0 -30
  197. package/src/lock/hashKey.ts +0 -51
  198. package/src/lock/lockManager.ts +0 -124
  199. package/src/lock/pidCheck.ts +0 -13
  200. package/src/report/html.ts +0 -227
  201. package/src/report/sitegraphExport.ts +0 -58
  202. package/src/scoring/hits.ts +0 -131
  203. package/src/scoring/orphanSeverity.ts +0 -176
  204. package/src/utils/version.ts +0 -18
  205. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  206. package/tests/analysis.unit.test.ts +0 -98
  207. package/tests/analyze.integration.test.ts +0 -98
  208. package/tests/audit/dns.test.ts +0 -31
  209. package/tests/audit/headers.test.ts +0 -45
  210. package/tests/audit/scoring.test.ts +0 -133
  211. package/tests/audit/security.test.ts +0 -12
  212. package/tests/audit/transport.test.ts +0 -112
  213. package/tests/clustering.test.ts +0 -118
  214. package/tests/crawler.test.ts +0 -358
  215. package/tests/db.test.ts +0 -159
  216. package/tests/diff.test.ts +0 -67
  217. package/tests/duplicate.test.ts +0 -110
  218. package/tests/fetcher.test.ts +0 -106
  219. package/tests/fetcher_safety.test.ts +0 -85
  220. package/tests/fixtures/analyze-crawl.json +0 -26
  221. package/tests/hits.test.ts +0 -134
  222. package/tests/html_report.test.ts +0 -58
  223. package/tests/lock/lockManager.test.ts +0 -138
  224. package/tests/metrics.test.ts +0 -196
  225. package/tests/normalize.test.ts +0 -101
  226. package/tests/orphanSeverity.test.ts +0 -160
  227. package/tests/pagerank.test.ts +0 -98
  228. package/tests/parser.test.ts +0 -117
  229. package/tests/proxy_safety.test.ts +0 -57
  230. package/tests/redirect_safety.test.ts +0 -73
  231. package/tests/safety.test.ts +0 -114
  232. package/tests/scope.test.ts +0 -66
  233. package/tests/scoring.test.ts +0 -59
  234. package/tests/sitemap.test.ts +0 -88
  235. package/tests/soft404.test.ts +0 -41
  236. package/tests/trap.test.ts +0 -39
  237. package/tests/visualization_data.test.ts +0 -46
  238. package/tsconfig.json +0 -11
@@ -0,0 +1,116 @@
1
+ /**
2
+ * Supported heading levels within HTML content.
3
+ */
4
+ export type HeadingLevel = 1 | 2 | 3 | 4 | 5 | 6;
5
+ /**
6
+ * Represents a normalized heading node extracted from the DOM.
7
+ */
8
+ export interface HeadingNode {
9
+ level: HeadingLevel;
10
+ text: string;
11
+ index: number;
12
+ parentIndex?: number;
13
+ }
14
+ /**
15
+ * Represents content statistics for a section under a heading.
16
+ */
17
+ export interface SectionMetrics {
18
+ headingIndex: number;
19
+ headingText: string;
20
+ words: number;
21
+ keywordConcentration: number;
22
+ thin: boolean;
23
+ duplicateRisk: number;
24
+ }
25
+ /**
26
+ * Raw heading analysis generated for a single URL.
27
+ */
28
+ export interface LocalPageAnalysis {
29
+ url: string;
30
+ headingNodes: HeadingNode[];
31
+ sections: SectionMetrics[];
32
+ h1Norm: string;
33
+ h2SetHash: string;
34
+ patternHash: string;
35
+ issues: string[];
36
+ metrics: {
37
+ entropy: number;
38
+ maxDepth: number;
39
+ avgDepth: number;
40
+ headingDensity: number;
41
+ fragmentation: number;
42
+ levelVolatility: number;
43
+ hierarchySkips: number;
44
+ reverseJumps: number;
45
+ missingH1: number;
46
+ multipleH1: number;
47
+ };
48
+ }
49
+ /**
50
+ * Final heading-health payload attached to a page node.
51
+ */
52
+ export interface HeadingHealthPayload {
53
+ score: number;
54
+ status: 'Healthy' | 'Moderate' | 'Poor';
55
+ issues: string[];
56
+ map: HeadingNode[];
57
+ missing_h1: number;
58
+ multiple_h1: number;
59
+ entropy: number;
60
+ max_depth: number;
61
+ avg_depth: number;
62
+ heading_density: number;
63
+ fragmentation: number;
64
+ volatility: number;
65
+ hierarchy_skips: number;
66
+ reverse_jumps: number;
67
+ thin_sections: number;
68
+ duplicate_h1_group: number;
69
+ similar_h1_group: number;
70
+ identical_h2_set_group: number;
71
+ duplicate_pattern_group: number;
72
+ template_risk: number;
73
+ }
74
+ /**
75
+ * Snapshot-level summary emitted by the plugin.
76
+ */
77
+ export interface HeadingHealthSummary {
78
+ avgScore: number;
79
+ evaluatedPages: number;
80
+ totalMissing: number;
81
+ totalMultiple: number;
82
+ totalSkips: number;
83
+ totalReverseJumps: number;
84
+ totalThinSections: number;
85
+ avgEntropy: number;
86
+ poorPages: number;
87
+ }
88
+ /**
89
+ * Calculates token-level Jaccard similarity between two text values.
90
+ */
91
+ export declare function jaccardSimilarity(a: string, b: string): number;
92
+ /**
93
+ * Performs per-page heading extraction and structural scoring signal generation.
94
+ */
95
+ export declare function analyzeHeadingHealth(html: string, fallbackTitle?: string): LocalPageAnalysis;
96
+ /**
97
+ * Enriches section-level duplicate risk by comparing normalized section signatures across pages.
98
+ */
99
+ export declare function enrichDuplicateRisk(pages: LocalPageAnalysis[]): void;
100
+ /**
101
+ * Coordinates heading analysis across graph nodes and builds report-safe payloads.
102
+ */
103
+ export declare class HeadingHealthService {
104
+ /**
105
+ * Builds page-level payloads plus a snapshot summary for every eligible node.
106
+ */
107
+ evaluateNodes(nodes: Array<Record<string, any>>): {
108
+ payloadsByUrl: Map<string, HeadingHealthPayload>;
109
+ summary: HeadingHealthSummary;
110
+ };
111
+ private collectAnalyses;
112
+ private buildBuckets;
113
+ private computeSimilarH1GroupSizes;
114
+ private scoreHealth;
115
+ private computeTemplateRisk;
116
+ }
@@ -0,0 +1,356 @@
1
+ import { createHash } from 'node:crypto';
2
+ const STOPWORDS = new Set(['the', 'and', 'for', 'with', 'from', 'that', 'this', 'your', 'about', 'into', 'over', 'under', 'are', 'was', 'were', 'can', 'has', 'have', 'had', 'you', 'our', 'out', 'all']);
3
+ const THIN_SECTION_WORDS = 80;
4
+ const HEADING_PATTERN = /<h([1-6])\b[^<>]*>([\s\S]*?)<\/h\1>/gi;
5
+ const TITLE_PATTERN = /<title\b[^<>]*>([\s\S]*?)<\/title>/i;
6
+ const normalizeText = (input) => input.replace(/<[^<>]*>/g, ' ').replace(/&nbsp;/g, ' ').replace(/&amp;/g, '&').replace(/\s+/g, ' ').trim();
7
+ const normalizeComparable = (input) => normalizeText(input).toLowerCase();
8
+ const tokenize = (input) => normalizeComparable(input).replace(/[^a-z0-9\s]/g, ' ').split(/\s+/).filter((token) => token.length > 2 && !STOPWORDS.has(token));
9
+ const stableHash = (input) => createHash('sha1').update(input).digest('hex').slice(0, 16);
10
+ /**
11
+ * Calculates token-level Jaccard similarity between two text values.
12
+ */
13
+ export function jaccardSimilarity(a, b) {
14
+ const aSet = new Set(tokenize(a));
15
+ const bSet = new Set(tokenize(b));
16
+ if (!aSet.size || !bSet.size) {
17
+ return 0;
18
+ }
19
+ let intersection = 0;
20
+ for (const token of aSet) {
21
+ if (bSet.has(token)) {
22
+ intersection += 1;
23
+ }
24
+ }
25
+ return intersection / (aSet.size + bSet.size - intersection);
26
+ }
27
+ /**
28
+ * Performs per-page heading extraction and structural scoring signal generation.
29
+ */
30
+ export function analyzeHeadingHealth(html, fallbackTitle) {
31
+ const segments = [];
32
+ for (const match of html.matchAll(HEADING_PATTERN)) {
33
+ const level = Number(match[1]);
34
+ segments.push({
35
+ level,
36
+ text: normalizeText(match[2] || ''),
37
+ start: match.index || 0,
38
+ end: (match.index || 0) + match[0].length
39
+ });
40
+ }
41
+ const headingNodes = [];
42
+ const stack = [];
43
+ segments.forEach((segment, index) => {
44
+ const node = { level: segment.level, text: segment.text, index };
45
+ while (stack.length > 0 && stack[stack.length - 1].level >= node.level) {
46
+ stack.pop();
47
+ }
48
+ if (stack.length > 0) {
49
+ node.parentIndex = stack[stack.length - 1].index;
50
+ }
51
+ stack.push(node);
52
+ headingNodes.push(node);
53
+ });
54
+ const sections = [];
55
+ const pageWords = tokenize(html);
56
+ const frequency = new Map();
57
+ for (const word of pageWords) {
58
+ frequency.set(word, (frequency.get(word) || 0) + 1);
59
+ }
60
+ for (let i = 0; i < segments.length; i += 1) {
61
+ const current = segments[i];
62
+ const next = segments[i + 1];
63
+ const textChunk = html.slice(current.end, next ? next.start : html.length);
64
+ const words = tokenize(textChunk);
65
+ const headingTokens = tokenize(current.text);
66
+ const concentration = headingTokens.reduce((sum, token) => sum + (frequency.get(token) || 0), 0);
67
+ sections.push({
68
+ headingIndex: headingNodes[i]?.index ?? i,
69
+ headingText: current.text,
70
+ words: words.length,
71
+ keywordConcentration: words.length > 0 ? Number((concentration / words.length).toFixed(3)) : 0,
72
+ thin: words.length > 0 && words.length < THIN_SECTION_WORDS,
73
+ duplicateRisk: 0
74
+ });
75
+ }
76
+ const levelCounts = [0, 0, 0, 0, 0, 0];
77
+ headingNodes.forEach((node) => {
78
+ levelCounts[node.level - 1] += 1;
79
+ });
80
+ const entropyScore = Number(calculateEntropy(levelCounts).toFixed(3));
81
+ const missingH1 = levelCounts[0] === 0 ? 1 : 0;
82
+ const multipleH1 = levelCounts[0] > 1 ? 1 : 0;
83
+ let hierarchySkips = 0;
84
+ let reverseJumps = 0;
85
+ let volatilitySum = 0;
86
+ for (let i = 1; i < headingNodes.length; i += 1) {
87
+ const delta = headingNodes[i].level - headingNodes[i - 1].level;
88
+ volatilitySum += Math.abs(delta);
89
+ if (delta > 1) {
90
+ hierarchySkips += 1;
91
+ }
92
+ if (delta < -1) {
93
+ reverseJumps += 1;
94
+ }
95
+ }
96
+ const maxDepth = headingNodes.length ? Math.max(...headingNodes.map((node) => node.level)) : 0;
97
+ const avgDepth = headingNodes.length ? Number((headingNodes.reduce((sum, node) => sum + node.level, 0) / headingNodes.length).toFixed(2)) : 0;
98
+ const headingDensity = pageWords.length ? Number((headingNodes.length / pageWords.length).toFixed(4)) : 0;
99
+ const fragmentation = headingNodes.length ? Number((headingNodes.filter((node) => node.level <= 2).length / headingNodes.length).toFixed(3)) : 0;
100
+ const levelVolatility = headingNodes.length > 1 ? Number((volatilitySum / (headingNodes.length - 1)).toFixed(3)) : 0;
101
+ const h1Nodes = headingNodes.filter((node) => node.level === 1);
102
+ const issues = [];
103
+ if (missingH1)
104
+ issues.push('Missing H1');
105
+ if (multipleH1)
106
+ issues.push('Multiple H1 found');
107
+ if (h1Nodes.some((node) => node.text.length < 6))
108
+ issues.push('Empty or near-empty H1');
109
+ const title = fallbackTitle || getTitleFromHtml(html);
110
+ if (title && h1Nodes[0] && jaccardSimilarity(title, h1Nodes[0].text) < 0.3) {
111
+ issues.push('H1 diverges from <title>');
112
+ }
113
+ if (hierarchySkips > 0)
114
+ issues.push(`${hierarchySkips} hierarchy skips detected`);
115
+ if (reverseJumps > 0)
116
+ issues.push(`${reverseJumps} reverse hierarchy jumps detected`);
117
+ for (const thin of sections.filter((section) => section.thin).slice(0, 2)) {
118
+ issues.push(`Thin section under "${thin.headingText || 'Untitled heading'}"`);
119
+ }
120
+ if (entropyScore > 2.1)
121
+ issues.push('High structural entropy');
122
+ if (fragmentation > 0.65)
123
+ issues.push('Section fragmentation is high');
124
+ const h1Norm = normalizeComparable(h1Nodes[0]?.text || '');
125
+ const h2SetHash = stableHash(headingNodes
126
+ .filter((node) => node.level === 2)
127
+ .map((node) => normalizeComparable(node.text))
128
+ .filter(Boolean)
129
+ .sort()
130
+ .join('|'));
131
+ const patternHash = stableHash(headingNodes.map((node) => node.level).join('>'));
132
+ return {
133
+ url: '',
134
+ headingNodes,
135
+ sections,
136
+ h1Norm,
137
+ h2SetHash,
138
+ patternHash,
139
+ issues,
140
+ metrics: {
141
+ entropy: entropyScore,
142
+ maxDepth,
143
+ avgDepth,
144
+ headingDensity,
145
+ fragmentation,
146
+ levelVolatility,
147
+ hierarchySkips,
148
+ reverseJumps,
149
+ missingH1,
150
+ multipleH1
151
+ }
152
+ };
153
+ }
154
+ /**
155
+ * Enriches section-level duplicate risk by comparing normalized section signatures across pages.
156
+ */
157
+ export function enrichDuplicateRisk(pages) {
158
+ const buckets = new Map();
159
+ for (const page of pages) {
160
+ for (const section of page.sections) {
161
+ const key = stableHash(`${normalizeComparable(section.headingText)}:${section.words}`);
162
+ const bucket = buckets.get(key) || [];
163
+ bucket.push(page.url);
164
+ buckets.set(key, bucket);
165
+ }
166
+ }
167
+ for (const page of pages) {
168
+ for (const section of page.sections) {
169
+ const key = stableHash(`${normalizeComparable(section.headingText)}:${section.words}`);
170
+ const size = (buckets.get(key) || []).length;
171
+ section.duplicateRisk = Number(Math.min(1, (size - 1) / 5).toFixed(3));
172
+ }
173
+ }
174
+ }
175
+ function calculateEntropy(values) {
176
+ const total = values.reduce((a, b) => a + b, 0);
177
+ if (!total) {
178
+ return 0;
179
+ }
180
+ return values.reduce((sum, value) => {
181
+ if (value === 0) {
182
+ return sum;
183
+ }
184
+ const probability = value / total;
185
+ return sum - probability * Math.log2(probability);
186
+ }, 0);
187
+ }
188
+ function getTitleFromHtml(html) {
189
+ const match = html.match(TITLE_PATTERN);
190
+ return match ? normalizeText(match[1]) : '';
191
+ }
192
+ /**
193
+ * Coordinates heading analysis across graph nodes and builds report-safe payloads.
194
+ */
195
+ export class HeadingHealthService {
196
+ /**
197
+ * Builds page-level payloads plus a snapshot summary for every eligible node.
198
+ */
199
+ evaluateNodes(nodes) {
200
+ const analyzedPages = this.collectAnalyses(nodes);
201
+ enrichDuplicateRisk(analyzedPages);
202
+ const exactH1Buckets = this.buildBuckets(analyzedPages, (page) => page.h1Norm);
203
+ const h2SetBuckets = this.buildBuckets(analyzedPages, (page) => page.h2SetHash);
204
+ const patternBuckets = this.buildBuckets(analyzedPages, (page) => page.patternHash);
205
+ const similarH1GroupSizes = this.computeSimilarH1GroupSizes(analyzedPages);
206
+ const payloadsByUrl = new Map();
207
+ let totalScore = 0;
208
+ let totalMissing = 0;
209
+ let totalMultiple = 0;
210
+ let totalSkips = 0;
211
+ let totalReverseJumps = 0;
212
+ let totalThinSections = 0;
213
+ let totalEntropy = 0;
214
+ let poorPages = 0;
215
+ for (const page of analyzedPages) {
216
+ const duplicateH1GroupSize = page.h1Norm ? exactH1Buckets.get(page.h1Norm)?.length || 1 : 0;
217
+ const similarH1GroupSize = similarH1GroupSizes.get(page.url) || 0;
218
+ const identicalH2SetGroupSize = h2SetBuckets.get(page.h2SetHash)?.length || 1;
219
+ const duplicatePatternGroupSize = patternBuckets.get(page.patternHash)?.length || 1;
220
+ const templateRisk = this.computeTemplateRisk(similarH1GroupSize, identicalH2SetGroupSize, duplicatePatternGroupSize);
221
+ const thinSectionCount = page.sections.filter((section) => section.thin).length;
222
+ const health = this.scoreHealth({
223
+ metrics: page.metrics,
224
+ thinSectionCount,
225
+ duplicateH1GroupSize,
226
+ similarH1GroupSize,
227
+ identicalH2SetGroupSize,
228
+ duplicatePatternGroupSize,
229
+ templateRisk,
230
+ issues: page.issues
231
+ });
232
+ if (health.status === 'Poor') {
233
+ poorPages += 1;
234
+ }
235
+ totalScore += health.score;
236
+ totalMissing += page.metrics.missingH1;
237
+ totalMultiple += page.metrics.multipleH1;
238
+ totalSkips += page.metrics.hierarchySkips;
239
+ totalReverseJumps += page.metrics.reverseJumps;
240
+ totalThinSections += thinSectionCount;
241
+ totalEntropy += page.metrics.entropy;
242
+ payloadsByUrl.set(page.url, {
243
+ score: health.score,
244
+ status: health.status,
245
+ issues: health.issues,
246
+ map: page.headingNodes,
247
+ missing_h1: page.metrics.missingH1,
248
+ multiple_h1: page.metrics.multipleH1,
249
+ entropy: page.metrics.entropy,
250
+ max_depth: page.metrics.maxDepth,
251
+ avg_depth: page.metrics.avgDepth,
252
+ heading_density: page.metrics.headingDensity,
253
+ fragmentation: page.metrics.fragmentation,
254
+ volatility: page.metrics.levelVolatility,
255
+ hierarchy_skips: page.metrics.hierarchySkips,
256
+ reverse_jumps: page.metrics.reverseJumps,
257
+ thin_sections: thinSectionCount,
258
+ duplicate_h1_group: duplicateH1GroupSize,
259
+ similar_h1_group: similarH1GroupSize,
260
+ identical_h2_set_group: identicalH2SetGroupSize,
261
+ duplicate_pattern_group: duplicatePatternGroupSize,
262
+ template_risk: templateRisk
263
+ });
264
+ }
265
+ const evaluatedPages = analyzedPages.length;
266
+ const summary = {
267
+ avgScore: evaluatedPages ? Math.round(totalScore / evaluatedPages) : 0,
268
+ evaluatedPages,
269
+ totalMissing,
270
+ totalMultiple,
271
+ totalSkips,
272
+ totalReverseJumps,
273
+ totalThinSections,
274
+ avgEntropy: evaluatedPages ? Number((totalEntropy / evaluatedPages).toFixed(3)) : 0,
275
+ poorPages
276
+ };
277
+ return { payloadsByUrl, summary };
278
+ }
279
+ collectAnalyses(nodes) {
280
+ const analyses = [];
281
+ for (const node of nodes) {
282
+ if (node.status < 200 || node.status >= 300 || !node.html || !node.url) {
283
+ continue;
284
+ }
285
+ const analysis = analyzeHeadingHealth(node.html, node.title || node.rawTitle);
286
+ analysis.url = node.url;
287
+ analyses.push(analysis);
288
+ }
289
+ return analyses;
290
+ }
291
+ buildBuckets(pages, selector) {
292
+ const buckets = new Map();
293
+ for (const page of pages) {
294
+ const key = selector(page);
295
+ if (!key) {
296
+ continue;
297
+ }
298
+ buckets.set(key, [...(buckets.get(key) || []), page.url]);
299
+ }
300
+ return buckets;
301
+ }
302
+ computeSimilarH1GroupSizes(pages) {
303
+ const uniqueH1 = Array.from(new Set(pages.map((page) => page.h1Norm).filter(Boolean)));
304
+ const similarBuckets = new Map();
305
+ for (const h1 of uniqueH1) {
306
+ similarBuckets.set(h1, new Set([h1]));
307
+ }
308
+ for (let i = 0; i < uniqueH1.length; i += 1) {
309
+ for (let j = i + 1; j < uniqueH1.length; j += 1) {
310
+ const a = uniqueH1[i];
311
+ const b = uniqueH1[j];
312
+ if (jaccardSimilarity(a, b) >= 0.7) {
313
+ similarBuckets.get(a)?.add(b);
314
+ similarBuckets.get(b)?.add(a);
315
+ }
316
+ }
317
+ }
318
+ const groupSizes = new Map();
319
+ for (const page of pages) {
320
+ groupSizes.set(page.url, similarBuckets.get(page.h1Norm)?.size || (page.h1Norm ? 1 : 0));
321
+ }
322
+ return groupSizes;
323
+ }
324
+ scoreHealth(input) {
325
+ let score = 100;
326
+ const metrics = input.metrics;
327
+ if (metrics.missingH1)
328
+ score -= 20;
329
+ if (metrics.multipleH1)
330
+ score -= 6;
331
+ score -= metrics.hierarchySkips * 8;
332
+ score -= metrics.reverseJumps * 6;
333
+ score -= Math.round(metrics.entropy * 7);
334
+ score -= Math.round(metrics.fragmentation * 20);
335
+ score -= Math.round(metrics.levelVolatility * 6);
336
+ score -= input.thinSectionCount * 4;
337
+ if (input.duplicateH1GroupSize > 1)
338
+ score -= Math.min(16, (input.duplicateH1GroupSize - 1) * 3);
339
+ if (input.similarH1GroupSize > 1)
340
+ score -= Math.min(8, (input.similarH1GroupSize - 1) * 2);
341
+ if (input.identicalH2SetGroupSize > 1)
342
+ score -= Math.min(10, (input.identicalH2SetGroupSize - 1) * 2);
343
+ if (input.duplicatePatternGroupSize > 1)
344
+ score -= Math.min(12, (input.duplicatePatternGroupSize - 1) * 2);
345
+ score -= Math.round(input.templateRisk * 12);
346
+ score = Math.max(0, Math.min(100, score));
347
+ return {
348
+ score,
349
+ status: score >= 80 ? 'Healthy' : score >= 55 ? 'Moderate' : 'Poor',
350
+ issues: input.issues
351
+ };
352
+ }
353
+ computeTemplateRisk(similar, h2set, pattern) {
354
+ return Number(Math.max(0, Math.min(1, ((similar - 1) * 0.15) + ((h2set - 1) * 0.2) + ((pattern - 1) * 0.2))).toFixed(3));
355
+ }
356
+ }
@@ -3,4 +3,4 @@ export interface ImageAltAnalysis {
3
3
  missingAlt: number;
4
4
  emptyAlt: number;
5
5
  }
6
- export declare function analyzeImageAlts(html: string): ImageAltAnalysis;
6
+ export declare function analyzeImageAlts($: any): ImageAltAnalysis;
@@ -1,10 +1,11 @@
1
1
  import { load } from 'cheerio';
2
- export function analyzeImageAlts(html) {
3
- const $ = load(html);
2
+ export function analyzeImageAlts($) {
3
+ const isString = typeof $ === 'string';
4
+ const cheerioObj = isString ? load($ || '<html></html>') : $;
4
5
  let missingAlt = 0;
5
6
  let emptyAlt = 0;
6
- $('img').each((_idx, el) => {
7
- const alt = $(el).attr('alt');
7
+ cheerioObj('img').each((_idx, el) => {
8
+ const alt = cheerioObj(el).attr('alt');
8
9
  if (alt === undefined) {
9
10
  missingAlt += 1;
10
11
  return;
@@ -13,6 +14,6 @@ export function analyzeImageAlts(html) {
13
14
  emptyAlt += 1;
14
15
  }
15
16
  });
16
- const totalImages = $('img').length;
17
+ const totalImages = cheerioObj('img').length;
17
18
  return { totalImages, missingAlt, emptyAlt };
18
19
  }
@@ -4,4 +4,4 @@ export interface LinkRatioAnalysis {
4
4
  nofollowCount: number;
5
5
  externalRatio: number;
6
6
  }
7
- export declare function analyzeLinks(html: string, pageUrl: string, rootUrl: string): LinkRatioAnalysis;
7
+ export declare function analyzeLinks($: any, pageUrl: string, rootUrl: string): LinkRatioAnalysis;
@@ -1,23 +1,23 @@
1
1
  import { load } from 'cheerio';
2
- import { normalizeUrl } from '../crawler/normalize.js';
3
- export function analyzeLinks(html, pageUrl, rootUrl) {
4
- const $ = load(html);
5
- const rootOrigin = new URL(rootUrl).origin;
2
+ import { normalizeUrl, UrlUtil } from '../crawler/normalize.js';
3
+ export function analyzeLinks($, pageUrl, rootUrl) {
4
+ const isString = typeof $ === 'string';
5
+ const cheerioObj = isString ? load($ || '<html></html>') : $;
6
6
  let internalLinks = 0;
7
7
  let externalLinks = 0;
8
8
  let nofollowCount = 0;
9
- $('a[href]').each((_idx, el) => {
10
- const href = $(el).attr('href');
9
+ cheerioObj('a[href]').each((_idx, el) => {
10
+ const href = cheerioObj(el).attr('href');
11
11
  if (!href)
12
12
  return;
13
13
  const normalized = normalizeUrl(href, pageUrl, { stripQuery: false });
14
14
  if (!normalized)
15
15
  return;
16
- const rel = ($(el).attr('rel') || '').toLowerCase();
16
+ const rel = (cheerioObj(el).attr('rel') || '').toLowerCase();
17
17
  if (rel.includes('nofollow')) {
18
18
  nofollowCount += 1;
19
19
  }
20
- if (new URL(normalized).origin === rootOrigin) {
20
+ if (UrlUtil.isInternal(normalized, rootUrl)) {
21
21
  internalLinks += 1;
22
22
  }
23
23
  else {
@@ -1,26 +1,15 @@
1
- export type OrphanType = 'hard' | 'near' | 'soft' | 'crawl-only';
2
- export type ImpactLevel = 'low' | 'medium' | 'high' | 'critical';
3
- export interface SitegraphNode {
4
- url: string;
5
- depth: number;
6
- inLinks: number;
7
- outLinks: number;
8
- status: number;
9
- discoveredViaSitemap?: boolean;
10
- robotsExcluded?: boolean;
11
- canonicalUrl?: string;
12
- isHomepage?: boolean;
13
- wordCount?: number;
14
- hasStructuredData?: boolean;
1
+ import type { GraphNode, GraphEdge } from '../graph/graph.js';
2
+ export interface ExtendedGraphNode extends GraphNode {
15
3
  pageType?: string;
16
- noindex?: boolean;
17
- duplicateContent?: boolean;
4
+ hasStructuredData?: boolean;
18
5
  isProductOrCommercial?: boolean;
6
+ duplicateContent?: boolean;
7
+ isHomepage?: boolean;
8
+ robotsExcluded?: boolean;
9
+ discoveredViaSitemap?: boolean;
19
10
  }
20
- export interface SitegraphEdge {
21
- source: string;
22
- target: string;
23
- }
11
+ export type OrphanType = 'hard' | 'near' | 'soft' | 'crawl-only';
12
+ export type ImpactLevel = 'low' | 'medium' | 'high' | 'critical';
24
13
  export interface OrphanScoringOptions {
25
14
  enabled: boolean;
26
15
  severityEnabled: boolean;
@@ -28,12 +17,12 @@ export interface OrphanScoringOptions {
28
17
  minInbound: number;
29
18
  rootUrl?: string;
30
19
  }
31
- export type AnnotatedNode = SitegraphNode & {
20
+ export type AnnotatedNode = GraphNode & {
32
21
  orphan: boolean;
33
22
  orphanType?: OrphanType;
34
23
  orphanSeverity?: number;
35
24
  impactLevel?: ImpactLevel;
36
25
  };
37
26
  export declare function mapImpactLevel(score: number): ImpactLevel;
38
- export declare function calculateOrphanSeverity(orphanType: OrphanType, node: SitegraphNode): number;
39
- export declare function annotateOrphans(nodes: SitegraphNode[], edges: SitegraphEdge[], options: OrphanScoringOptions): AnnotatedNode[];
27
+ export declare function calculateOrphanSeverity(orphanType: OrphanType, node: ExtendedGraphNode): number;
28
+ export declare function annotateOrphans(nodes: ExtendedGraphNode[], edges: GraphEdge[], options: OrphanScoringOptions): AnnotatedNode[];
@@ -33,7 +33,8 @@ export function calculateOrphanSeverity(orphanType, node) {
33
33
  score = 90;
34
34
  break;
35
35
  case 'crawl-only':
36
- score = 80;
36
+ // Sitemap-only URLs are less severe if we haven't even tried to crawl them yet.
37
+ score = node.status === 0 ? 50 : 70;
37
38
  break;
38
39
  case 'near':
39
40
  score = node.inLinks <= 1 ? 70 : 60;
@@ -64,12 +65,17 @@ export function calculateOrphanSeverity(orphanType, node) {
64
65
  negativeModifier = Math.min(20, negativeModifier);
65
66
  score += positiveModifier;
66
67
  score -= negativeModifier;
68
+ // Safety: unvisited nodes should not be flagged as high-severity orphans
69
+ // because we haven't confirmed they are real pages or fully explored their context.
70
+ if (node.status === 0) {
71
+ score = Math.min(score, 60);
72
+ }
67
73
  return clampScore(score);
68
74
  }
69
75
  function consolidateInboundByCanonical(nodes) {
70
76
  const canonicalInbound = new Map();
71
77
  for (const node of nodes) {
72
- const canonical = node.canonicalUrl || node.url;
78
+ const canonical = node.canonical || node.url;
73
79
  canonicalInbound.set(canonical, (canonicalInbound.get(canonical) || 0) + node.inLinks);
74
80
  }
75
81
  return canonicalInbound;
@@ -85,7 +91,7 @@ export function annotateOrphans(nodes, edges, options) {
85
91
  if (isHomepage || node.robotsExcluded) {
86
92
  return { ...node, orphan: false };
87
93
  }
88
- const canonical = node.canonicalUrl || node.url;
94
+ const canonical = node.canonical || node.url;
89
95
  const inbound = canonicalInbound.get(canonical) || 0;
90
96
  let orphanType;
91
97
  if (inbound === 0) {
@@ -1,6 +1,12 @@
1
1
  export function scorePageSeo(page) {
2
+ if (page.meta.crawlStatus === 'blocked_by_robots') {
3
+ return 0;
4
+ }
2
5
  const titleMeta = (scoreTextStatus(page.title.status) + scoreTextStatus(page.metaDescription.status)) / 2;
3
- const h1 = page.h1.status === 'ok' ? 100 : page.h1.status === 'warning' ? 60 : 10;
6
+ let h1 = page.h1.status === 'ok' ? 100 : page.h1.status === 'warning' ? 60 : 10;
7
+ if (page.headingScore !== undefined && page.headingScore !== null) {
8
+ h1 = page.headingScore;
9
+ }
4
10
  const wordQuality = Math.min(100, (page.content.wordCount / 600) * 100) * 0.7 + Math.min(100, page.content.textHtmlRatio * 500) * 0.3;
5
11
  const thin = 100 - page.thinScore;
6
12
  const imageDen = Math.max(1, page.images.totalImages);
@@ -33,7 +39,10 @@ export function aggregateSiteScore(metrics, pages) {
33
39
  const entropyScore = Math.max(0, 100 - Math.abs(metrics.structuralEntropy - 2) * 25);
34
40
  const orphanPenalty = metrics.totalPages === 0 ? 0 : (metrics.orphanPages.length / metrics.totalPages) * 100;
35
41
  const authorityEntropyOrphanScore = Math.max(0, Math.min(100, (avgAuthority * 100 * 0.4) + (entropyScore * 0.35) + ((100 - orphanPenalty) * 0.25)));
36
- const overallScore = Number((seoHealthScore * 0.7 + authorityEntropyOrphanScore * 0.3).toFixed(2));
42
+ let overallScore = Number((seoHealthScore * 0.7 + authorityEntropyOrphanScore * 0.3).toFixed(2));
43
+ if (pages.some(p => p.meta.crawlStatus === 'blocked_by_robots')) {
44
+ overallScore = 0;
45
+ }
37
46
  return {
38
47
  seoHealthScore: Number(seoHealthScore.toFixed(2)),
39
48
  authorityEntropyOrphanScore: Number(authorityEntropyOrphanScore.toFixed(2)),
@@ -8,8 +8,12 @@ export interface H1Analysis {
8
8
  count: number;
9
9
  status: 'ok' | 'critical' | 'warning';
10
10
  matchesTitle: boolean;
11
+ value: string | null;
11
12
  }
12
- export declare function analyzeTitle(html: string): TextFieldAnalysis;
13
- export declare function analyzeMetaDescription(html: string): TextFieldAnalysis;
14
- export declare function applyDuplicateStatuses<T extends TextFieldAnalysis>(fields: T[]): T[];
15
- export declare function analyzeH1(html: string, titleValue: string | null): H1Analysis;
13
+ export declare function analyzeTitle($: any): TextFieldAnalysis;
14
+ export declare function analyzeMetaDescription($: any): TextFieldAnalysis;
15
+ export declare function analyzeH1($: any, titleValue: string | null): H1Analysis;
16
+ export declare function applyDuplicateStatuses<T extends {
17
+ value: string | null;
18
+ status: string;
19
+ }>(items: T[]): T[];